diff options
author | Ingo Molnar <mingo@kernel.org> | 2023-09-18 22:17:15 +0200 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2023-09-18 22:17:15 +0200 |
commit | 6f23fc47c1b2ac226704fb7294f43ed3b0965e51 (patch) | |
tree | 079303a4d4ed8489537ab300f129581ec9ca2594 /kernel | |
parent | e35a6cf1cc343d720ad235f678f1cd2a9876b777 (diff) | |
parent | ce9ecca0238b140b88f43859b211c9fdfd8e5b70 (diff) | |
download | linux-stable-6f23fc47c1b2ac226704fb7294f43ed3b0965e51.tar.gz linux-stable-6f23fc47c1b2ac226704fb7294f43ed3b0965e51.tar.bz2 linux-stable-6f23fc47c1b2ac226704fb7294f43ed3b0965e51.zip |
Merge tag 'v6.6-rc2' into locking/core, to pick up fixes
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
166 files changed, 9281 insertions, 4083 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec new file mode 100644 index 000000000000..9bfe68fe9676 --- /dev/null +++ b/kernel/Kconfig.kexec @@ -0,0 +1,150 @@ +# SPDX-License-Identifier: GPL-2.0-only + +menu "Kexec and crash features" + +config CRASH_CORE + bool + +config KEXEC_CORE + select CRASH_CORE + bool + +config KEXEC_ELF + bool + +config HAVE_IMA_KEXEC + bool + +config KEXEC + bool "Enable kexec system call" + depends on ARCH_SUPPORTS_KEXEC + select KEXEC_CORE + help + kexec is a system call that implements the ability to shutdown your + current kernel, and to start another kernel. It is like a reboot + but it is independent of the system firmware. And like a reboot + you can start any kernel with it, not just Linux. + + The name comes from the similarity to the exec system call. + + It is an ongoing process to be certain the hardware in a machine + is properly shutdown, so do not be surprised if this code does not + initially work for you. As of this writing the exact hardware + interface is strongly in flux, so no good recommendation can be + made. + +config KEXEC_FILE + bool "Enable kexec file based system call" + depends on ARCH_SUPPORTS_KEXEC_FILE + select KEXEC_CORE + help + This is new version of kexec system call. This system call is + file based and takes file descriptors as system call argument + for kernel and initramfs as opposed to list of segments as + accepted by kexec system call. + +config KEXEC_SIG + bool "Verify kernel signature during kexec_file_load() syscall" + depends on ARCH_SUPPORTS_KEXEC_SIG + depends on KEXEC_FILE + help + This option makes the kexec_file_load() syscall check for a valid + signature of the kernel image. The image can still be loaded without + a valid signature unless you also enable KEXEC_SIG_FORCE, though if + there's a signature that we can check, then it must be valid. + + In addition to this option, you need to enable signature + verification for the corresponding kernel image type being + loaded in order for this to work. + +config KEXEC_SIG_FORCE + bool "Require a valid signature in kexec_file_load() syscall" + depends on ARCH_SUPPORTS_KEXEC_SIG_FORCE + depends on KEXEC_SIG + help + This option makes kernel signature verification mandatory for + the kexec_file_load() syscall. + +config KEXEC_IMAGE_VERIFY_SIG + bool "Enable Image signature verification support (ARM)" + default ARCH_DEFAULT_KEXEC_IMAGE_VERIFY_SIG + depends on ARCH_SUPPORTS_KEXEC_IMAGE_VERIFY_SIG + depends on KEXEC_SIG + depends on EFI && SIGNED_PE_FILE_VERIFICATION + help + Enable Image signature verification support. + +config KEXEC_BZIMAGE_VERIFY_SIG + bool "Enable bzImage signature verification support" + depends on ARCH_SUPPORTS_KEXEC_BZIMAGE_VERIFY_SIG + depends on KEXEC_SIG + depends on SIGNED_PE_FILE_VERIFICATION + select SYSTEM_TRUSTED_KEYRING + help + Enable bzImage signature verification support. + +config KEXEC_JUMP + bool "kexec jump" + depends on ARCH_SUPPORTS_KEXEC_JUMP + depends on KEXEC && HIBERNATION + help + Jump between original kernel and kexeced kernel and invoke + code in physical address mode via KEXEC + +config CRASH_DUMP + bool "kernel crash dumps" + depends on ARCH_SUPPORTS_CRASH_DUMP + depends on ARCH_SUPPORTS_KEXEC + select CRASH_CORE + select KEXEC_CORE + select KEXEC + help + Generate crash dump after being started by kexec. + This should be normally only set in special crash dump kernels + which are loaded in the main kernel with kexec-tools into + a specially reserved region and then later executed after + a crash by kdump/kexec. The crash dump kernel must be compiled + to a memory address not used by the main kernel or BIOS using + PHYSICAL_START, or it must be built as a relocatable image + (CONFIG_RELOCATABLE=y). + For more details see Documentation/admin-guide/kdump/kdump.rst + + For s390, this option also enables zfcpdump. + See also <file:Documentation/s390/zfcpdump.rst> + +config CRASH_HOTPLUG + bool "Update the crash elfcorehdr on system configuration changes" + default y + depends on CRASH_DUMP && (HOTPLUG_CPU || MEMORY_HOTPLUG) + depends on ARCH_SUPPORTS_CRASH_HOTPLUG + help + Enable direct update to the crash elfcorehdr (which contains + the list of CPUs and memory regions to be dumped upon a crash) + in response to hot plug/unplug or online/offline of CPUs or + memory. This is a much more advanced approach than userspace + attempting that. + + If unsure, say Y. + +config CRASH_MAX_MEMORY_RANGES + int "Specify the maximum number of memory regions for the elfcorehdr" + default 8192 + depends on CRASH_HOTPLUG + help + For the kexec_file_load() syscall path, specify the maximum number of + memory regions that the elfcorehdr buffer/segment can accommodate. + These regions are obtained via walk_system_ram_res(); eg. the + 'System RAM' entries in /proc/iomem. + This value is combined with NR_CPUS_DEFAULT and multiplied by + sizeof(Elf64_Phdr) to determine the final elfcorehdr memory buffer/ + segment size. + The value 8192, for example, covers a (sparsely populated) 1TiB system + consisting of 128MiB memblocks, while resulting in an elfcorehdr + memory buffer/segment size under 1MiB. This represents a sane choice + to accommodate both baremetal and virtual machine configurations. + + For the kexec_load() syscall path, CRASH_MAX_MEMORY_RANGES is part of + the computation behind the value provided through the + /sys/kernel/crash_elfcorehdr_size attribute. + +endmenu diff --git a/kernel/acct.c b/kernel/acct.c index 010667ce6080..1a9f929fe629 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -445,7 +445,7 @@ static void fill_ac(acct_t *ac) memset(ac, 0, sizeof(acct_t)); ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER; - strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm)); + strscpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm)); /* calculate run_time in nsec*/ run_time = ktime_get_ns(); @@ -470,7 +470,7 @@ static void fill_ac(acct_t *ac) do_div(elapsed, AHZ); btime = ktime_get_real_seconds() - elapsed; ac->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX); -#if ACCT_VERSION==2 +#if ACCT_VERSION == 2 ac->ac_ahz = AHZ; #endif diff --git a/kernel/audit.c b/kernel/audit.c index 9bc0b0301198..16205dd29843 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -53,9 +53,7 @@ #include <net/sock.h> #include <net/netlink.h> #include <linux/skbuff.h> -#ifdef CONFIG_SECURITY #include <linux/security.h> -#endif #include <linux/freezer.h> #include <linux/pid_namespace.h> #include <net/netns/generic.h> @@ -323,7 +321,8 @@ static inline int audit_rate_check(void) unsigned long now; int retval = 0; - if (!audit_rate_limit) return 1; + if (!audit_rate_limit) + return 1; spin_lock_irqsave(&lock, flags); if (++messages < audit_rate_limit) { diff --git a/kernel/audit.h b/kernel/audit.h index 94738bce40b2..a60d2840559e 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -334,7 +334,7 @@ static inline int audit_signal_info_syscall(struct task_struct *t) return 0; } -#define audit_filter_inodes(t, c) AUDIT_STATE_DISABLED +#define audit_filter_inodes(t, c) do { } while (0) #endif /* CONFIG_AUDITSYSCALL */ extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 42d99896e7a6..8317a37dea0b 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -221,7 +221,7 @@ static int audit_match_signal(struct audit_entry *entry) entry->rule.mask)); } - switch(audit_classify_arch(arch->val)) { + switch (audit_classify_arch(arch->val)) { case 0: /* native */ return (audit_match_class_bits(AUDIT_CLASS_SIGNAL, entry->rule.mask)); @@ -243,7 +243,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data * err = -EINVAL; listnr = rule->flags & ~AUDIT_FILTER_PREPEND; - switch(listnr) { + switch (listnr) { default: goto exit_err; #ifdef CONFIG_AUDITSYSCALL @@ -344,7 +344,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) switch (entry->rule.listnr) { case AUDIT_FILTER_FS: - switch(f->type) { + switch (f->type) { case AUDIT_FSTYPE: case AUDIT_FILTERKEY: break; @@ -651,7 +651,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) data->fields[i] = f->type; data->fieldflags[i] = audit_ops[f->op]; - switch(f->type) { + switch (f->type) { case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: @@ -694,7 +694,8 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) data->values[i] = f->val; } } - for (i = 0; i < AUDIT_BITMASK_SIZE; i++) data->mask[i] = krule->mask[i]; + for (i = 0; i < AUDIT_BITMASK_SIZE; i++) + data->mask[i] = krule->mask[i]; return data; } @@ -717,7 +718,7 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) a->fields[i].op != b->fields[i].op) return 1; - switch(a->fields[i].type) { + switch (a->fields[i].type) { case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: @@ -946,7 +947,7 @@ static inline int audit_add_rule(struct audit_entry *entry) int dont_count = 0; /* If any of these, don't count towards total */ - switch(entry->rule.listnr) { + switch (entry->rule.listnr) { case AUDIT_FILTER_USER: case AUDIT_FILTER_EXCLUDE: case AUDIT_FILTER_FS: @@ -1029,7 +1030,7 @@ int audit_del_rule(struct audit_entry *entry) int dont_count = 0; /* If any of these, don't count towards total */ - switch(entry->rule.listnr) { + switch (entry->rule.listnr) { case AUDIT_FILTER_USER: case AUDIT_FILTER_EXCLUDE: case AUDIT_FILTER_FS: @@ -1083,7 +1084,7 @@ static void audit_list_rules(int seq, struct sk_buff_head *q) /* This is a blocking read, so use audit_filter_mutex instead of rcu * iterator to sync with list writers. */ - for (i=0; i<AUDIT_NR_FILTERS; i++) { + for (i = 0; i < AUDIT_NR_FILTERS; i++) { list_for_each_entry(r, &audit_rules_list[i], list) { struct audit_rule_data *data; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index addeed3df15d..21d2fa815e78 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -143,6 +143,8 @@ static const struct audit_nfcfgop_tab audit_nfcfgs[] = { { AUDIT_NFT_OP_OBJ_RESET, "nft_reset_obj" }, { AUDIT_NFT_OP_FLOWTABLE_REGISTER, "nft_register_flowtable" }, { AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, "nft_unregister_flowtable" }, + { AUDIT_NFT_OP_SETELEM_RESET, "nft_reset_setelem" }, + { AUDIT_NFT_OP_RULE_RESET, "nft_reset_rule" }, { AUDIT_NFT_OP_INVALID, "nft_invalid" }, }; @@ -880,7 +882,8 @@ static void audit_filter_syscall(struct task_struct *tsk, */ static int audit_filter_inode_name(struct task_struct *tsk, struct audit_names *n, - struct audit_context *ctx) { + struct audit_context *ctx) +{ int h = audit_hash_ino((u32)n->ino); struct list_head *list = &audit_inode_hash[h]; @@ -1064,7 +1067,8 @@ int audit_alloc(struct task_struct *tsk) return 0; } - if (!(context = audit_alloc_context(state))) { + context = audit_alloc_context(state); + if (!context) { kfree(key); audit_log_lost("out of memory in audit_alloc"); return -ENOMEM; @@ -2124,7 +2128,7 @@ retry: d = dentry; rcu_read_lock(); seq = read_seqbegin(&rename_lock); - for(;;) { + for (;;) { struct inode *inode = d_backing_inode(d); if (inode && unlikely(inode->i_fsnotify_marks)) { @@ -2456,6 +2460,8 @@ void __audit_inode_child(struct inode *parent, } } + cond_resched(); + /* is there a matching child entry? */ list_for_each_entry(n, &context->names_list, list) { /* can only match entries that have a name */ diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index 2dfe1079f772..6a906ff93006 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -31,6 +31,7 @@ config BPF_SYSCALL select TASKS_TRACE_RCU select BINARY_PRINTF select NET_SOCK_MSG if NET + select NET_XGRESS if NET select PAGE_POOL if NET default n help diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 1d3892168d32..f526b7573e97 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -12,7 +12,7 @@ obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o -obj-$(CONFIG_BPF_SYSCALL) += disasm.o +obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o obj-$(CONFIG_BPF_JIT) += dispatcher.o @@ -21,6 +21,7 @@ obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o obj-$(CONFIG_BPF_SYSCALL) += offload.o obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o +obj-$(CONFIG_BPF_SYSCALL) += tcx.o endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index b5149cfce7d4..146824cc9689 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -553,7 +553,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, void *value, u64 map_flags, gfp_t gfp_flags) { struct bpf_local_storage_data *old_sdata = NULL; - struct bpf_local_storage_elem *selem = NULL; + struct bpf_local_storage_elem *alloc_selem, *selem = NULL; struct bpf_local_storage *local_storage; unsigned long flags; int err; @@ -607,11 +607,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, } } - if (gfp_flags == GFP_KERNEL) { - selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); - if (!selem) - return ERR_PTR(-ENOMEM); - } + /* A lookup has just been done before and concluded a new selem is + * needed. The chance of an unnecessary alloc is unlikely. + */ + alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); + if (!alloc_selem) + return ERR_PTR(-ENOMEM); raw_spin_lock_irqsave(&local_storage->lock, flags); @@ -623,13 +624,13 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, * simple. */ err = -EAGAIN; - goto unlock_err; + goto unlock; } old_sdata = bpf_local_storage_lookup(local_storage, smap, false); err = check_flags(old_sdata, map_flags); if (err) - goto unlock_err; + goto unlock; if (old_sdata && (map_flags & BPF_F_LOCK)) { copy_map_value_locked(&smap->map, old_sdata->data, value, @@ -638,23 +639,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, goto unlock; } - if (gfp_flags != GFP_KERNEL) { - /* local_storage->lock is held. Hence, we are sure - * we can unlink and uncharge the old_sdata successfully - * later. Hence, instead of charging the new selem now - * and then uncharge the old selem later (which may cause - * a potential but unnecessary charge failure), avoid taking - * a charge at all here (the "!old_sdata" check) and the - * old_sdata will not be uncharged later during - * bpf_selem_unlink_storage_nolock(). - */ - selem = bpf_selem_alloc(smap, owner, value, !old_sdata, gfp_flags); - if (!selem) { - err = -ENOMEM; - goto unlock_err; - } - } - + alloc_selem = NULL; /* First, link the new selem to the map */ bpf_selem_link_map(smap, selem); @@ -665,20 +650,16 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (old_sdata) { bpf_selem_unlink_map(SELEM(old_sdata)); bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), - false, false); + true, false); } unlock: raw_spin_unlock_irqrestore(&local_storage->lock, flags); - return SDATA(selem); - -unlock_err: - raw_spin_unlock_irqrestore(&local_storage->lock, flags); - if (selem) { + if (alloc_selem) { mem_uncharge(smap, owner, smap->elem_size); - bpf_selem_free(selem, smap, true); + bpf_selem_free(alloc_selem, smap, true); } - return ERR_PTR(err); + return err ? ERR_PTR(err) : SDATA(selem); } static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache) @@ -779,7 +760,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) * of the loop will set the free_cgroup_storage to true. */ free_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, false, true); + local_storage, selem, true, true); } raw_spin_unlock_irqrestore(&local_storage->lock, flags); diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index 8f3c8b2b4490..cbd8d3720c2b 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -75,6 +75,5 @@ void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, void bpf_lru_destroy(struct bpf_lru *lru); struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash); void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node); -void bpf_lru_promote(struct bpf_lru *lru, struct bpf_lru_node *node); #endif diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 116a0ce378ec..fdc3e8705a3c 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -374,9 +374,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, struct bpf_struct_ops_value *uvalue, *kvalue; const struct btf_member *member; const struct btf_type *t = st_ops->type; - struct bpf_tramp_links *tlinks = NULL; + struct bpf_tramp_links *tlinks; void *udata, *kdata; - int prog_fd, err = 0; + int prog_fd, err; void *image, *image_end; u32 i; @@ -509,9 +509,12 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, } if (st_map->map.map_flags & BPF_F_LINK) { - err = st_ops->validate(kdata); - if (err) - goto reset_unlock; + err = 0; + if (st_ops->validate) { + err = st_ops->validate(kdata); + if (err) + goto reset_unlock; + } set_memory_rox((long)st_map->image, 1); /* Let bpf_link handle registration & unregistration. * @@ -663,9 +666,6 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) if (attr->value_size != vt->size) return ERR_PTR(-EINVAL); - if (attr->map_flags & BPF_F_LINK && (!st_ops->validate || !st_ops->update)) - return ERR_PTR(-EOPNOTSUPP); - t = st_ops->type; st_map_size = sizeof(*st_map) + @@ -815,7 +815,7 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map struct bpf_struct_ops_map *st_map, *old_st_map; struct bpf_map *old_map; struct bpf_struct_ops_link *st_link; - int err = 0; + int err; st_link = container_of(link, struct bpf_struct_ops_link, link); st_map = container_of(new_map, struct bpf_struct_ops_map, map); @@ -823,6 +823,9 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map if (!bpf_struct_ops_valid_to_reg(new_map)) return -EINVAL; + if (!st_map->st_ops->update) + return -EOPNOTSUPP; + mutex_lock(&update_mutex); old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex)); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 817204d53372..1095bbe29859 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -29,6 +29,7 @@ #include <net/netfilter/nf_bpf_link.h> #include <net/sock.h> +#include <net/xdp.h> #include "../tools/lib/bpf/relo_core.h" /* BTF (BPF Type Format) is the meta data format which describes @@ -552,7 +553,7 @@ s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind) return -ENOENT; } -static s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p) +s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p) { struct btf *btf; s32 ret; @@ -6133,8 +6134,9 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf, const char *tname, *mname, *tag_value; u32 vlen, elem_id, mid; - *flag = 0; again: + if (btf_type_is_modifier(t)) + t = btf_type_skip_modifiers(btf, t->type, NULL); tname = __btf_name_by_offset(btf, t->name_off); if (!btf_type_is_struct(t)) { bpf_log(log, "Type '%s' is not a struct\n", tname); @@ -6142,6 +6144,14 @@ again: } vlen = btf_type_vlen(t); + if (BTF_INFO_KIND(t->info) == BTF_KIND_UNION && vlen != 1 && !(*flag & PTR_UNTRUSTED)) + /* + * walking unions yields untrusted pointers + * with exception of __bpf_md_ptr and other + * unions with a single member + */ + *flag |= PTR_UNTRUSTED; + if (off + size > t->size) { /* If the last element is a variable size array, we may * need to relax the rule. @@ -6302,15 +6312,6 @@ error: * of this field or inside of this struct */ if (btf_type_is_struct(mtype)) { - if (BTF_INFO_KIND(mtype->info) == BTF_KIND_UNION && - btf_type_vlen(mtype) != 1) - /* - * walking unions yields untrusted pointers - * with exception of __bpf_md_ptr and other - * unions with a single member - */ - *flag |= PTR_UNTRUSTED; - /* our field must be inside that union or struct */ t = mtype; @@ -6368,7 +6369,7 @@ error: * that also allows using an array of int as a scratch * space. e.g. skb->cb[]. */ - if (off + size > mtrue_end) { + if (off + size > mtrue_end && !(*flag & PTR_UNTRUSTED)) { bpf_log(log, "access beyond the end of member %s (mend:%u) in struct %s with off %u size %u\n", mname, mtrue_end, tname, off, size); @@ -6476,7 +6477,7 @@ bool btf_struct_ids_match(struct bpf_verifier_log *log, bool strict) { const struct btf_type *type; - enum bpf_type_flag flag; + enum bpf_type_flag flag = 0; int err; /* Are we already done? */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index dc85240a0134..4e3ce0542e31 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -61,6 +61,7 @@ #define AX regs[BPF_REG_AX] #define ARG1 regs[BPF_REG_ARG1] #define CTX regs[BPF_REG_CTX] +#define OFF insn->off #define IMM insn->imm struct bpf_mem_alloc bpf_global_ma; @@ -372,7 +373,12 @@ static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, { const s32 off_min = S16_MIN, off_max = S16_MAX; s32 delta = end_new - end_old; - s32 off = insn->off; + s32 off; + + if (insn->code == (BPF_JMP32 | BPF_JA)) + off = insn->imm; + else + off = insn->off; if (curr < pos && curr + off + 1 >= end_old) off += delta; @@ -380,8 +386,12 @@ static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, off -= delta; if (off < off_min || off > off_max) return -ERANGE; - if (!probe_pass) - insn->off = off; + if (!probe_pass) { + if (insn->code == (BPF_JMP32 | BPF_JA)) + insn->imm = off; + else + insn->off = off; + } return 0; } @@ -860,7 +870,7 @@ static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_ins GFP_KERNEL); if (!pack) return NULL; - pack->ptr = module_alloc(BPF_PROG_PACK_SIZE); + pack->ptr = bpf_jit_alloc_exec(BPF_PROG_PACK_SIZE); if (!pack->ptr) { kfree(pack); return NULL; @@ -884,7 +894,7 @@ void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) mutex_lock(&pack_mutex); if (size > BPF_PROG_PACK_SIZE) { size = round_up(size, PAGE_SIZE); - ptr = module_alloc(size); + ptr = bpf_jit_alloc_exec(size); if (ptr) { bpf_fill_ill_insns(ptr, size); set_vm_flush_reset_perms(ptr); @@ -922,7 +932,7 @@ void bpf_prog_pack_free(struct bpf_binary_header *hdr) mutex_lock(&pack_mutex); if (hdr->size > BPF_PROG_PACK_SIZE) { - module_memfree(hdr); + bpf_jit_free_exec(hdr); goto out; } @@ -946,7 +956,7 @@ void bpf_prog_pack_free(struct bpf_binary_header *hdr) if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0, BPF_PROG_CHUNK_COUNT, 0) == 0) { list_del(&pack->list); - module_memfree(pack->ptr); + bpf_jit_free_exec(pack->ptr); kfree(pack); } out: @@ -1271,7 +1281,7 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, case BPF_ALU | BPF_MOD | BPF_K: *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); - *to++ = BPF_ALU32_REG(from->code, from->dst_reg, BPF_REG_AX); + *to++ = BPF_ALU32_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off); break; case BPF_ALU64 | BPF_ADD | BPF_K: @@ -1285,7 +1295,7 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, case BPF_ALU64 | BPF_MOD | BPF_K: *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); - *to++ = BPF_ALU64_REG(from->code, from->dst_reg, BPF_REG_AX); + *to++ = BPF_ALU64_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off); break; case BPF_JMP | BPF_JEQ | BPF_K: @@ -1523,6 +1533,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); INSN_3(ALU64, DIV, X), \ INSN_3(ALU64, MOD, X), \ INSN_2(ALU64, NEG), \ + INSN_3(ALU64, END, TO_LE), \ /* Immediate based. */ \ INSN_3(ALU64, ADD, K), \ INSN_3(ALU64, SUB, K), \ @@ -1591,6 +1602,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); INSN_3(JMP, JSLE, K), \ INSN_3(JMP, JSET, K), \ INSN_2(JMP, JA), \ + INSN_2(JMP32, JA), \ /* Store instructions. */ \ /* Register based. */ \ INSN_3(STX, MEM, B), \ @@ -1610,6 +1622,9 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); INSN_3(LDX, MEM, H), \ INSN_3(LDX, MEM, W), \ INSN_3(LDX, MEM, DW), \ + INSN_3(LDX, MEMSX, B), \ + INSN_3(LDX, MEMSX, H), \ + INSN_3(LDX, MEMSX, W), \ /* Immediate based. */ \ INSN_3(LD, IMM, DW) @@ -1635,12 +1650,6 @@ bool bpf_opcode_in_insntable(u8 code) } #ifndef CONFIG_BPF_JIT_ALWAYS_ON -u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) -{ - memset(dst, 0, size); - return -EFAULT; -} - /** * ___bpf_prog_run - run eBPF program on a given context * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers @@ -1666,6 +1675,9 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn) [BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H, [BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W, [BPF_LDX | BPF_PROBE_MEM | BPF_DW] = &&LDX_PROBE_MEM_DW, + [BPF_LDX | BPF_PROBE_MEMSX | BPF_B] = &&LDX_PROBE_MEMSX_B, + [BPF_LDX | BPF_PROBE_MEMSX | BPF_H] = &&LDX_PROBE_MEMSX_H, + [BPF_LDX | BPF_PROBE_MEMSX | BPF_W] = &&LDX_PROBE_MEMSX_W, }; #undef BPF_INSN_3_LBL #undef BPF_INSN_2_LBL @@ -1733,13 +1745,36 @@ select_insn: DST = -DST; CONT; ALU_MOV_X: - DST = (u32) SRC; + switch (OFF) { + case 0: + DST = (u32) SRC; + break; + case 8: + DST = (u32)(s8) SRC; + break; + case 16: + DST = (u32)(s16) SRC; + break; + } CONT; ALU_MOV_K: DST = (u32) IMM; CONT; ALU64_MOV_X: - DST = SRC; + switch (OFF) { + case 0: + DST = SRC; + break; + case 8: + DST = (s8) SRC; + break; + case 16: + DST = (s16) SRC; + break; + case 32: + DST = (s32) SRC; + break; + } CONT; ALU64_MOV_K: DST = IMM; @@ -1761,36 +1796,114 @@ select_insn: (*(s64 *) &DST) >>= IMM; CONT; ALU64_MOD_X: - div64_u64_rem(DST, SRC, &AX); - DST = AX; + switch (OFF) { + case 0: + div64_u64_rem(DST, SRC, &AX); + DST = AX; + break; + case 1: + AX = div64_s64(DST, SRC); + DST = DST - AX * SRC; + break; + } CONT; ALU_MOD_X: - AX = (u32) DST; - DST = do_div(AX, (u32) SRC); + switch (OFF) { + case 0: + AX = (u32) DST; + DST = do_div(AX, (u32) SRC); + break; + case 1: + AX = abs((s32)DST); + AX = do_div(AX, abs((s32)SRC)); + if ((s32)DST < 0) + DST = (u32)-AX; + else + DST = (u32)AX; + break; + } CONT; ALU64_MOD_K: - div64_u64_rem(DST, IMM, &AX); - DST = AX; + switch (OFF) { + case 0: + div64_u64_rem(DST, IMM, &AX); + DST = AX; + break; + case 1: + AX = div64_s64(DST, IMM); + DST = DST - AX * IMM; + break; + } CONT; ALU_MOD_K: - AX = (u32) DST; - DST = do_div(AX, (u32) IMM); + switch (OFF) { + case 0: + AX = (u32) DST; + DST = do_div(AX, (u32) IMM); + break; + case 1: + AX = abs((s32)DST); + AX = do_div(AX, abs((s32)IMM)); + if ((s32)DST < 0) + DST = (u32)-AX; + else + DST = (u32)AX; + break; + } CONT; ALU64_DIV_X: - DST = div64_u64(DST, SRC); + switch (OFF) { + case 0: + DST = div64_u64(DST, SRC); + break; + case 1: + DST = div64_s64(DST, SRC); + break; + } CONT; ALU_DIV_X: - AX = (u32) DST; - do_div(AX, (u32) SRC); - DST = (u32) AX; + switch (OFF) { + case 0: + AX = (u32) DST; + do_div(AX, (u32) SRC); + DST = (u32) AX; + break; + case 1: + AX = abs((s32)DST); + do_div(AX, abs((s32)SRC)); + if (((s32)DST < 0) == ((s32)SRC < 0)) + DST = (u32)AX; + else + DST = (u32)-AX; + break; + } CONT; ALU64_DIV_K: - DST = div64_u64(DST, IMM); + switch (OFF) { + case 0: + DST = div64_u64(DST, IMM); + break; + case 1: + DST = div64_s64(DST, IMM); + break; + } CONT; ALU_DIV_K: - AX = (u32) DST; - do_div(AX, (u32) IMM); - DST = (u32) AX; + switch (OFF) { + case 0: + AX = (u32) DST; + do_div(AX, (u32) IMM); + DST = (u32) AX; + break; + case 1: + AX = abs((s32)DST); + do_div(AX, abs((s32)IMM)); + if (((s32)DST < 0) == ((s32)IMM < 0)) + DST = (u32)AX; + else + DST = (u32)-AX; + break; + } CONT; ALU_END_TO_BE: switch (IMM) { @@ -1818,6 +1931,19 @@ select_insn: break; } CONT; + ALU64_END_TO_LE: + switch (IMM) { + case 16: + DST = (__force u16) __swab16(DST); + break; + case 32: + DST = (__force u32) __swab32(DST); + break; + case 64: + DST = (__force u64) __swab64(DST); + break; + } + CONT; /* CALL */ JMP_CALL: @@ -1867,6 +1993,9 @@ out: JMP_JA: insn += insn->off; CONT; + JMP32_JA: + insn += insn->imm; + CONT; JMP_EXIT: return BPF_R0; /* JMP */ @@ -1931,8 +2060,8 @@ out: DST = *(SIZE *)(unsigned long) (SRC + insn->off); \ CONT; \ LDX_PROBE_MEM_##SIZEOP: \ - bpf_probe_read_kernel(&DST, sizeof(SIZE), \ - (const void *)(long) (SRC + insn->off)); \ + bpf_probe_read_kernel_common(&DST, sizeof(SIZE), \ + (const void *)(long) (SRC + insn->off)); \ DST = *((SIZE *)&DST); \ CONT; @@ -1942,6 +2071,21 @@ out: LDST(DW, u64) #undef LDST +#define LDSX(SIZEOP, SIZE) \ + LDX_MEMSX_##SIZEOP: \ + DST = *(SIZE *)(unsigned long) (SRC + insn->off); \ + CONT; \ + LDX_PROBE_MEMSX_##SIZEOP: \ + bpf_probe_read_kernel_common(&DST, sizeof(SIZE), \ + (const void *)(long) (SRC + insn->off)); \ + DST = *((SIZE *)&DST); \ + CONT; + + LDSX(B, s8) + LDSX(H, s16) + LDSX(W, s32) +#undef LDSX + #define ATOMIC_ALU_OP(BOP, KOP) \ case BOP: \ if (BPF_SIZE(insn->code) == BPF_W) \ diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 6ae02be7a48e..e42a1bdb7f53 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -28,6 +28,7 @@ #include <linux/sched.h> #include <linux/workqueue.h> #include <linux/kthread.h> +#include <linux/completion.h> #include <trace/events/xdp.h> #include <linux/btf_ids.h> @@ -60,8 +61,6 @@ struct bpf_cpu_map_entry { /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ struct xdp_bulk_queue __percpu *bulkq; - struct bpf_cpu_map *cmap; - /* Queue with potential multi-producers, and single-consumer kthread */ struct ptr_ring *queue; struct task_struct *kthread; @@ -69,10 +68,8 @@ struct bpf_cpu_map_entry { struct bpf_cpumap_val value; struct bpf_prog *prog; - atomic_t refcnt; /* Control when this struct can be free'ed */ - struct rcu_head rcu; - - struct work_struct kthread_stop_wq; + struct completion kthread_running; + struct rcu_work free_work; }; struct bpf_cpu_map { @@ -117,11 +114,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) return &cmap->map; } -static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) -{ - atomic_inc(&rcpu->refcnt); -} - static void __cpu_map_ring_cleanup(struct ptr_ring *ring) { /* The tear-down procedure should have made sure that queue is @@ -129,47 +121,16 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring) * invoked cpu_map_kthread_stop(). Catch any broken behaviour * gracefully and warn once. */ - struct xdp_frame *xdpf; - - while ((xdpf = ptr_ring_consume(ring))) - if (WARN_ON_ONCE(xdpf)) - xdp_return_frame(xdpf); -} - -static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) -{ - if (atomic_dec_and_test(&rcpu->refcnt)) { - if (rcpu->prog) - bpf_prog_put(rcpu->prog); - /* The queue should be empty at this point */ - __cpu_map_ring_cleanup(rcpu->queue); - ptr_ring_cleanup(rcpu->queue, NULL); - kfree(rcpu->queue); - kfree(rcpu); - } -} - -/* called from workqueue, to workaround syscall using preempt_disable */ -static void cpu_map_kthread_stop(struct work_struct *work) -{ - struct bpf_cpu_map_entry *rcpu; - int err; - - rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq); + void *ptr; - /* Wait for flush in __cpu_map_entry_free(), via full RCU barrier, - * as it waits until all in-flight call_rcu() callbacks complete. - */ - rcu_barrier(); - - /* kthread_stop will wake_up_process and wait for it to complete */ - err = kthread_stop(rcpu->kthread); - if (err) { - /* kthread_stop may be called before cpu_map_kthread_run - * is executed, so we need to release the memory related - * to rcpu. - */ - put_cpu_map_entry(rcpu); + while ((ptr = ptr_ring_consume(ring))) { + WARN_ON_ONCE(1); + if (unlikely(__ptr_test_bit(0, &ptr))) { + __ptr_clear_bit(0, &ptr); + kfree_skb(ptr); + continue; + } + xdp_return_frame(ptr); } } @@ -298,11 +259,11 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, return nframes; } - static int cpu_map_kthread_run(void *data) { struct bpf_cpu_map_entry *rcpu = data; + complete(&rcpu->kthread_running); set_current_state(TASK_INTERRUPTIBLE); /* When kthread gives stop order, then rcpu have been disconnected @@ -397,7 +358,6 @@ static int cpu_map_kthread_run(void *data) } __set_current_state(TASK_RUNNING); - put_cpu_map_entry(rcpu); return 0; } @@ -467,19 +427,23 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, goto free_ptr_ring; /* Setup kthread */ + init_completion(&rcpu->kthread_running); rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, "cpumap/%d/map:%d", cpu, map->id); if (IS_ERR(rcpu->kthread)) goto free_prog; - get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */ - get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */ - /* Make sure kthread runs on a single CPU */ kthread_bind(rcpu->kthread, cpu); wake_up_process(rcpu->kthread); + /* Make sure kthread has been running, so kthread_stop() will not + * stop the kthread prematurely and all pending frames or skbs + * will be handled by the kthread before kthread_stop() returns. + */ + wait_for_completion(&rcpu->kthread_running); + return rcpu; free_prog: @@ -496,40 +460,40 @@ free_rcu: return NULL; } -static void __cpu_map_entry_free(struct rcu_head *rcu) +static void __cpu_map_entry_free(struct work_struct *work) { struct bpf_cpu_map_entry *rcpu; /* This cpu_map_entry have been disconnected from map and one - * RCU grace-period have elapsed. Thus, XDP cannot queue any + * RCU grace-period have elapsed. Thus, XDP cannot queue any * new packets and cannot change/set flush_needed that can * find this entry. */ - rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu); + rcpu = container_of(to_rcu_work(work), struct bpf_cpu_map_entry, free_work); + + /* kthread_stop will wake_up_process and wait for it to complete. + * cpu_map_kthread_run() makes sure the pointer ring is empty + * before exiting. + */ + kthread_stop(rcpu->kthread); + if (rcpu->prog) + bpf_prog_put(rcpu->prog); + /* The queue should be empty at this point */ + __cpu_map_ring_cleanup(rcpu->queue); + ptr_ring_cleanup(rcpu->queue, NULL); + kfree(rcpu->queue); free_percpu(rcpu->bulkq); - /* Cannot kthread_stop() here, last put free rcpu resources */ - put_cpu_map_entry(rcpu); + kfree(rcpu); } -/* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to - * ensure any driver rcu critical sections have completed, but this - * does not guarantee a flush has happened yet. Because driver side - * rcu_read_lock/unlock only protects the running XDP program. The - * atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a - * pending flush op doesn't fail. - * - * The bpf_cpu_map_entry is still used by the kthread, and there can - * still be pending packets (in queue and percpu bulkq). A refcnt - * makes sure to last user (kthread_stop vs. call_rcu) free memory - * resources. - * - * The rcu callback __cpu_map_entry_free flush remaining packets in - * percpu bulkq to queue. Due to caller map_delete_elem() disable - * preemption, cannot call kthread_stop() to make sure queue is empty. - * Instead a work_queue is started for stopping kthread, - * cpu_map_kthread_stop, which waits for an RCU grace period before - * stopping kthread, emptying the queue. +/* After the xchg of the bpf_cpu_map_entry pointer, we need to make sure the old + * entry is no longer in use before freeing. We use queue_rcu_work() to call + * __cpu_map_entry_free() in a separate workqueue after waiting for an RCU grace + * period. This means that (a) all pending enqueue and flush operations have + * completed (because of the RCU callback), and (b) we are in a workqueue + * context where we can stop the kthread and wait for it to exit before freeing + * everything. */ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, u32 key_cpu, struct bpf_cpu_map_entry *rcpu) @@ -538,9 +502,8 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, old_rcpu = unrcu_pointer(xchg(&cmap->cpu_map[key_cpu], RCU_INITIALIZER(rcpu))); if (old_rcpu) { - call_rcu(&old_rcpu->rcu, __cpu_map_entry_free); - INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop); - schedule_work(&old_rcpu->kthread_stop_wq); + INIT_RCU_WORK(&old_rcpu->free_work, __cpu_map_entry_free); + queue_rcu_work(system_wq, &old_rcpu->free_work); } } @@ -552,7 +515,7 @@ static long cpu_map_delete_elem(struct bpf_map *map, void *key) if (key_cpu >= map->max_entries) return -EINVAL; - /* notice caller map_delete_elem() use preempt_disable() */ + /* notice caller map_delete_elem() uses rcu_read_lock() */ __cpu_map_entry_replace(cmap, key_cpu, NULL); return 0; } @@ -588,7 +551,6 @@ static long cpu_map_update_elem(struct bpf_map *map, void *key, void *value, rcpu = __cpu_map_entry_alloc(map, &cpumap_value, key_cpu); if (!rcpu) return -ENOMEM; - rcpu->cmap = cmap; } rcu_read_lock(); __cpu_map_entry_replace(cmap, key_cpu, rcpu); @@ -604,16 +566,15 @@ static void cpu_map_free(struct bpf_map *map) /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, * so the bpf programs (can be more than one that used this map) were * disconnected from events. Wait for outstanding critical sections in - * these programs to complete. The rcu critical section only guarantees - * no further "XDP/bpf-side" reads against bpf_cpu_map->cpu_map. - * It does __not__ ensure pending flush operations (if any) are - * complete. + * these programs to complete. synchronize_rcu() below not only + * guarantees no further "XDP/bpf-side" reads against + * bpf_cpu_map->cpu_map, but also ensure pending flush operations + * (if any) are completed. */ - synchronize_rcu(); - /* For cpu_map the remote CPUs can still be using the entries - * (struct bpf_cpu_map_entry). + /* The only possible user of bpf_cpu_map_entry is + * cpu_map_kthread_run(). */ for (i = 0; i < cmap->map.max_entries; i++) { struct bpf_cpu_map_entry *rcpu; @@ -622,8 +583,8 @@ static void cpu_map_free(struct bpf_map *map) if (!rcpu) continue; - /* bq flush and cleanup happens after RCU grace-period */ - __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ + /* Stop kthread and cleanup entry directly */ + __cpu_map_entry_free(&rcpu->free_work.work); } bpf_map_area_free(cmap->cpu_map); bpf_map_area_free(cmap); diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index 938a60ff4295..6983af8e093c 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -9,7 +9,6 @@ /** * struct bpf_cpumask - refcounted BPF cpumask wrapper structure * @cpumask: The actual cpumask embedded in the struct. - * @rcu: The RCU head used to free the cpumask with RCU safety. * @usage: Object reference counter. When the refcount goes to 0, the * memory is released back to the BPF allocator, which provides * RCU safety. @@ -25,7 +24,6 @@ */ struct bpf_cpumask { cpumask_t cpumask; - struct rcu_head rcu; refcount_t usage; }; @@ -82,16 +80,6 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) return cpumask; } -static void cpumask_free_cb(struct rcu_head *head) -{ - struct bpf_cpumask *cpumask; - - cpumask = container_of(head, struct bpf_cpumask, rcu); - migrate_disable(); - bpf_mem_cache_free(&bpf_cpumask_ma, cpumask); - migrate_enable(); -} - /** * bpf_cpumask_release() - Release a previously acquired BPF cpumask. * @cpumask: The cpumask being released. @@ -102,8 +90,12 @@ static void cpumask_free_cb(struct rcu_head *head) */ __bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask) { - if (refcount_dec_and_test(&cpumask->usage)) - call_rcu(&cpumask->rcu, cpumask_free_cb); + if (!refcount_dec_and_test(&cpumask->usage)) + return; + + migrate_disable(); + bpf_mem_cache_free_rcu(&bpf_cpumask_ma, cpumask); + migrate_enable(); } /** diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 49cc0b5671c6..4d42f6ed6c11 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -65,7 +65,6 @@ struct xdp_dev_bulk_queue { struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct hlist_node index_hlist; - struct bpf_dtab *dtab; struct bpf_prog *xdp_prog; struct rcu_head rcu; unsigned int idx; @@ -874,7 +873,6 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, } dev->idx = idx; - dev->dtab = dtab; if (prog) { dev->xdp_prog = prog; dev->val.bpf_prog.id = prog->aux->id; diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 7b4afb7d96db..49940c26a227 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -87,6 +87,17 @@ const char *const bpf_alu_string[16] = { [BPF_END >> 4] = "endian", }; +static const char *const bpf_alu_sign_string[16] = { + [BPF_DIV >> 4] = "s/=", + [BPF_MOD >> 4] = "s%=", +}; + +static const char *const bpf_movsx_string[4] = { + [0] = "(s8)", + [1] = "(s16)", + [3] = "(s32)", +}; + static const char *const bpf_atomic_alu_string[16] = { [BPF_ADD >> 4] = "add", [BPF_AND >> 4] = "and", @@ -101,6 +112,12 @@ static const char *const bpf_ldst_string[] = { [BPF_DW >> 3] = "u64", }; +static const char *const bpf_ldsx_string[] = { + [BPF_W >> 3] = "s32", + [BPF_H >> 3] = "s16", + [BPF_B >> 3] = "s8", +}; + static const char *const bpf_jmp_string[16] = { [BPF_JA >> 4] = "jmp", [BPF_JEQ >> 4] = "==", @@ -128,6 +145,27 @@ static void print_bpf_end_insn(bpf_insn_print_t verbose, insn->imm, insn->dst_reg); } +static void print_bpf_bswap_insn(bpf_insn_print_t verbose, + void *private_data, + const struct bpf_insn *insn) +{ + verbose(private_data, "(%02x) r%d = bswap%d r%d\n", + insn->code, insn->dst_reg, + insn->imm, insn->dst_reg); +} + +static bool is_sdiv_smod(const struct bpf_insn *insn) +{ + return (BPF_OP(insn->code) == BPF_DIV || BPF_OP(insn->code) == BPF_MOD) && + insn->off == 1; +} + +static bool is_movsx(const struct bpf_insn *insn) +{ + return BPF_OP(insn->code) == BPF_MOV && + (insn->off == 8 || insn->off == 16 || insn->off == 32); +} + void print_bpf_insn(const struct bpf_insn_cbs *cbs, const struct bpf_insn *insn, bool allow_ptr_leaks) @@ -138,7 +176,7 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, if (class == BPF_ALU || class == BPF_ALU64) { if (BPF_OP(insn->code) == BPF_END) { if (class == BPF_ALU64) - verbose(cbs->private_data, "BUG_alu64_%02x\n", insn->code); + print_bpf_bswap_insn(verbose, cbs->private_data, insn); else print_bpf_end_insn(verbose, cbs->private_data, insn); } else if (BPF_OP(insn->code) == BPF_NEG) { @@ -147,17 +185,20 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, insn->dst_reg, class == BPF_ALU ? 'w' : 'r', insn->dst_reg); } else if (BPF_SRC(insn->code) == BPF_X) { - verbose(cbs->private_data, "(%02x) %c%d %s %c%d\n", + verbose(cbs->private_data, "(%02x) %c%d %s %s%c%d\n", insn->code, class == BPF_ALU ? 'w' : 'r', insn->dst_reg, - bpf_alu_string[BPF_OP(insn->code) >> 4], + is_sdiv_smod(insn) ? bpf_alu_sign_string[BPF_OP(insn->code) >> 4] + : bpf_alu_string[BPF_OP(insn->code) >> 4], + is_movsx(insn) ? bpf_movsx_string[(insn->off >> 3) - 1] : "", class == BPF_ALU ? 'w' : 'r', insn->src_reg); } else { verbose(cbs->private_data, "(%02x) %c%d %s %d\n", insn->code, class == BPF_ALU ? 'w' : 'r', insn->dst_reg, - bpf_alu_string[BPF_OP(insn->code) >> 4], + is_sdiv_smod(insn) ? bpf_alu_sign_string[BPF_OP(insn->code) >> 4] + : bpf_alu_string[BPF_OP(insn->code) >> 4], insn->imm); } } else if (class == BPF_STX) { @@ -218,13 +259,15 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, verbose(cbs->private_data, "BUG_st_%02x\n", insn->code); } } else if (class == BPF_LDX) { - if (BPF_MODE(insn->code) != BPF_MEM) { + if (BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_MEMSX) { verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code); return; } verbose(cbs->private_data, "(%02x) r%d = *(%s *)(r%d %+d)\n", insn->code, insn->dst_reg, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + BPF_MODE(insn->code) == BPF_MEM ? + bpf_ldst_string[BPF_SIZE(insn->code) >> 3] : + bpf_ldsx_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->off); } else if (class == BPF_LD) { if (BPF_MODE(insn->code) == BPF_ABS) { @@ -279,6 +322,9 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, } else if (insn->code == (BPF_JMP | BPF_JA)) { verbose(cbs->private_data, "(%02x) goto pc%+d\n", insn->code, insn->off); + } else if (insn->code == (BPF_JMP32 | BPF_JA)) { + verbose(cbs->private_data, "(%02x) gotol pc%+d\n", + insn->code, insn->imm); } else if (insn->code == (BPF_JMP | BPF_EXIT)) { verbose(cbs->private_data, "(%02x) exit\n", insn->code); } else if (BPF_SRC(insn->code) == BPF_X) { diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 56d3da7d0bc6..a8c7e1c5abfa 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -302,6 +302,7 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, struct htab_elem *l; if (node) { + bpf_map_inc_elem_count(&htab->map); l = container_of(node, struct htab_elem, lru_node); memcpy(l->key, key, htab->map.key_size); return l; @@ -510,12 +511,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) htab->n_buckets > U32_MAX / sizeof(struct bucket)) goto free_htab; + err = bpf_map_init_elem_count(&htab->map); + if (err) + goto free_htab; + err = -ENOMEM; htab->buckets = bpf_map_area_alloc(htab->n_buckets * sizeof(struct bucket), htab->map.numa_node); if (!htab->buckets) - goto free_htab; + goto free_elem_count; for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) { htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map, @@ -593,6 +598,8 @@ free_map_locked: bpf_map_area_free(htab->buckets); bpf_mem_alloc_destroy(&htab->pcpu_ma); bpf_mem_alloc_destroy(&htab->ma); +free_elem_count: + bpf_map_free_elem_count(&htab->map); free_htab: lockdep_unregister_key(&htab->lockdep_key); bpf_map_area_free(htab); @@ -804,6 +811,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) if (l == tgt_l) { hlist_nulls_del_rcu(&l->hash_node); check_and_free_fields(htab, l); + bpf_map_dec_elem_count(&htab->map); break; } @@ -900,6 +908,8 @@ static bool is_map_full(struct bpf_htab *htab) static void inc_elem_count(struct bpf_htab *htab) { + bpf_map_inc_elem_count(&htab->map); + if (htab->use_percpu_counter) percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH); else @@ -908,6 +918,8 @@ static void inc_elem_count(struct bpf_htab *htab) static void dec_elem_count(struct bpf_htab *htab) { + bpf_map_dec_elem_count(&htab->map); + if (htab->use_percpu_counter) percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH); else @@ -920,6 +932,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) htab_put_fd_value(htab, l); if (htab_is_prealloc(htab)) { + bpf_map_dec_elem_count(&htab->map); check_and_free_fields(htab, l); __pcpu_freelist_push(&htab->freelist, &l->fnode); } else { @@ -1000,6 +1013,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, if (!l) return ERR_PTR(-E2BIG); l_new = container_of(l, struct htab_elem, fnode); + bpf_map_inc_elem_count(&htab->map); } } else { if (is_map_full(htab)) @@ -1168,6 +1182,7 @@ err: static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem) { check_and_free_fields(htab, elem); + bpf_map_dec_elem_count(&htab->map); bpf_lru_push_free(&htab->lru, &elem->lru_node); } @@ -1357,8 +1372,10 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, err: htab_unlock_bucket(htab, b, hash, flags); err_lock_bucket: - if (l_new) + if (l_new) { + bpf_map_dec_elem_count(&htab->map); bpf_lru_push_free(&htab->lru, &l_new->lru_node); + } return ret; } @@ -1523,6 +1540,7 @@ static void htab_map_free(struct bpf_map *map) prealloc_destroy(htab); } + bpf_map_free_elem_count(map); free_percpu(htab->extra_elems); bpf_map_area_free(htab->buckets); bpf_mem_alloc_destroy(&htab->pcpu_ma); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9e80efa59a5d..8bd3812fb8df 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -286,6 +286,7 @@ static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0"); BUILD_BUG_ON(sizeof(*l) != sizeof(__u32)); BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32)); + preempt_disable(); arch_spin_lock(l); } @@ -294,6 +295,7 @@ static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) arch_spinlock_t *l = (void *)lock; arch_spin_unlock(l); + preempt_enable(); } #else @@ -1913,7 +1915,11 @@ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec) if (rec) bpf_obj_free_fields(rec, p); - bpf_mem_free(&bpf_global_ma, p); + + if (rec && rec->refcount_off >= 0) + bpf_mem_free_rcu(&bpf_global_ma, p); + else + bpf_mem_free(&bpf_global_ma, p); } __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign) @@ -1942,23 +1948,29 @@ __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta return (void *)p__refcounted_kptr; } -static int __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head, +static int __bpf_list_add(struct bpf_list_node_kern *node, + struct bpf_list_head *head, bool tail, struct btf_record *rec, u64 off) { - struct list_head *n = (void *)node, *h = (void *)head; + struct list_head *n = &node->list_head, *h = (void *)head; /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't * called on its fields, so init here */ if (unlikely(!h->next)) INIT_LIST_HEAD(h); - if (!list_empty(n)) { + + /* node->owner != NULL implies !list_empty(n), no need to separately + * check the latter + */ + if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) { /* Only called from BPF prog, no need to migrate_disable */ __bpf_obj_drop_impl((void *)n - off, rec); return -EINVAL; } tail ? list_add_tail(n, h) : list_add(n, h); + WRITE_ONCE(node->owner, head); return 0; } @@ -1967,25 +1979,26 @@ __bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head, struct bpf_list_node *node, void *meta__ign, u64 off) { + struct bpf_list_node_kern *n = (void *)node; struct btf_struct_meta *meta = meta__ign; - return __bpf_list_add(node, head, false, - meta ? meta->record : NULL, off); + return __bpf_list_add(n, head, false, meta ? meta->record : NULL, off); } __bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head, struct bpf_list_node *node, void *meta__ign, u64 off) { + struct bpf_list_node_kern *n = (void *)node; struct btf_struct_meta *meta = meta__ign; - return __bpf_list_add(node, head, true, - meta ? meta->record : NULL, off); + return __bpf_list_add(n, head, true, meta ? meta->record : NULL, off); } static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail) { struct list_head *n, *h = (void *)head; + struct bpf_list_node_kern *node; /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't * called on its fields, so init here @@ -1994,8 +2007,14 @@ static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tai INIT_LIST_HEAD(h); if (list_empty(h)) return NULL; + n = tail ? h->prev : h->next; + node = container_of(n, struct bpf_list_node_kern, list_head); + if (WARN_ON_ONCE(READ_ONCE(node->owner) != head)) + return NULL; + list_del_init(n); + WRITE_ONCE(node->owner, NULL); return (struct bpf_list_node *)n; } @@ -2012,29 +2031,38 @@ __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, struct bpf_rb_node *node) { + struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node; struct rb_root_cached *r = (struct rb_root_cached *)root; - struct rb_node *n = (struct rb_node *)node; + struct rb_node *n = &node_internal->rb_node; - if (RB_EMPTY_NODE(n)) + /* node_internal->owner != root implies either RB_EMPTY_NODE(n) or + * n is owned by some other tree. No need to check RB_EMPTY_NODE(n) + */ + if (READ_ONCE(node_internal->owner) != root) return NULL; rb_erase_cached(n, r); RB_CLEAR_NODE(n); + WRITE_ONCE(node_internal->owner, NULL); return (struct bpf_rb_node *)n; } /* Need to copy rbtree_add_cached's logic here because our 'less' is a BPF * program */ -static int __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, +static int __bpf_rbtree_add(struct bpf_rb_root *root, + struct bpf_rb_node_kern *node, void *less, struct btf_record *rec, u64 off) { struct rb_node **link = &((struct rb_root_cached *)root)->rb_root.rb_node; - struct rb_node *parent = NULL, *n = (struct rb_node *)node; + struct rb_node *parent = NULL, *n = &node->rb_node; bpf_callback_t cb = (bpf_callback_t)less; bool leftmost = true; - if (!RB_EMPTY_NODE(n)) { + /* node->owner != NULL implies !RB_EMPTY_NODE(n), no need to separately + * check the latter + */ + if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) { /* Only called from BPF prog, no need to migrate_disable */ __bpf_obj_drop_impl((void *)n - off, rec); return -EINVAL; @@ -2052,6 +2080,7 @@ static int __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, rb_link_node(n, parent, link); rb_insert_color_cached(n, (struct rb_root_cached *)root, leftmost); + WRITE_ONCE(node->owner, root); return 0; } @@ -2060,8 +2089,9 @@ __bpf_kfunc int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node void *meta__ign, u64 off) { struct btf_struct_meta *meta = meta__ign; + struct bpf_rb_node_kern *n = (void *)node; - return __bpf_rbtree_add(root, node, (void *)less, meta ? meta->record : NULL, off); + return __bpf_rbtree_add(root, n, (void *)less, meta ? meta->record : NULL, off); } __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) @@ -2239,11 +2269,14 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset case BPF_DYNPTR_TYPE_RINGBUF: return ptr->data + ptr->offset + offset; case BPF_DYNPTR_TYPE_SKB: - return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__opt); + if (buffer__opt) + return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer__opt); + else + return skb_pointer_if_linear(ptr->data, ptr->offset + offset, len); case BPF_DYNPTR_TYPE_XDP: { void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len); - if (xdp_ptr) + if (!IS_ERR_OR_NULL(xdp_ptr)) return xdp_ptr; if (!buffer__opt) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 4174f76133df..99d0625b6c82 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -118,9 +118,8 @@ static struct inode *bpf_get_inode(struct super_block *sb, return ERR_PTR(-ENOSPC); inode->i_ino = get_next_ino(); - inode->i_atime = current_time(inode); + inode->i_atime = inode_set_ctime_current(inode); inode->i_mtime = inode->i_atime; - inode->i_ctime = inode->i_atime; inode_init_owner(&nop_mnt_idmap, inode, dir, mode); @@ -148,8 +147,7 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode, d_instantiate(dentry, inode); dget(dentry); - dir->i_mtime = current_time(dir); - dir->i_ctime = dir->i_mtime; + dir->i_mtime = inode_set_ctime_current(dir); } static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir, diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index b0fa190b0979..6fc9dae9edc8 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -78,8 +78,7 @@ static const struct seq_operations bpf_map_seq_ops = { .show = bpf_map_seq_show, }; -BTF_ID_LIST(btf_bpf_map_id) -BTF_ID(struct, bpf_map) +BTF_ID_LIST_GLOBAL_SINGLE(btf_bpf_map_id, struct, bpf_map) static const struct bpf_iter_seq_info bpf_map_seq_info = { .seq_ops = &bpf_map_seq_ops, @@ -93,7 +92,7 @@ static struct bpf_iter_reg bpf_map_reg_info = { .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_map, map), - PTR_TO_BTF_ID_OR_NULL }, + PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, }, .seq_info = &bpf_map_seq_info, }; @@ -193,3 +192,40 @@ static int __init bpf_map_iter_init(void) } late_initcall(bpf_map_iter_init); + +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "Global functions as their definitions will be in vmlinux BTF"); + +__bpf_kfunc s64 bpf_map_sum_elem_count(const struct bpf_map *map) +{ + s64 *pcount; + s64 ret = 0; + int cpu; + + if (!map || !map->elem_count) + return 0; + + for_each_possible_cpu(cpu) { + pcount = per_cpu_ptr(map->elem_count, cpu); + ret += READ_ONCE(*pcount); + } + return ret; +} + +__diag_pop(); + +BTF_SET8_START(bpf_map_iter_kfunc_ids) +BTF_ID_FLAGS(func, bpf_map_sum_elem_count, KF_TRUSTED_ARGS) +BTF_SET8_END(bpf_map_iter_kfunc_ids) + +static const struct btf_kfunc_id_set bpf_map_iter_kfunc_set = { + .owner = THIS_MODULE, + .set = &bpf_map_iter_kfunc_ids, +}; + +static int init_subsystem(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &bpf_map_iter_kfunc_set); +} +late_initcall(init_subsystem); diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 0668bcd7c926..9c49ae53deaf 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -98,11 +98,23 @@ struct bpf_mem_cache { int free_cnt; int low_watermark, high_watermark, batch; int percpu_size; + bool draining; + struct bpf_mem_cache *tgt; - struct rcu_head rcu; + /* list of objects to be freed after RCU GP */ struct llist_head free_by_rcu; + struct llist_node *free_by_rcu_tail; struct llist_head waiting_for_gp; + struct llist_node *waiting_for_gp_tail; + struct rcu_head rcu; atomic_t call_rcu_in_progress; + struct llist_head free_llist_extra_rcu; + + /* list of objects to be freed after RCU tasks trace GP */ + struct llist_head free_by_rcu_ttrace; + struct llist_head waiting_for_gp_ttrace; + struct rcu_head rcu_ttrace; + atomic_t call_rcu_ttrace_in_progress; }; struct bpf_mem_caches { @@ -153,59 +165,87 @@ static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c) #endif } +static void inc_active(struct bpf_mem_cache *c, unsigned long *flags) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + /* In RT irq_work runs in per-cpu kthread, so disable + * interrupts to avoid preemption and interrupts and + * reduce the chance of bpf prog executing on this cpu + * when active counter is busy. + */ + local_irq_save(*flags); + /* alloc_bulk runs from irq_work which will not preempt a bpf + * program that does unit_alloc/unit_free since IRQs are + * disabled there. There is no race to increment 'active' + * counter. It protects free_llist from corruption in case NMI + * bpf prog preempted this loop. + */ + WARN_ON_ONCE(local_inc_return(&c->active) != 1); +} + +static void dec_active(struct bpf_mem_cache *c, unsigned long *flags) +{ + local_dec(&c->active); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_restore(*flags); +} + +static void add_obj_to_free_list(struct bpf_mem_cache *c, void *obj) +{ + unsigned long flags; + + inc_active(c, &flags); + __llist_add(obj, &c->free_llist); + c->free_cnt++; + dec_active(c, &flags); +} + /* Mostly runs from irq_work except __init phase. */ -static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node) +static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic) { struct mem_cgroup *memcg = NULL, *old_memcg; - unsigned long flags; + gfp_t gfp; void *obj; int i; - memcg = get_memcg(c); - old_memcg = set_active_memcg(memcg); + gfp = __GFP_NOWARN | __GFP_ACCOUNT; + gfp |= atomic ? GFP_NOWAIT : GFP_KERNEL; + for (i = 0; i < cnt; i++) { /* - * free_by_rcu is only manipulated by irq work refill_work(). - * IRQ works on the same CPU are called sequentially, so it is - * safe to use __llist_del_first() here. If alloc_bulk() is - * invoked by the initial prefill, there will be no running - * refill_work(), so __llist_del_first() is fine as well. - * - * In most cases, objects on free_by_rcu are from the same CPU. - * If some objects come from other CPUs, it doesn't incur any - * harm because NUMA_NO_NODE means the preference for current - * numa node and it is not a guarantee. + * For every 'c' llist_del_first(&c->free_by_rcu_ttrace); is + * done only by one CPU == current CPU. Other CPUs might + * llist_add() and llist_del_all() in parallel. */ - obj = __llist_del_first(&c->free_by_rcu); - if (!obj) { - /* Allocate, but don't deplete atomic reserves that typical - * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc - * will allocate from the current numa node which is what we - * want here. - */ - obj = __alloc(c, node, GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT); - if (!obj) - break; - } - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - /* In RT irq_work runs in per-cpu kthread, so disable - * interrupts to avoid preemption and interrupts and - * reduce the chance of bpf prog executing on this cpu - * when active counter is busy. - */ - local_irq_save(flags); - /* alloc_bulk runs from irq_work which will not preempt a bpf - * program that does unit_alloc/unit_free since IRQs are - * disabled there. There is no race to increment 'active' - * counter. It protects free_llist from corruption in case NMI - * bpf prog preempted this loop. + obj = llist_del_first(&c->free_by_rcu_ttrace); + if (!obj) + break; + add_obj_to_free_list(c, obj); + } + if (i >= cnt) + return; + + for (; i < cnt; i++) { + obj = llist_del_first(&c->waiting_for_gp_ttrace); + if (!obj) + break; + add_obj_to_free_list(c, obj); + } + if (i >= cnt) + return; + + memcg = get_memcg(c); + old_memcg = set_active_memcg(memcg); + for (; i < cnt; i++) { + /* Allocate, but don't deplete atomic reserves that typical + * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc + * will allocate from the current numa node which is what we + * want here. */ - WARN_ON_ONCE(local_inc_return(&c->active) != 1); - __llist_add(obj, &c->free_llist); - c->free_cnt++; - local_dec(&c->active); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - local_irq_restore(flags); + obj = __alloc(c, node, gfp); + if (!obj) + break; + add_obj_to_free_list(c, obj); } set_active_memcg(old_memcg); mem_cgroup_put(memcg); @@ -222,20 +262,24 @@ static void free_one(void *obj, bool percpu) kfree(obj); } -static void free_all(struct llist_node *llnode, bool percpu) +static int free_all(struct llist_node *llnode, bool percpu) { struct llist_node *pos, *t; + int cnt = 0; - llist_for_each_safe(pos, t, llnode) + llist_for_each_safe(pos, t, llnode) { free_one(pos, percpu); + cnt++; + } + return cnt; } static void __free_rcu(struct rcu_head *head) { - struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); + struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace); - free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size); - atomic_set(&c->call_rcu_in_progress, 0); + free_all(llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size); + atomic_set(&c->call_rcu_ttrace_in_progress, 0); } static void __free_rcu_tasks_trace(struct rcu_head *head) @@ -254,60 +298,128 @@ static void enque_to_free(struct bpf_mem_cache *c, void *obj) struct llist_node *llnode = obj; /* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work. - * Nothing races to add to free_by_rcu list. + * Nothing races to add to free_by_rcu_ttrace list. */ - __llist_add(llnode, &c->free_by_rcu); + llist_add(llnode, &c->free_by_rcu_ttrace); } -static void do_call_rcu(struct bpf_mem_cache *c) +static void do_call_rcu_ttrace(struct bpf_mem_cache *c) { struct llist_node *llnode, *t; - if (atomic_xchg(&c->call_rcu_in_progress, 1)) + if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) { + if (unlikely(READ_ONCE(c->draining))) { + llnode = llist_del_all(&c->free_by_rcu_ttrace); + free_all(llnode, !!c->percpu_size); + } return; + } + + WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace)); + llist_for_each_safe(llnode, t, llist_del_all(&c->free_by_rcu_ttrace)) + llist_add(llnode, &c->waiting_for_gp_ttrace); + + if (unlikely(READ_ONCE(c->draining))) { + __free_rcu(&c->rcu_ttrace); + return; + } - WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); - llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu)) - /* There is no concurrent __llist_add(waiting_for_gp) access. - * It doesn't race with llist_del_all either. - * But there could be two concurrent llist_del_all(waiting_for_gp): - * from __free_rcu() and from drain_mem_cache(). - */ - __llist_add(llnode, &c->waiting_for_gp); /* Use call_rcu_tasks_trace() to wait for sleepable progs to finish. * If RCU Tasks Trace grace period implies RCU grace period, free * these elements directly, else use call_rcu() to wait for normal * progs to finish and finally do free_one() on each element. */ - call_rcu_tasks_trace(&c->rcu, __free_rcu_tasks_trace); + call_rcu_tasks_trace(&c->rcu_ttrace, __free_rcu_tasks_trace); } static void free_bulk(struct bpf_mem_cache *c) { + struct bpf_mem_cache *tgt = c->tgt; struct llist_node *llnode, *t; unsigned long flags; int cnt; + WARN_ON_ONCE(tgt->unit_size != c->unit_size); + do { - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - local_irq_save(flags); - WARN_ON_ONCE(local_inc_return(&c->active) != 1); + inc_active(c, &flags); llnode = __llist_del_first(&c->free_llist); if (llnode) cnt = --c->free_cnt; else cnt = 0; - local_dec(&c->active); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - local_irq_restore(flags); + dec_active(c, &flags); if (llnode) - enque_to_free(c, llnode); + enque_to_free(tgt, llnode); } while (cnt > (c->high_watermark + c->low_watermark) / 2); /* and drain free_llist_extra */ llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra)) - enque_to_free(c, llnode); - do_call_rcu(c); + enque_to_free(tgt, llnode); + do_call_rcu_ttrace(tgt); +} + +static void __free_by_rcu(struct rcu_head *head) +{ + struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); + struct bpf_mem_cache *tgt = c->tgt; + struct llist_node *llnode; + + llnode = llist_del_all(&c->waiting_for_gp); + if (!llnode) + goto out; + + llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace); + + /* Objects went through regular RCU GP. Send them to RCU tasks trace */ + do_call_rcu_ttrace(tgt); +out: + atomic_set(&c->call_rcu_in_progress, 0); +} + +static void check_free_by_rcu(struct bpf_mem_cache *c) +{ + struct llist_node *llnode, *t; + unsigned long flags; + + /* drain free_llist_extra_rcu */ + if (unlikely(!llist_empty(&c->free_llist_extra_rcu))) { + inc_active(c, &flags); + llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu)) + if (__llist_add(llnode, &c->free_by_rcu)) + c->free_by_rcu_tail = llnode; + dec_active(c, &flags); + } + + if (llist_empty(&c->free_by_rcu)) + return; + + if (atomic_xchg(&c->call_rcu_in_progress, 1)) { + /* + * Instead of kmalloc-ing new rcu_head and triggering 10k + * call_rcu() to hit rcutree.qhimark and force RCU to notice + * the overload just ask RCU to hurry up. There could be many + * objects in free_by_rcu list. + * This hint reduces memory consumption for an artificial + * benchmark from 2 Gbyte to 150 Mbyte. + */ + rcu_request_urgent_qs_task(current); + return; + } + + WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); + + inc_active(c, &flags); + WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu)); + c->waiting_for_gp_tail = c->free_by_rcu_tail; + dec_active(c, &flags); + + if (unlikely(READ_ONCE(c->draining))) { + free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size); + atomic_set(&c->call_rcu_in_progress, 0); + } else { + call_rcu_hurry(&c->rcu, __free_by_rcu); + } } static void bpf_mem_refill(struct irq_work *work) @@ -321,9 +433,11 @@ static void bpf_mem_refill(struct irq_work *work) /* irq_work runs on this cpu and kmalloc will allocate * from the current numa node which is what we want here. */ - alloc_bulk(c, c->batch, NUMA_NO_NODE); + alloc_bulk(c, c->batch, NUMA_NO_NODE, true); else if (cnt > c->high_watermark) free_bulk(c); + + check_free_by_rcu(c); } static void notrace irq_work_raise(struct bpf_mem_cache *c) @@ -367,7 +481,7 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) * prog won't be doing more than 4 map_update_elem from * irq disabled region */ - alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu)); + alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu), false); } /* When size != 0 bpf_mem_cache for each cpu. @@ -406,6 +520,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) c->unit_size = unit_size; c->objcg = objcg; c->percpu_size = percpu_size; + c->tgt = c; prefill_mem_cache(c, cpu); } ma->cache = pc; @@ -428,6 +543,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) c = &cc->cache[i]; c->unit_size = sizes[i]; c->objcg = objcg; + c->tgt = c; prefill_mem_cache(c, cpu); } } @@ -441,19 +557,57 @@ static void drain_mem_cache(struct bpf_mem_cache *c) /* No progs are using this bpf_mem_cache, but htab_map_free() called * bpf_mem_cache_free() for all remaining elements and they can be in - * free_by_rcu or in waiting_for_gp lists, so drain those lists now. + * free_by_rcu_ttrace or in waiting_for_gp_ttrace lists, so drain those lists now. * - * Except for waiting_for_gp list, there are no concurrent operations + * Except for waiting_for_gp_ttrace list, there are no concurrent operations * on these lists, so it is safe to use __llist_del_all(). */ - free_all(__llist_del_all(&c->free_by_rcu), percpu); - free_all(llist_del_all(&c->waiting_for_gp), percpu); + free_all(llist_del_all(&c->free_by_rcu_ttrace), percpu); + free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu); free_all(__llist_del_all(&c->free_llist), percpu); free_all(__llist_del_all(&c->free_llist_extra), percpu); + free_all(__llist_del_all(&c->free_by_rcu), percpu); + free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu); + free_all(llist_del_all(&c->waiting_for_gp), percpu); +} + +static void check_mem_cache(struct bpf_mem_cache *c) +{ + WARN_ON_ONCE(!llist_empty(&c->free_by_rcu_ttrace)); + WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace)); + WARN_ON_ONCE(!llist_empty(&c->free_llist)); + WARN_ON_ONCE(!llist_empty(&c->free_llist_extra)); + WARN_ON_ONCE(!llist_empty(&c->free_by_rcu)); + WARN_ON_ONCE(!llist_empty(&c->free_llist_extra_rcu)); + WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); +} + +static void check_leaked_objs(struct bpf_mem_alloc *ma) +{ + struct bpf_mem_caches *cc; + struct bpf_mem_cache *c; + int cpu, i; + + if (ma->cache) { + for_each_possible_cpu(cpu) { + c = per_cpu_ptr(ma->cache, cpu); + check_mem_cache(c); + } + } + if (ma->caches) { + for_each_possible_cpu(cpu) { + cc = per_cpu_ptr(ma->caches, cpu); + for (i = 0; i < NUM_CACHES; i++) { + c = &cc->cache[i]; + check_mem_cache(c); + } + } + } } static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) { + check_leaked_objs(ma); free_percpu(ma->cache); free_percpu(ma->caches); ma->cache = NULL; @@ -462,8 +616,8 @@ static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) static void free_mem_alloc(struct bpf_mem_alloc *ma) { - /* waiting_for_gp lists was drained, but __free_rcu might - * still execute. Wait for it now before we freeing percpu caches. + /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks + * might still execute. Wait for them. * * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(), * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used @@ -472,7 +626,8 @@ static void free_mem_alloc(struct bpf_mem_alloc *ma) * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by * using rcu_trace_implies_rcu_gp() as well. */ - rcu_barrier_tasks_trace(); + rcu_barrier(); /* wait for __free_by_rcu */ + rcu_barrier_tasks_trace(); /* wait for __free_rcu */ if (!rcu_trace_implies_rcu_gp()) rcu_barrier(); free_mem_alloc_no_barrier(ma); @@ -498,7 +653,7 @@ static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress) return; } - copy = kmalloc(sizeof(*ma), GFP_KERNEL); + copy = kmemdup(ma, sizeof(*ma), GFP_KERNEL); if (!copy) { /* Slow path with inline barrier-s */ free_mem_alloc(ma); @@ -506,10 +661,7 @@ static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress) } /* Defer barriers into worker to let the rest of map memory to be freed */ - copy->cache = ma->cache; - ma->cache = NULL; - copy->caches = ma->caches; - ma->caches = NULL; + memset(ma, 0, sizeof(*ma)); INIT_WORK(©->work, free_mem_alloc_deferred); queue_work(system_unbound_wq, ©->work); } @@ -524,17 +676,10 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) rcu_in_progress = 0; for_each_possible_cpu(cpu) { c = per_cpu_ptr(ma->cache, cpu); - /* - * refill_work may be unfinished for PREEMPT_RT kernel - * in which irq work is invoked in a per-CPU RT thread. - * It is also possible for kernel with - * arch_irq_work_has_interrupt() being false and irq - * work is invoked in timer interrupt. So waiting for - * the completion of irq work to ease the handling of - * concurrency. - */ + WRITE_ONCE(c->draining, true); irq_work_sync(&c->refill_work); drain_mem_cache(c); + rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } /* objcg is the same across cpus */ @@ -548,8 +693,10 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) cc = per_cpu_ptr(ma->caches, cpu); for (i = 0; i < NUM_CACHES; i++) { c = &cc->cache[i]; + WRITE_ONCE(c->draining, true); irq_work_sync(&c->refill_work); drain_mem_cache(c); + rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } } @@ -581,8 +728,10 @@ static void notrace *unit_alloc(struct bpf_mem_cache *c) local_irq_save(flags); if (local_inc_return(&c->active) == 1) { llnode = __llist_del_first(&c->free_llist); - if (llnode) + if (llnode) { cnt = --c->free_cnt; + *(struct bpf_mem_cache **)llnode = c; + } } local_dec(&c->active); local_irq_restore(flags); @@ -606,6 +755,12 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) BUILD_BUG_ON(LLIST_NODE_SZ > 8); + /* + * Remember bpf_mem_cache that allocated this object. + * The hint is not accurate. + */ + c->tgt = *(struct bpf_mem_cache **)llnode; + local_irq_save(flags); if (local_inc_return(&c->active) == 1) { __llist_add(llnode, &c->free_llist); @@ -627,6 +782,27 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) irq_work_raise(c); } +static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) +{ + struct llist_node *llnode = ptr - LLIST_NODE_SZ; + unsigned long flags; + + c->tgt = *(struct bpf_mem_cache **)llnode; + + local_irq_save(flags); + if (local_inc_return(&c->active) == 1) { + if (__llist_add(llnode, &c->free_by_rcu)) + c->free_by_rcu_tail = llnode; + } else { + llist_add(llnode, &c->free_llist_extra_rcu); + } + local_dec(&c->active); + local_irq_restore(flags); + + if (!atomic_read(&c->call_rcu_in_progress)) + irq_work_raise(c); +} + /* Called from BPF program or from sys_bpf syscall. * In both cases migration is disabled. */ @@ -660,6 +836,20 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr); } +void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr) +{ + int idx; + + if (!ptr) + return; + + idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ)); + if (idx < 0) + return; + + unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr); +} + void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma) { void *ret; @@ -676,6 +866,14 @@ void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr) unit_free(this_cpu_ptr(ma->cache), ptr); } +void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr) +{ + if (!ptr) + return; + + unit_free_rcu(this_cpu_ptr(ma->cache), ptr); +} + /* Directly does a kfree() without putting 'ptr' back to the free_llist * for reuse and without waiting for a rcu_tasks_trace gp. * The caller must first go through the rcu_tasks_trace gp for 'ptr' diff --git a/kernel/bpf/mprog.c b/kernel/bpf/mprog.c new file mode 100644 index 000000000000..32d2c4829eb8 --- /dev/null +++ b/kernel/bpf/mprog.c @@ -0,0 +1,447 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2023 Isovalent */ + +#include <linux/bpf.h> +#include <linux/bpf_mprog.h> + +static int bpf_mprog_link(struct bpf_tuple *tuple, + u32 id_or_fd, u32 flags, + enum bpf_prog_type type) +{ + struct bpf_link *link = ERR_PTR(-EINVAL); + bool id = flags & BPF_F_ID; + + if (id) + link = bpf_link_by_id(id_or_fd); + else if (id_or_fd) + link = bpf_link_get_from_fd(id_or_fd); + if (IS_ERR(link)) + return PTR_ERR(link); + if (type && link->prog->type != type) { + bpf_link_put(link); + return -EINVAL; + } + + tuple->link = link; + tuple->prog = link->prog; + return 0; +} + +static int bpf_mprog_prog(struct bpf_tuple *tuple, + u32 id_or_fd, u32 flags, + enum bpf_prog_type type) +{ + struct bpf_prog *prog = ERR_PTR(-EINVAL); + bool id = flags & BPF_F_ID; + + if (id) + prog = bpf_prog_by_id(id_or_fd); + else if (id_or_fd) + prog = bpf_prog_get(id_or_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + if (type && prog->type != type) { + bpf_prog_put(prog); + return -EINVAL; + } + + tuple->link = NULL; + tuple->prog = prog; + return 0; +} + +static int bpf_mprog_tuple_relative(struct bpf_tuple *tuple, + u32 id_or_fd, u32 flags, + enum bpf_prog_type type) +{ + bool link = flags & BPF_F_LINK; + bool id = flags & BPF_F_ID; + + memset(tuple, 0, sizeof(*tuple)); + if (link) + return bpf_mprog_link(tuple, id_or_fd, flags, type); + /* If no relevant flag is set and no id_or_fd was passed, then + * tuple link/prog is just NULLed. This is the case when before/ + * after selects first/last position without passing fd. + */ + if (!id && !id_or_fd) + return 0; + return bpf_mprog_prog(tuple, id_or_fd, flags, type); +} + +static void bpf_mprog_tuple_put(struct bpf_tuple *tuple) +{ + if (tuple->link) + bpf_link_put(tuple->link); + else if (tuple->prog) + bpf_prog_put(tuple->prog); +} + +/* The bpf_mprog_{replace,delete}() operate on exact idx position with the + * one exception that for deletion we support delete from front/back. In + * case of front idx is -1, in case of back idx is bpf_mprog_total(entry). + * Adjustment to first and last entry is trivial. The bpf_mprog_insert() + * we have to deal with the following cases: + * + * idx + before: + * + * Insert P4 before P3: idx for old array is 1, idx for new array is 2, + * hence we adjust target idx for the new array, so that memmove copies + * P1 and P2 to the new entry, and we insert P4 into idx 2. Inserting + * before P1 would have old idx -1 and new idx 0. + * + * +--+--+--+ +--+--+--+--+ +--+--+--+--+ + * |P1|P2|P3| ==> |P1|P2| |P3| ==> |P1|P2|P4|P3| + * +--+--+--+ +--+--+--+--+ +--+--+--+--+ + * + * idx + after: + * + * Insert P4 after P2: idx for old array is 2, idx for new array is 2. + * Again, memmove copies P1 and P2 to the new entry, and we insert P4 + * into idx 2. Inserting after P3 would have both old/new idx at 4 aka + * bpf_mprog_total(entry). + * + * +--+--+--+ +--+--+--+--+ +--+--+--+--+ + * |P1|P2|P3| ==> |P1|P2| |P3| ==> |P1|P2|P4|P3| + * +--+--+--+ +--+--+--+--+ +--+--+--+--+ + */ +static int bpf_mprog_replace(struct bpf_mprog_entry *entry, + struct bpf_mprog_entry **entry_new, + struct bpf_tuple *ntuple, int idx) +{ + struct bpf_mprog_fp *fp; + struct bpf_mprog_cp *cp; + struct bpf_prog *oprog; + + bpf_mprog_read(entry, idx, &fp, &cp); + oprog = READ_ONCE(fp->prog); + bpf_mprog_write(fp, cp, ntuple); + if (!ntuple->link) { + WARN_ON_ONCE(cp->link); + bpf_prog_put(oprog); + } + *entry_new = entry; + return 0; +} + +static int bpf_mprog_insert(struct bpf_mprog_entry *entry, + struct bpf_mprog_entry **entry_new, + struct bpf_tuple *ntuple, int idx, u32 flags) +{ + int total = bpf_mprog_total(entry); + struct bpf_mprog_entry *peer; + struct bpf_mprog_fp *fp; + struct bpf_mprog_cp *cp; + + peer = bpf_mprog_peer(entry); + bpf_mprog_entry_copy(peer, entry); + if (idx == total) + goto insert; + else if (flags & BPF_F_BEFORE) + idx += 1; + bpf_mprog_entry_grow(peer, idx); +insert: + bpf_mprog_read(peer, idx, &fp, &cp); + bpf_mprog_write(fp, cp, ntuple); + bpf_mprog_inc(peer); + *entry_new = peer; + return 0; +} + +static int bpf_mprog_delete(struct bpf_mprog_entry *entry, + struct bpf_mprog_entry **entry_new, + struct bpf_tuple *dtuple, int idx) +{ + int total = bpf_mprog_total(entry); + struct bpf_mprog_entry *peer; + + peer = bpf_mprog_peer(entry); + bpf_mprog_entry_copy(peer, entry); + if (idx == -1) + idx = 0; + else if (idx == total) + idx = total - 1; + bpf_mprog_entry_shrink(peer, idx); + bpf_mprog_dec(peer); + bpf_mprog_mark_for_release(peer, dtuple); + *entry_new = peer; + return 0; +} + +/* In bpf_mprog_pos_*() we evaluate the target position for the BPF + * program/link that needs to be replaced, inserted or deleted for + * each "rule" independently. If all rules agree on that position + * or existing element, then enact replacement, addition or deletion. + * If this is not the case, then the request cannot be satisfied and + * we bail out with an error. + */ +static int bpf_mprog_pos_exact(struct bpf_mprog_entry *entry, + struct bpf_tuple *tuple) +{ + struct bpf_mprog_fp *fp; + struct bpf_mprog_cp *cp; + int i; + + for (i = 0; i < bpf_mprog_total(entry); i++) { + bpf_mprog_read(entry, i, &fp, &cp); + if (tuple->prog == READ_ONCE(fp->prog)) + return tuple->link == cp->link ? i : -EBUSY; + } + return -ENOENT; +} + +static int bpf_mprog_pos_before(struct bpf_mprog_entry *entry, + struct bpf_tuple *tuple) +{ + struct bpf_mprog_fp *fp; + struct bpf_mprog_cp *cp; + int i; + + for (i = 0; i < bpf_mprog_total(entry); i++) { + bpf_mprog_read(entry, i, &fp, &cp); + if (tuple->prog == READ_ONCE(fp->prog) && + (!tuple->link || tuple->link == cp->link)) + return i - 1; + } + return tuple->prog ? -ENOENT : -1; +} + +static int bpf_mprog_pos_after(struct bpf_mprog_entry *entry, + struct bpf_tuple *tuple) +{ + struct bpf_mprog_fp *fp; + struct bpf_mprog_cp *cp; + int i; + + for (i = 0; i < bpf_mprog_total(entry); i++) { + bpf_mprog_read(entry, i, &fp, &cp); + if (tuple->prog == READ_ONCE(fp->prog) && + (!tuple->link || tuple->link == cp->link)) + return i + 1; + } + return tuple->prog ? -ENOENT : bpf_mprog_total(entry); +} + +int bpf_mprog_attach(struct bpf_mprog_entry *entry, + struct bpf_mprog_entry **entry_new, + struct bpf_prog *prog_new, struct bpf_link *link, + struct bpf_prog *prog_old, + u32 flags, u32 id_or_fd, u64 revision) +{ + struct bpf_tuple rtuple, ntuple = { + .prog = prog_new, + .link = link, + }, otuple = { + .prog = prog_old, + .link = link, + }; + int ret, idx = -ERANGE, tidx; + + if (revision && revision != bpf_mprog_revision(entry)) + return -ESTALE; + if (bpf_mprog_exists(entry, prog_new)) + return -EEXIST; + ret = bpf_mprog_tuple_relative(&rtuple, id_or_fd, + flags & ~BPF_F_REPLACE, + prog_new->type); + if (ret) + return ret; + if (flags & BPF_F_REPLACE) { + tidx = bpf_mprog_pos_exact(entry, &otuple); + if (tidx < 0) { + ret = tidx; + goto out; + } + idx = tidx; + } + if (flags & BPF_F_BEFORE) { + tidx = bpf_mprog_pos_before(entry, &rtuple); + if (tidx < -1 || (idx >= -1 && tidx != idx)) { + ret = tidx < -1 ? tidx : -ERANGE; + goto out; + } + idx = tidx; + } + if (flags & BPF_F_AFTER) { + tidx = bpf_mprog_pos_after(entry, &rtuple); + if (tidx < -1 || (idx >= -1 && tidx != idx)) { + ret = tidx < 0 ? tidx : -ERANGE; + goto out; + } + idx = tidx; + } + if (idx < -1) { + if (rtuple.prog || flags) { + ret = -EINVAL; + goto out; + } + idx = bpf_mprog_total(entry); + flags = BPF_F_AFTER; + } + if (idx >= bpf_mprog_max()) { + ret = -ERANGE; + goto out; + } + if (flags & BPF_F_REPLACE) + ret = bpf_mprog_replace(entry, entry_new, &ntuple, idx); + else + ret = bpf_mprog_insert(entry, entry_new, &ntuple, idx, flags); +out: + bpf_mprog_tuple_put(&rtuple); + return ret; +} + +static int bpf_mprog_fetch(struct bpf_mprog_entry *entry, + struct bpf_tuple *tuple, int idx) +{ + int total = bpf_mprog_total(entry); + struct bpf_mprog_cp *cp; + struct bpf_mprog_fp *fp; + struct bpf_prog *prog; + struct bpf_link *link; + + if (idx == -1) + idx = 0; + else if (idx == total) + idx = total - 1; + bpf_mprog_read(entry, idx, &fp, &cp); + prog = READ_ONCE(fp->prog); + link = cp->link; + /* The deletion request can either be without filled tuple in which + * case it gets populated here based on idx, or with filled tuple + * where the only thing we end up doing is the WARN_ON_ONCE() assert. + * If we hit a BPF link at the given index, it must not be removed + * from opts path. + */ + if (link && !tuple->link) + return -EBUSY; + WARN_ON_ONCE(tuple->prog && tuple->prog != prog); + WARN_ON_ONCE(tuple->link && tuple->link != link); + tuple->prog = prog; + tuple->link = link; + return 0; +} + +int bpf_mprog_detach(struct bpf_mprog_entry *entry, + struct bpf_mprog_entry **entry_new, + struct bpf_prog *prog, struct bpf_link *link, + u32 flags, u32 id_or_fd, u64 revision) +{ + struct bpf_tuple rtuple, dtuple = { + .prog = prog, + .link = link, + }; + int ret, idx = -ERANGE, tidx; + + if (flags & BPF_F_REPLACE) + return -EINVAL; + if (revision && revision != bpf_mprog_revision(entry)) + return -ESTALE; + if (!bpf_mprog_total(entry)) + return -ENOENT; + ret = bpf_mprog_tuple_relative(&rtuple, id_or_fd, flags, + prog ? prog->type : + BPF_PROG_TYPE_UNSPEC); + if (ret) + return ret; + if (dtuple.prog) { + tidx = bpf_mprog_pos_exact(entry, &dtuple); + if (tidx < 0) { + ret = tidx; + goto out; + } + idx = tidx; + } + if (flags & BPF_F_BEFORE) { + tidx = bpf_mprog_pos_before(entry, &rtuple); + if (tidx < -1 || (idx >= -1 && tidx != idx)) { + ret = tidx < -1 ? tidx : -ERANGE; + goto out; + } + idx = tidx; + } + if (flags & BPF_F_AFTER) { + tidx = bpf_mprog_pos_after(entry, &rtuple); + if (tidx < -1 || (idx >= -1 && tidx != idx)) { + ret = tidx < 0 ? tidx : -ERANGE; + goto out; + } + idx = tidx; + } + if (idx < -1) { + if (rtuple.prog || flags) { + ret = -EINVAL; + goto out; + } + idx = bpf_mprog_total(entry); + flags = BPF_F_AFTER; + } + if (idx >= bpf_mprog_max()) { + ret = -ERANGE; + goto out; + } + ret = bpf_mprog_fetch(entry, &dtuple, idx); + if (ret) + goto out; + ret = bpf_mprog_delete(entry, entry_new, &dtuple, idx); +out: + bpf_mprog_tuple_put(&rtuple); + return ret; +} + +int bpf_mprog_query(const union bpf_attr *attr, union bpf_attr __user *uattr, + struct bpf_mprog_entry *entry) +{ + u32 __user *uprog_flags, *ulink_flags; + u32 __user *uprog_id, *ulink_id; + struct bpf_mprog_fp *fp; + struct bpf_mprog_cp *cp; + struct bpf_prog *prog; + const u32 flags = 0; + int i, ret = 0; + u32 id, count; + u64 revision; + + if (attr->query.query_flags || attr->query.attach_flags) + return -EINVAL; + revision = bpf_mprog_revision(entry); + count = bpf_mprog_total(entry); + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) + return -EFAULT; + if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision))) + return -EFAULT; + if (copy_to_user(&uattr->query.count, &count, sizeof(count))) + return -EFAULT; + uprog_id = u64_to_user_ptr(attr->query.prog_ids); + uprog_flags = u64_to_user_ptr(attr->query.prog_attach_flags); + ulink_id = u64_to_user_ptr(attr->query.link_ids); + ulink_flags = u64_to_user_ptr(attr->query.link_attach_flags); + if (attr->query.count == 0 || !uprog_id || !count) + return 0; + if (attr->query.count < count) { + count = attr->query.count; + ret = -ENOSPC; + } + for (i = 0; i < bpf_mprog_max(); i++) { + bpf_mprog_read(entry, i, &fp, &cp); + prog = READ_ONCE(fp->prog); + if (!prog) + break; + id = prog->aux->id; + if (copy_to_user(uprog_id + i, &id, sizeof(id))) + return -EFAULT; + if (uprog_flags && + copy_to_user(uprog_flags + i, &flags, sizeof(flags))) + return -EFAULT; + id = cp->link ? cp->link->id : 0; + if (ulink_id && + copy_to_user(ulink_id + i, &id, sizeof(id))) + return -EFAULT; + if (ulink_flags && + copy_to_user(ulink_flags + i, &flags, sizeof(flags))) + return -EFAULT; + if (i + 1 == count) + break; + } + return ret; +} diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 8a26cd8814c1..3e4f2ec1af06 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -25,6 +25,7 @@ #include <linux/rhashtable.h> #include <linux/rtnetlink.h> #include <linux/rwsem.h> +#include <net/xdp.h> /* Protects offdevs, members of bpf_offload_netdev and offload members * of all progs. diff --git a/kernel/bpf/preload/iterators/Makefile b/kernel/bpf/preload/iterators/Makefile index 8937dc6bc8d0..b83c2f5e9be1 100644 --- a/kernel/bpf/preload/iterators/Makefile +++ b/kernel/bpf/preload/iterators/Makefile @@ -50,7 +50,7 @@ iterators.lskel-%.h: $(OUTPUT)/%/iterators.bpf.o | $(BPFTOOL) $(OUTPUT)/%/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT) $(call msg,BPF,$@) $(Q)mkdir -p $(@D) - $(Q)$(CLANG) -g -O2 -target bpf -m$* $(INCLUDES) \ + $(Q)$(CLANG) -g -O2 --target=bpf -m$* $(INCLUDES) \ -c $(filter %.c,$^) -o $@ && \ $(LLVM_STRIP) -g $@ diff --git a/kernel/bpf/preload/iterators/iterators.bpf.c b/kernel/bpf/preload/iterators/iterators.bpf.c index 03af863314ea..b78968b63fab 100644 --- a/kernel/bpf/preload/iterators/iterators.bpf.c +++ b/kernel/bpf/preload/iterators/iterators.bpf.c @@ -73,6 +73,8 @@ static const char *get_name(struct btf *btf, long btf_id, const char *fallback) return str + name_off; } +__s64 bpf_map_sum_elem_count(struct bpf_map *map) __ksym; + SEC("iter/bpf_map") int dump_bpf_map(struct bpf_iter__bpf_map *ctx) { @@ -84,9 +86,12 @@ int dump_bpf_map(struct bpf_iter__bpf_map *ctx) return 0; if (seq_num == 0) - BPF_SEQ_PRINTF(seq, " id name max_entries\n"); + BPF_SEQ_PRINTF(seq, " id name max_entries cur_entries\n"); + + BPF_SEQ_PRINTF(seq, "%4u %-16s %10d %10lld\n", + map->id, map->name, map->max_entries, + bpf_map_sum_elem_count(map)); - BPF_SEQ_PRINTF(seq, "%4u %-16s%6d\n", map->id, map->name, map->max_entries); return 0; } diff --git a/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h b/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h index 70f236a82fe1..5b98ab02025e 100644 --- a/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h +++ b/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ -/* THIS FILE IS AUTOGENERATED! */ +/* THIS FILE IS AUTOGENERATED BY BPFTOOL! */ #ifndef __ITERATORS_BPF_SKEL_H__ #define __ITERATORS_BPF_SKEL_H__ @@ -18,8 +18,6 @@ struct iterators_bpf { int dump_bpf_map_fd; int dump_bpf_prog_fd; } links; - struct iterators_bpf__rodata { - } *rodata; }; static inline int @@ -68,7 +66,6 @@ iterators_bpf__destroy(struct iterators_bpf *skel) iterators_bpf__detach(skel); skel_closenz(skel->progs.dump_bpf_map.prog_fd); skel_closenz(skel->progs.dump_bpf_prog.prog_fd); - skel_free_map_data(skel->rodata, skel->maps.rodata.initial_value, 4096); skel_closenz(skel->maps.rodata.map_fd); skel_free(skel); } @@ -81,15 +78,6 @@ iterators_bpf__open(void) if (!skel) goto cleanup; skel->ctx.sz = (void *)&skel->links - (void *)skel; - skel->rodata = skel_prep_map_data((void *)"\ -\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\ -\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\ -\x25\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\ -\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\ -\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0", 4096, 98); - if (!skel->rodata) - goto cleanup; - skel->maps.rodata.initial_value = (__u64) (long) skel->rodata; return skel; cleanup: iterators_bpf__destroy(skel); @@ -103,7 +91,7 @@ iterators_bpf__load(struct iterators_bpf *skel) int err; opts.ctx = (struct bpf_loader_ctx *)skel; - opts.data_sz = 6056; + opts.data_sz = 6208; opts.data = (void *)"\ \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ @@ -138,190 +126,197 @@ iterators_bpf__load(struct iterators_bpf *skel) \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x9f\xeb\x01\0\ -\x18\0\0\0\0\0\0\0\x1c\x04\0\0\x1c\x04\0\0\xf9\x04\0\0\0\0\0\0\0\0\0\x02\x02\0\ +\x18\0\0\0\0\0\0\0\x80\x04\0\0\x80\x04\0\0\x31\x05\0\0\0\0\0\0\0\0\0\x02\x02\0\ \0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\0\0\0\x04\ \0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\0\0\0\0\0\ \0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\0\0\0\x20\ -\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xa3\0\0\0\x03\0\0\x04\x18\0\0\0\xb1\0\ -\0\0\x09\0\0\0\0\0\0\0\xb5\0\0\0\x0b\0\0\0\x40\0\0\0\xc0\0\0\0\x0b\0\0\0\x80\0\ -\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xc8\0\0\0\0\0\0\x07\0\0\0\0\xd1\0\0\0\0\0\0\ -\x08\x0c\0\0\0\xd7\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\x94\x01\0\0\x03\0\0\x04\ -\x18\0\0\0\x9c\x01\0\0\x0e\0\0\0\0\0\0\0\x9f\x01\0\0\x11\0\0\0\x20\0\0\0\xa4\ -\x01\0\0\x0e\0\0\0\xa0\0\0\0\xb0\x01\0\0\0\0\0\x08\x0f\0\0\0\xb6\x01\0\0\0\0\0\ -\x01\x04\0\0\0\x20\0\0\0\xc3\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\0\0\0\ -\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xc8\x01\0\0\0\0\0\x01\x04\0\0\0\ -\x20\0\0\0\0\0\0\0\0\0\0\x02\x14\0\0\0\x2c\x02\0\0\x02\0\0\x04\x10\0\0\0\x13\0\ -\0\0\x03\0\0\0\0\0\0\0\x3f\x02\0\0\x15\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x18\0\ -\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x13\0\0\0\x44\x02\0\0\x01\0\0\x0c\ -\x16\0\0\0\x90\x02\0\0\x01\0\0\x04\x08\0\0\0\x99\x02\0\0\x19\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\x02\x1a\0\0\0\xea\x02\0\0\x06\0\0\x04\x38\0\0\0\x9c\x01\0\0\x0e\0\0\ -\0\0\0\0\0\x9f\x01\0\0\x11\0\0\0\x20\0\0\0\xf7\x02\0\0\x1b\0\0\0\xc0\0\0\0\x08\ -\x03\0\0\x15\0\0\0\0\x01\0\0\x11\x03\0\0\x1d\0\0\0\x40\x01\0\0\x1b\x03\0\0\x1e\ -\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x1c\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\0\0\0\0\ -\0\0\0\0\0\x02\x1f\0\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\x65\x03\0\0\x02\0\0\x04\ -\x08\0\0\0\x73\x03\0\0\x0e\0\0\0\0\0\0\0\x7c\x03\0\0\x0e\0\0\0\x20\0\0\0\x1b\ -\x03\0\0\x03\0\0\x04\x18\0\0\0\x86\x03\0\0\x1b\0\0\0\0\0\0\0\x8e\x03\0\0\x21\0\ -\0\0\x40\0\0\0\x94\x03\0\0\x23\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x22\0\0\0\0\0\ -\0\0\0\0\0\x02\x24\0\0\0\x98\x03\0\0\x01\0\0\x04\x04\0\0\0\xa3\x03\0\0\x0e\0\0\ -\0\0\0\0\0\x0c\x04\0\0\x01\0\0\x04\x04\0\0\0\x15\x04\0\0\x0e\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\0\x8b\x04\0\0\0\0\0\x0e\x25\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\0\x9f\x04\ -\0\0\0\0\0\x0e\x27\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\ -\x20\0\0\0\xb5\x04\0\0\0\0\0\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\ -\x1c\0\0\0\x12\0\0\0\x11\0\0\0\xca\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\xe1\x04\0\0\0\0\0\x0e\x2d\0\0\ -\0\x01\0\0\0\xe9\x04\0\0\x04\0\0\x0f\x62\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\0\x28\ -\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\0\0\ -\x11\0\0\0\xf1\x04\0\0\x01\0\0\x0f\x04\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\0\0\0\x62\ -\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\x74\ -\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\x70\ -\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x30\ -\x3a\x30\0\x2f\x77\x2f\x6e\x65\x74\x2d\x6e\x65\x78\x74\x2f\x6b\x65\x72\x6e\x65\ -\x6c\x2f\x62\x70\x66\x2f\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\ -\x74\x6f\x72\x73\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\ -\x63\0\x09\x73\x74\x72\x75\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\ -\x73\x65\x71\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\ -\x71\x3b\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\ -\x73\x65\x73\x73\x69\x6f\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\ -\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\x5f\x75\x36\x34\0\x75\x6e\x73\x69\x67\x6e\ -\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\0\x30\x3a\x31\0\x09\x73\x74\ -\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\ -\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\ -\x29\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x20\x63\ -\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\x3b\0\x30\ -\x3a\x32\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\ -\x29\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\ -\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\ -\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\ -\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\ -\0\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\ -\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\ -\x52\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\ -\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\ -\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\ -\x2d\x3e\x69\x64\x2c\x20\x6d\x61\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\ -\x70\x2d\x3e\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\ -\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\ -\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\ -\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\ -\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\ -\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\ -\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\ -\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\ -\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\ -\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\ -\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\ -\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\ -\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\ -\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\ -\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\ -\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\ -\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\ -\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\ -\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\ -\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\ -\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\ -\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\ -\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\ -\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\ -\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\ -\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\ -\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\ -\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\ -\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\ -\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\ -\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\ -\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\ -\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\ -\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\ -\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\ -\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\ -\x4e\x53\x45\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x2d\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\ -\0\x04\0\0\0\x62\0\0\0\x01\0\0\0\x80\x04\0\0\0\0\0\0\0\0\0\0\x69\x74\x65\x72\ -\x61\x74\x6f\x72\x2e\x72\x6f\x64\x61\x74\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\x2f\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\ -\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\ -\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\ -\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\ -\x61\x74\x74\x61\x63\x68\x65\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\ -\x25\x73\x20\x25\x73\x0a\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\ -\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\x1b\0\0\ -\0\0\0\x79\x11\0\0\0\0\0\0\x79\x11\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\ -\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x23\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\ -\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\ -\0\0\0\0\0\x0f\x12\0\0\0\0\0\0\x7b\x2a\xf0\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\ -\x7b\x1a\xf8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\ -\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x23\0\0\0\xb7\x03\0\0\x0e\0\0\0\ -\xb7\x05\0\0\x18\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\ -\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\x1e\x3c\x01\0\x01\0\0\0\x42\0\0\ -\0\x7b\0\0\0\x24\x3c\x01\0\x02\0\0\0\x42\0\0\0\xee\0\0\0\x1d\x44\x01\0\x03\0\0\ -\0\x42\0\0\0\x0f\x01\0\0\x06\x4c\x01\0\x04\0\0\0\x42\0\0\0\x1a\x01\0\0\x17\x40\ -\x01\0\x05\0\0\0\x42\0\0\0\x1a\x01\0\0\x1d\x40\x01\0\x06\0\0\0\x42\0\0\0\x43\ -\x01\0\0\x06\x58\x01\0\x08\0\0\0\x42\0\0\0\x56\x01\0\0\x03\x5c\x01\0\x0f\0\0\0\ -\x42\0\0\0\xdc\x01\0\0\x02\x64\x01\0\x1f\0\0\0\x42\0\0\0\x2a\x02\0\0\x01\x6c\ -\x01\0\0\0\0\0\x02\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\ -\0\x10\0\0\0\x02\0\0\0\xea\0\0\0\0\0\0\0\x20\0\0\0\x02\0\0\0\x3e\0\0\0\0\0\0\0\ -\x28\0\0\0\x08\0\0\0\x3f\x01\0\0\0\0\0\0\x78\0\0\0\x0d\0\0\0\x3e\0\0\0\0\0\0\0\ -\x88\0\0\0\x0d\0\0\0\xea\0\0\0\0\0\0\0\xa8\0\0\0\x0d\0\0\0\x3f\x01\0\0\0\0\0\0\ -\x1a\0\0\0\x21\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\0\0\0\ -\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\ -\0\0\0\0\0\x0a\0\0\0\x01\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\x10\0\0\0\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x6d\ -\x61\x70\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\ -\0\0\0\0\x79\x12\x08\0\0\0\0\0\x15\x02\x3c\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x79\ -\x27\0\0\0\0\0\0\x79\x11\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\ -\0\x07\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\ -\x31\0\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x7b\ -\x6a\xc8\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\xff\0\0\0\0\xb7\x03\0\0\ -\x04\0\0\0\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\x71\x28\0\0\0\0\0\x79\ -\x78\x30\0\0\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\ -\0\x61\x11\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\0\x03\0\0\0\x0f\x13\0\ -\0\0\0\0\0\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf8\xff\xff\xff\ -\xb7\x02\0\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\0\0\0\0\x79\xa3\xf8\xff\ -\0\0\0\0\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf4\xff\xff\xff\ -\xb7\x02\0\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\x04\0\0\0\x61\xa1\xf4\ -\xff\0\0\0\0\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\x0f\x16\0\0\0\0\0\0\ -\xbf\x69\0\0\0\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\0\0\0\0\x7b\x1a\xe0\ -\xff\0\0\0\0\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x31\0\0\0\0\0\0\x7b\ -\x1a\xe8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\x79\xa1\ -\xc8\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x51\0\0\0\xb7\x03\0\0\x11\0\0\0\ -\xb7\x05\0\0\x20\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\ -\0\0\0\0\x17\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\x1e\x80\x01\0\x01\0\0\0\x42\0\0\ -\0\x7b\0\0\0\x24\x80\x01\0\x02\0\0\0\x42\0\0\0\x60\x02\0\0\x1f\x88\x01\0\x03\0\ -\0\0\x42\0\0\0\x84\x02\0\0\x06\x94\x01\0\x04\0\0\0\x42\0\0\0\x1a\x01\0\0\x17\ -\x84\x01\0\x05\0\0\0\x42\0\0\0\x9d\x02\0\0\x0e\xa0\x01\0\x06\0\0\0\x42\0\0\0\ -\x1a\x01\0\0\x1d\x84\x01\0\x07\0\0\0\x42\0\0\0\x43\x01\0\0\x06\xa4\x01\0\x09\0\ -\0\0\x42\0\0\0\xaf\x02\0\0\x03\xa8\x01\0\x11\0\0\0\x42\0\0\0\x1f\x03\0\0\x02\ -\xb0\x01\0\x18\0\0\0\x42\0\0\0\x5a\x03\0\0\x06\x04\x01\0\x1b\0\0\0\x42\0\0\0\0\ -\0\0\0\0\0\0\0\x1c\0\0\0\x42\0\0\0\xab\x03\0\0\x0f\x10\x01\0\x1d\0\0\0\x42\0\0\ -\0\xc0\x03\0\0\x2d\x14\x01\0\x1f\0\0\0\x42\0\0\0\xf7\x03\0\0\x0d\x0c\x01\0\x21\ -\0\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x22\0\0\0\x42\0\0\0\xc0\x03\0\0\x02\x14\x01\0\ -\x25\0\0\0\x42\0\0\0\x1e\x04\0\0\x0d\x18\x01\0\x28\0\0\0\x42\0\0\0\0\0\0\0\0\0\ -\0\0\x29\0\0\0\x42\0\0\0\x1e\x04\0\0\x0d\x18\x01\0\x2c\0\0\0\x42\0\0\0\x1e\x04\ -\0\0\x0d\x18\x01\0\x2d\0\0\0\x42\0\0\0\x4c\x04\0\0\x1b\x1c\x01\0\x2e\0\0\0\x42\ -\0\0\0\x4c\x04\0\0\x06\x1c\x01\0\x2f\0\0\0\x42\0\0\0\x6f\x04\0\0\x0d\x24\x01\0\ -\x31\0\0\0\x42\0\0\0\x1f\x03\0\0\x02\xb0\x01\0\x40\0\0\0\x42\0\0\0\x2a\x02\0\0\ -\x01\xc0\x01\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\ -\0\0\0\0\0\x10\0\0\0\x14\0\0\0\xea\0\0\0\0\0\0\0\x20\0\0\0\x14\0\0\0\x3e\0\0\0\ -\0\0\0\0\x28\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x30\0\0\0\x08\0\0\0\x3f\x01\0\0\ -\0\0\0\0\x88\0\0\0\x1a\0\0\0\x3e\0\0\0\0\0\0\0\x98\0\0\0\x1a\0\0\0\xea\0\0\0\0\ -\0\0\0\xb0\0\0\0\x1a\0\0\0\x52\x03\0\0\0\0\0\0\xb8\0\0\0\x1a\0\0\0\x56\x03\0\0\ -\0\0\0\0\xc8\0\0\0\x1f\0\0\0\x84\x03\0\0\0\0\0\0\xe0\0\0\0\x20\0\0\0\xea\0\0\0\ -\0\0\0\0\xf8\0\0\0\x20\0\0\0\x3e\0\0\0\0\0\0\0\x20\x01\0\0\x24\0\0\0\x3e\0\0\0\ -\0\0\0\0\x58\x01\0\0\x1a\0\0\0\xea\0\0\0\0\0\0\0\x68\x01\0\0\x20\0\0\0\x46\x04\ -\0\0\0\0\0\0\x90\x01\0\0\x1a\0\0\0\x3f\x01\0\0\0\0\0\0\xa0\x01\0\0\x1a\0\0\0\ -\x87\x04\0\0\0\0\0\0\xa8\x01\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x42\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0\x1c\0\0\ -\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x1a\0\ -\0\0\x01\0\0\0\0\0\0\0\x13\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\0\ -\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\ -\0\0\0\0"; - opts.insns_sz = 2216; +\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xb0\0\0\0\x03\0\0\x04\x18\0\0\0\xbe\0\ +\0\0\x09\0\0\0\0\0\0\0\xc2\0\0\0\x0b\0\0\0\x40\0\0\0\xcd\0\0\0\x0b\0\0\0\x80\0\ +\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xd5\0\0\0\0\0\0\x07\0\0\0\0\xde\0\0\0\0\0\0\ +\x08\x0c\0\0\0\xe4\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\xae\x01\0\0\x03\0\0\x04\ +\x18\0\0\0\xb6\x01\0\0\x0e\0\0\0\0\0\0\0\xb9\x01\0\0\x11\0\0\0\x20\0\0\0\xbe\ +\x01\0\0\x0e\0\0\0\xa0\0\0\0\xca\x01\0\0\0\0\0\x08\x0f\0\0\0\xd0\x01\0\0\0\0\0\ +\x01\x04\0\0\0\x20\0\0\0\xdd\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\0\0\0\ +\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xe2\x01\0\0\0\0\0\x01\x04\0\0\0\ +\x20\0\0\0\0\0\0\0\x01\0\0\x0d\x14\0\0\0\x26\x05\0\0\x04\0\0\0\x2b\x02\0\0\0\0\ +\0\x08\x15\0\0\0\x31\x02\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\x01\x3b\x02\0\0\x01\0\ +\0\x0c\x13\0\0\0\0\0\0\0\0\0\0\x02\x18\0\0\0\x52\x02\0\0\x02\0\0\x04\x10\0\0\0\ +\x13\0\0\0\x03\0\0\0\0\0\0\0\x65\x02\0\0\x19\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\ +\x1c\0\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x17\0\0\0\x6a\x02\0\0\x01\0\ +\0\x0c\x1a\0\0\0\xb6\x02\0\0\x01\0\0\x04\x08\0\0\0\xbf\x02\0\0\x1d\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\x02\x1e\0\0\0\x10\x03\0\0\x06\0\0\x04\x38\0\0\0\xb6\x01\0\0\ +\x0e\0\0\0\0\0\0\0\xb9\x01\0\0\x11\0\0\0\x20\0\0\0\x1d\x03\0\0\x1f\0\0\0\xc0\0\ +\0\0\x2e\x03\0\0\x19\0\0\0\0\x01\0\0\x37\x03\0\0\x21\0\0\0\x40\x01\0\0\x41\x03\ +\0\0\x22\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\ +\0\0\0\0\0\0\0\0\0\x02\x23\0\0\0\0\0\0\0\0\0\0\x02\x24\0\0\0\x8b\x03\0\0\x02\0\ +\0\x04\x08\0\0\0\x99\x03\0\0\x0e\0\0\0\0\0\0\0\xa2\x03\0\0\x0e\0\0\0\x20\0\0\0\ +\x41\x03\0\0\x03\0\0\x04\x18\0\0\0\xac\x03\0\0\x1f\0\0\0\0\0\0\0\xb4\x03\0\0\ +\x25\0\0\0\x40\0\0\0\xba\x03\0\0\x27\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x26\0\0\ +\0\0\0\0\0\0\0\0\x02\x28\0\0\0\xbe\x03\0\0\x01\0\0\x04\x04\0\0\0\xc9\x03\0\0\ +\x0e\0\0\0\0\0\0\0\x32\x04\0\0\x01\0\0\x04\x04\0\0\0\x3b\x04\0\0\x0e\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x20\0\0\0\x12\0\0\0\x30\0\0\0\xb1\x04\0\0\0\0\0\ +\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x20\0\0\0\x12\0\0\0\x1a\0\0\0\ +\xc5\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x20\0\0\0\ +\x12\0\0\0\x20\0\0\0\xdb\x04\0\0\0\0\0\x0e\x2d\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\ +\0\0\0\0\x20\0\0\0\x12\0\0\0\x11\0\0\0\xf0\x04\0\0\0\0\0\x0e\x2f\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\x07\x05\0\0\0\0\0\x0e\ +\x31\0\0\0\x01\0\0\0\x0f\x05\0\0\x01\0\0\x0f\x04\0\0\0\x36\0\0\0\0\0\0\0\x04\0\ +\0\0\x16\x05\0\0\x04\0\0\x0f\x7b\0\0\0\x2a\0\0\0\0\0\0\0\x30\0\0\0\x2c\0\0\0\ +\x30\0\0\0\x1a\0\0\0\x2e\0\0\0\x4a\0\0\0\x20\0\0\0\x30\0\0\0\x6a\0\0\0\x11\0\0\ +\0\x1e\x05\0\0\x01\0\0\x0f\x04\0\0\0\x32\0\0\0\0\0\0\0\x04\0\0\0\x26\x05\0\0\0\ +\0\0\x0e\x06\0\0\0\x01\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\ +\x66\x5f\x6d\x61\x70\0\x6d\x65\x74\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\ +\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\ +\x62\x70\x66\x5f\x6d\x61\x70\0\x30\x3a\x30\0\x2f\x68\x6f\x6d\x65\x2f\x61\x73\ +\x70\x73\x6b\x2f\x73\x72\x63\x2f\x62\x70\x66\x2d\x6e\x65\x78\x74\x2f\x6b\x65\ +\x72\x6e\x65\x6c\x2f\x62\x70\x66\x2f\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\ +\x65\x72\x61\x74\x6f\x72\x73\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\ +\x70\x66\x2e\x63\0\x09\x73\x74\x72\x75\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\ +\x65\x20\x2a\x73\x65\x71\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\ +\x3e\x73\x65\x71\x3b\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\ +\x73\x65\x71\0\x73\x65\x73\x73\x69\x6f\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\ +\x75\x6d\0\x73\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\x5f\x75\x36\x34\0\x75\x6e\x73\ +\x69\x67\x6e\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\0\x30\x3a\x31\0\ +\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\ +\x70\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\ +\x6d\x61\x70\x29\0\x30\x3a\x32\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\ +\x6e\x75\x6d\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\ +\x71\x5f\x6e\x75\x6d\x3b\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\ +\x3d\x3d\x20\x30\x29\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\ +\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\ +\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\ +\x72\x69\x65\x73\x20\x20\x63\x75\x72\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\x6e\ +\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\x6d\ +\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\x73\ +\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\x52\ +\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\x5f\ +\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\ +\x75\x20\x25\x2d\x31\x36\x73\x20\x20\x25\x31\x30\x64\x20\x20\x20\x25\x31\x30\ +\x6c\x6c\x64\x5c\x6e\x22\x2c\0\x7d\0\x5f\x5f\x73\x36\x34\0\x6c\x6f\x6e\x67\x20\ +\x6c\x6f\x6e\x67\0\x62\x70\x66\x5f\x6d\x61\x70\x5f\x73\x75\x6d\x5f\x65\x6c\x65\ +\x6d\x5f\x63\x6f\x75\x6e\x74\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\ +\x66\x5f\x70\x72\x6f\x67\0\x70\x72\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\ +\x5f\x70\x72\x6f\x67\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\ +\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\ +\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\ +\x20\x28\x21\x70\x72\x6f\x67\x29\0\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\ +\x78\0\x09\x61\x75\x78\x20\x3d\x20\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\ +\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\ +\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\ +\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\ +\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\ +\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\ +\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\ +\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\x75\ +\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\ +\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\ +\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\ +\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\ +\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\ +\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\ +\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\ +\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\ +\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\x29\x2c\x20\x74\x79\x70\x65\x73\x20\ +\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\x09\x73\x74\x72\x20\x3d\x20\x62\x74\ +\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\ +\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\ +\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\ +\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\x30\x3a\x32\x3a\x30\0\x09\x69\x66\ +\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3e\x3d\x20\x62\x74\x66\x2d\x3e\ +\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\x29\0\x09\x72\x65\x74\x75\x72\x6e\ +\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\ +\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\ +\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\ +\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\ +\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\ +\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\x4e\x53\x45\0\x2e\x6b\x73\x79\x6d\ +\x73\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\x64\x75\x6d\ +\x6d\x79\x5f\x6b\x73\x79\x6d\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\xc9\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\0\x04\0\0\0\x7b\0\0\0\x01\0\0\0\ +\x80\0\0\0\0\0\0\0\0\0\0\0\x69\x74\x65\x72\x61\x74\x6f\x72\x2e\x72\x6f\x64\x61\ +\x74\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\x34\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x20\x20\ +\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\ +\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x20\x20\x63\x75\x72\x5f\x65\ +\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x20\x25\ +\x31\x30\x64\x20\x20\x20\x25\x31\x30\x6c\x6c\x64\x0a\0\x20\x20\x69\x64\x20\x6e\ +\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\ +\x61\x63\x68\x65\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\ +\x25\x73\x0a\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x12\0\0\0\ +\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\x1d\0\0\0\0\0\x79\x21\ +\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe0\xff\ +\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\ +\x30\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\ +\xe0\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\0\0\0\0\0\x0f\x12\0\0\0\0\0\0\ +\x7b\x2a\xe8\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\x7b\x1a\xf0\xff\0\0\0\0\xbf\x71\ +\0\0\0\0\0\0\x85\x20\0\0\0\0\0\0\x7b\x0a\xf8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\ +\x07\x04\0\0\xe0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\ +\x30\0\0\0\xb7\x03\0\0\x1a\0\0\0\xb7\x05\0\0\x20\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\ +\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x88\0\0\0\ +\x1e\x44\x01\0\x01\0\0\0\x42\0\0\0\x88\0\0\0\x24\x44\x01\0\x02\0\0\0\x42\0\0\0\ +\xfb\0\0\0\x1d\x4c\x01\0\x03\0\0\0\x42\0\0\0\x1c\x01\0\0\x06\x54\x01\0\x04\0\0\ +\0\x42\0\0\0\x2b\x01\0\0\x1d\x48\x01\0\x05\0\0\0\x42\0\0\0\x50\x01\0\0\x06\x60\ +\x01\0\x07\0\0\0\x42\0\0\0\x63\x01\0\0\x03\x64\x01\0\x0e\0\0\0\x42\0\0\0\xf6\ +\x01\0\0\x02\x6c\x01\0\x21\0\0\0\x42\0\0\0\x29\x02\0\0\x01\x80\x01\0\0\0\0\0\ +\x02\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\ +\x02\0\0\0\xf7\0\0\0\0\0\0\0\x20\0\0\0\x08\0\0\0\x27\x01\0\0\0\0\0\0\x70\0\0\0\ +\x0d\0\0\0\x3e\0\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\0\xf7\0\0\0\0\0\0\0\xa0\0\0\0\ +\x0d\0\0\0\x27\x01\0\0\0\0\0\0\x1a\0\0\0\x23\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\ +\x70\x66\x5f\x6d\x61\x70\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\ +\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x09\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x62\x70\x66\x5f\x69\x74\ +\x65\x72\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\0\0\0\0\0\0\0\x62\x70\x66\x5f\x6d\ +\x61\x70\x5f\x73\x75\x6d\x5f\x65\x6c\x65\x6d\x5f\x63\x6f\x75\x6e\x74\0\0\x47\ +\x50\x4c\0\0\0\0\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x11\x08\0\0\0\0\ +\0\x15\x01\x3b\0\0\0\0\0\x79\x17\0\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\ +\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\ +\x18\x62\0\0\0\0\0\0\0\0\0\0\x4a\0\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\ +\0\x85\0\0\0\x7e\0\0\0\x7b\x6a\xc8\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\ +\xff\0\0\0\0\xb7\x03\0\0\x04\0\0\0\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\ +\x71\x28\0\0\0\0\0\x79\x78\x30\0\0\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\ +\0\0\x0f\x21\0\0\0\0\0\0\x61\x11\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\ +\0\x03\0\0\0\x0f\x13\0\0\0\0\0\0\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\ +\x01\0\0\xf8\xff\xff\xff\xb7\x02\0\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\ +\0\0\0\0\x79\xa3\xf8\xff\0\0\0\0\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\ +\x01\0\0\xf4\xff\xff\xff\xb7\x02\0\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\ +\x04\0\0\0\x61\xa1\xf4\xff\0\0\0\0\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\ +\x0f\x16\0\0\0\0\0\0\xbf\x69\0\0\0\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\ +\0\0\0\0\x7b\x1a\xe0\xff\0\0\0\0\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\ +\x31\0\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\ +\xff\xff\xff\x79\xa1\xc8\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x6a\0\0\0\xb7\ +\x03\0\0\x11\0\0\0\xb7\x05\0\0\x20\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\ +\x95\0\0\0\0\0\0\0\0\0\0\0\x1b\0\0\0\0\0\0\0\x42\0\0\0\x88\0\0\0\x1e\x94\x01\0\ +\x01\0\0\0\x42\0\0\0\x88\0\0\0\x24\x94\x01\0\x02\0\0\0\x42\0\0\0\x86\x02\0\0\ +\x1f\x9c\x01\0\x03\0\0\0\x42\0\0\0\xaa\x02\0\0\x06\xa8\x01\0\x04\0\0\0\x42\0\0\ +\0\xc3\x02\0\0\x0e\xb4\x01\0\x05\0\0\0\x42\0\0\0\x2b\x01\0\0\x1d\x98\x01\0\x06\ +\0\0\0\x42\0\0\0\x50\x01\0\0\x06\xb8\x01\0\x08\0\0\0\x42\0\0\0\xd5\x02\0\0\x03\ +\xbc\x01\0\x10\0\0\0\x42\0\0\0\x45\x03\0\0\x02\xc4\x01\0\x17\0\0\0\x42\0\0\0\ +\x80\x03\0\0\x06\x04\x01\0\x1a\0\0\0\x42\0\0\0\x45\x03\0\0\x02\xc4\x01\0\x1b\0\ +\0\0\x42\0\0\0\xd1\x03\0\0\x0f\x10\x01\0\x1c\0\0\0\x42\0\0\0\xe6\x03\0\0\x2d\ +\x14\x01\0\x1e\0\0\0\x42\0\0\0\x1d\x04\0\0\x0d\x0c\x01\0\x20\0\0\0\x42\0\0\0\ +\x45\x03\0\0\x02\xc4\x01\0\x21\0\0\0\x42\0\0\0\xe6\x03\0\0\x02\x14\x01\0\x24\0\ +\0\0\x42\0\0\0\x44\x04\0\0\x0d\x18\x01\0\x27\0\0\0\x42\0\0\0\x45\x03\0\0\x02\ +\xc4\x01\0\x28\0\0\0\x42\0\0\0\x44\x04\0\0\x0d\x18\x01\0\x2b\0\0\0\x42\0\0\0\ +\x44\x04\0\0\x0d\x18\x01\0\x2c\0\0\0\x42\0\0\0\x72\x04\0\0\x1b\x1c\x01\0\x2d\0\ +\0\0\x42\0\0\0\x72\x04\0\0\x06\x1c\x01\0\x2e\0\0\0\x42\0\0\0\x95\x04\0\0\x0d\ +\x24\x01\0\x30\0\0\0\x42\0\0\0\x45\x03\0\0\x02\xc4\x01\0\x3f\0\0\0\x42\0\0\0\ +\x29\x02\0\0\x01\xd4\x01\0\0\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\ +\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x18\0\0\0\xf7\0\0\0\0\0\0\0\x20\0\0\0\x1c\0\0\ +\0\x3e\0\0\0\0\0\0\0\x28\0\0\0\x08\0\0\0\x27\x01\0\0\0\0\0\0\x80\0\0\0\x1e\0\0\ +\0\x3e\0\0\0\0\0\0\0\x90\0\0\0\x1e\0\0\0\xf7\0\0\0\0\0\0\0\xa8\0\0\0\x1e\0\0\0\ +\x78\x03\0\0\0\0\0\0\xb0\0\0\0\x1e\0\0\0\x7c\x03\0\0\0\0\0\0\xc0\0\0\0\x23\0\0\ +\0\xaa\x03\0\0\0\0\0\0\xd8\0\0\0\x24\0\0\0\xf7\0\0\0\0\0\0\0\xf0\0\0\0\x24\0\0\ +\0\x3e\0\0\0\0\0\0\0\x18\x01\0\0\x28\0\0\0\x3e\0\0\0\0\0\0\0\x50\x01\0\0\x1e\0\ +\0\0\xf7\0\0\0\0\0\0\0\x60\x01\0\0\x24\0\0\0\x6c\x04\0\0\0\0\0\0\x88\x01\0\0\ +\x1e\0\0\0\x27\x01\0\0\0\0\0\0\x98\x01\0\0\x1e\0\0\0\xad\x04\0\0\0\0\0\0\xa0\ +\x01\0\0\x1c\0\0\0\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x41\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\ +\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\ +\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x19\0\0\0\x01\0\0\0\0\0\0\0\ +\x12\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x62\x70\x66\x5f\ +\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0"; + opts.insns_sz = 2456; opts.insns = (void *)"\ \xbf\x16\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\x78\xff\xff\xff\xb7\x02\0\ \0\x88\0\0\0\xb7\x03\0\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x05\0\x14\0\0\0\0\0\x61\ @@ -331,79 +326,83 @@ iterators_bpf__load(struct iterators_bpf *skel) \0\0\0\x85\0\0\0\xa8\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\ \0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xbf\x70\0\0\ \0\0\0\0\x95\0\0\0\0\0\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\ -\x48\x0e\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\ -\0\0\x44\x0e\0\0\x63\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\ -\0\0\0\0\x38\x0e\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x05\0\0\ -\x18\x61\0\0\0\0\0\0\0\0\0\0\x30\x0e\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x12\0\ -\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x30\x0e\0\0\xb7\x03\0\0\x1c\0\0\0\x85\0\0\0\ +\xe8\x0e\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\ +\0\0\xe4\x0e\0\0\x63\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\ +\0\0\0\0\xd8\x0e\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x05\0\0\ +\x18\x61\0\0\0\0\0\0\0\0\0\0\xd0\x0e\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x12\0\ +\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\xd0\x0e\0\0\xb7\x03\0\0\x1c\0\0\0\x85\0\0\0\ \xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\xd4\xff\0\0\0\0\x63\x7a\x78\xff\0\0\0\0\ -\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x0e\0\0\x63\x01\0\0\0\ +\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x20\x0f\0\0\x63\x01\0\0\0\ \0\0\0\x61\x60\x1c\0\0\0\0\0\x15\0\x03\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\ -\x5c\x0e\0\0\x63\x01\0\0\0\0\0\0\xb7\x01\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\ -\0\x50\x0e\0\0\xb7\x03\0\0\x48\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\ +\xfc\x0e\0\0\x63\x01\0\0\0\0\0\0\xb7\x01\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\ +\0\xf0\x0e\0\0\xb7\x03\0\0\x48\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\ \xc5\x07\xc3\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x63\x71\0\0\0\0\0\ -\0\x79\x63\x20\0\0\0\0\0\x15\x03\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x98\ -\x0e\0\0\xb7\x02\0\0\x62\0\0\0\x61\x60\x04\0\0\0\0\0\x45\0\x02\0\x01\0\0\0\x85\ +\0\x79\x63\x20\0\0\0\0\0\x15\x03\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x38\ +\x0f\0\0\xb7\x02\0\0\x7b\0\0\0\x61\x60\x04\0\0\0\0\0\x45\0\x02\0\x01\0\0\0\x85\ \0\0\0\x94\0\0\0\x05\0\x01\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x18\x62\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\x63\ -\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\ -\0\0\x10\x0f\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x98\x0e\0\0\ -\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x0f\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x02\0\ -\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\xb7\x03\0\0\x20\0\0\0\x85\0\0\0\ +\0\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xc0\x0f\0\0\x63\ +\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xb8\x0f\0\0\x18\x61\0\0\0\0\0\0\0\ +\0\0\0\xc8\x0f\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x38\x0f\0\0\ +\x18\x61\0\0\0\0\0\0\0\0\0\0\xd0\x0f\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x02\0\ +\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\xc0\x0f\0\0\xb7\x03\0\0\x20\0\0\0\x85\0\0\0\ \xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x9f\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\x63\ -\x01\0\0\0\0\0\0\xb7\x01\0\0\x16\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\ +\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe0\x0f\0\0\x63\ +\x01\0\0\0\0\0\0\xb7\x01\0\0\x16\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\xe0\x0f\0\0\ \xb7\x03\0\0\x04\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x92\xff\ -\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\ -\x78\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x38\x0f\0\0\x18\ -\x61\0\0\0\0\0\0\0\0\0\0\x70\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\ -\0\0\0\x40\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x11\0\0\x7b\x01\0\0\0\0\0\0\ -\x18\x60\0\0\0\0\0\0\0\0\0\0\x48\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xc8\x11\0\ -\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x10\0\0\x18\x61\0\0\0\0\ -\0\0\0\0\0\0\xe8\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe0\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\x60\x08\0\0\ -\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x11\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\ -\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x84\x11\0\0\x63\x01\0\0\0\0\0\0\x79\x60\ -\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x88\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\ -\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb0\x11\0\0\x63\x01\0\0\0\0\0\ -\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xf8\x11\0\0\xb7\x02\0\0\x11\0\0\0\xb7\x03\0\0\ +\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\ +\x20\x12\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xf0\x0f\0\0\x18\ +\x61\0\0\0\0\0\0\0\0\0\0\x18\x12\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\ +\0\0\0\x08\x11\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x60\x12\0\0\x7b\x01\0\0\0\0\0\0\ +\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x11\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x70\x12\0\ +\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xa0\x11\0\0\x18\x61\0\0\0\0\ +\0\0\0\0\0\0\x90\x12\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x88\x12\0\0\x7b\x01\0\0\0\0\0\0\x61\x60\x08\0\0\ +\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x12\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\ +\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x2c\x12\0\0\x63\x01\0\0\0\0\0\0\x79\x60\ +\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x30\x12\0\0\x7b\x01\0\0\0\0\0\0\x61\ +\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x58\x12\0\0\x63\x01\0\0\0\0\0\ +\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xa0\x12\0\0\xb7\x02\0\0\x11\0\0\0\xb7\x03\0\0\ \x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\ -\x5c\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x68\x11\0\0\x63\x70\x6c\0\0\0\0\0\ -\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\0\0\0\x18\x62\0\0\ -\0\0\0\0\0\0\0\0\x68\x11\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\ -\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd8\x11\0\0\x61\x01\0\0\0\0\0\0\xd5\ -\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\x07\x4a\xff\0\0\ -\0\0\x63\x7a\x80\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\x18\x61\0\ -\0\0\0\0\0\0\0\0\0\x10\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\ -\x18\x12\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\ -\x60\0\0\0\0\0\0\0\0\0\0\x28\x14\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x50\x17\0\0\ -\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x14\0\0\x18\x61\0\0\0\0\0\ -\0\0\0\0\0\x60\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd0\x15\ -\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x78\x17\0\0\x7b\x01\0\0\0\0\ -\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x17\0\0\x63\x01\0\0\ -\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x1c\x17\0\0\x63\x01\ -\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\x7b\ -\x01\0\0\0\0\0\0\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x48\x17\0\ -\0\x63\x01\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x90\x17\0\0\xb7\x02\0\0\x12\ -\0\0\0\xb7\x03\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\ -\0\0\0\0\0\xc5\x07\x13\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\x63\ -\x70\x6c\0\0\0\0\0\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\ -\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\ -\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x70\x17\0\0\x61\x01\ -\0\0\0\0\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\ -\x07\x01\xff\0\0\0\0\x63\x7a\x84\xff\0\0\0\0\x61\xa1\x78\xff\0\0\0\0\xd5\x01\ -\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa0\x80\xff\0\0\0\0\ -\x63\x06\x28\0\0\0\0\0\x61\xa0\x84\xff\0\0\0\0\x63\x06\x2c\0\0\0\0\0\x18\x61\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\0\0\x63\x06\x18\0\0\0\0\0\xb7\0\0\0\ -\0\0\0\0\x95\0\0\0\0\0\0\0"; +\x5c\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\x63\x70\x6c\0\0\0\0\0\ +\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\x18\x68\0\0\0\0\0\0\0\0\0\0\xa8\ +\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x12\0\0\xb7\x02\0\0\x17\0\0\0\xb7\x03\ +\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\ +\x07\x4d\xff\0\0\0\0\x75\x07\x03\0\0\0\0\0\x62\x08\x04\0\0\0\0\0\x6a\x08\x02\0\ +\0\0\0\0\x05\0\x0a\0\0\0\0\0\x63\x78\x04\0\0\0\0\0\xbf\x79\0\0\0\0\0\0\x77\x09\ +\0\0\x20\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\x63\x90\0\0\0\0\0\0\x55\ +\x09\x02\0\0\0\0\0\x6a\x08\x02\0\0\0\0\0\x05\0\x01\0\0\0\0\0\x6a\x08\x02\0\x40\ +\0\0\0\xb7\x01\0\0\x05\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\xb7\x03\0\ +\0\x8c\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\ +\0\0\x01\0\0\x61\x01\0\0\0\0\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\ +\0\0\0\xa8\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x80\x12\0\0\x61\x01\0\0\0\0\0\0\ +\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\x07\x2c\xff\ +\0\0\0\0\x63\x7a\x80\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd0\x12\0\0\x18\ +\x61\0\0\0\0\0\0\0\0\0\0\xa8\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\ +\0\0\0\xd8\x12\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xa0\x17\0\0\x7b\x01\0\0\0\0\0\0\ +\x18\x60\0\0\0\0\0\0\0\0\0\0\xe0\x14\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe8\x17\0\ +\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x14\0\0\x18\x61\0\0\0\0\ +\0\0\0\0\0\0\xf8\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x78\ +\x16\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x18\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x10\x18\0\0\x7b\x01\0\0\ +\0\0\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb0\x17\0\0\x63\x01\ +\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb4\x17\0\0\x63\ +\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x17\0\0\ +\x7b\x01\0\0\0\0\0\0\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe0\ +\x17\0\0\x63\x01\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x18\0\0\xb7\x02\0\ +\0\x12\0\0\0\xb7\x03\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\ +\x07\0\0\0\0\0\0\xc5\x07\xf5\xfe\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x98\x17\0\ +\0\x63\x70\x6c\0\0\0\0\0\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\ +\0\x05\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x98\x17\0\0\xb7\x03\0\0\x8c\0\0\0\x85\ +\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x08\x18\0\0\ +\x61\x01\0\0\0\0\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\ +\0\0\xc5\x07\xe3\xfe\0\0\0\0\x63\x7a\x84\xff\0\0\0\0\x61\xa1\x78\xff\0\0\0\0\ +\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa0\x80\xff\ +\0\0\0\0\x63\x06\x28\0\0\0\0\0\x61\xa0\x84\xff\0\0\0\0\x63\x06\x2c\0\0\0\0\0\ +\x18\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\0\0\x63\x06\x18\0\0\0\0\0\ +\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0"; err = bpf_load_and_run(&opts); if (err < 0) return err; - skel->rodata = skel_finalize_map_data(&skel->maps.rodata.initial_value, - 4096, PROT_READ, skel->maps.rodata.map_fd); - if (!skel->rodata) - return -ENOMEM; return 0; } @@ -422,4 +421,15 @@ iterators_bpf__open_and_load(void) return skel; } +__attribute__((unused)) static void +iterators_bpf__assert(struct iterators_bpf *s __attribute__((unused))) +{ +#ifdef __cplusplus +#define _Static_assert static_assert +#endif +#ifdef __cplusplus +#undef _Static_assert +#endif +} + #endif /* __ITERATORS_BPF_SKEL_H__ */ diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 875ac9b698d9..f045fde632e5 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -23,15 +23,6 @@ #define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4) -/* Maximum size of ring buffer area is limited by 32-bit page offset within - * record header, counted in pages. Reserve 8 bits for extensibility, and take - * into account few extra pages for consumer/producer pages and - * non-mmap()'able parts. This gives 64GB limit, which seems plenty for single - * ring buffer. - */ -#define RINGBUF_MAX_DATA_SZ \ - (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE) - struct bpf_ringbuf { wait_queue_head_t waitq; struct irq_work work; @@ -161,6 +152,17 @@ static void bpf_ringbuf_notify(struct irq_work *work) wake_up_all(&rb->waitq); } +/* Maximum size of ring buffer area is limited by 32-bit page offset within + * record header, counted in pages. Reserve 8 bits for extensibility, and + * take into account few extra pages for consumer/producer pages and + * non-mmap()'able parts, the current maximum size would be: + * + * (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE) + * + * This gives 64GB limit, which seems plenty for single ring buffer. Now + * considering that the maximum value of data_sz is (4GB - 1), there + * will be no overflow, so just note the size limit in the comments. + */ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) { struct bpf_ringbuf *rb; @@ -193,12 +195,6 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) !PAGE_ALIGNED(attr->max_entries)) return ERR_PTR(-EINVAL); -#ifdef CONFIG_64BIT - /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */ - if (attr->max_entries > RINGBUF_MAX_DATA_SZ) - return ERR_PTR(-E2BIG); -#endif - rb_map = bpf_map_area_alloc(sizeof(*rb_map), NUMA_NO_NODE); if (!rb_map) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a2aef900519c..eb01c31ed591 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -37,6 +37,8 @@ #include <linux/trace_events.h> #include <net/netfilter/nf_bpf_link.h> +#include <net/tcx.h> + #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) @@ -655,7 +657,6 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) if (!btf_is_kernel(field->kptr.btf)) { pointee_struct_meta = btf_find_struct_meta(field->kptr.btf, field->kptr.btf_id); - WARN_ON_ONCE(!pointee_struct_meta); migrate_disable(); __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ? pointee_struct_meta->record : @@ -2813,10 +2814,12 @@ static void bpf_link_free_id(int id) /* Clean up bpf_link and corresponding anon_inode file and FD. After * anon_inode is created, bpf_link can't be just kfree()'d due to deferred - * anon_inode's release() call. This helper marksbpf_link as + * anon_inode's release() call. This helper marks bpf_link as * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt * is not decremented, it's the responsibility of a calling code that failed * to complete bpf_link initialization. + * This helper eventually calls link's dealloc callback, but does not call + * link's release callback. */ void bpf_link_cleanup(struct bpf_link_primer *primer) { @@ -3295,6 +3298,25 @@ static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link, raw_tp_link->btp->tp->name); } +static int bpf_copy_to_user(char __user *ubuf, const char *buf, u32 ulen, + u32 len) +{ + if (ulen >= len + 1) { + if (copy_to_user(ubuf, buf, len + 1)) + return -EFAULT; + } else { + char zero = '\0'; + + if (copy_to_user(ubuf, buf, ulen - 1)) + return -EFAULT; + if (put_user(zero, ubuf + ulen - 1)) + return -EFAULT; + return -ENOSPC; + } + + return 0; +} + static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { @@ -3313,20 +3335,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link, if (!ubuf) return 0; - if (ulen >= tp_len + 1) { - if (copy_to_user(ubuf, tp_name, tp_len + 1)) - return -EFAULT; - } else { - char zero = '\0'; - - if (copy_to_user(ubuf, tp_name, ulen - 1)) - return -EFAULT; - if (put_user(zero, ubuf + ulen - 1)) - return -EFAULT; - return -ENOSPC; - } - - return 0; + return bpf_copy_to_user(ubuf, tp_name, ulen, tp_len); } static const struct bpf_link_ops bpf_raw_tp_link_lops = { @@ -3358,9 +3367,154 @@ static void bpf_perf_link_dealloc(struct bpf_link *link) kfree(perf_link); } +static int bpf_perf_link_fill_common(const struct perf_event *event, + char __user *uname, u32 ulen, + u64 *probe_offset, u64 *probe_addr, + u32 *fd_type) +{ + const char *buf; + u32 prog_id; + size_t len; + int err; + + if (!ulen ^ !uname) + return -EINVAL; + + err = bpf_get_perf_event_info(event, &prog_id, fd_type, &buf, + probe_offset, probe_addr); + if (err) + return err; + if (!uname) + return 0; + if (buf) { + len = strlen(buf); + err = bpf_copy_to_user(uname, buf, ulen, len); + if (err) + return err; + } else { + char zero = '\0'; + + if (put_user(zero, uname)) + return -EFAULT; + } + return 0; +} + +#ifdef CONFIG_KPROBE_EVENTS +static int bpf_perf_link_fill_kprobe(const struct perf_event *event, + struct bpf_link_info *info) +{ + char __user *uname; + u64 addr, offset; + u32 ulen, type; + int err; + + uname = u64_to_user_ptr(info->perf_event.kprobe.func_name); + ulen = info->perf_event.kprobe.name_len; + err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr, + &type); + if (err) + return err; + if (type == BPF_FD_TYPE_KRETPROBE) + info->perf_event.type = BPF_PERF_EVENT_KRETPROBE; + else + info->perf_event.type = BPF_PERF_EVENT_KPROBE; + + info->perf_event.kprobe.offset = offset; + if (!kallsyms_show_value(current_cred())) + addr = 0; + info->perf_event.kprobe.addr = addr; + return 0; +} +#endif + +#ifdef CONFIG_UPROBE_EVENTS +static int bpf_perf_link_fill_uprobe(const struct perf_event *event, + struct bpf_link_info *info) +{ + char __user *uname; + u64 addr, offset; + u32 ulen, type; + int err; + + uname = u64_to_user_ptr(info->perf_event.uprobe.file_name); + ulen = info->perf_event.uprobe.name_len; + err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr, + &type); + if (err) + return err; + + if (type == BPF_FD_TYPE_URETPROBE) + info->perf_event.type = BPF_PERF_EVENT_URETPROBE; + else + info->perf_event.type = BPF_PERF_EVENT_UPROBE; + info->perf_event.uprobe.offset = offset; + return 0; +} +#endif + +static int bpf_perf_link_fill_probe(const struct perf_event *event, + struct bpf_link_info *info) +{ +#ifdef CONFIG_KPROBE_EVENTS + if (event->tp_event->flags & TRACE_EVENT_FL_KPROBE) + return bpf_perf_link_fill_kprobe(event, info); +#endif +#ifdef CONFIG_UPROBE_EVENTS + if (event->tp_event->flags & TRACE_EVENT_FL_UPROBE) + return bpf_perf_link_fill_uprobe(event, info); +#endif + return -EOPNOTSUPP; +} + +static int bpf_perf_link_fill_tracepoint(const struct perf_event *event, + struct bpf_link_info *info) +{ + char __user *uname; + u32 ulen; + + uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name); + ulen = info->perf_event.tracepoint.name_len; + info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT; + return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL); +} + +static int bpf_perf_link_fill_perf_event(const struct perf_event *event, + struct bpf_link_info *info) +{ + info->perf_event.event.type = event->attr.type; + info->perf_event.event.config = event->attr.config; + info->perf_event.type = BPF_PERF_EVENT_EVENT; + return 0; +} + +static int bpf_perf_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_perf_link *perf_link; + const struct perf_event *event; + + perf_link = container_of(link, struct bpf_perf_link, link); + event = perf_get_event(perf_link->perf_file); + if (IS_ERR(event)) + return PTR_ERR(event); + + switch (event->prog->type) { + case BPF_PROG_TYPE_PERF_EVENT: + return bpf_perf_link_fill_perf_event(event, info); + case BPF_PROG_TYPE_TRACEPOINT: + return bpf_perf_link_fill_tracepoint(event, info); + case BPF_PROG_TYPE_KPROBE: + return bpf_perf_link_fill_probe(event, info); + default: + return -EOPNOTSUPP; + } +} + static const struct bpf_link_ops bpf_perf_link_lops = { .release = bpf_perf_link_release, .dealloc = bpf_perf_link_dealloc, + .fill_link_info = bpf_perf_link_fill_link_info, }; static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) @@ -3502,34 +3656,6 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) return fd; } -static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, - enum bpf_attach_type attach_type) -{ - switch (prog->type) { - case BPF_PROG_TYPE_CGROUP_SOCK: - case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: - case BPF_PROG_TYPE_CGROUP_SOCKOPT: - case BPF_PROG_TYPE_SK_LOOKUP: - return attach_type == prog->expected_attach_type ? 0 : -EINVAL; - case BPF_PROG_TYPE_CGROUP_SKB: - if (!capable(CAP_NET_ADMIN)) - /* cg-skb progs can be loaded by unpriv user. - * check permissions at attach time. - */ - return -EPERM; - return prog->enforce_expected_attach_type && - prog->expected_attach_type != attach_type ? - -EINVAL : 0; - case BPF_PROG_TYPE_KPROBE: - if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI && - attach_type != BPF_TRACE_KPROBE_MULTI) - return -EINVAL; - return 0; - default: - return 0; - } -} - static enum bpf_prog_type attach_type_to_prog_type(enum bpf_attach_type attach_type) { @@ -3588,31 +3714,101 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_XDP; case BPF_LSM_CGROUP: return BPF_PROG_TYPE_LSM; + case BPF_TCX_INGRESS: + case BPF_TCX_EGRESS: + return BPF_PROG_TYPE_SCHED_CLS; default: return BPF_PROG_TYPE_UNSPEC; } } -#define BPF_PROG_ATTACH_LAST_FIELD replace_bpf_fd +static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, + enum bpf_attach_type attach_type) +{ + enum bpf_prog_type ptype; + + switch (prog->type) { + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + case BPF_PROG_TYPE_SK_LOOKUP: + return attach_type == prog->expected_attach_type ? 0 : -EINVAL; + case BPF_PROG_TYPE_CGROUP_SKB: + if (!capable(CAP_NET_ADMIN)) + /* cg-skb progs can be loaded by unpriv user. + * check permissions at attach time. + */ + return -EPERM; + return prog->enforce_expected_attach_type && + prog->expected_attach_type != attach_type ? + -EINVAL : 0; + case BPF_PROG_TYPE_EXT: + return 0; + case BPF_PROG_TYPE_NETFILTER: + if (attach_type != BPF_NETFILTER) + return -EINVAL; + return 0; + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_TRACEPOINT: + if (attach_type != BPF_PERF_EVENT) + return -EINVAL; + return 0; + case BPF_PROG_TYPE_KPROBE: + if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI && + attach_type != BPF_TRACE_KPROBE_MULTI) + return -EINVAL; + if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI && + attach_type != BPF_TRACE_UPROBE_MULTI) + return -EINVAL; + if (attach_type != BPF_PERF_EVENT && + attach_type != BPF_TRACE_KPROBE_MULTI && + attach_type != BPF_TRACE_UPROBE_MULTI) + return -EINVAL; + return 0; + case BPF_PROG_TYPE_SCHED_CLS: + if (attach_type != BPF_TCX_INGRESS && + attach_type != BPF_TCX_EGRESS) + return -EINVAL; + return 0; + default: + ptype = attach_type_to_prog_type(attach_type); + if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) + return -EINVAL; + return 0; + } +} -#define BPF_F_ATTACH_MASK \ - (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI | BPF_F_REPLACE) +#define BPF_PROG_ATTACH_LAST_FIELD expected_revision + +#define BPF_F_ATTACH_MASK_BASE \ + (BPF_F_ALLOW_OVERRIDE | \ + BPF_F_ALLOW_MULTI | \ + BPF_F_REPLACE) + +#define BPF_F_ATTACH_MASK_MPROG \ + (BPF_F_REPLACE | \ + BPF_F_BEFORE | \ + BPF_F_AFTER | \ + BPF_F_ID | \ + BPF_F_LINK) static int bpf_prog_attach(const union bpf_attr *attr) { enum bpf_prog_type ptype; struct bpf_prog *prog; + u32 mask; int ret; if (CHECK_ATTR(BPF_PROG_ATTACH)) return -EINVAL; - if (attr->attach_flags & ~BPF_F_ATTACH_MASK) - return -EINVAL; - ptype = attach_type_to_prog_type(attr->attach_type); if (ptype == BPF_PROG_TYPE_UNSPEC) return -EINVAL; + mask = bpf_mprog_supported(ptype) ? + BPF_F_ATTACH_MASK_MPROG : BPF_F_ATTACH_MASK_BASE; + if (attr->attach_flags & ~mask) + return -EINVAL; prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); if (IS_ERR(prog)) @@ -3648,6 +3844,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) else ret = cgroup_bpf_prog_attach(attr, ptype, prog); break; + case BPF_PROG_TYPE_SCHED_CLS: + ret = tcx_prog_attach(attr, prog); + break; default: ret = -EINVAL; } @@ -3657,25 +3856,41 @@ static int bpf_prog_attach(const union bpf_attr *attr) return ret; } -#define BPF_PROG_DETACH_LAST_FIELD attach_type +#define BPF_PROG_DETACH_LAST_FIELD expected_revision static int bpf_prog_detach(const union bpf_attr *attr) { + struct bpf_prog *prog = NULL; enum bpf_prog_type ptype; + int ret; if (CHECK_ATTR(BPF_PROG_DETACH)) return -EINVAL; ptype = attach_type_to_prog_type(attr->attach_type); + if (bpf_mprog_supported(ptype)) { + if (ptype == BPF_PROG_TYPE_UNSPEC) + return -EINVAL; + if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG) + return -EINVAL; + if (attr->attach_bpf_fd) { + prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); + if (IS_ERR(prog)) + return PTR_ERR(prog); + } + } switch (ptype) { case BPF_PROG_TYPE_SK_MSG: case BPF_PROG_TYPE_SK_SKB: - return sock_map_prog_detach(attr, ptype); + ret = sock_map_prog_detach(attr, ptype); + break; case BPF_PROG_TYPE_LIRC_MODE2: - return lirc_prog_detach(attr); + ret = lirc_prog_detach(attr); + break; case BPF_PROG_TYPE_FLOW_DISSECTOR: - return netns_bpf_prog_detach(attr, ptype); + ret = netns_bpf_prog_detach(attr, ptype); + break; case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: @@ -3684,13 +3899,21 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_LSM: - return cgroup_bpf_prog_detach(attr, ptype); + ret = cgroup_bpf_prog_detach(attr, ptype); + break; + case BPF_PROG_TYPE_SCHED_CLS: + ret = tcx_prog_detach(attr, prog); + break; default: - return -EINVAL; + ret = -EINVAL; } + + if (prog) + bpf_prog_put(prog); + return ret; } -#define BPF_PROG_QUERY_LAST_FIELD query.prog_attach_flags +#define BPF_PROG_QUERY_LAST_FIELD query.link_attach_flags static int bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -3738,6 +3961,9 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_SK_MSG_VERDICT: case BPF_SK_SKB_VERDICT: return sock_map_bpf_prog_query(attr, uattr); + case BPF_TCX_INGRESS: + case BPF_TCX_EGRESS: + return tcx_prog_query(attr, uattr); default: return -EINVAL; } @@ -4655,10 +4881,9 @@ err_put: return err; } -#define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies +#define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.pid static int link_create(union bpf_attr *attr, bpfptr_t uattr) { - enum bpf_prog_type ptype; struct bpf_prog *prog; int ret; @@ -4678,38 +4903,6 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) goto out; switch (prog->type) { - case BPF_PROG_TYPE_EXT: - break; - case BPF_PROG_TYPE_NETFILTER: - if (attr->link_create.attach_type != BPF_NETFILTER) { - ret = -EINVAL; - goto out; - } - break; - case BPF_PROG_TYPE_PERF_EVENT: - case BPF_PROG_TYPE_TRACEPOINT: - if (attr->link_create.attach_type != BPF_PERF_EVENT) { - ret = -EINVAL; - goto out; - } - break; - case BPF_PROG_TYPE_KPROBE: - if (attr->link_create.attach_type != BPF_PERF_EVENT && - attr->link_create.attach_type != BPF_TRACE_KPROBE_MULTI) { - ret = -EINVAL; - goto out; - } - break; - default: - ptype = attach_type_to_prog_type(attr->link_create.attach_type); - if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) { - ret = -EINVAL; - goto out; - } - break; - } - - switch (prog->type) { case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: @@ -4751,6 +4944,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_XDP: ret = bpf_xdp_link_attach(attr, prog); break; + case BPF_PROG_TYPE_SCHED_CLS: + ret = tcx_link_attach(attr, prog); + break; case BPF_PROG_TYPE_NETFILTER: ret = bpf_nf_link_attach(attr, prog); break; @@ -4762,8 +4958,10 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_KPROBE: if (attr->link_create.attach_type == BPF_PERF_EVENT) ret = bpf_perf_link_attach(attr, prog); - else + else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI) ret = bpf_kprobe_multi_link_attach(attr, prog); + else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI) + ret = bpf_uprobe_multi_link_attach(attr, prog); break; default: ret = -EINVAL; @@ -5304,9 +5502,9 @@ int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size) } run_ctx.bpf_cookie = 0; - run_ctx.saved_run_ctx = NULL; if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) { /* recursion detected */ + __bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx); bpf_prog_put(prog); return -EBUSY; } diff --git a/kernel/bpf/tcx.c b/kernel/bpf/tcx.c new file mode 100644 index 000000000000..13f0b5dc8262 --- /dev/null +++ b/kernel/bpf/tcx.c @@ -0,0 +1,352 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2023 Isovalent */ + +#include <linux/bpf.h> +#include <linux/bpf_mprog.h> +#include <linux/netdevice.h> + +#include <net/tcx.h> + +int tcx_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + bool created, ingress = attr->attach_type == BPF_TCX_INGRESS; + struct net *net = current->nsproxy->net_ns; + struct bpf_mprog_entry *entry, *entry_new; + struct bpf_prog *replace_prog = NULL; + struct net_device *dev; + int ret; + + rtnl_lock(); + dev = __dev_get_by_index(net, attr->target_ifindex); + if (!dev) { + ret = -ENODEV; + goto out; + } + if (attr->attach_flags & BPF_F_REPLACE) { + replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, + prog->type); + if (IS_ERR(replace_prog)) { + ret = PTR_ERR(replace_prog); + replace_prog = NULL; + goto out; + } + } + entry = tcx_entry_fetch_or_create(dev, ingress, &created); + if (!entry) { + ret = -ENOMEM; + goto out; + } + ret = bpf_mprog_attach(entry, &entry_new, prog, NULL, replace_prog, + attr->attach_flags, attr->relative_fd, + attr->expected_revision); + if (!ret) { + if (entry != entry_new) { + tcx_entry_update(dev, entry_new, ingress); + tcx_entry_sync(); + tcx_skeys_inc(ingress); + } + bpf_mprog_commit(entry); + } else if (created) { + tcx_entry_free(entry); + } +out: + if (replace_prog) + bpf_prog_put(replace_prog); + rtnl_unlock(); + return ret; +} + +int tcx_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + bool ingress = attr->attach_type == BPF_TCX_INGRESS; + struct net *net = current->nsproxy->net_ns; + struct bpf_mprog_entry *entry, *entry_new; + struct net_device *dev; + int ret; + + rtnl_lock(); + dev = __dev_get_by_index(net, attr->target_ifindex); + if (!dev) { + ret = -ENODEV; + goto out; + } + entry = tcx_entry_fetch(dev, ingress); + if (!entry) { + ret = -ENOENT; + goto out; + } + ret = bpf_mprog_detach(entry, &entry_new, prog, NULL, attr->attach_flags, + attr->relative_fd, attr->expected_revision); + if (!ret) { + if (!tcx_entry_is_active(entry_new)) + entry_new = NULL; + tcx_entry_update(dev, entry_new, ingress); + tcx_entry_sync(); + tcx_skeys_dec(ingress); + bpf_mprog_commit(entry); + if (!entry_new) + tcx_entry_free(entry); + } +out: + rtnl_unlock(); + return ret; +} + +void tcx_uninstall(struct net_device *dev, bool ingress) +{ + struct bpf_mprog_entry *entry, *entry_new = NULL; + struct bpf_tuple tuple = {}; + struct bpf_mprog_fp *fp; + struct bpf_mprog_cp *cp; + bool active; + + entry = tcx_entry_fetch(dev, ingress); + if (!entry) + return; + active = tcx_entry(entry)->miniq_active; + if (active) + bpf_mprog_clear_all(entry, &entry_new); + tcx_entry_update(dev, entry_new, ingress); + tcx_entry_sync(); + bpf_mprog_foreach_tuple(entry, fp, cp, tuple) { + if (tuple.link) + tcx_link(tuple.link)->dev = NULL; + else + bpf_prog_put(tuple.prog); + tcx_skeys_dec(ingress); + } + if (!active) + tcx_entry_free(entry); +} + +int tcx_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) +{ + bool ingress = attr->query.attach_type == BPF_TCX_INGRESS; + struct net *net = current->nsproxy->net_ns; + struct bpf_mprog_entry *entry; + struct net_device *dev; + int ret; + + rtnl_lock(); + dev = __dev_get_by_index(net, attr->query.target_ifindex); + if (!dev) { + ret = -ENODEV; + goto out; + } + entry = tcx_entry_fetch(dev, ingress); + if (!entry) { + ret = -ENOENT; + goto out; + } + ret = bpf_mprog_query(attr, uattr, entry); +out: + rtnl_unlock(); + return ret; +} + +static int tcx_link_prog_attach(struct bpf_link *link, u32 flags, u32 id_or_fd, + u64 revision) +{ + struct tcx_link *tcx = tcx_link(link); + bool created, ingress = tcx->location == BPF_TCX_INGRESS; + struct bpf_mprog_entry *entry, *entry_new; + struct net_device *dev = tcx->dev; + int ret; + + ASSERT_RTNL(); + entry = tcx_entry_fetch_or_create(dev, ingress, &created); + if (!entry) + return -ENOMEM; + ret = bpf_mprog_attach(entry, &entry_new, link->prog, link, NULL, flags, + id_or_fd, revision); + if (!ret) { + if (entry != entry_new) { + tcx_entry_update(dev, entry_new, ingress); + tcx_entry_sync(); + tcx_skeys_inc(ingress); + } + bpf_mprog_commit(entry); + } else if (created) { + tcx_entry_free(entry); + } + return ret; +} + +static void tcx_link_release(struct bpf_link *link) +{ + struct tcx_link *tcx = tcx_link(link); + bool ingress = tcx->location == BPF_TCX_INGRESS; + struct bpf_mprog_entry *entry, *entry_new; + struct net_device *dev; + int ret = 0; + + rtnl_lock(); + dev = tcx->dev; + if (!dev) + goto out; + entry = tcx_entry_fetch(dev, ingress); + if (!entry) { + ret = -ENOENT; + goto out; + } + ret = bpf_mprog_detach(entry, &entry_new, link->prog, link, 0, 0, 0); + if (!ret) { + if (!tcx_entry_is_active(entry_new)) + entry_new = NULL; + tcx_entry_update(dev, entry_new, ingress); + tcx_entry_sync(); + tcx_skeys_dec(ingress); + bpf_mprog_commit(entry); + if (!entry_new) + tcx_entry_free(entry); + tcx->dev = NULL; + } +out: + WARN_ON_ONCE(ret); + rtnl_unlock(); +} + +static int tcx_link_update(struct bpf_link *link, struct bpf_prog *nprog, + struct bpf_prog *oprog) +{ + struct tcx_link *tcx = tcx_link(link); + bool ingress = tcx->location == BPF_TCX_INGRESS; + struct bpf_mprog_entry *entry, *entry_new; + struct net_device *dev; + int ret = 0; + + rtnl_lock(); + dev = tcx->dev; + if (!dev) { + ret = -ENOLINK; + goto out; + } + if (oprog && link->prog != oprog) { + ret = -EPERM; + goto out; + } + oprog = link->prog; + if (oprog == nprog) { + bpf_prog_put(nprog); + goto out; + } + entry = tcx_entry_fetch(dev, ingress); + if (!entry) { + ret = -ENOENT; + goto out; + } + ret = bpf_mprog_attach(entry, &entry_new, nprog, link, oprog, + BPF_F_REPLACE | BPF_F_ID, + link->prog->aux->id, 0); + if (!ret) { + WARN_ON_ONCE(entry != entry_new); + oprog = xchg(&link->prog, nprog); + bpf_prog_put(oprog); + bpf_mprog_commit(entry); + } +out: + rtnl_unlock(); + return ret; +} + +static void tcx_link_dealloc(struct bpf_link *link) +{ + kfree(tcx_link(link)); +} + +static void tcx_link_fdinfo(const struct bpf_link *link, struct seq_file *seq) +{ + const struct tcx_link *tcx = tcx_link_const(link); + u32 ifindex = 0; + + rtnl_lock(); + if (tcx->dev) + ifindex = tcx->dev->ifindex; + rtnl_unlock(); + + seq_printf(seq, "ifindex:\t%u\n", ifindex); + seq_printf(seq, "attach_type:\t%u (%s)\n", + tcx->location, + tcx->location == BPF_TCX_INGRESS ? "ingress" : "egress"); +} + +static int tcx_link_fill_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + const struct tcx_link *tcx = tcx_link_const(link); + u32 ifindex = 0; + + rtnl_lock(); + if (tcx->dev) + ifindex = tcx->dev->ifindex; + rtnl_unlock(); + + info->tcx.ifindex = ifindex; + info->tcx.attach_type = tcx->location; + return 0; +} + +static int tcx_link_detach(struct bpf_link *link) +{ + tcx_link_release(link); + return 0; +} + +static const struct bpf_link_ops tcx_link_lops = { + .release = tcx_link_release, + .detach = tcx_link_detach, + .dealloc = tcx_link_dealloc, + .update_prog = tcx_link_update, + .show_fdinfo = tcx_link_fdinfo, + .fill_link_info = tcx_link_fill_info, +}; + +static int tcx_link_init(struct tcx_link *tcx, + struct bpf_link_primer *link_primer, + const union bpf_attr *attr, + struct net_device *dev, + struct bpf_prog *prog) +{ + bpf_link_init(&tcx->link, BPF_LINK_TYPE_TCX, &tcx_link_lops, prog); + tcx->location = attr->link_create.attach_type; + tcx->dev = dev; + return bpf_link_prime(&tcx->link, link_primer); +} + +int tcx_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct net *net = current->nsproxy->net_ns; + struct bpf_link_primer link_primer; + struct net_device *dev; + struct tcx_link *tcx; + int ret; + + rtnl_lock(); + dev = __dev_get_by_index(net, attr->link_create.target_ifindex); + if (!dev) { + ret = -ENODEV; + goto out; + } + tcx = kzalloc(sizeof(*tcx), GFP_USER); + if (!tcx) { + ret = -ENOMEM; + goto out; + } + ret = tcx_link_init(tcx, &link_primer, attr, dev, prog); + if (ret) { + kfree(tcx); + goto out; + } + ret = tcx_link_prog_attach(&tcx->link, attr->link_create.flags, + attr->link_create.tcx.relative_fd, + attr->link_create.tcx.expected_revision); + if (ret) { + tcx->dev = NULL; + bpf_link_cleanup(&link_primer); + goto out; + } + ret = bpf_link_settle(&link_primer); +out: + rtnl_unlock(); + return ret; +} diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 78acf28d4873..53ff50cac61e 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -926,13 +926,12 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, migrate_disable(); might_fault(); + run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); + if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); return 0; } - - run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); - return bpf_prog_start_time(); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 02a021c524ab..bb78212fa5b2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -25,6 +25,8 @@ #include <linux/btf_ids.h> #include <linux/poison.h> #include <linux/module.h> +#include <linux/cpumask.h> +#include <net/xdp.h> #include "disasm.h" @@ -2854,7 +2856,10 @@ static int check_subprogs(struct bpf_verifier_env *env) goto next; if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) goto next; - off = i + insn[i].off + 1; + if (code == (BPF_JMP32 | BPF_JA)) + off = i + insn[i].imm + 1; + else + off = i + insn[i].off + 1; if (off < subprog_start || off >= subprog_end) { verbose(env, "jump out of range from insn %d to %d\n", i, off); return -EINVAL; @@ -2866,6 +2871,7 @@ next: * or unconditional jump back */ if (code != (BPF_JMP | BPF_EXIT) && + code != (BPF_JMP32 | BPF_JA) && code != (BPF_JMP | BPF_JA)) { verbose(env, "last insn is not an exit or jmp\n"); return -EINVAL; @@ -3011,8 +3017,10 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn, } } + if (class == BPF_ALU64 && op == BPF_END && (insn->imm == 16 || insn->imm == 32)) + return false; + if (class == BPF_ALU64 || class == BPF_JMP || - /* BPF_END always use BPF_ALU class. */ (class == BPF_ALU && op == BPF_END && insn->imm == 64)) return true; @@ -3420,7 +3428,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, return 0; if (opcode == BPF_MOV) { if (BPF_SRC(insn->code) == BPF_X) { - /* dreg = sreg + /* dreg = sreg or dreg = (s8, s16, s32)sreg * dreg needs precision after this insn * sreg needs precision before this insn */ @@ -4982,20 +4990,22 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno) { const char *targ_name = btf_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id); - int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU; + int perm_flags; const char *reg_name = ""; - /* Only unreferenced case accepts untrusted pointers */ - if (kptr_field->type == BPF_KPTR_UNREF) - perm_flags |= PTR_UNTRUSTED; + if (btf_is_kernel(reg->btf)) { + perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU; + + /* Only unreferenced case accepts untrusted pointers */ + if (kptr_field->type == BPF_KPTR_UNREF) + perm_flags |= PTR_UNTRUSTED; + } else { + perm_flags = PTR_MAYBE_NULL | MEM_ALLOC; + } if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags)) goto bad_type; - if (!btf_is_kernel(reg->btf)) { - verbose(env, "R%d must point to kernel BTF\n", regno); - return -EINVAL; - } /* We need to verify reg->type and reg->btf, before accessing reg->btf */ reg_name = btf_type_name(reg->btf, reg->btf_id); @@ -5008,7 +5018,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, if (__check_ptr_off_reg(env, reg, regno, true)) return -EACCES; - /* A full type match is needed, as BTF can be vmlinux or module BTF, and + /* A full type match is needed, as BTF can be vmlinux, module or prog BTF, and * we also need to take into account the reg->off. * * We want to support cases like: @@ -5054,7 +5064,9 @@ bad_type: */ static bool in_rcu_cs(struct bpf_verifier_env *env) { - return env->cur_state->active_rcu_lock || !env->prog->aux->sleepable; + return env->cur_state->active_rcu_lock || + env->cur_state->active_lock.ptr || + !env->prog->aux->sleepable; } /* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */ @@ -5412,12 +5424,25 @@ static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno) return reg->type == PTR_TO_FLOW_KEYS; } +static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = { +#ifdef CONFIG_NET + [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], + [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], + [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP], +#endif + [CONST_PTR_TO_MAP] = btf_bpf_map_id, +}; + static bool is_trusted_reg(const struct bpf_reg_state *reg) { /* A referenced register is always trusted. */ if (reg->ref_obj_id) return true; + /* Types listed in the reg2btf_ids are always trusted */ + if (reg2btf_ids[base_type(reg->type)]) + return true; + /* If a register is not referenced, it is trusted if it has the * MEM_ALLOC or PTR_TRUSTED type modifiers, and no others. Some of the * other type modifiers may be safe, but we elect to take an opt-in @@ -5813,6 +5838,147 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) __reg_combine_64_into_32(reg); } +static void set_sext64_default_val(struct bpf_reg_state *reg, int size) +{ + if (size == 1) { + reg->smin_value = reg->s32_min_value = S8_MIN; + reg->smax_value = reg->s32_max_value = S8_MAX; + } else if (size == 2) { + reg->smin_value = reg->s32_min_value = S16_MIN; + reg->smax_value = reg->s32_max_value = S16_MAX; + } else { + /* size == 4 */ + reg->smin_value = reg->s32_min_value = S32_MIN; + reg->smax_value = reg->s32_max_value = S32_MAX; + } + reg->umin_value = reg->u32_min_value = 0; + reg->umax_value = U64_MAX; + reg->u32_max_value = U32_MAX; + reg->var_off = tnum_unknown; +} + +static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size) +{ + s64 init_s64_max, init_s64_min, s64_max, s64_min, u64_cval; + u64 top_smax_value, top_smin_value; + u64 num_bits = size * 8; + + if (tnum_is_const(reg->var_off)) { + u64_cval = reg->var_off.value; + if (size == 1) + reg->var_off = tnum_const((s8)u64_cval); + else if (size == 2) + reg->var_off = tnum_const((s16)u64_cval); + else + /* size == 4 */ + reg->var_off = tnum_const((s32)u64_cval); + + u64_cval = reg->var_off.value; + reg->smax_value = reg->smin_value = u64_cval; + reg->umax_value = reg->umin_value = u64_cval; + reg->s32_max_value = reg->s32_min_value = u64_cval; + reg->u32_max_value = reg->u32_min_value = u64_cval; + return; + } + + top_smax_value = ((u64)reg->smax_value >> num_bits) << num_bits; + top_smin_value = ((u64)reg->smin_value >> num_bits) << num_bits; + + if (top_smax_value != top_smin_value) + goto out; + + /* find the s64_min and s64_min after sign extension */ + if (size == 1) { + init_s64_max = (s8)reg->smax_value; + init_s64_min = (s8)reg->smin_value; + } else if (size == 2) { + init_s64_max = (s16)reg->smax_value; + init_s64_min = (s16)reg->smin_value; + } else { + init_s64_max = (s32)reg->smax_value; + init_s64_min = (s32)reg->smin_value; + } + + s64_max = max(init_s64_max, init_s64_min); + s64_min = min(init_s64_max, init_s64_min); + + /* both of s64_max/s64_min positive or negative */ + if ((s64_max >= 0) == (s64_min >= 0)) { + reg->smin_value = reg->s32_min_value = s64_min; + reg->smax_value = reg->s32_max_value = s64_max; + reg->umin_value = reg->u32_min_value = s64_min; + reg->umax_value = reg->u32_max_value = s64_max; + reg->var_off = tnum_range(s64_min, s64_max); + return; + } + +out: + set_sext64_default_val(reg, size); +} + +static void set_sext32_default_val(struct bpf_reg_state *reg, int size) +{ + if (size == 1) { + reg->s32_min_value = S8_MIN; + reg->s32_max_value = S8_MAX; + } else { + /* size == 2 */ + reg->s32_min_value = S16_MIN; + reg->s32_max_value = S16_MAX; + } + reg->u32_min_value = 0; + reg->u32_max_value = U32_MAX; +} + +static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size) +{ + s32 init_s32_max, init_s32_min, s32_max, s32_min, u32_val; + u32 top_smax_value, top_smin_value; + u32 num_bits = size * 8; + + if (tnum_is_const(reg->var_off)) { + u32_val = reg->var_off.value; + if (size == 1) + reg->var_off = tnum_const((s8)u32_val); + else + reg->var_off = tnum_const((s16)u32_val); + + u32_val = reg->var_off.value; + reg->s32_min_value = reg->s32_max_value = u32_val; + reg->u32_min_value = reg->u32_max_value = u32_val; + return; + } + + top_smax_value = ((u32)reg->s32_max_value >> num_bits) << num_bits; + top_smin_value = ((u32)reg->s32_min_value >> num_bits) << num_bits; + + if (top_smax_value != top_smin_value) + goto out; + + /* find the s32_min and s32_min after sign extension */ + if (size == 1) { + init_s32_max = (s8)reg->s32_max_value; + init_s32_min = (s8)reg->s32_min_value; + } else { + /* size == 2 */ + init_s32_max = (s16)reg->s32_max_value; + init_s32_min = (s16)reg->s32_min_value; + } + s32_max = max(init_s32_max, init_s32_min); + s32_min = min(init_s32_max, init_s32_min); + + if ((s32_min >= 0) == (s32_max >= 0)) { + reg->s32_min_value = s32_min; + reg->s32_max_value = s32_max; + reg->u32_min_value = (u32)s32_min; + reg->u32_max_value = (u32)s32_max; + return; + } + +out: + set_sext32_default_val(reg, size); +} + static bool bpf_map_is_rdonly(const struct bpf_map *map) { /* A map is considered read-only if the following condition are true: @@ -5833,7 +5999,8 @@ static bool bpf_map_is_rdonly(const struct bpf_map *map) !bpf_map_write_active(map); } -static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val) +static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val, + bool is_ldsx) { void *ptr; u64 addr; @@ -5846,13 +6013,13 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val) switch (size) { case sizeof(u8): - *val = (u64)*(u8 *)ptr; + *val = is_ldsx ? (s64)*(s8 *)ptr : (u64)*(u8 *)ptr; break; case sizeof(u16): - *val = (u64)*(u16 *)ptr; + *val = is_ldsx ? (s64)*(s16 *)ptr : (u64)*(u16 *)ptr; break; case sizeof(u32): - *val = (u64)*(u32 *)ptr; + *val = is_ldsx ? (s64)*(s32 *)ptr : (u64)*(u32 *)ptr; break; case sizeof(u64): *val = *(u64 *)ptr; @@ -6085,6 +6252,11 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, type_is_rcu_or_null(env, reg, field_name, btf_id)) { /* __rcu tagged pointers can be NULL */ flag |= MEM_RCU | PTR_MAYBE_NULL; + + /* We always trust them */ + if (type_is_rcu_or_null(env, reg, field_name, btf_id) && + flag & PTR_UNTRUSTED) + flag &= ~PTR_UNTRUSTED; } else if (flag & (MEM_PERCPU | MEM_USER)) { /* keep as-is */ } else { @@ -6266,7 +6438,7 @@ static int check_stack_access_within_bounds( */ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, int off, int bpf_size, enum bpf_access_type t, - int value_regno, bool strict_alignment_once) + int value_regno, bool strict_alignment_once, bool is_ldsx) { struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = regs + regno; @@ -6327,7 +6499,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn u64 val = 0; err = bpf_map_direct_read(map, map_off, size, - &val); + &val, is_ldsx); if (err) return err; @@ -6497,8 +6669,11 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ && regs[value_regno].type == SCALAR_VALUE) { - /* b/h/w load zero-extends, mark upper bits as known 0 */ - coerce_reg_to_size(®s[value_regno], size); + if (!is_ldsx) + /* b/h/w load zero-extends, mark upper bits as known 0 */ + coerce_reg_to_size(®s[value_regno], size); + else + coerce_reg_to_size_sx(®s[value_regno], size); } return err; } @@ -6590,17 +6765,17 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i * case to simulate the register fill. */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_READ, -1, true); + BPF_SIZE(insn->code), BPF_READ, -1, true, false); if (!err && load_reg >= 0) err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, load_reg, - true); + true, false); if (err) return err; /* Check whether we can write into the same memory. */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_WRITE, -1, true); + BPF_SIZE(insn->code), BPF_WRITE, -1, true, false); if (err) return err; @@ -6846,7 +7021,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return zero_size_allowed ? 0 : -EACCES; return check_mem_access(env, env->insn_idx, regno, offset, BPF_B, - atype, -1, false); + atype, -1, false, false); } fallthrough; @@ -7218,7 +7393,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn /* we write BPF_DW bits (8 bytes) at a time */ for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) { err = check_mem_access(env, insn_idx, regno, - i, BPF_DW, BPF_WRITE, -1, false); + i, BPF_DW, BPF_WRITE, -1, false, false); if (err) return err; } @@ -7311,7 +7486,7 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) { err = check_mem_access(env, insn_idx, regno, - i, BPF_DW, BPF_WRITE, -1, false); + i, BPF_DW, BPF_WRITE, -1, false, false); if (err) return err; } @@ -7745,7 +7920,10 @@ found: verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n"); return -EFAULT; } - /* Handled by helper specific checks */ + if (meta->func_id == BPF_FUNC_kptr_xchg) { + if (map_kptr_match_type(env, meta->kptr_field, reg, regno)) + return -EACCES; + } break; case PTR_TO_BTF_ID | MEM_PERCPU: case PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED: @@ -7797,17 +7975,6 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, if (arg_type_is_dynptr(arg_type) && type == PTR_TO_STACK) return 0; - if ((type_is_ptr_alloc_obj(type) || type_is_non_owning_ref(type)) && reg->off) { - if (reg_find_field_offset(reg, reg->off, BPF_GRAPH_NODE_OR_ROOT)) - return __check_ptr_off_reg(env, reg, regno, true); - - verbose(env, "R%d must have zero offset when passed to release func\n", - regno); - verbose(env, "No graph node or root found at R%d type:%s off:%d\n", regno, - btf_type_name(reg->btf, reg->btf_id), reg->off); - return -EINVAL; - } - /* Doing check_ptr_off_reg check for the offset will catch this * because fixed_off_ok is false, but checking here allows us * to give the user a better error message. @@ -7842,6 +8009,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, case PTR_TO_BTF_ID | PTR_TRUSTED: case PTR_TO_BTF_ID | MEM_RCU: case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF: + case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU: /* When referenced PTR_TO_BTF_ID is passed to release function, * its fixed offset must be 0. In the other cases, fixed offset * can be non-zero. This was already checked above. So pass @@ -9135,19 +9303,33 @@ static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type, { struct bpf_reg_state *ret_reg = ®s[BPF_REG_0]; - if (ret_type != RET_INTEGER || - (func_id != BPF_FUNC_get_stack && - func_id != BPF_FUNC_get_task_stack && - func_id != BPF_FUNC_probe_read_str && - func_id != BPF_FUNC_probe_read_kernel_str && - func_id != BPF_FUNC_probe_read_user_str)) + if (ret_type != RET_INTEGER) return; - ret_reg->smax_value = meta->msize_max_value; - ret_reg->s32_max_value = meta->msize_max_value; - ret_reg->smin_value = -MAX_ERRNO; - ret_reg->s32_min_value = -MAX_ERRNO; - reg_bounds_sync(ret_reg); + switch (func_id) { + case BPF_FUNC_get_stack: + case BPF_FUNC_get_task_stack: + case BPF_FUNC_probe_read_str: + case BPF_FUNC_probe_read_kernel_str: + case BPF_FUNC_probe_read_user_str: + ret_reg->smax_value = meta->msize_max_value; + ret_reg->s32_max_value = meta->msize_max_value; + ret_reg->smin_value = -MAX_ERRNO; + ret_reg->s32_min_value = -MAX_ERRNO; + reg_bounds_sync(ret_reg); + break; + case BPF_FUNC_get_smp_processor_id: + ret_reg->umax_value = nr_cpu_ids - 1; + ret_reg->u32_max_value = nr_cpu_ids - 1; + ret_reg->smax_value = nr_cpu_ids - 1; + ret_reg->s32_max_value = nr_cpu_ids - 1; + ret_reg->umin_value = 0; + ret_reg->u32_min_value = 0; + ret_reg->smin_value = 0; + ret_reg->s32_min_value = 0; + reg_bounds_sync(ret_reg); + break; + } } static int @@ -9441,7 +9623,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn */ for (i = 0; i < meta.access_size; i++) { err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, - BPF_WRITE, -1, false); + BPF_WRITE, -1, false, false); if (err) return err; } @@ -10050,15 +10232,6 @@ static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env, return true; } - -static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = { -#ifdef CONFIG_NET - [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], - [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], - [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP], -#endif -}; - enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_CTX, KF_ARG_PTR_TO_ALLOC_BTF_ID, /* Allocated object */ @@ -10303,6 +10476,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_verifier_state *state = env->cur_state; + struct btf_record *rec = reg_btf_record(reg); if (!state->active_lock.ptr) { verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n"); @@ -10315,6 +10489,9 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state } reg->type |= NON_OWN_REF; + if (rec->refcount_off >= 0) + reg->type |= MEM_RCU; + return 0; } @@ -11047,10 +11224,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "arg#%d doesn't point to a type with bpf_refcount field\n", i); return -EINVAL; } - if (rec->refcount_off >= 0) { - verbose(env, "bpf_refcount_acquire calls are disabled for now\n"); - return -EINVAL; - } + meta->arg_btf = reg->btf; meta->arg_btf_id = reg->btf_id; break; @@ -11155,6 +11329,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_func_state *state; struct bpf_reg_state *reg; + if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) { + verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n"); + return -EACCES; + } + if (rcu_lock) { verbose(env, "nested rcu read lock (kernel function %s)\n", func_name); return -EINVAL; @@ -12907,7 +13086,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } else { if (insn->src_reg != BPF_REG_0 || insn->off != 0 || (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) || - BPF_CLASS(insn->code) == BPF_ALU64) { + (BPF_CLASS(insn->code) == BPF_ALU64 && + BPF_SRC(insn->code) != BPF_TO_LE)) { verbose(env, "BPF_END uses reserved fields\n"); return -EINVAL; } @@ -12932,11 +13112,24 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } else if (opcode == BPF_MOV) { if (BPF_SRC(insn->code) == BPF_X) { - if (insn->imm != 0 || insn->off != 0) { + if (insn->imm != 0) { verbose(env, "BPF_MOV uses reserved fields\n"); return -EINVAL; } + if (BPF_CLASS(insn->code) == BPF_ALU) { + if (insn->off != 0 && insn->off != 8 && insn->off != 16) { + verbose(env, "BPF_MOV uses reserved fields\n"); + return -EINVAL; + } + } else { + if (insn->off != 0 && insn->off != 8 && insn->off != 16 && + insn->off != 32) { + verbose(env, "BPF_MOV uses reserved fields\n"); + return -EINVAL; + } + } + /* check src operand */ err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) @@ -12960,18 +13153,42 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) !tnum_is_const(src_reg->var_off); if (BPF_CLASS(insn->code) == BPF_ALU64) { - /* case: R1 = R2 - * copy register state to dest reg - */ - if (need_id) - /* Assign src and dst registers the same ID - * that will be used by find_equal_scalars() - * to propagate min/max range. + if (insn->off == 0) { + /* case: R1 = R2 + * copy register state to dest reg */ - src_reg->id = ++env->id_gen; - copy_register_state(dst_reg, src_reg); - dst_reg->live |= REG_LIVE_WRITTEN; - dst_reg->subreg_def = DEF_NOT_SUBREG; + if (need_id) + /* Assign src and dst registers the same ID + * that will be used by find_equal_scalars() + * to propagate min/max range. + */ + src_reg->id = ++env->id_gen; + copy_register_state(dst_reg, src_reg); + dst_reg->live |= REG_LIVE_WRITTEN; + dst_reg->subreg_def = DEF_NOT_SUBREG; + } else { + /* case: R1 = (s8, s16 s32)R2 */ + if (is_pointer_value(env, insn->src_reg)) { + verbose(env, + "R%d sign-extension part of pointer\n", + insn->src_reg); + return -EACCES; + } else if (src_reg->type == SCALAR_VALUE) { + bool no_sext; + + no_sext = src_reg->umax_value < (1ULL << (insn->off - 1)); + if (no_sext && need_id) + src_reg->id = ++env->id_gen; + copy_register_state(dst_reg, src_reg); + if (!no_sext) + dst_reg->id = 0; + coerce_reg_to_size_sx(dst_reg, insn->off >> 3); + dst_reg->live |= REG_LIVE_WRITTEN; + dst_reg->subreg_def = DEF_NOT_SUBREG; + } else { + mark_reg_unknown(env, regs, insn->dst_reg); + } + } } else { /* R1 = (u32) R2 */ if (is_pointer_value(env, insn->src_reg)) { @@ -12980,19 +13197,33 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) insn->src_reg); return -EACCES; } else if (src_reg->type == SCALAR_VALUE) { - bool is_src_reg_u32 = src_reg->umax_value <= U32_MAX; - - if (is_src_reg_u32 && need_id) - src_reg->id = ++env->id_gen; - copy_register_state(dst_reg, src_reg); - /* Make sure ID is cleared if src_reg is not in u32 range otherwise - * dst_reg min/max could be incorrectly - * propagated into src_reg by find_equal_scalars() - */ - if (!is_src_reg_u32) - dst_reg->id = 0; - dst_reg->live |= REG_LIVE_WRITTEN; - dst_reg->subreg_def = env->insn_idx + 1; + if (insn->off == 0) { + bool is_src_reg_u32 = src_reg->umax_value <= U32_MAX; + + if (is_src_reg_u32 && need_id) + src_reg->id = ++env->id_gen; + copy_register_state(dst_reg, src_reg); + /* Make sure ID is cleared if src_reg is not in u32 + * range otherwise dst_reg min/max could be incorrectly + * propagated into src_reg by find_equal_scalars() + */ + if (!is_src_reg_u32) + dst_reg->id = 0; + dst_reg->live |= REG_LIVE_WRITTEN; + dst_reg->subreg_def = env->insn_idx + 1; + } else { + /* case: W1 = (s8, s16)W2 */ + bool no_sext = src_reg->umax_value < (1ULL << (insn->off - 1)); + + if (no_sext && need_id) + src_reg->id = ++env->id_gen; + copy_register_state(dst_reg, src_reg); + if (!no_sext) + dst_reg->id = 0; + dst_reg->live |= REG_LIVE_WRITTEN; + dst_reg->subreg_def = env->insn_idx + 1; + coerce_subreg_to_size_sx(dst_reg, insn->off >> 3); + } } else { mark_reg_unknown(env, regs, insn->dst_reg); @@ -13023,7 +13254,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } else { /* all other ALU ops: and, sub, xor, add, ... */ if (BPF_SRC(insn->code) == BPF_X) { - if (insn->imm != 0 || insn->off != 0) { + if (insn->imm != 0 || insn->off > 1 || + (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) { verbose(env, "BPF_ALU uses reserved fields\n"); return -EINVAL; } @@ -13032,7 +13264,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (err) return err; } else { - if (insn->src_reg != BPF_REG_0 || insn->off != 0) { + if (insn->src_reg != BPF_REG_0 || insn->off > 1 || + (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) { verbose(env, "BPF_ALU uses reserved fields\n"); return -EINVAL; } @@ -13817,6 +14050,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return -EINVAL; } + /* check src2 operand */ + err = check_reg_arg(env, insn->dst_reg, SRC_OP); + if (err) + return err; + + dst_reg = ®s[insn->dst_reg]; if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0) { verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); @@ -13828,12 +14067,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (err) return err; - if (is_pointer_value(env, insn->src_reg)) { + src_reg = ®s[insn->src_reg]; + if (!(reg_is_pkt_pointer_any(dst_reg) && reg_is_pkt_pointer_any(src_reg)) && + is_pointer_value(env, insn->src_reg)) { verbose(env, "R%d pointer comparison prohibited\n", insn->src_reg); return -EACCES; } - src_reg = ®s[insn->src_reg]; } else { if (insn->src_reg != BPF_REG_0) { verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); @@ -13841,12 +14081,6 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } } - /* check src2 operand */ - err = check_reg_arg(env, insn->dst_reg, SRC_OP); - if (err) - return err; - - dst_reg = ®s[insn->dst_reg]; is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; if (BPF_SRC(insn->code) == BPF_K) { @@ -14576,7 +14810,7 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, static int visit_insn(int t, struct bpf_verifier_env *env) { struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t]; - int ret; + int ret, off; if (bpf_pseudo_func(insn)) return visit_func_call_insn(t, insns, env, true); @@ -14624,14 +14858,19 @@ static int visit_insn(int t, struct bpf_verifier_env *env) if (BPF_SRC(insn->code) != BPF_K) return -EINVAL; + if (BPF_CLASS(insn->code) == BPF_JMP) + off = insn->off; + else + off = insn->imm; + /* unconditional jump with single edge */ - ret = push_insn(t, t + insn->off + 1, FALLTHROUGH, env, + ret = push_insn(t, t + off + 1, FALLTHROUGH, env, true); if (ret) return ret; - mark_prune_point(env, t + insn->off + 1); - mark_jmp_point(env, t + insn->off + 1); + mark_prune_point(env, t + off + 1); + mark_jmp_point(env, t + off + 1); return ret; @@ -16178,7 +16417,7 @@ static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type typ * Have to support a use case when one path through * the program yields TRUSTED pointer while another * is UNTRUSTED. Fallback to UNTRUSTED to generate - * BPF_PROBE_MEM. + * BPF_PROBE_MEM/BPF_PROBE_MEMSX. */ *prev_type = PTR_TO_BTF_ID | PTR_UNTRUSTED; } else { @@ -16319,7 +16558,8 @@ static int do_check(struct bpf_verifier_env *env) */ err = check_mem_access(env, env->insn_idx, insn->src_reg, insn->off, BPF_SIZE(insn->code), - BPF_READ, insn->dst_reg, false); + BPF_READ, insn->dst_reg, false, + BPF_MODE(insn->code) == BPF_MEMSX); if (err) return err; @@ -16356,7 +16596,7 @@ static int do_check(struct bpf_verifier_env *env) /* check that memory (dst_reg + off) is writeable */ err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), - BPF_WRITE, insn->src_reg, false); + BPF_WRITE, insn->src_reg, false, false); if (err) return err; @@ -16381,7 +16621,7 @@ static int do_check(struct bpf_verifier_env *env) /* check that memory (dst_reg + off) is writeable */ err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), - BPF_WRITE, -1, false); + BPF_WRITE, -1, false, false); if (err) return err; @@ -16426,15 +16666,18 @@ static int do_check(struct bpf_verifier_env *env) mark_reg_scratched(env, BPF_REG_0); } else if (opcode == BPF_JA) { if (BPF_SRC(insn->code) != BPF_K || - insn->imm != 0 || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0 || - class == BPF_JMP32) { + (class == BPF_JMP && insn->imm != 0) || + (class == BPF_JMP32 && insn->off != 0)) { verbose(env, "BPF_JA uses reserved fields\n"); return -EINVAL; } - env->insn_idx += insn->off + 1; + if (class == BPF_JMP) + env->insn_idx += insn->off + 1; + else + env->insn_idx += insn->imm + 1; continue; } else if (opcode == BPF_EXIT) { @@ -16453,7 +16696,8 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } - if (env->cur_state->active_rcu_lock) { + if (env->cur_state->active_rcu_lock && + !in_rbtree_lock_required_cb(env)) { verbose(env, "bpf_rcu_read_unlock is missing\n"); return -EINVAL; } @@ -16733,11 +16977,6 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, verbose(env, "tracing progs cannot use bpf_spin_lock yet\n"); return -EINVAL; } - - if (prog->aux->sleepable) { - verbose(env, "sleepable progs cannot use bpf_spin_lock yet\n"); - return -EINVAL; - } } if (btf_record_has_field(map->record, BPF_TIMER)) { @@ -16809,7 +17048,8 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++, insn++) { if (BPF_CLASS(insn->code) == BPF_LDX && - (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) { + ((BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_MEMSX) || + insn->imm != 0)) { verbose(env, "BPF_LDX uses reserved fields\n"); return -EINVAL; } @@ -17280,13 +17520,13 @@ static bool insn_is_cond_jump(u8 code) { u8 op; + op = BPF_OP(code); if (BPF_CLASS(code) == BPF_JMP32) - return true; + return op != BPF_JA; if (BPF_CLASS(code) != BPF_JMP) return false; - op = BPF_OP(code); return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL; } @@ -17503,11 +17743,15 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++, insn++) { bpf_convert_ctx_access_t convert_ctx_access; + u8 mode; if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) || insn->code == (BPF_LDX | BPF_MEM | BPF_H) || insn->code == (BPF_LDX | BPF_MEM | BPF_W) || - insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) { + insn->code == (BPF_LDX | BPF_MEM | BPF_DW) || + insn->code == (BPF_LDX | BPF_MEMSX | BPF_B) || + insn->code == (BPF_LDX | BPF_MEMSX | BPF_H) || + insn->code == (BPF_LDX | BPF_MEMSX | BPF_W)) { type = BPF_READ; } else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) || insn->code == (BPF_STX | BPF_MEM | BPF_H) || @@ -17566,8 +17810,12 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) */ case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED: if (type == BPF_READ) { - insn->code = BPF_LDX | BPF_PROBE_MEM | - BPF_SIZE((insn)->code); + if (BPF_MODE(insn->code) == BPF_MEM) + insn->code = BPF_LDX | BPF_PROBE_MEM | + BPF_SIZE((insn)->code); + else + insn->code = BPF_LDX | BPF_PROBE_MEMSX | + BPF_SIZE((insn)->code); env->prog->aux->num_exentries++; } continue; @@ -17577,6 +17825,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size; size = BPF_LDST_BYTES(insn); + mode = BPF_MODE(insn->code); /* If the read access is a narrower load of the field, * convert to a 4/8-byte load, to minimum program type specific @@ -17636,6 +17885,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) (1ULL << size * 8) - 1); } } + if (mode == BPF_MEMSX) + insn_buf[cnt++] = BPF_RAW_INSN(BPF_ALU64 | BPF_MOV | BPF_X, + insn->dst_reg, insn->dst_reg, + size * 8, 0); new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) @@ -17755,7 +18008,8 @@ static int jit_subprogs(struct bpf_verifier_env *env) insn = func[i]->insnsi; for (j = 0; j < func[i]->len; j++, insn++) { if (BPF_CLASS(insn->code) == BPF_LDX && - BPF_MODE(insn->code) == BPF_PROBE_MEM) + (BPF_MODE(insn->code) == BPF_PROBE_MEM || + BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) num_exentries++; } func[i]->aux->num_exentries = num_exentries; @@ -18027,6 +18281,13 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) }; + if (desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] && + !kptr_struct_meta) { + verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n", + insn_idx); + return -EFAULT; + } + insn_buf[0] = addr[0]; insn_buf[1] = addr[1]; insn_buf[2] = *insn; @@ -18034,6 +18295,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } else if (desc->func_id == special_kfunc_list[KF_bpf_list_push_back_impl] || desc->func_id == special_kfunc_list[KF_bpf_list_push_front_impl] || desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { + struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; int struct_meta_reg = BPF_REG_3; int node_offset_reg = BPF_REG_4; @@ -18043,6 +18305,12 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, node_offset_reg = BPF_REG_5; } + if (!kptr_struct_meta) { + verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n", + insn_idx); + return -EFAULT; + } + __fixup_collection_insert_kfunc(&env->insn_aux_data[insn_idx], struct_meta_reg, node_offset_reg, insn, insn_buf, cnt); } else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] || diff --git a/kernel/capability.c b/kernel/capability.c index 1a2795102ae4..dac4df77e376 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -112,7 +112,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, int ret; if (pid && (pid != task_pid_vnr(current))) { - struct task_struct *target; + const struct task_struct *target; rcu_read_lock(); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 83044312bc41..c487ffef6652 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -431,7 +431,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) if (l->list[mid] == pid) { index = mid; break; - } else if (l->list[mid] <= pid) + } else if (l->list[mid] < pid) index = mid + 1; else end = mid; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f55a40db065f..1fb7f562289d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -493,28 +493,6 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, } /** - * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem - * @cgrp: the cgroup of interest - * @ss: the subsystem of interest - * - * Find and get @cgrp's css associated with @ss. If the css doesn't exist - * or is offline, %NULL is returned. - */ -static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, - struct cgroup_subsys *ss) -{ - struct cgroup_subsys_state *css; - - rcu_read_lock(); - css = cgroup_css(cgrp, ss); - if (css && !css_tryget_online(css)) - css = NULL; - rcu_read_unlock(); - - return css; -} - -/** * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) @@ -679,7 +657,7 @@ EXPORT_SYMBOL_GPL(of_css); * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end * @cgrp: the target cgroup to iterate css's of * - * Should be called under cgroup_[tree_]mutex. + * Should be called under cgroup_mutex. */ #define for_each_css(css, ssid, cgrp) \ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ @@ -929,7 +907,7 @@ static void css_set_move_task(struct task_struct *task, #define CSS_SET_HASH_BITS 7 static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS); -static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) +static unsigned long css_set_hash(struct cgroup_subsys_state **css) { unsigned long key = 0UL; struct cgroup_subsys *ss; @@ -1070,7 +1048,7 @@ static bool compare_css_sets(struct css_set *cset, */ static struct css_set *find_existing_css_set(struct css_set *old_cset, struct cgroup *cgrp, - struct cgroup_subsys_state *template[]) + struct cgroup_subsys_state **template) { struct cgroup_root *root = cgrp->root; struct cgroup_subsys *ss; @@ -1736,7 +1714,7 @@ static int css_populate_dir(struct cgroup_subsys_state *css) struct cftype *cfts, *failed_cfts; int ret; - if ((css->flags & CSS_VISIBLE) || !cgrp->kn) + if (css->flags & CSS_VISIBLE) return 0; if (!css->ss) { @@ -2499,7 +2477,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, /* * This function may be called both before and - * after cgroup_taskset_migrate(). The two cases + * after cgroup_migrate_execute(). The two cases * can be distinguished by looking at whether @cset * has its ->mg_dst_cset set. */ @@ -3654,9 +3632,32 @@ static int cgroup_stat_show(struct seq_file *seq, void *v) return 0; } -static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq, - struct cgroup *cgrp, int ssid) +#ifdef CONFIG_CGROUP_SCHED +/** + * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest + * + * Find and get @cgrp's css associated with @ss. If the css doesn't exist + * or is offline, %NULL is returned. + */ +static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + struct cgroup_subsys_state *css; + + rcu_read_lock(); + css = cgroup_css(cgrp, ss); + if (css && !css_tryget_online(css)) + css = NULL; + rcu_read_unlock(); + + return css; +} + +static int cgroup_extra_stat_show(struct seq_file *seq, int ssid) { + struct cgroup *cgrp = seq_css(seq)->cgroup; struct cgroup_subsys *ss = cgroup_subsys[ssid]; struct cgroup_subsys_state *css; int ret; @@ -3673,14 +3674,44 @@ static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq, return ret; } +static int cgroup_local_stat_show(struct seq_file *seq, + struct cgroup *cgrp, int ssid) +{ + struct cgroup_subsys *ss = cgroup_subsys[ssid]; + struct cgroup_subsys_state *css; + int ret; + + if (!ss->css_local_stat_show) + return 0; + + css = cgroup_tryget_css(cgrp, ss); + if (!css) + return 0; + + ret = ss->css_local_stat_show(seq, css); + css_put(css); + return ret; +} +#endif + static int cpu_stat_show(struct seq_file *seq, void *v) { - struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; int ret = 0; cgroup_base_stat_cputime_show(seq); #ifdef CONFIG_CGROUP_SCHED - ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id); + ret = cgroup_extra_stat_show(seq, cpu_cgrp_id); +#endif + return ret; +} + +static int cpu_local_stat_show(struct seq_file *seq, void *v) +{ + struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; + int ret = 0; + +#ifdef CONFIG_CGROUP_SCHED + ret = cgroup_local_stat_show(seq, cgrp, cpu_cgrp_id); #endif return ret; } @@ -4320,14 +4351,13 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) return ret; } -static int cgroup_rm_cftypes_locked(struct cftype *cfts) +static void cgroup_rm_cftypes_locked(struct cftype *cfts) { lockdep_assert_held(&cgroup_mutex); list_del(&cfts->node); cgroup_apply_cftypes(cfts, false); cgroup_exit_cftypes(cfts); - return 0; } /** @@ -4343,8 +4373,6 @@ static int cgroup_rm_cftypes_locked(struct cftype *cfts) */ int cgroup_rm_cftypes(struct cftype *cfts) { - int ret; - if (!cfts || cfts[0].name[0] == '\0') return 0; @@ -4352,9 +4380,9 @@ int cgroup_rm_cftypes(struct cftype *cfts) return -ENOENT; cgroup_lock(); - ret = cgroup_rm_cftypes_locked(cfts); + cgroup_rm_cftypes_locked(cfts); cgroup_unlock(); - return ret; + return 0; } /** @@ -5235,6 +5263,10 @@ static struct cftype cgroup_base_files[] = { .name = "cpu.stat", .seq_show = cpu_stat_show, }, + { + .name = "cpu.stat.local", + .seq_show = cpu_local_stat_show, + }, { } /* terminate */ }; @@ -5303,7 +5335,7 @@ static struct cftype cgroup_psi_files[] = { * RCU callback. * * 4. After the grace period, the css can be freed. Implemented in - * css_free_work_fn(). + * css_free_rwork_fn(). * * It is actually hairier because both step 2 and 4 require process context * and thus involve punting to css->destroy_work adding two additional @@ -5547,8 +5579,7 @@ err_free_css: /* * The returned cgroup is fully initialized including its control mask, but - * it isn't associated with its kernfs_node and doesn't have the control - * mask applied. + * it doesn't have the control mask applied. */ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, umode_t mode) @@ -5874,7 +5905,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * Mark @cgrp and the associated csets dead. The former prevents * further task migration and child creation by disabling - * cgroup_lock_live_group(). The latter makes the csets ignored by + * cgroup_kn_lock_live(). The latter makes the csets ignored by * the migration path. */ cgrp->self.flags &= ~CSS_ONLINE; @@ -5896,7 +5927,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) parent->nr_threaded_children--; spin_lock_irq(&css_set_lock); - for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) { + for (tcgrp = parent; tcgrp; tcgrp = cgroup_parent(tcgrp)) { tcgrp->nr_descendants--; tcgrp->nr_dying_descendants++; /* @@ -6089,8 +6120,8 @@ int __init cgroup_init(void) continue; if (cgroup1_ssid_disabled(ssid)) - printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n", - ss->name); + pr_info("Disabling %s control group subsystem in v1 mounts\n", + ss->name); cgrp_dfl_root.subsys_mask |= 1 << ss->id; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 58e6f18f01c1..58ec88efa4f8 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1230,7 +1230,7 @@ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) /* * Percpu kthreads in top_cpuset are ignored */ - if ((task->flags & PF_KTHREAD) && kthread_is_per_cpu(task)) + if (kthread_is_per_cpu(task)) continue; cpumask_andnot(new_cpus, possible_mask, cs->subparts_cpus); } else { @@ -1255,7 +1255,7 @@ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) static void compute_effective_cpumask(struct cpumask *new_cpus, struct cpuset *cs, struct cpuset *parent) { - if (parent->nr_subparts_cpus) { + if (parent->nr_subparts_cpus && is_partition_valid(cs)) { cpumask_or(new_cpus, parent->effective_cpus, parent->subparts_cpus); cpumask_and(new_cpus, new_cpus, cs->cpus_allowed); @@ -1277,6 +1277,52 @@ enum subparts_cmd { static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on); +static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, + struct tmpmasks *tmp); + +/* + * Update partition exclusive flag + * + * Return: 0 if successful, an error code otherwise + */ +static int update_partition_exclusive(struct cpuset *cs, int new_prs) +{ + bool exclusive = (new_prs > 0); + + if (exclusive && !is_cpu_exclusive(cs)) { + if (update_flag(CS_CPU_EXCLUSIVE, cs, 1)) + return PERR_NOTEXCL; + } else if (!exclusive && is_cpu_exclusive(cs)) { + /* Turning off CS_CPU_EXCLUSIVE will not return error */ + update_flag(CS_CPU_EXCLUSIVE, cs, 0); + } + return 0; +} + +/* + * Update partition load balance flag and/or rebuild sched domain + * + * Changing load balance flag will automatically call + * rebuild_sched_domains_locked(). + */ +static void update_partition_sd_lb(struct cpuset *cs, int old_prs) +{ + int new_prs = cs->partition_root_state; + bool new_lb = (new_prs != PRS_ISOLATED); + bool rebuild_domains = (new_prs > 0) || (old_prs > 0); + + if (new_lb != !!is_sched_load_balance(cs)) { + rebuild_domains = true; + if (new_lb) + set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); + else + clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); + } + + if (rebuild_domains) + rebuild_sched_domains_locked(); +} + /** * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset * @cs: The cpuset that requests change in partition root state @@ -1336,8 +1382,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, return is_partition_invalid(parent) ? PERR_INVPARENT : PERR_NOTPART; } - if ((newmask && cpumask_empty(newmask)) || - (!newmask && cpumask_empty(cs->cpus_allowed))) + if (!newmask && cpumask_empty(cs->cpus_allowed)) return PERR_CPUSEMPTY; /* @@ -1404,10 +1449,15 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, adding = cpumask_andnot(tmp->addmask, tmp->addmask, parent->subparts_cpus); /* + * Empty cpumask is not allowed + */ + if (cpumask_empty(newmask)) { + part_error = PERR_CPUSEMPTY; + /* * Make partition invalid if parent's effective_cpus could * become empty and there are tasks in the parent. */ - if (adding && + } else if (adding && cpumask_subset(parent->effective_cpus, tmp->addmask) && !cpumask_intersects(tmp->delmask, cpu_active_mask) && partition_is_populated(parent, cs)) { @@ -1480,14 +1530,13 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, /* * Transitioning between invalid to valid or vice versa may require - * changing CS_CPU_EXCLUSIVE and CS_SCHED_LOAD_BALANCE. + * changing CS_CPU_EXCLUSIVE. */ if (old_prs != new_prs) { - if (is_prs_invalid(old_prs) && !is_cpu_exclusive(cs) && - (update_flag(CS_CPU_EXCLUSIVE, cs, 1) < 0)) - return PERR_NOTEXCL; - if (is_prs_invalid(new_prs) && is_cpu_exclusive(cs)) - update_flag(CS_CPU_EXCLUSIVE, cs, 0); + int err = update_partition_exclusive(cs, new_prs); + + if (err) + return err; } /* @@ -1520,24 +1569,34 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, spin_unlock_irq(&callback_lock); - if (adding || deleting) + if (adding || deleting) { update_tasks_cpumask(parent, tmp->addmask); + if (parent->child_ecpus_count) + update_sibling_cpumasks(parent, cs, tmp); + } /* - * Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary. - * rebuild_sched_domains_locked() may be called. + * For partcmd_update without newmask, it is being called from + * cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken. + * Update the load balance flag and scheduling domain if + * cpus_read_trylock() is successful. */ - if (old_prs != new_prs) { - if (old_prs == PRS_ISOLATED) - update_flag(CS_SCHED_LOAD_BALANCE, cs, 1); - else if (new_prs == PRS_ISOLATED) - update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); + if ((cmd == partcmd_update) && !newmask && cpus_read_trylock()) { + update_partition_sd_lb(cs, old_prs); + cpus_read_unlock(); } + notify_partition_change(cs, old_prs); return 0; } /* + * update_cpumasks_hier() flags + */ +#define HIER_CHECKALL 0x01 /* Check all cpusets with no skipping */ +#define HIER_NO_SD_REBUILD 0x02 /* Don't rebuild sched domains */ + +/* * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree * @cs: the cpuset to consider * @tmp: temp variables for calculating effective_cpus & partition setup @@ -1551,7 +1610,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, * Called with cpuset_mutex held */ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, - bool force) + int flags) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; @@ -1588,11 +1647,16 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, } /* - * Skip the whole subtree if the cpumask remains the same - * and has no partition root state and force flag not set. + * Skip the whole subtree if + * 1) the cpumask remains the same, + * 2) has no partition root state, + * 3) HIER_CHECKALL flag not set, and + * 4) for v2 load balance state same as its parent. */ - if (!cp->partition_root_state && !force && - cpumask_equal(tmp->new_cpus, cp->effective_cpus)) { + if (!cp->partition_root_state && !(flags & HIER_CHECKALL) && + cpumask_equal(tmp->new_cpus, cp->effective_cpus) && + (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || + (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) { pos_css = css_rightmost_descendant(pos_css); continue; } @@ -1676,6 +1740,20 @@ update_parent_subparts: update_tasks_cpumask(cp, tmp->new_cpus); /* + * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE + * from parent if current cpuset isn't a valid partition root + * and their load balance states differ. + */ + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + !is_partition_valid(cp) && + (is_sched_load_balance(parent) != is_sched_load_balance(cp))) { + if (is_sched_load_balance(parent)) + set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); + else + clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); + } + + /* * On legacy hierarchy, if the effective cpumask of any non- * empty cpuset is changed, we need to rebuild sched domains. * On default hierarchy, the cpuset needs to be a partition @@ -1692,7 +1770,7 @@ update_parent_subparts: } rcu_read_unlock(); - if (need_rebuild_sched_domains) + if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD)) rebuild_sched_domains_locked(); } @@ -1716,7 +1794,9 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, * to use the right effective_cpus value. * * The update_cpumasks_hier() function may sleep. So we have to - * release the RCU read lock before calling it. + * release the RCU read lock before calling it. HIER_NO_SD_REBUILD + * flag is used to suppress rebuild of sched domains as the callers + * will take care of that. */ rcu_read_lock(); cpuset_for_each_child(sibling, pos_css, parent) { @@ -1728,7 +1808,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, continue; rcu_read_unlock(); - update_cpumasks_hier(sibling, tmp, false); + update_cpumasks_hier(sibling, tmp, HIER_NO_SD_REBUILD); rcu_read_lock(); css_put(&sibling->css); } @@ -1747,6 +1827,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, int retval; struct tmpmasks tmp; bool invalidate = false; + int old_prs = cs->partition_root_state; /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ if (cs == &top_cpuset) @@ -1774,18 +1855,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) return 0; -#ifdef CONFIG_CPUMASK_OFFSTACK - /* - * Use the cpumasks in trialcs for tmpmasks when they are pointers - * to allocated cpumasks. - * - * Note that update_parent_subparts_cpumask() uses only addmask & - * delmask, but not new_cpus. - */ - tmp.addmask = trialcs->subparts_cpus; - tmp.delmask = trialcs->effective_cpus; - tmp.new_cpus = NULL; -#endif + if (alloc_cpumasks(NULL, &tmp)) + return -ENOMEM; retval = validate_change(cs, trialcs); @@ -1814,7 +1885,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, retval = 0; } if (retval < 0) - return retval; + goto out_free; if (cs->partition_root_state) { if (invalidate) @@ -1849,13 +1920,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, } spin_unlock_irq(&callback_lock); -#ifdef CONFIG_CPUMASK_OFFSTACK - /* Now trialcs->cpus_allowed is available */ - tmp.new_cpus = trialcs->cpus_allowed; -#endif - /* effective_cpus will be updated here */ - update_cpumasks_hier(cs, &tmp, false); + update_cpumasks_hier(cs, &tmp, 0); if (cs->partition_root_state) { struct cpuset *parent = parent_cs(cs); @@ -1866,7 +1932,12 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, */ if (parent->child_ecpus_count) update_sibling_cpumasks(parent, cs, &tmp); + + /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains */ + update_partition_sd_lb(cs, old_prs); } +out_free: + free_cpumasks(NULL, &tmp); return 0; } @@ -2242,7 +2313,6 @@ out: static int update_prstate(struct cpuset *cs, int new_prs) { int err = PERR_NONE, old_prs = cs->partition_root_state; - bool sched_domain_rebuilt = false; struct cpuset *parent = parent_cs(cs); struct tmpmasks tmpmask; @@ -2261,45 +2331,26 @@ static int update_prstate(struct cpuset *cs, int new_prs) if (alloc_cpumasks(NULL, &tmpmask)) return -ENOMEM; + err = update_partition_exclusive(cs, new_prs); + if (err) + goto out; + if (!old_prs) { /* - * Turning on partition root requires setting the - * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed - * cannot be empty. + * cpus_allowed cannot be empty. */ if (cpumask_empty(cs->cpus_allowed)) { err = PERR_CPUSEMPTY; goto out; } - err = update_flag(CS_CPU_EXCLUSIVE, cs, 1); - if (err) { - err = PERR_NOTEXCL; - goto out; - } - err = update_parent_subparts_cpumask(cs, partcmd_enable, NULL, &tmpmask); - if (err) { - update_flag(CS_CPU_EXCLUSIVE, cs, 0); - goto out; - } - - if (new_prs == PRS_ISOLATED) { - /* - * Disable the load balance flag should not return an - * error unless the system is running out of memory. - */ - update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); - sched_domain_rebuilt = true; - } } else if (old_prs && new_prs) { /* * A change in load balance state only, no change in cpumasks. */ - update_flag(CS_SCHED_LOAD_BALANCE, cs, (new_prs != PRS_ISOLATED)); - sched_domain_rebuilt = true; - goto out; /* Sched domain is rebuilt in update_flag() */ + ; } else { /* * Switching back to member is always allowed even if it @@ -2318,40 +2369,31 @@ static int update_prstate(struct cpuset *cs, int new_prs) compute_effective_cpumask(cs->effective_cpus, cs, parent); spin_unlock_irq(&callback_lock); } - - /* Turning off CS_CPU_EXCLUSIVE will not return error */ - update_flag(CS_CPU_EXCLUSIVE, cs, 0); - - if (!is_sched_load_balance(cs)) { - /* Make sure load balance is on */ - update_flag(CS_SCHED_LOAD_BALANCE, cs, 1); - sched_domain_rebuilt = true; - } } - - update_tasks_cpumask(parent, tmpmask.new_cpus); - - if (parent->child_ecpus_count) - update_sibling_cpumasks(parent, cs, &tmpmask); - - if (!sched_domain_rebuilt) - rebuild_sched_domains_locked(); out: /* - * Make partition invalid if an error happen + * Make partition invalid & disable CS_CPU_EXCLUSIVE if an error + * happens. */ - if (err) + if (err) { new_prs = -new_prs; + update_partition_exclusive(cs, new_prs); + } + spin_lock_irq(&callback_lock); cs->partition_root_state = new_prs; WRITE_ONCE(cs->prs_err, err); spin_unlock_irq(&callback_lock); + /* * Update child cpusets, if present. * Force update if switching back to member. */ if (!list_empty(&cs->css.children)) - update_cpumasks_hier(cs, &tmpmask, !new_prs); + update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0); + + /* Update sched domains and load balance flag */ + update_partition_sd_lb(cs, old_prs); notify_partition_change(cs, old_prs); free_cpumasks(NULL, &tmpmask); @@ -2487,6 +2529,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) struct cgroup_subsys_state *css; struct cpuset *cs, *oldcs; struct task_struct *task; + bool cpus_updated, mems_updated; int ret; /* used later by cpuset_attach() */ @@ -2501,13 +2544,25 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) if (ret) goto out_unlock; + cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus); + mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); + cgroup_taskset_for_each(task, css, tset) { ret = task_can_attach(task); if (ret) goto out_unlock; - ret = security_task_setscheduler(task); - if (ret) - goto out_unlock; + + /* + * Skip rights over task check in v2 when nothing changes, + * migration permission derives from hierarchy ownership in + * cgroup_procs_write_permission()). + */ + if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || + (cpus_updated || mems_updated)) { + ret = security_task_setscheduler(task); + if (ret) + goto out_unlock; + } if (dl_task(task)) { cs->nr_migrate_dl_tasks++; @@ -3222,6 +3277,14 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->use_parent_ecpus = true; parent->child_ecpus_count++; } + + /* + * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated + */ + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + !is_sched_load_balance(parent)) + clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); + spin_unlock_irq(&callback_lock); if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) @@ -3521,17 +3584,16 @@ hotplug_update_tasks_legacy(struct cpuset *cs, is_empty = cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed); - mutex_unlock(&cpuset_mutex); - /* * Move tasks to the nearest ancestor with execution resources, * This is full cgroup operation which will also call back into * cpuset. Should be done outside any lock. */ - if (is_empty) + if (is_empty) { + mutex_unlock(&cpuset_mutex); remove_tasks_in_empty_cpuset(cs); - - mutex_lock(&cpuset_mutex); + mutex_lock(&cpuset_mutex); + } } static void @@ -3691,6 +3753,7 @@ unlock: /** * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset + * @work: unused * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always @@ -4073,6 +4136,7 @@ bool cpuset_node_allowed(int node, gfp_t gfp_mask) /** * cpuset_spread_node() - On which node to begin search for a page + * @rotor: round robin rotor * * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for * tasks in a cpuset with is_spread_page or is_spread_slab set), diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c index ae2f4dd47508..79a3717a5803 100644 --- a/kernel/cgroup/misc.c +++ b/kernel/cgroup/misc.c @@ -14,7 +14,7 @@ #include <linux/misc_cgroup.h> #define MAX_STR "max" -#define MAX_NUM ULONG_MAX +#define MAX_NUM U64_MAX /* Miscellaneous res name, keep it in sync with enum misc_res_type */ static const char *const misc_res_name[] = { @@ -37,7 +37,7 @@ static struct misc_cg root_cg; * more than the actual capacity. We are using Limits resource distribution * model of cgroup for miscellaneous controller. */ -static unsigned long misc_res_capacity[MISC_CG_RES_TYPES]; +static u64 misc_res_capacity[MISC_CG_RES_TYPES]; /** * parent_misc() - Get the parent of the passed misc cgroup. @@ -74,10 +74,10 @@ static inline bool valid_type(enum misc_res_type type) * Context: Any context. * Return: Current total usage of the resource. */ -unsigned long misc_cg_res_total_usage(enum misc_res_type type) +u64 misc_cg_res_total_usage(enum misc_res_type type) { if (valid_type(type)) - return atomic_long_read(&root_cg.res[type].usage); + return atomic64_read(&root_cg.res[type].usage); return 0; } @@ -95,7 +95,7 @@ EXPORT_SYMBOL_GPL(misc_cg_res_total_usage); * * %0 - Successfully registered the capacity. * * %-EINVAL - If @type is invalid. */ -int misc_cg_set_capacity(enum misc_res_type type, unsigned long capacity) +int misc_cg_set_capacity(enum misc_res_type type, u64 capacity) { if (!valid_type(type)) return -EINVAL; @@ -114,9 +114,9 @@ EXPORT_SYMBOL_GPL(misc_cg_set_capacity); * Context: Any context. */ static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg, - unsigned long amount) + u64 amount) { - WARN_ONCE(atomic_long_add_negative(-amount, &cg->res[type].usage), + WARN_ONCE(atomic64_add_negative(-amount, &cg->res[type].usage), "misc cgroup resource %s became less than 0", misc_res_name[type]); } @@ -137,13 +137,12 @@ static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg, * * -EBUSY - If max limit will be crossed or total usage will be more than the * capacity. */ -int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, - unsigned long amount) +int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount) { struct misc_cg *i, *j; int ret; struct misc_res *res; - int new_usage; + u64 new_usage; if (!(valid_type(type) && cg && READ_ONCE(misc_res_capacity[type]))) return -EINVAL; @@ -154,7 +153,7 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, for (i = cg; i; i = parent_misc(i)) { res = &i->res[type]; - new_usage = atomic_long_add_return(amount, &res->usage); + new_usage = atomic64_add_return(amount, &res->usage); if (new_usage > READ_ONCE(res->max) || new_usage > READ_ONCE(misc_res_capacity[type])) { ret = -EBUSY; @@ -165,7 +164,7 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, err_charge: for (j = i; j; j = parent_misc(j)) { - atomic_long_inc(&j->res[type].events); + atomic64_inc(&j->res[type].events); cgroup_file_notify(&j->events_file); } @@ -184,8 +183,7 @@ EXPORT_SYMBOL_GPL(misc_cg_try_charge); * * Context: Any context. */ -void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, - unsigned long amount) +void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, u64 amount) { struct misc_cg *i; @@ -209,7 +207,7 @@ static int misc_cg_max_show(struct seq_file *sf, void *v) { int i; struct misc_cg *cg = css_misc(seq_css(sf)); - unsigned long max; + u64 max; for (i = 0; i < MISC_CG_RES_TYPES; i++) { if (READ_ONCE(misc_res_capacity[i])) { @@ -217,7 +215,7 @@ static int misc_cg_max_show(struct seq_file *sf, void *v) if (max == MAX_NUM) seq_printf(sf, "%s max\n", misc_res_name[i]); else - seq_printf(sf, "%s %lu\n", misc_res_name[i], + seq_printf(sf, "%s %llu\n", misc_res_name[i], max); } } @@ -241,13 +239,13 @@ static int misc_cg_max_show(struct seq_file *sf, void *v) * Return: * * >= 0 - Number of bytes processed in the input. * * -EINVAL - If buf is not valid. - * * -ERANGE - If number is bigger than the unsigned long capacity. + * * -ERANGE - If number is bigger than the u64 capacity. */ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct misc_cg *cg; - unsigned long max; + u64 max; int ret = 0, i; enum misc_res_type type = MISC_CG_RES_TYPES; char *token; @@ -271,7 +269,7 @@ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf, if (!strcmp(MAX_STR, buf)) { max = MAX_NUM; } else { - ret = kstrtoul(buf, 0, &max); + ret = kstrtou64(buf, 0, &max); if (ret) return ret; } @@ -297,13 +295,13 @@ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf, static int misc_cg_current_show(struct seq_file *sf, void *v) { int i; - unsigned long usage; + u64 usage; struct misc_cg *cg = css_misc(seq_css(sf)); for (i = 0; i < MISC_CG_RES_TYPES; i++) { - usage = atomic_long_read(&cg->res[i].usage); + usage = atomic64_read(&cg->res[i].usage); if (READ_ONCE(misc_res_capacity[i]) || usage) - seq_printf(sf, "%s %lu\n", misc_res_name[i], usage); + seq_printf(sf, "%s %llu\n", misc_res_name[i], usage); } return 0; @@ -322,12 +320,12 @@ static int misc_cg_current_show(struct seq_file *sf, void *v) static int misc_cg_capacity_show(struct seq_file *sf, void *v) { int i; - unsigned long cap; + u64 cap; for (i = 0; i < MISC_CG_RES_TYPES; i++) { cap = READ_ONCE(misc_res_capacity[i]); if (cap) - seq_printf(sf, "%s %lu\n", misc_res_name[i], cap); + seq_printf(sf, "%s %llu\n", misc_res_name[i], cap); } return 0; @@ -336,12 +334,13 @@ static int misc_cg_capacity_show(struct seq_file *sf, void *v) static int misc_events_show(struct seq_file *sf, void *v) { struct misc_cg *cg = css_misc(seq_css(sf)); - unsigned long events, i; + u64 events; + int i; for (i = 0; i < MISC_CG_RES_TYPES; i++) { - events = atomic_long_read(&cg->res[i].events); + events = atomic64_read(&cg->res[i].events); if (READ_ONCE(misc_res_capacity[i]) || events) - seq_printf(sf, "%s.max %lu\n", misc_res_name[i], events); + seq_printf(sf, "%s.max %llu\n", misc_res_name[i], events); } return 0; } @@ -397,7 +396,7 @@ misc_cg_alloc(struct cgroup_subsys_state *parent_css) for (i = 0; i < MISC_CG_RES_TYPES; i++) { WRITE_ONCE(cg->res[i].max, MAX_NUM); - atomic_long_set(&cg->res[i].usage, 0); + atomic64_set(&cg->res[i].usage, 0); } return &cg->css; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 0d5c29879a50..144a464e45c6 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -149,9 +149,3 @@ const struct proc_ns_operations cgroupns_operations = { .install = cgroupns_install, .owner = cgroupns_owner, }; - -static __init int cgroup_namespaces_init(void) -{ - return 0; -} -subsys_initcall(cgroup_namespaces_init); diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 2542c21b6b6d..d80d7a608141 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -344,6 +344,7 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) { struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); struct cgroup *parent = cgroup_parent(cgrp); + struct cgroup_rstat_cpu *prstatc; struct cgroup_base_stat delta; unsigned seq; @@ -357,17 +358,24 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) delta = rstatc->bstat; } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); - /* propagate percpu delta to global */ + /* propagate per-cpu delta to cgroup and per-cpu global statistics */ cgroup_base_stat_sub(&delta, &rstatc->last_bstat); cgroup_base_stat_add(&cgrp->bstat, &delta); cgroup_base_stat_add(&rstatc->last_bstat, &delta); + cgroup_base_stat_add(&rstatc->subtree_bstat, &delta); - /* propagate global delta to parent (unless that's root) */ + /* propagate cgroup and per-cpu global delta to parent (unless that's root) */ if (cgroup_parent(parent)) { delta = cgrp->bstat; cgroup_base_stat_sub(&delta, &cgrp->last_bstat); cgroup_base_stat_add(&parent->bstat, &delta); cgroup_base_stat_add(&cgrp->last_bstat, &delta); + + delta = rstatc->subtree_bstat; + prstatc = cgroup_rstat_cpu(parent, cpu); + cgroup_base_stat_sub(&delta, &rstatc->last_subtree_bstat); + cgroup_base_stat_add(&prstatc->subtree_bstat, &delta); + cgroup_base_stat_add(&rstatc->last_subtree_bstat, &delta); } } diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config index e8db8d938661..4722b998a324 100644 --- a/kernel/configs/debug.config +++ b/kernel/configs/debug.config @@ -1,3 +1,5 @@ +# Help: Debugging for CI systems and finding regressions +# # The config is based on running daily CI for enterprise Linux distros to # seek regressions on linux-next builds on different bare-metal and virtual # platforms. It can be used for example, diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config index 208481d91090..d0877063d925 100644 --- a/kernel/configs/kvm_guest.config +++ b/kernel/configs/kvm_guest.config @@ -1,3 +1,4 @@ +# Help: Bootable as a KVM guest CONFIG_NET=y CONFIG_NET_CORE=y CONFIG_NETDEVICES=y diff --git a/kernel/configs/nopm.config b/kernel/configs/nopm.config index 81ff07863576..ebfdc3d8aa9a 100644 --- a/kernel/configs/nopm.config +++ b/kernel/configs/nopm.config @@ -1,3 +1,5 @@ +# Help: Disable Power Management + CONFIG_PM=n CONFIG_SUSPEND=n CONFIG_HIBERNATION=n diff --git a/kernel/configs/rust.config b/kernel/configs/rust.config index 38a7c5362c9c..2c6e001a7284 100644 --- a/kernel/configs/rust.config +++ b/kernel/configs/rust.config @@ -1 +1,2 @@ +# Help: Enable Rust CONFIG_RUST=y diff --git a/kernel/configs/tiny-base.config b/kernel/configs/tiny-base.config index 2f0e6bf6db2c..ffb9dcafca26 100644 --- a/kernel/configs/tiny-base.config +++ b/kernel/configs/tiny-base.config @@ -1 +1 @@ -CONFIG_EMBEDDED=y +CONFIG_EXPERT=y diff --git a/kernel/configs/x86_debug.config b/kernel/configs/x86_debug.config index 6fac5b405334..35f48671b8d5 100644 --- a/kernel/configs/x86_debug.config +++ b/kernel/configs/x86_debug.config @@ -1,3 +1,4 @@ +# Help: Debugging options for tip tree testing CONFIG_X86_DEBUG_FPU=y CONFIG_LOCK_STAT=y CONFIG_DEBUG_VM=y diff --git a/kernel/configs/xen.config b/kernel/configs/xen.config index 436f806aa1ed..6878b9a49be8 100644 --- a/kernel/configs/xen.config +++ b/kernel/configs/xen.config @@ -1,3 +1,5 @@ +# Help: Bootable as a Xen guest +# # global stuff - these enable us to allow some # of the not so generic stuff below for xen CONFIG_PARAVIRT=y diff --git a/kernel/cpu.c b/kernel/cpu.c index 88a7ede322bd..6de7c6bb74ee 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -592,7 +592,10 @@ static void lockdep_release_cpus_lock(void) void __weak arch_smt_update(void) { } #ifdef CONFIG_HOTPLUG_SMT + enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; +static unsigned int cpu_smt_max_threads __ro_after_init; +unsigned int cpu_smt_num_threads __read_mostly = UINT_MAX; void __init cpu_smt_disable(bool force) { @@ -606,16 +609,33 @@ void __init cpu_smt_disable(bool force) pr_info("SMT: disabled\n"); cpu_smt_control = CPU_SMT_DISABLED; } + cpu_smt_num_threads = 1; } /* * The decision whether SMT is supported can only be done after the full * CPU identification. Called from architecture code. */ -void __init cpu_smt_check_topology(void) +void __init cpu_smt_set_num_threads(unsigned int num_threads, + unsigned int max_threads) { - if (!topology_smt_supported()) + WARN_ON(!num_threads || (num_threads > max_threads)); + + if (max_threads == 1) cpu_smt_control = CPU_SMT_NOT_SUPPORTED; + + cpu_smt_max_threads = max_threads; + + /* + * If SMT has been disabled via the kernel command line or SMT is + * not supported, set cpu_smt_num_threads to 1 for consistency. + * If enabled, take the architecture requested number of threads + * to bring up into account. + */ + if (cpu_smt_control != CPU_SMT_ENABLED) + cpu_smt_num_threads = 1; + else if (num_threads < cpu_smt_num_threads) + cpu_smt_num_threads = num_threads; } static int __init smt_cmdline_disable(char *str) @@ -625,9 +645,23 @@ static int __init smt_cmdline_disable(char *str) } early_param("nosmt", smt_cmdline_disable); +/* + * For Archicture supporting partial SMT states check if the thread is allowed. + * Otherwise this has already been checked through cpu_smt_max_threads when + * setting the SMT level. + */ +static inline bool cpu_smt_thread_allowed(unsigned int cpu) +{ +#ifdef CONFIG_SMT_NUM_THREADS_DYNAMIC + return topology_smt_thread_allowed(cpu); +#else + return true; +#endif +} + static inline bool cpu_smt_allowed(unsigned int cpu) { - if (cpu_smt_control == CPU_SMT_ENABLED) + if (cpu_smt_control == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu)) return true; if (topology_is_primary_thread(cpu)) @@ -642,7 +676,7 @@ static inline bool cpu_smt_allowed(unsigned int cpu) return !cpumask_test_cpu(cpu, &cpus_booted_once_mask); } -/* Returns true if SMT is not supported of forcefully (irreversibly) disabled */ +/* Returns true if SMT is supported and not forcefully (irreversibly) disabled */ bool cpu_smt_possible(void) { return cpu_smt_control != CPU_SMT_FORCE_DISABLED && @@ -650,22 +684,8 @@ bool cpu_smt_possible(void) } EXPORT_SYMBOL_GPL(cpu_smt_possible); -static inline bool cpuhp_smt_aware(void) -{ - return topology_smt_supported(); -} - -static inline const struct cpumask *cpuhp_get_primary_thread_mask(void) -{ - return cpu_primary_thread_mask; -} #else static inline bool cpu_smt_allowed(unsigned int cpu) { return true; } -static inline bool cpuhp_smt_aware(void) { return false; } -static inline const struct cpumask *cpuhp_get_primary_thread_mask(void) -{ - return cpu_present_mask; -} #endif static inline enum cpuhp_state @@ -1467,8 +1487,22 @@ out: return ret; } +struct cpu_down_work { + unsigned int cpu; + enum cpuhp_state target; +}; + +static long __cpu_down_maps_locked(void *arg) +{ + struct cpu_down_work *work = arg; + + return _cpu_down(work->cpu, 0, work->target); +} + static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target) { + struct cpu_down_work work = { .cpu = cpu, .target = target, }; + /* * If the platform does not support hotplug, report it explicitly to * differentiate it from a transient offlining failure. @@ -1477,7 +1511,15 @@ static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target) return -EOPNOTSUPP; if (cpu_hotplug_disabled) return -EBUSY; - return _cpu_down(cpu, 0, target); + + /* + * Ensure that the control task does not run on the to be offlined + * CPU to prevent a deadlock against cfs_b->period_timer. + */ + cpu = cpumask_any_but(cpu_online_mask, cpu); + if (cpu >= nr_cpu_ids) + return -EBUSY; + return work_on_cpu(cpu, __cpu_down_maps_locked, &work); } static int cpu_down(unsigned int cpu, enum cpuhp_state target) @@ -1793,6 +1835,16 @@ static int __init parallel_bringup_parse_param(char *arg) } early_param("cpuhp.parallel", parallel_bringup_parse_param); +static inline bool cpuhp_smt_aware(void) +{ + return cpu_smt_max_threads > 1; +} + +static inline const struct cpumask *cpuhp_get_primary_thread_mask(void) +{ + return cpu_primary_thread_mask; +} + /* * On architectures which have enabled parallel bringup this invokes all BP * prepare states for each of the to be onlined APs first. The last state @@ -2626,6 +2678,12 @@ int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) for_each_online_cpu(cpu) { if (topology_is_primary_thread(cpu)) continue; + /* + * Disable can be called with CPU_SMT_ENABLED when changing + * from a higher to lower number of SMT threads per core. + */ + if (ctrlval == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu)) + continue; ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE); if (ret) break; @@ -2660,6 +2718,8 @@ int cpuhp_smt_enable(void) /* Skip online CPUs and CPUs on offline nodes */ if (cpu_online(cpu) || !node_online(cpu_to_node(cpu))) continue; + if (!cpu_smt_thread_allowed(cpu)) + continue; ret = _cpu_up(cpu, 0, CPUHP_ONLINE); if (ret) break; @@ -2838,20 +2898,19 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = { #ifdef CONFIG_HOTPLUG_SMT +static bool cpu_smt_num_threads_valid(unsigned int threads) +{ + if (IS_ENABLED(CONFIG_SMT_NUM_THREADS_DYNAMIC)) + return threads >= 1 && threads <= cpu_smt_max_threads; + return threads == 1 || threads == cpu_smt_max_threads; +} + static ssize_t __store_smt_control(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - int ctrlval, ret; - - if (sysfs_streq(buf, "on")) - ctrlval = CPU_SMT_ENABLED; - else if (sysfs_streq(buf, "off")) - ctrlval = CPU_SMT_DISABLED; - else if (sysfs_streq(buf, "forceoff")) - ctrlval = CPU_SMT_FORCE_DISABLED; - else - return -EINVAL; + int ctrlval, ret, num_threads, orig_threads; + bool force_off; if (cpu_smt_control == CPU_SMT_FORCE_DISABLED) return -EPERM; @@ -2859,21 +2918,39 @@ __store_smt_control(struct device *dev, struct device_attribute *attr, if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED) return -ENODEV; + if (sysfs_streq(buf, "on")) { + ctrlval = CPU_SMT_ENABLED; + num_threads = cpu_smt_max_threads; + } else if (sysfs_streq(buf, "off")) { + ctrlval = CPU_SMT_DISABLED; + num_threads = 1; + } else if (sysfs_streq(buf, "forceoff")) { + ctrlval = CPU_SMT_FORCE_DISABLED; + num_threads = 1; + } else if (kstrtoint(buf, 10, &num_threads) == 0) { + if (num_threads == 1) + ctrlval = CPU_SMT_DISABLED; + else if (cpu_smt_num_threads_valid(num_threads)) + ctrlval = CPU_SMT_ENABLED; + else + return -EINVAL; + } else { + return -EINVAL; + } + ret = lock_device_hotplug_sysfs(); if (ret) return ret; - if (ctrlval != cpu_smt_control) { - switch (ctrlval) { - case CPU_SMT_ENABLED: - ret = cpuhp_smt_enable(); - break; - case CPU_SMT_DISABLED: - case CPU_SMT_FORCE_DISABLED: - ret = cpuhp_smt_disable(ctrlval); - break; - } - } + orig_threads = cpu_smt_num_threads; + cpu_smt_num_threads = num_threads; + + force_off = ctrlval != cpu_smt_control && ctrlval == CPU_SMT_FORCE_DISABLED; + + if (num_threads > orig_threads) + ret = cpuhp_smt_enable(); + else if (num_threads < orig_threads || force_off) + ret = cpuhp_smt_disable(ctrlval); unlock_device_hotplug(); return ret ? ret : count; @@ -2901,6 +2978,17 @@ static ssize_t control_show(struct device *dev, { const char *state = smt_states[cpu_smt_control]; +#ifdef CONFIG_HOTPLUG_SMT + /* + * If SMT is enabled but not all threads are enabled then show the + * number of threads. If all threads are enabled show "on". Otherwise + * show the state name. + */ + if (cpu_smt_control == CPU_SMT_ENABLED && + cpu_smt_num_threads != cpu_smt_max_threads) + return sysfs_emit(buf, "%d\n", cpu_smt_num_threads); +#endif + return snprintf(buf, PAGE_SIZE - 2, "%s\n", state); } diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 90ce1dfd591c..03a7932cde0a 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -10,6 +10,9 @@ #include <linux/utsname.h> #include <linux/vmalloc.h> #include <linux/sizes.h> +#include <linux/kexec.h> +#include <linux/memory.h> +#include <linux/cpuhotplug.h> #include <asm/page.h> #include <asm/sections.h> @@ -17,6 +20,10 @@ #include <crypto/sha1.h> #include "kallsyms_internal.h" +#include "kexec_internal.h" + +/* Per cpu memory for storing cpu states in case of system crash. */ +note_buf_t __percpu *crash_notes; /* vmcoreinfo stuff */ unsigned char *vmcoreinfo_data; @@ -314,6 +321,187 @@ static int __init parse_crashkernel_dummy(char *arg) } early_param("crashkernel", parse_crashkernel_dummy); +int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, + void **addr, unsigned long *sz) +{ + Elf64_Ehdr *ehdr; + Elf64_Phdr *phdr; + unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz; + unsigned char *buf; + unsigned int cpu, i; + unsigned long long notes_addr; + unsigned long mstart, mend; + + /* extra phdr for vmcoreinfo ELF note */ + nr_phdr = nr_cpus + 1; + nr_phdr += mem->nr_ranges; + + /* + * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping + * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64). + * I think this is required by tools like gdb. So same physical + * memory will be mapped in two ELF headers. One will contain kernel + * text virtual addresses and other will have __va(physical) addresses. + */ + + nr_phdr++; + elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr); + elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN); + + buf = vzalloc(elf_sz); + if (!buf) + return -ENOMEM; + + ehdr = (Elf64_Ehdr *)buf; + phdr = (Elf64_Phdr *)(ehdr + 1); + memcpy(ehdr->e_ident, ELFMAG, SELFMAG); + ehdr->e_ident[EI_CLASS] = ELFCLASS64; + ehdr->e_ident[EI_DATA] = ELFDATA2LSB; + ehdr->e_ident[EI_VERSION] = EV_CURRENT; + ehdr->e_ident[EI_OSABI] = ELF_OSABI; + memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD); + ehdr->e_type = ET_CORE; + ehdr->e_machine = ELF_ARCH; + ehdr->e_version = EV_CURRENT; + ehdr->e_phoff = sizeof(Elf64_Ehdr); + ehdr->e_ehsize = sizeof(Elf64_Ehdr); + ehdr->e_phentsize = sizeof(Elf64_Phdr); + + /* Prepare one phdr of type PT_NOTE for each possible CPU */ + for_each_possible_cpu(cpu) { + phdr->p_type = PT_NOTE; + notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu)); + phdr->p_offset = phdr->p_paddr = notes_addr; + phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t); + (ehdr->e_phnum)++; + phdr++; + } + + /* Prepare one PT_NOTE header for vmcoreinfo */ + phdr->p_type = PT_NOTE; + phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note(); + phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE; + (ehdr->e_phnum)++; + phdr++; + + /* Prepare PT_LOAD type program header for kernel text region */ + if (need_kernel_map) { + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_vaddr = (unsigned long) _text; + phdr->p_filesz = phdr->p_memsz = _end - _text; + phdr->p_offset = phdr->p_paddr = __pa_symbol(_text); + ehdr->e_phnum++; + phdr++; + } + + /* Go through all the ranges in mem->ranges[] and prepare phdr */ + for (i = 0; i < mem->nr_ranges; i++) { + mstart = mem->ranges[i].start; + mend = mem->ranges[i].end; + + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_offset = mstart; + + phdr->p_paddr = mstart; + phdr->p_vaddr = (unsigned long) __va(mstart); + phdr->p_filesz = phdr->p_memsz = mend - mstart + 1; + phdr->p_align = 0; + ehdr->e_phnum++; + pr_debug("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n", + phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz, + ehdr->e_phnum, phdr->p_offset); + phdr++; + } + + *addr = buf; + *sz = elf_sz; + return 0; +} + +int crash_exclude_mem_range(struct crash_mem *mem, + unsigned long long mstart, unsigned long long mend) +{ + int i, j; + unsigned long long start, end, p_start, p_end; + struct range temp_range = {0, 0}; + + for (i = 0; i < mem->nr_ranges; i++) { + start = mem->ranges[i].start; + end = mem->ranges[i].end; + p_start = mstart; + p_end = mend; + + if (mstart > end || mend < start) + continue; + + /* Truncate any area outside of range */ + if (mstart < start) + p_start = start; + if (mend > end) + p_end = end; + + /* Found completely overlapping range */ + if (p_start == start && p_end == end) { + mem->ranges[i].start = 0; + mem->ranges[i].end = 0; + if (i < mem->nr_ranges - 1) { + /* Shift rest of the ranges to left */ + for (j = i; j < mem->nr_ranges - 1; j++) { + mem->ranges[j].start = + mem->ranges[j+1].start; + mem->ranges[j].end = + mem->ranges[j+1].end; + } + + /* + * Continue to check if there are another overlapping ranges + * from the current position because of shifting the above + * mem ranges. + */ + i--; + mem->nr_ranges--; + continue; + } + mem->nr_ranges--; + return 0; + } + + if (p_start > start && p_end < end) { + /* Split original range */ + mem->ranges[i].end = p_start - 1; + temp_range.start = p_end + 1; + temp_range.end = end; + } else if (p_start != start) + mem->ranges[i].end = p_start - 1; + else + mem->ranges[i].start = p_end + 1; + break; + } + + /* If a split happened, add the split to array */ + if (!temp_range.end) + return 0; + + /* Split happened */ + if (i == mem->max_nr_ranges - 1) + return -ENOMEM; + + /* Location where new range should go */ + j = i + 1; + if (j < mem->nr_ranges) { + /* Move over all ranges one slot towards the end */ + for (i = mem->nr_ranges - 1; i >= j; i--) + mem->ranges[i + 1] = mem->ranges[i]; + } + + mem->ranges[j].start = temp_range.start; + mem->ranges[j].end = temp_range.end; + mem->nr_ranges++; + return 0; +} + Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len) { @@ -455,8 +643,6 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(page, lru); VMCOREINFO_OFFSET(page, _mapcount); VMCOREINFO_OFFSET(page, private); - VMCOREINFO_OFFSET(folio, _folio_dtor); - VMCOREINFO_OFFSET(folio, _folio_order); VMCOREINFO_OFFSET(page, compound_head); VMCOREINFO_OFFSET(pglist_data, node_zones); VMCOREINFO_OFFSET(pglist_data, nr_zones); @@ -490,7 +676,7 @@ static int __init crash_save_vmcoreinfo_init(void) #define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy) VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); #ifdef CONFIG_HUGETLB_PAGE - VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); + VMCOREINFO_NUMBER(PG_hugetlb); #define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline) VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); #endif @@ -515,3 +701,206 @@ static int __init crash_save_vmcoreinfo_init(void) } subsys_initcall(crash_save_vmcoreinfo_init); + +static int __init crash_notes_memory_init(void) +{ + /* Allocate memory for saving cpu registers. */ + size_t size, align; + + /* + * crash_notes could be allocated across 2 vmalloc pages when percpu + * is vmalloc based . vmalloc doesn't guarantee 2 continuous vmalloc + * pages are also on 2 continuous physical pages. In this case the + * 2nd part of crash_notes in 2nd page could be lost since only the + * starting address and size of crash_notes are exported through sysfs. + * Here round up the size of crash_notes to the nearest power of two + * and pass it to __alloc_percpu as align value. This can make sure + * crash_notes is allocated inside one physical page. + */ + size = sizeof(note_buf_t); + align = min(roundup_pow_of_two(sizeof(note_buf_t)), PAGE_SIZE); + + /* + * Break compile if size is bigger than PAGE_SIZE since crash_notes + * definitely will be in 2 pages with that. + */ + BUILD_BUG_ON(size > PAGE_SIZE); + + crash_notes = __alloc_percpu(size, align); + if (!crash_notes) { + pr_warn("Memory allocation for saving cpu register states failed\n"); + return -ENOMEM; + } + return 0; +} +subsys_initcall(crash_notes_memory_init); + +#ifdef CONFIG_CRASH_HOTPLUG +#undef pr_fmt +#define pr_fmt(fmt) "crash hp: " fmt + +/* + * This routine utilized when the crash_hotplug sysfs node is read. + * It reflects the kernel's ability/permission to update the crash + * elfcorehdr directly. + */ +int crash_check_update_elfcorehdr(void) +{ + int rc = 0; + + /* Obtain lock while reading crash information */ + if (!kexec_trylock()) { + pr_info("kexec_trylock() failed, elfcorehdr may be inaccurate\n"); + return 0; + } + if (kexec_crash_image) { + if (kexec_crash_image->file_mode) + rc = 1; + else + rc = kexec_crash_image->update_elfcorehdr; + } + /* Release lock now that update complete */ + kexec_unlock(); + + return rc; +} + +/* + * To accurately reflect hot un/plug changes of cpu and memory resources + * (including onling and offlining of those resources), the elfcorehdr + * (which is passed to the crash kernel via the elfcorehdr= parameter) + * must be updated with the new list of CPUs and memories. + * + * In order to make changes to elfcorehdr, two conditions are needed: + * First, the segment containing the elfcorehdr must be large enough + * to permit a growing number of resources; the elfcorehdr memory size + * is based on NR_CPUS_DEFAULT and CRASH_MAX_MEMORY_RANGES. + * Second, purgatory must explicitly exclude the elfcorehdr from the + * list of segments it checks (since the elfcorehdr changes and thus + * would require an update to purgatory itself to update the digest). + */ +static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu) +{ + struct kimage *image; + + /* Obtain lock while changing crash information */ + if (!kexec_trylock()) { + pr_info("kexec_trylock() failed, elfcorehdr may be inaccurate\n"); + return; + } + + /* Check kdump is not loaded */ + if (!kexec_crash_image) + goto out; + + image = kexec_crash_image; + + /* Check that updating elfcorehdr is permitted */ + if (!(image->file_mode || image->update_elfcorehdr)) + goto out; + + if (hp_action == KEXEC_CRASH_HP_ADD_CPU || + hp_action == KEXEC_CRASH_HP_REMOVE_CPU) + pr_debug("hp_action %u, cpu %u\n", hp_action, cpu); + else + pr_debug("hp_action %u\n", hp_action); + + /* + * The elfcorehdr_index is set to -1 when the struct kimage + * is allocated. Find the segment containing the elfcorehdr, + * if not already found. + */ + if (image->elfcorehdr_index < 0) { + unsigned long mem; + unsigned char *ptr; + unsigned int n; + + for (n = 0; n < image->nr_segments; n++) { + mem = image->segment[n].mem; + ptr = kmap_local_page(pfn_to_page(mem >> PAGE_SHIFT)); + if (ptr) { + /* The segment containing elfcorehdr */ + if (memcmp(ptr, ELFMAG, SELFMAG) == 0) + image->elfcorehdr_index = (int)n; + kunmap_local(ptr); + } + } + } + + if (image->elfcorehdr_index < 0) { + pr_err("unable to locate elfcorehdr segment"); + goto out; + } + + /* Needed in order for the segments to be updated */ + arch_kexec_unprotect_crashkres(); + + /* Differentiate between normal load and hotplug update */ + image->hp_action = hp_action; + + /* Now invoke arch-specific update handler */ + arch_crash_handle_hotplug_event(image); + + /* No longer handling a hotplug event */ + image->hp_action = KEXEC_CRASH_HP_NONE; + image->elfcorehdr_updated = true; + + /* Change back to read-only */ + arch_kexec_protect_crashkres(); + + /* Errors in the callback is not a reason to rollback state */ +out: + /* Release lock now that update complete */ + kexec_unlock(); +} + +static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *v) +{ + switch (val) { + case MEM_ONLINE: + crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_MEMORY, + KEXEC_CRASH_HP_INVALID_CPU); + break; + + case MEM_OFFLINE: + crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_MEMORY, + KEXEC_CRASH_HP_INVALID_CPU); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block crash_memhp_nb = { + .notifier_call = crash_memhp_notifier, + .priority = 0 +}; + +static int crash_cpuhp_online(unsigned int cpu) +{ + crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_CPU, cpu); + return 0; +} + +static int crash_cpuhp_offline(unsigned int cpu) +{ + crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_CPU, cpu); + return 0; +} + +static int __init crash_hotplug_init(void) +{ + int result = 0; + + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) + register_memory_notifier(&crash_memhp_nb); + + if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) { + result = cpuhp_setup_state_nocalls(CPUHP_BP_PREPARE_DYN, + "crash/cpuhp", crash_cpuhp_online, crash_cpuhp_offline); + } + + return result; +} + +subsys_initcall(crash_hotplug_init); +#endif diff --git a/kernel/cred.c b/kernel/cred.c index 811ad654abd1..98cb4eca23fb 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -4,6 +4,9 @@ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ + +#define pr_fmt(fmt) "CRED: " fmt + #include <linux/export.h> #include <linux/cred.h> #include <linux/slab.h> @@ -835,32 +838,32 @@ EXPORT_SYMBOL(creds_are_invalid); static void dump_invalid_creds(const struct cred *cred, const char *label, const struct task_struct *tsk) { - printk(KERN_ERR "CRED: %s credentials: %p %s%s%s\n", + pr_err("%s credentials: %p %s%s%s\n", label, cred, cred == &init_cred ? "[init]" : "", cred == tsk->real_cred ? "[real]" : "", cred == tsk->cred ? "[eff]" : ""); - printk(KERN_ERR "CRED: ->magic=%x, put_addr=%p\n", + pr_err("->magic=%x, put_addr=%p\n", cred->magic, cred->put_addr); - printk(KERN_ERR "CRED: ->usage=%d, subscr=%d\n", + pr_err("->usage=%d, subscr=%d\n", atomic_read(&cred->usage), read_cred_subscribers(cred)); - printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n", + pr_err("->*uid = { %d,%d,%d,%d }\n", from_kuid_munged(&init_user_ns, cred->uid), from_kuid_munged(&init_user_ns, cred->euid), from_kuid_munged(&init_user_ns, cred->suid), from_kuid_munged(&init_user_ns, cred->fsuid)); - printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n", + pr_err("->*gid = { %d,%d,%d,%d }\n", from_kgid_munged(&init_user_ns, cred->gid), from_kgid_munged(&init_user_ns, cred->egid), from_kgid_munged(&init_user_ns, cred->sgid), from_kgid_munged(&init_user_ns, cred->fsgid)); #ifdef CONFIG_SECURITY - printk(KERN_ERR "CRED: ->security is %p\n", cred->security); + pr_err("->security is %p\n", cred->security); if ((unsigned long) cred->security >= PAGE_SIZE && (((unsigned long) cred->security & 0xffffff00) != (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))) - printk(KERN_ERR "CRED: ->security {%x, %x}\n", + pr_err("->security {%x, %x}\n", ((u32*)cred->security)[0], ((u32*)cred->security)[1]); #endif @@ -871,8 +874,8 @@ static void dump_invalid_creds(const struct cred *cred, const char *label, */ void __noreturn __invalid_creds(const struct cred *cred, const char *file, unsigned line) { - printk(KERN_ERR "CRED: Invalid credentials\n"); - printk(KERN_ERR "CRED: At %s:%u\n", file, line); + pr_err("Invalid credentials\n"); + pr_err("At %s:%u\n", file, line); dump_invalid_creds(cred, "Specified", current); BUG(); } @@ -898,14 +901,14 @@ void __validate_process_creds(struct task_struct *tsk, return; invalid_creds: - printk(KERN_ERR "CRED: Invalid process credentials\n"); - printk(KERN_ERR "CRED: At %s:%u\n", file, line); + pr_err("Invalid process credentials\n"); + pr_err("At %s:%u\n", file, line); dump_invalid_creds(tsk->real_cred, "Real", tsk); if (tsk->cred != tsk->real_cred) dump_invalid_creds(tsk->cred, "Effective", tsk); else - printk(KERN_ERR "CRED: Effective creds == Real creds\n"); + pr_err("Effective creds == Real creds\n"); BUG(); } EXPORT_SYMBOL(__validate_process_creds); diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index d5e9ccde3ab8..621037a0aa87 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -968,7 +968,7 @@ static int __init opt_kgdb_con(char *str) early_param("kgdbcon", opt_kgdb_con); #ifdef CONFIG_MAGIC_SYSRQ -static void sysrq_handle_dbg(int key) +static void sysrq_handle_dbg(u8 key) { if (!dbg_io_ops) { pr_crit("ERROR: No KGDB I/O module available\n"); diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 813cb6cf72d6..9443bc63c5a2 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -590,6 +590,8 @@ static void kdb_msg_write(const char *msg, int msg_len) continue; if (c == dbg_io_ops->cons) continue; + if (!c->write) + continue; /* * Set oops_in_progress to encourage the console drivers to * disregard their internal spin locks: in the current calling diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 11d077003205..f488997b0717 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -90,6 +90,19 @@ config SWIOTLB bool select NEED_DMA_MAP_STATE +config SWIOTLB_DYNAMIC + bool "Dynamic allocation of DMA bounce buffers" + default n + depends on SWIOTLB + help + This enables dynamic resizing of the software IO TLB. The kernel + starts with one memory pool at boot and it will allocate additional + pools as needed. To reduce run-time kernel memory requirements, you + may have to specify a smaller size of the initial pool using + "swiotlb=" on the kernel command line. + + If unsure, say N. + config DMA_BOUNCE_UNALIGNED_KMALLOC bool depends on SWIOTLB @@ -145,15 +158,16 @@ config DMA_CMA if DMA_CMA -config DMA_PERNUMA_CMA - bool "Enable separate DMA Contiguous Memory Area for each NUMA Node" - default NUMA && ARM64 +config DMA_NUMA_CMA + bool "Enable separate DMA Contiguous Memory Area for NUMA Node" + depends on NUMA help - Enable this option to get pernuma CMA areas so that devices like - ARM64 SMMU can get local memory by DMA coherent APIs. + Enable this option to get numa CMA areas so that NUMA devices + can get local memory by DMA coherent APIs. You can set the size of pernuma CMA by specifying "cma_pernuma=size" - on the kernel's command line. + or set the node id and its size of CMA by specifying "numa_cma= + <node>:size[,<node>:size]" on the kernel's command line. comment "Default contiguous memory area size:" diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 6ea80ae42622..f005c66f378c 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -50,6 +50,7 @@ #include <linux/sizes.h> #include <linux/dma-map-ops.h> #include <linux/cma.h> +#include <linux/nospec.h> #ifdef CONFIG_CMA_SIZE_MBYTES #define CMA_SIZE_MBYTES CONFIG_CMA_SIZE_MBYTES @@ -96,11 +97,44 @@ static int __init early_cma(char *p) } early_param("cma", early_cma); -#ifdef CONFIG_DMA_PERNUMA_CMA +#ifdef CONFIG_DMA_NUMA_CMA +static struct cma *dma_contiguous_numa_area[MAX_NUMNODES]; +static phys_addr_t numa_cma_size[MAX_NUMNODES] __initdata; static struct cma *dma_contiguous_pernuma_area[MAX_NUMNODES]; static phys_addr_t pernuma_size_bytes __initdata; +static int __init early_numa_cma(char *p) +{ + int nid, count = 0; + unsigned long tmp; + char *s = p; + + while (*s) { + if (sscanf(s, "%lu%n", &tmp, &count) != 1) + break; + + if (s[count] == ':') { + if (tmp >= MAX_NUMNODES) + break; + nid = array_index_nospec(tmp, MAX_NUMNODES); + + s += count + 1; + tmp = memparse(s, &s); + numa_cma_size[nid] = tmp; + + if (*s == ',') + s++; + else + break; + } else + break; + } + + return 0; +} +early_param("numa_cma", early_numa_cma); + static int __init early_cma_pernuma(char *p) { pernuma_size_bytes = memparse(p, &p); @@ -127,32 +161,49 @@ static inline __maybe_unused phys_addr_t cma_early_percent_memory(void) #endif -#ifdef CONFIG_DMA_PERNUMA_CMA -void __init dma_pernuma_cma_reserve(void) +#ifdef CONFIG_DMA_NUMA_CMA +static void __init dma_numa_cma_reserve(void) { int nid; - if (!pernuma_size_bytes) - return; - - for_each_online_node(nid) { + for_each_node(nid) { int ret; char name[CMA_MAX_NAME]; - struct cma **cma = &dma_contiguous_pernuma_area[nid]; - - snprintf(name, sizeof(name), "pernuma%d", nid); - ret = cma_declare_contiguous_nid(0, pernuma_size_bytes, 0, 0, - 0, false, name, cma, nid); - if (ret) { - pr_warn("%s: reservation failed: err %d, node %d", __func__, - ret, nid); + struct cma **cma; + + if (!node_online(nid)) { + if (pernuma_size_bytes || numa_cma_size[nid]) + pr_warn("invalid node %d specified\n", nid); continue; } - pr_debug("%s: reserved %llu MiB on node %d\n", __func__, - (unsigned long long)pernuma_size_bytes / SZ_1M, nid); + if (pernuma_size_bytes) { + + cma = &dma_contiguous_pernuma_area[nid]; + snprintf(name, sizeof(name), "pernuma%d", nid); + ret = cma_declare_contiguous_nid(0, pernuma_size_bytes, 0, 0, + 0, false, name, cma, nid); + if (ret) + pr_warn("%s: reservation failed: err %d, node %d", __func__, + ret, nid); + } + + if (numa_cma_size[nid]) { + + cma = &dma_contiguous_numa_area[nid]; + snprintf(name, sizeof(name), "numa%d", nid); + ret = cma_declare_contiguous_nid(0, numa_cma_size[nid], 0, 0, 0, false, + name, cma, nid); + if (ret) + pr_warn("%s: reservation failed: err %d, node %d", __func__, + ret, nid); + } } } +#else +static inline void __init dma_numa_cma_reserve(void) +{ +} #endif /** @@ -171,6 +222,8 @@ void __init dma_contiguous_reserve(phys_addr_t limit) phys_addr_t selected_limit = limit; bool fixed = false; + dma_numa_cma_reserve(); + pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit); if (size_cmdline != -1) { @@ -303,7 +356,7 @@ static struct page *cma_alloc_aligned(struct cma *cma, size_t size, gfp_t gfp) */ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) { -#ifdef CONFIG_DMA_PERNUMA_CMA +#ifdef CONFIG_DMA_NUMA_CMA int nid = dev_to_node(dev); #endif @@ -315,7 +368,7 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) if (size <= PAGE_SIZE) return NULL; -#ifdef CONFIG_DMA_PERNUMA_CMA +#ifdef CONFIG_DMA_NUMA_CMA if (nid != NUMA_NO_NODE && !(gfp & (GFP_DMA | GFP_DMA32))) { struct cma *cma = dma_contiguous_pernuma_area[nid]; struct page *page; @@ -325,6 +378,13 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) if (page) return page; } + + cma = dma_contiguous_numa_area[nid]; + if (cma) { + page = cma_alloc_aligned(cma, size, gfp); + if (page) + return page; + } } #endif if (!dma_contiguous_default_area) @@ -356,10 +416,13 @@ void dma_free_contiguous(struct device *dev, struct page *page, size_t size) /* * otherwise, page is from either per-numa cma or default cma */ -#ifdef CONFIG_DMA_PERNUMA_CMA +#ifdef CONFIG_DMA_NUMA_CMA if (cma_release(dma_contiguous_pernuma_area[page_to_nid(page)], page, count)) return; + if (cma_release(dma_contiguous_numa_area[page_to_nid(page)], + page, count)) + return; #endif if (cma_release(dma_contiguous_default_area, page, count)) return; diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index f190651bcadd..06366acd27b0 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -637,15 +637,19 @@ static struct dma_debug_entry *__dma_entry_alloc(void) return entry; } -static void __dma_entry_alloc_check_leak(void) +/* + * This should be called outside of free_entries_lock scope to avoid potential + * deadlocks with serial consoles that use DMA. + */ +static void __dma_entry_alloc_check_leak(u32 nr_entries) { - u32 tmp = nr_total_entries % nr_prealloc_entries; + u32 tmp = nr_entries % nr_prealloc_entries; /* Shout each time we tick over some multiple of the initial pool */ if (tmp < DMA_DEBUG_DYNAMIC_ENTRIES) { pr_info("dma_debug_entry pool grown to %u (%u00%%)\n", - nr_total_entries, - (nr_total_entries / nr_prealloc_entries)); + nr_entries, + (nr_entries / nr_prealloc_entries)); } } @@ -656,8 +660,10 @@ static void __dma_entry_alloc_check_leak(void) */ static struct dma_debug_entry *dma_entry_alloc(void) { + bool alloc_check_leak = false; struct dma_debug_entry *entry; unsigned long flags; + u32 nr_entries; spin_lock_irqsave(&free_entries_lock, flags); if (num_free_entries == 0) { @@ -667,13 +673,17 @@ static struct dma_debug_entry *dma_entry_alloc(void) pr_err("debugging out of memory - disabling\n"); return NULL; } - __dma_entry_alloc_check_leak(); + alloc_check_leak = true; + nr_entries = nr_total_entries; } entry = __dma_entry_alloc(); spin_unlock_irqrestore(&free_entries_lock, flags); + if (alloc_check_leak) + __dma_entry_alloc_check_leak(nr_entries); + #ifdef CONFIG_STACKTRACE entry->stack_len = stack_trace_save(entry->stack_entries, ARRAY_SIZE(entry->stack_entries), diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index d29cade048db..9596ae1aa0da 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -66,7 +66,7 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 *phys_limit) return 0; } -static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) +bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) { dma_addr_t dma_addr = phys_to_dma_direct(dev, phys); diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 9a4db5cce600..e323ca48f7f2 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -760,12 +760,6 @@ bool dma_pci_p2pdma_supported(struct device *dev) } EXPORT_SYMBOL_GPL(dma_pci_p2pdma_supported); -#ifdef CONFIG_ARCH_HAS_DMA_SET_MASK -void arch_dma_set_mask(struct device *dev, u64 mask); -#else -#define arch_dma_set_mask(dev, mask) do { } while (0) -#endif - int dma_set_mask(struct device *dev, u64 mask) { /* diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 1acec2e22827..b481c48a31a6 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -135,9 +135,9 @@ encrypt_mapping: remove_mapping: #ifdef CONFIG_DMA_DIRECT_REMAP dma_common_free_remap(addr, pool_size); -#endif -free_page: __maybe_unused +free_page: __free_pages(page, order); +#endif out: return ret; } diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 2b83e3ad9dca..394494a6b1f3 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -35,6 +35,7 @@ #include <linux/memblock.h> #include <linux/mm.h> #include <linux/pfn.h> +#include <linux/rculist.h> #include <linux/scatterlist.h> #include <linux/set_memory.h> #include <linux/spinlock.h> @@ -62,6 +63,13 @@ #define INVALID_PHYS_ADDR (~(phys_addr_t)0) +/** + * struct io_tlb_slot - IO TLB slot descriptor + * @orig_addr: The original address corresponding to a mapped entry. + * @alloc_size: Size of the allocated buffer. + * @list: The free list describing the number of free entries available + * from each index. + */ struct io_tlb_slot { phys_addr_t orig_addr; size_t alloc_size; @@ -71,7 +79,22 @@ struct io_tlb_slot { static bool swiotlb_force_bounce; static bool swiotlb_force_disable; -struct io_tlb_mem io_tlb_default_mem; +#ifdef CONFIG_SWIOTLB_DYNAMIC + +static void swiotlb_dyn_alloc(struct work_struct *work); + +static struct io_tlb_mem io_tlb_default_mem = { + .lock = __SPIN_LOCK_UNLOCKED(io_tlb_default_mem.lock), + .pools = LIST_HEAD_INIT(io_tlb_default_mem.pools), + .dyn_alloc = __WORK_INITIALIZER(io_tlb_default_mem.dyn_alloc, + swiotlb_dyn_alloc), +}; + +#else /* !CONFIG_SWIOTLB_DYNAMIC */ + +static struct io_tlb_mem io_tlb_default_mem; + +#endif /* CONFIG_SWIOTLB_DYNAMIC */ static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT; static unsigned long default_nareas; @@ -202,7 +225,7 @@ void __init swiotlb_adjust_size(unsigned long size) void swiotlb_print_info(void) { - struct io_tlb_mem *mem = &io_tlb_default_mem; + struct io_tlb_pool *mem = &io_tlb_default_mem.defpool; if (!mem->nslabs) { pr_warn("No low mem\n"); @@ -231,7 +254,7 @@ static inline unsigned long nr_slots(u64 val) */ void __init swiotlb_update_mem_attributes(void) { - struct io_tlb_mem *mem = &io_tlb_default_mem; + struct io_tlb_pool *mem = &io_tlb_default_mem.defpool; unsigned long bytes; if (!mem->nslabs || mem->late_alloc) @@ -240,9 +263,8 @@ void __init swiotlb_update_mem_attributes(void) set_memory_decrypted((unsigned long)mem->vaddr, bytes >> PAGE_SHIFT); } -static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, - unsigned long nslabs, unsigned int flags, - bool late_alloc, unsigned int nareas) +static void swiotlb_init_io_tlb_pool(struct io_tlb_pool *mem, phys_addr_t start, + unsigned long nslabs, bool late_alloc, unsigned int nareas) { void *vaddr = phys_to_virt(start); unsigned long bytes = nslabs << IO_TLB_SHIFT, i; @@ -254,8 +276,6 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, mem->nareas = nareas; mem->area_nslabs = nslabs / mem->nareas; - mem->force_bounce = swiotlb_force_bounce || (flags & SWIOTLB_FORCE); - for (i = 0; i < mem->nareas; i++) { spin_lock_init(&mem->areas[i].lock); mem->areas[i].index = 0; @@ -273,6 +293,23 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, return; } +/** + * add_mem_pool() - add a memory pool to the allocator + * @mem: Software IO TLB allocator. + * @pool: Memory pool to be added. + */ +static void add_mem_pool(struct io_tlb_mem *mem, struct io_tlb_pool *pool) +{ +#ifdef CONFIG_SWIOTLB_DYNAMIC + spin_lock(&mem->lock); + list_add_rcu(&pool->node, &mem->pools); + mem->nslabs += pool->nslabs; + spin_unlock(&mem->lock); +#else + mem->nslabs = pool->nslabs; +#endif +} + static void __init *swiotlb_memblock_alloc(unsigned long nslabs, unsigned int flags, int (*remap)(void *tlb, unsigned long nslabs)) @@ -312,7 +349,7 @@ static void __init *swiotlb_memblock_alloc(unsigned long nslabs, void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, int (*remap)(void *tlb, unsigned long nslabs)) { - struct io_tlb_mem *mem = &io_tlb_default_mem; + struct io_tlb_pool *mem = &io_tlb_default_mem.defpool; unsigned long nslabs; unsigned int nareas; size_t alloc_size; @@ -323,6 +360,18 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, if (swiotlb_force_disable) return; + io_tlb_default_mem.force_bounce = + swiotlb_force_bounce || (flags & SWIOTLB_FORCE); + +#ifdef CONFIG_SWIOTLB_DYNAMIC + if (!remap) + io_tlb_default_mem.can_grow = true; + if (flags & SWIOTLB_ANY) + io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1); + else + io_tlb_default_mem.phys_limit = ARCH_LOW_ADDRESS_LIMIT; +#endif + if (!default_nareas) swiotlb_adjust_nareas(num_possible_cpus()); @@ -356,8 +405,9 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, return; } - swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, flags, false, - default_nareas); + swiotlb_init_io_tlb_pool(mem, __pa(tlb), nslabs, false, + default_nareas); + add_mem_pool(&io_tlb_default_mem, mem); if (flags & SWIOTLB_VERBOSE) swiotlb_print_info(); @@ -376,7 +426,7 @@ void __init swiotlb_init(bool addressing_limit, unsigned int flags) int swiotlb_init_late(size_t size, gfp_t gfp_mask, int (*remap)(void *tlb, unsigned long nslabs)) { - struct io_tlb_mem *mem = &io_tlb_default_mem; + struct io_tlb_pool *mem = &io_tlb_default_mem.defpool; unsigned long nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE); unsigned int nareas; unsigned char *vstart = NULL; @@ -384,9 +434,25 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask, bool retried = false; int rc = 0; + if (io_tlb_default_mem.nslabs) + return 0; + if (swiotlb_force_disable) return 0; + io_tlb_default_mem.force_bounce = swiotlb_force_bounce; + +#ifdef CONFIG_SWIOTLB_DYNAMIC + if (!remap) + io_tlb_default_mem.can_grow = true; + if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mask & __GFP_DMA)) + io_tlb_default_mem.phys_limit = DMA_BIT_MASK(zone_dma_bits); + else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32)) + io_tlb_default_mem.phys_limit = DMA_BIT_MASK(32); + else + io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1); +#endif + if (!default_nareas) swiotlb_adjust_nareas(num_possible_cpus()); @@ -438,8 +504,9 @@ retry: set_memory_decrypted((unsigned long)vstart, (nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT); - swiotlb_init_io_tlb_mem(mem, virt_to_phys(vstart), nslabs, 0, true, - nareas); + swiotlb_init_io_tlb_pool(mem, virt_to_phys(vstart), nslabs, true, + nareas); + add_mem_pool(&io_tlb_default_mem, mem); swiotlb_print_info(); return 0; @@ -453,7 +520,7 @@ error_area: void __init swiotlb_exit(void) { - struct io_tlb_mem *mem = &io_tlb_default_mem; + struct io_tlb_pool *mem = &io_tlb_default_mem.defpool; unsigned long tbl_vaddr; size_t tbl_size, slots_size; unsigned int area_order; @@ -486,6 +553,265 @@ void __init swiotlb_exit(void) memset(mem, 0, sizeof(*mem)); } +#ifdef CONFIG_SWIOTLB_DYNAMIC + +/** + * alloc_dma_pages() - allocate pages to be used for DMA + * @gfp: GFP flags for the allocation. + * @bytes: Size of the buffer. + * + * Allocate pages from the buddy allocator. If successful, make the allocated + * pages decrypted that they can be used for DMA. + * + * Return: Decrypted pages, or %NULL on failure. + */ +static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes) +{ + unsigned int order = get_order(bytes); + struct page *page; + void *vaddr; + + page = alloc_pages(gfp, order); + if (!page) + return NULL; + + vaddr = page_address(page); + if (set_memory_decrypted((unsigned long)vaddr, PFN_UP(bytes))) + goto error; + return page; + +error: + __free_pages(page, order); + return NULL; +} + +/** + * swiotlb_alloc_tlb() - allocate a dynamic IO TLB buffer + * @dev: Device for which a memory pool is allocated. + * @bytes: Size of the buffer. + * @phys_limit: Maximum allowed physical address of the buffer. + * @gfp: GFP flags for the allocation. + * + * Return: Allocated pages, or %NULL on allocation failure. + */ +static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes, + u64 phys_limit, gfp_t gfp) +{ + struct page *page; + + /* + * Allocate from the atomic pools if memory is encrypted and + * the allocation is atomic, because decrypting may block. + */ + if (!gfpflags_allow_blocking(gfp) && dev && force_dma_unencrypted(dev)) { + void *vaddr; + + if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL)) + return NULL; + + return dma_alloc_from_pool(dev, bytes, &vaddr, gfp, + dma_coherent_ok); + } + + gfp &= ~GFP_ZONEMASK; + if (phys_limit <= DMA_BIT_MASK(zone_dma_bits)) + gfp |= __GFP_DMA; + else if (phys_limit <= DMA_BIT_MASK(32)) + gfp |= __GFP_DMA32; + + while ((page = alloc_dma_pages(gfp, bytes)) && + page_to_phys(page) + bytes - 1 > phys_limit) { + /* allocated, but too high */ + __free_pages(page, get_order(bytes)); + + if (IS_ENABLED(CONFIG_ZONE_DMA32) && + phys_limit < DMA_BIT_MASK(64) && + !(gfp & (__GFP_DMA32 | __GFP_DMA))) + gfp |= __GFP_DMA32; + else if (IS_ENABLED(CONFIG_ZONE_DMA) && + !(gfp & __GFP_DMA)) + gfp = (gfp & ~__GFP_DMA32) | __GFP_DMA; + else + return NULL; + } + + return page; +} + +/** + * swiotlb_free_tlb() - free a dynamically allocated IO TLB buffer + * @vaddr: Virtual address of the buffer. + * @bytes: Size of the buffer. + */ +static void swiotlb_free_tlb(void *vaddr, size_t bytes) +{ + if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && + dma_free_from_pool(NULL, vaddr, bytes)) + return; + + /* Intentional leak if pages cannot be encrypted again. */ + if (!set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes))) + __free_pages(virt_to_page(vaddr), get_order(bytes)); +} + +/** + * swiotlb_alloc_pool() - allocate a new IO TLB memory pool + * @dev: Device for which a memory pool is allocated. + * @minslabs: Minimum number of slabs. + * @nslabs: Desired (maximum) number of slabs. + * @nareas: Number of areas. + * @phys_limit: Maximum DMA buffer physical address. + * @gfp: GFP flags for the allocations. + * + * Allocate and initialize a new IO TLB memory pool. The actual number of + * slabs may be reduced if allocation of @nslabs fails. If even + * @minslabs cannot be allocated, this function fails. + * + * Return: New memory pool, or %NULL on allocation failure. + */ +static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev, + unsigned long minslabs, unsigned long nslabs, + unsigned int nareas, u64 phys_limit, gfp_t gfp) +{ + struct io_tlb_pool *pool; + unsigned int slot_order; + struct page *tlb; + size_t pool_size; + size_t tlb_size; + + pool_size = sizeof(*pool) + array_size(sizeof(*pool->areas), nareas); + pool = kzalloc(pool_size, gfp); + if (!pool) + goto error; + pool->areas = (void *)pool + sizeof(*pool); + + tlb_size = nslabs << IO_TLB_SHIFT; + while (!(tlb = swiotlb_alloc_tlb(dev, tlb_size, phys_limit, gfp))) { + if (nslabs <= minslabs) + goto error_tlb; + nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE); + nareas = limit_nareas(nareas, nslabs); + tlb_size = nslabs << IO_TLB_SHIFT; + } + + slot_order = get_order(array_size(sizeof(*pool->slots), nslabs)); + pool->slots = (struct io_tlb_slot *) + __get_free_pages(gfp, slot_order); + if (!pool->slots) + goto error_slots; + + swiotlb_init_io_tlb_pool(pool, page_to_phys(tlb), nslabs, true, nareas); + return pool; + +error_slots: + swiotlb_free_tlb(page_address(tlb), tlb_size); +error_tlb: + kfree(pool); +error: + return NULL; +} + +/** + * swiotlb_dyn_alloc() - dynamic memory pool allocation worker + * @work: Pointer to dyn_alloc in struct io_tlb_mem. + */ +static void swiotlb_dyn_alloc(struct work_struct *work) +{ + struct io_tlb_mem *mem = + container_of(work, struct io_tlb_mem, dyn_alloc); + struct io_tlb_pool *pool; + + pool = swiotlb_alloc_pool(NULL, IO_TLB_MIN_SLABS, default_nslabs, + default_nareas, mem->phys_limit, GFP_KERNEL); + if (!pool) { + pr_warn_ratelimited("Failed to allocate new pool"); + return; + } + + add_mem_pool(mem, pool); + + /* Pairs with smp_rmb() in is_swiotlb_buffer(). */ + smp_wmb(); +} + +/** + * swiotlb_dyn_free() - RCU callback to free a memory pool + * @rcu: RCU head in the corresponding struct io_tlb_pool. + */ +static void swiotlb_dyn_free(struct rcu_head *rcu) +{ + struct io_tlb_pool *pool = container_of(rcu, struct io_tlb_pool, rcu); + size_t slots_size = array_size(sizeof(*pool->slots), pool->nslabs); + size_t tlb_size = pool->end - pool->start; + + free_pages((unsigned long)pool->slots, get_order(slots_size)); + swiotlb_free_tlb(pool->vaddr, tlb_size); + kfree(pool); +} + +/** + * swiotlb_find_pool() - find the IO TLB pool for a physical address + * @dev: Device which has mapped the DMA buffer. + * @paddr: Physical address within the DMA buffer. + * + * Find the IO TLB memory pool descriptor which contains the given physical + * address, if any. + * + * Return: Memory pool which contains @paddr, or %NULL if none. + */ +struct io_tlb_pool *swiotlb_find_pool(struct device *dev, phys_addr_t paddr) +{ + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; + struct io_tlb_pool *pool; + + rcu_read_lock(); + list_for_each_entry_rcu(pool, &mem->pools, node) { + if (paddr >= pool->start && paddr < pool->end) + goto out; + } + + list_for_each_entry_rcu(pool, &dev->dma_io_tlb_pools, node) { + if (paddr >= pool->start && paddr < pool->end) + goto out; + } + pool = NULL; +out: + rcu_read_unlock(); + return pool; +} + +/** + * swiotlb_del_pool() - remove an IO TLB pool from a device + * @dev: Owning device. + * @pool: Memory pool to be removed. + */ +static void swiotlb_del_pool(struct device *dev, struct io_tlb_pool *pool) +{ + unsigned long flags; + + spin_lock_irqsave(&dev->dma_io_tlb_lock, flags); + list_del_rcu(&pool->node); + spin_unlock_irqrestore(&dev->dma_io_tlb_lock, flags); + + call_rcu(&pool->rcu, swiotlb_dyn_free); +} + +#endif /* CONFIG_SWIOTLB_DYNAMIC */ + +/** + * swiotlb_dev_init() - initialize swiotlb fields in &struct device + * @dev: Device to be initialized. + */ +void swiotlb_dev_init(struct device *dev) +{ + dev->dma_io_tlb_mem = &io_tlb_default_mem; +#ifdef CONFIG_SWIOTLB_DYNAMIC + INIT_LIST_HEAD(&dev->dma_io_tlb_pools); + spin_lock_init(&dev->dma_io_tlb_lock); + dev->dma_uses_io_tlb = false; +#endif +} + /* * Return the offset into a iotlb slot required to keep the device happy. */ @@ -500,7 +826,7 @@ static unsigned int swiotlb_align_offset(struct device *dev, u64 addr) static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size, enum dma_data_direction dir) { - struct io_tlb_mem *mem = dev->dma_io_tlb_mem; + struct io_tlb_pool *mem = swiotlb_find_pool(dev, tlb_addr); int index = (tlb_addr - mem->start) >> IO_TLB_SHIFT; phys_addr_t orig_addr = mem->slots[index].orig_addr; size_t alloc_size = mem->slots[index].alloc_size; @@ -577,12 +903,10 @@ static inline phys_addr_t slot_addr(phys_addr_t start, phys_addr_t idx) */ static inline unsigned long get_max_slots(unsigned long boundary_mask) { - if (boundary_mask == ~0UL) - return 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); - return nr_slots(boundary_mask + 1); + return (boundary_mask >> IO_TLB_SHIFT) + 1; } -static unsigned int wrap_area_index(struct io_tlb_mem *mem, unsigned int index) +static unsigned int wrap_area_index(struct io_tlb_pool *mem, unsigned int index) { if (index >= mem->area_nslabs) return 0; @@ -623,19 +947,30 @@ static void dec_used(struct io_tlb_mem *mem, unsigned int nslots) } #endif /* CONFIG_DEBUG_FS */ -/* - * Find a suitable number of IO TLB entries size that will fit this request and - * allocate a buffer from that IO TLB pool. +/** + * swiotlb_area_find_slots() - search for slots in one IO TLB memory area + * @dev: Device which maps the buffer. + * @pool: Memory pool to be searched. + * @area_index: Index of the IO TLB memory area to be searched. + * @orig_addr: Original (non-bounced) IO buffer address. + * @alloc_size: Total requested size of the bounce buffer, + * including initial alignment padding. + * @alloc_align_mask: Required alignment of the allocated buffer. + * + * Find a suitable sequence of IO TLB entries for the request and allocate + * a buffer from the given IO TLB memory area. + * This function takes care of locking. + * + * Return: Index of the first allocated slot, or -1 on error. */ -static int swiotlb_do_find_slots(struct device *dev, int area_index, - phys_addr_t orig_addr, size_t alloc_size, +static int swiotlb_area_find_slots(struct device *dev, struct io_tlb_pool *pool, + int area_index, phys_addr_t orig_addr, size_t alloc_size, unsigned int alloc_align_mask) { - struct io_tlb_mem *mem = dev->dma_io_tlb_mem; - struct io_tlb_area *area = mem->areas + area_index; + struct io_tlb_area *area = pool->areas + area_index; unsigned long boundary_mask = dma_get_seg_boundary(dev); dma_addr_t tbl_dma_addr = - phys_to_dma_unencrypted(dev, mem->start) & boundary_mask; + phys_to_dma_unencrypted(dev, pool->start) & boundary_mask; unsigned long max_slots = get_max_slots(boundary_mask); unsigned int iotlb_align_mask = dma_get_min_align_mask(dev) | alloc_align_mask; @@ -647,7 +982,7 @@ static int swiotlb_do_find_slots(struct device *dev, int area_index, unsigned int slot_index; BUG_ON(!nslots); - BUG_ON(area_index >= mem->nareas); + BUG_ON(area_index >= pool->nareas); /* * For allocations of PAGE_SIZE or larger only look for page aligned @@ -664,35 +999,30 @@ static int swiotlb_do_find_slots(struct device *dev, int area_index, stride = (iotlb_align_mask >> IO_TLB_SHIFT) + 1; spin_lock_irqsave(&area->lock, flags); - if (unlikely(nslots > mem->area_nslabs - area->used)) + if (unlikely(nslots > pool->area_nslabs - area->used)) goto not_found; - slot_base = area_index * mem->area_nslabs; + slot_base = area_index * pool->area_nslabs; index = area->index; - for (slots_checked = 0; slots_checked < mem->area_nslabs; ) { + for (slots_checked = 0; slots_checked < pool->area_nslabs; ) { slot_index = slot_base + index; if (orig_addr && (slot_addr(tbl_dma_addr, slot_index) & iotlb_align_mask) != (orig_addr & iotlb_align_mask)) { - index = wrap_area_index(mem, index + 1); + index = wrap_area_index(pool, index + 1); slots_checked++; continue; } - /* - * If we find a slot that indicates we have 'nslots' number of - * contiguous buffers, we allocate the buffers from that slot - * and mark the entries as '0' indicating unavailable. - */ if (!iommu_is_span_boundary(slot_index, nslots, nr_slots(tbl_dma_addr), max_slots)) { - if (mem->slots[slot_index].list >= nslots) + if (pool->slots[slot_index].list >= nslots) goto found; } - index = wrap_area_index(mem, index + stride); + index = wrap_area_index(pool, index + stride); slots_checked += stride; } @@ -701,48 +1031,159 @@ not_found: return -1; found: + /* + * If we find a slot that indicates we have 'nslots' number of + * contiguous buffers, we allocate the buffers from that slot onwards + * and set the list of free entries to '0' indicating unavailable. + */ for (i = slot_index; i < slot_index + nslots; i++) { - mem->slots[i].list = 0; - mem->slots[i].alloc_size = alloc_size - (offset + + pool->slots[i].list = 0; + pool->slots[i].alloc_size = alloc_size - (offset + ((i - slot_index) << IO_TLB_SHIFT)); } for (i = slot_index - 1; io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && - mem->slots[i].list; i--) - mem->slots[i].list = ++count; + pool->slots[i].list; i--) + pool->slots[i].list = ++count; /* * Update the indices to avoid searching in the next round. */ - area->index = wrap_area_index(mem, index + nslots); + area->index = wrap_area_index(pool, index + nslots); area->used += nslots; spin_unlock_irqrestore(&area->lock, flags); - inc_used_and_hiwater(mem, nslots); + inc_used_and_hiwater(dev->dma_io_tlb_mem, nslots); return slot_index; } -static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, - size_t alloc_size, unsigned int alloc_align_mask) +/** + * swiotlb_pool_find_slots() - search for slots in one memory pool + * @dev: Device which maps the buffer. + * @pool: Memory pool to be searched. + * @orig_addr: Original (non-bounced) IO buffer address. + * @alloc_size: Total requested size of the bounce buffer, + * including initial alignment padding. + * @alloc_align_mask: Required alignment of the allocated buffer. + * + * Search through one memory pool to find a sequence of slots that match the + * allocation constraints. + * + * Return: Index of the first allocated slot, or -1 on error. + */ +static int swiotlb_pool_find_slots(struct device *dev, struct io_tlb_pool *pool, + phys_addr_t orig_addr, size_t alloc_size, + unsigned int alloc_align_mask) { - struct io_tlb_mem *mem = dev->dma_io_tlb_mem; - int start = raw_smp_processor_id() & (mem->nareas - 1); + int start = raw_smp_processor_id() & (pool->nareas - 1); int i = start, index; do { - index = swiotlb_do_find_slots(dev, i, orig_addr, alloc_size, - alloc_align_mask); + index = swiotlb_area_find_slots(dev, pool, i, orig_addr, + alloc_size, alloc_align_mask); if (index >= 0) return index; - if (++i >= mem->nareas) + if (++i >= pool->nareas) i = 0; } while (i != start); return -1; } +#ifdef CONFIG_SWIOTLB_DYNAMIC + +/** + * swiotlb_find_slots() - search for slots in the whole swiotlb + * @dev: Device which maps the buffer. + * @orig_addr: Original (non-bounced) IO buffer address. + * @alloc_size: Total requested size of the bounce buffer, + * including initial alignment padding. + * @alloc_align_mask: Required alignment of the allocated buffer. + * @retpool: Used memory pool, updated on return. + * + * Search through the whole software IO TLB to find a sequence of slots that + * match the allocation constraints. + * + * Return: Index of the first allocated slot, or -1 on error. + */ +static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, + size_t alloc_size, unsigned int alloc_align_mask, + struct io_tlb_pool **retpool) +{ + struct io_tlb_mem *mem = dev->dma_io_tlb_mem; + struct io_tlb_pool *pool; + unsigned long nslabs; + unsigned long flags; + u64 phys_limit; + int index; + + rcu_read_lock(); + list_for_each_entry_rcu(pool, &mem->pools, node) { + index = swiotlb_pool_find_slots(dev, pool, orig_addr, + alloc_size, alloc_align_mask); + if (index >= 0) { + rcu_read_unlock(); + goto found; + } + } + rcu_read_unlock(); + if (!mem->can_grow) + return -1; + + schedule_work(&mem->dyn_alloc); + + nslabs = nr_slots(alloc_size); + phys_limit = min_not_zero(*dev->dma_mask, dev->bus_dma_limit); + pool = swiotlb_alloc_pool(dev, nslabs, nslabs, 1, phys_limit, + GFP_NOWAIT | __GFP_NOWARN); + if (!pool) + return -1; + + index = swiotlb_pool_find_slots(dev, pool, orig_addr, + alloc_size, alloc_align_mask); + if (index < 0) { + swiotlb_dyn_free(&pool->rcu); + return -1; + } + + pool->transient = true; + spin_lock_irqsave(&dev->dma_io_tlb_lock, flags); + list_add_rcu(&pool->node, &dev->dma_io_tlb_pools); + spin_unlock_irqrestore(&dev->dma_io_tlb_lock, flags); + +found: + dev->dma_uses_io_tlb = true; + /* Pairs with smp_rmb() in is_swiotlb_buffer() */ + smp_wmb(); + + *retpool = pool; + return index; +} + +#else /* !CONFIG_SWIOTLB_DYNAMIC */ + +static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, + size_t alloc_size, unsigned int alloc_align_mask, + struct io_tlb_pool **retpool) +{ + *retpool = &dev->dma_io_tlb_mem->defpool; + return swiotlb_pool_find_slots(dev, *retpool, + orig_addr, alloc_size, alloc_align_mask); +} + +#endif /* CONFIG_SWIOTLB_DYNAMIC */ + #ifdef CONFIG_DEBUG_FS +/** + * mem_used() - get number of used slots in an allocator + * @mem: Software IO TLB allocator. + * + * The result is accurate in this version of the function, because an atomic + * counter is available if CONFIG_DEBUG_FS is set. + * + * Return: Number of used slots. + */ static unsigned long mem_used(struct io_tlb_mem *mem) { return atomic_long_read(&mem->total_used); @@ -750,14 +1191,48 @@ static unsigned long mem_used(struct io_tlb_mem *mem) #else /* !CONFIG_DEBUG_FS */ -static unsigned long mem_used(struct io_tlb_mem *mem) +/** + * mem_pool_used() - get number of used slots in a memory pool + * @pool: Software IO TLB memory pool. + * + * The result is not accurate, see mem_used(). + * + * Return: Approximate number of used slots. + */ +static unsigned long mem_pool_used(struct io_tlb_pool *pool) { int i; unsigned long used = 0; - for (i = 0; i < mem->nareas; i++) - used += mem->areas[i].used; + for (i = 0; i < pool->nareas; i++) + used += pool->areas[i].used; + return used; +} + +/** + * mem_used() - get number of used slots in an allocator + * @mem: Software IO TLB allocator. + * + * The result is not accurate, because there is no locking of individual + * areas. + * + * Return: Approximate number of used slots. + */ +static unsigned long mem_used(struct io_tlb_mem *mem) +{ +#ifdef CONFIG_SWIOTLB_DYNAMIC + struct io_tlb_pool *pool; + unsigned long used = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(pool, &mem->pools, node) + used += mem_pool_used(pool); + rcu_read_unlock(); + return used; +#else + return mem_pool_used(&mem->defpool); +#endif } #endif /* CONFIG_DEBUG_FS */ @@ -769,6 +1244,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; unsigned int offset = swiotlb_align_offset(dev, orig_addr); + struct io_tlb_pool *pool; unsigned int i; int index; phys_addr_t tlb_addr; @@ -789,7 +1265,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, } index = swiotlb_find_slots(dev, orig_addr, - alloc_size + offset, alloc_align_mask); + alloc_size + offset, alloc_align_mask, &pool); if (index == -1) { if (!(attrs & DMA_ATTR_NO_WARN)) dev_warn_ratelimited(dev, @@ -804,8 +1280,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, * needed. */ for (i = 0; i < nr_slots(alloc_size + offset); i++) - mem->slots[index + i].orig_addr = slot_addr(orig_addr, i); - tlb_addr = slot_addr(mem->start, index) + offset; + pool->slots[index + i].orig_addr = slot_addr(orig_addr, i); + tlb_addr = slot_addr(pool->start, index) + offset; /* * When dir == DMA_FROM_DEVICE we could omit the copy from the orig * to the tlb buffer, if we knew for sure the device will @@ -819,7 +1295,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr) { - struct io_tlb_mem *mem = dev->dma_io_tlb_mem; + struct io_tlb_pool *mem = swiotlb_find_pool(dev, tlb_addr); unsigned long flags; unsigned int offset = swiotlb_align_offset(dev, tlb_addr); int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT; @@ -863,9 +1339,44 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr) area->used -= nslots; spin_unlock_irqrestore(&area->lock, flags); - dec_used(mem, nslots); + dec_used(dev->dma_io_tlb_mem, nslots); +} + +#ifdef CONFIG_SWIOTLB_DYNAMIC + +/** + * swiotlb_del_transient() - delete a transient memory pool + * @dev: Device which mapped the buffer. + * @tlb_addr: Physical address within a bounce buffer. + * + * Check whether the address belongs to a transient SWIOTLB memory pool. + * If yes, then delete the pool. + * + * Return: %true if @tlb_addr belonged to a transient pool that was released. + */ +static bool swiotlb_del_transient(struct device *dev, phys_addr_t tlb_addr) +{ + struct io_tlb_pool *pool; + + pool = swiotlb_find_pool(dev, tlb_addr); + if (!pool->transient) + return false; + + dec_used(dev->dma_io_tlb_mem, pool->nslabs); + swiotlb_del_pool(dev, pool); + return true; } +#else /* !CONFIG_SWIOTLB_DYNAMIC */ + +static inline bool swiotlb_del_transient(struct device *dev, + phys_addr_t tlb_addr) +{ + return false; +} + +#endif /* CONFIG_SWIOTLB_DYNAMIC */ + /* * tlb_addr is the physical address of the bounce buffer to unmap. */ @@ -880,6 +1391,8 @@ void swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr, (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_FROM_DEVICE); + if (swiotlb_del_transient(dev, tlb_addr)) + return; swiotlb_release_slots(dev, tlb_addr); } @@ -950,13 +1463,47 @@ size_t swiotlb_max_mapping_size(struct device *dev) return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE - min_align; } +/** + * is_swiotlb_allocated() - check if the default software IO TLB is initialized + */ +bool is_swiotlb_allocated(void) +{ + return io_tlb_default_mem.nslabs; +} + bool is_swiotlb_active(struct device *dev) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; return mem && mem->nslabs; } -EXPORT_SYMBOL_GPL(is_swiotlb_active); + +/** + * default_swiotlb_base() - get the base address of the default SWIOTLB + * + * Get the lowest physical address used by the default software IO TLB pool. + */ +phys_addr_t default_swiotlb_base(void) +{ +#ifdef CONFIG_SWIOTLB_DYNAMIC + io_tlb_default_mem.can_grow = false; +#endif + return io_tlb_default_mem.defpool.start; +} + +/** + * default_swiotlb_limit() - get the address limit of the default SWIOTLB + * + * Get the highest physical address used by the default software IO TLB pool. + */ +phys_addr_t default_swiotlb_limit(void) +{ +#ifdef CONFIG_SWIOTLB_DYNAMIC + return io_tlb_default_mem.phys_limit; +#else + return io_tlb_default_mem.defpool.end - 1; +#endif +} #ifdef CONFIG_DEBUG_FS @@ -1031,17 +1578,18 @@ static inline void swiotlb_create_debugfs_files(struct io_tlb_mem *mem, struct page *swiotlb_alloc(struct device *dev, size_t size) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; + struct io_tlb_pool *pool; phys_addr_t tlb_addr; int index; if (!mem) return NULL; - index = swiotlb_find_slots(dev, 0, size, 0); + index = swiotlb_find_slots(dev, 0, size, 0, &pool); if (index == -1) return NULL; - tlb_addr = slot_addr(mem->start, index); + tlb_addr = slot_addr(pool->start, index); return pfn_to_page(PFN_DOWN(tlb_addr)); } @@ -1078,29 +1626,37 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem, * to it. */ if (!mem) { + struct io_tlb_pool *pool; + mem = kzalloc(sizeof(*mem), GFP_KERNEL); if (!mem) return -ENOMEM; + pool = &mem->defpool; - mem->slots = kcalloc(nslabs, sizeof(*mem->slots), GFP_KERNEL); - if (!mem->slots) { + pool->slots = kcalloc(nslabs, sizeof(*pool->slots), GFP_KERNEL); + if (!pool->slots) { kfree(mem); return -ENOMEM; } - mem->areas = kcalloc(nareas, sizeof(*mem->areas), + pool->areas = kcalloc(nareas, sizeof(*pool->areas), GFP_KERNEL); - if (!mem->areas) { - kfree(mem->slots); + if (!pool->areas) { + kfree(pool->slots); kfree(mem); return -ENOMEM; } set_memory_decrypted((unsigned long)phys_to_virt(rmem->base), rmem->size >> PAGE_SHIFT); - swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, SWIOTLB_FORCE, - false, nareas); + swiotlb_init_io_tlb_pool(pool, rmem->base, nslabs, + false, nareas); + mem->force_bounce = true; mem->for_alloc = true; +#ifdef CONFIG_SWIOTLB_DYNAMIC + spin_lock_init(&mem->lock); +#endif + add_mem_pool(mem, pool); rmem->priv = mem; diff --git a/kernel/entry/common.c b/kernel/entry/common.c index be61332c66b5..d7ee4bc3f2ba 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -205,8 +205,7 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs) arch_exit_to_user_mode_prepare(regs, ti_work); - /* Ensure that the address limit is intact and no locks are held */ - addr_limit_user_check(); + /* Ensure that kernel state is sane for a return to userspace */ kmap_assert_nomap(); lockdep_assert_irqs_disabled(); lockdep_sys_exit(); diff --git a/kernel/events/core.c b/kernel/events/core.c index 78ae7b6f90fd..4c72a41f11af 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8249,7 +8249,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) unsigned int size; memset(comm, 0, sizeof(comm)); - strlcpy(comm, comm_event->task->comm, sizeof(comm)); + strscpy(comm, comm_event->task->comm, sizeof(comm)); size = ALIGN(strlen(comm)+1, sizeof(u64)); comm_event->comm = comm; @@ -8631,7 +8631,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) unsigned int size; char tmp[16]; char *buf = NULL; - char *name; + char *name = NULL; if (vma->vm_flags & VM_READ) prot |= PROT_READ; @@ -8678,33 +8678,22 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) goto got_name; } else { - if (vma->vm_ops && vma->vm_ops->name) { + if (vma->vm_ops && vma->vm_ops->name) name = (char *) vma->vm_ops->name(vma); - if (name) - goto cpy_name; + if (!name) + name = (char *)arch_vma_name(vma); + if (!name) { + if (vma_is_initial_heap(vma)) + name = "[heap]"; + else if (vma_is_initial_stack(vma)) + name = "[stack]"; + else + name = "//anon"; } - - name = (char *)arch_vma_name(vma); - if (name) - goto cpy_name; - - if (vma->vm_start <= vma->vm_mm->start_brk && - vma->vm_end >= vma->vm_mm->brk) { - name = "[heap]"; - goto cpy_name; - } - if (vma->vm_start <= vma->vm_mm->start_stack && - vma->vm_end >= vma->vm_mm->start_stack) { - name = "[stack]"; - goto cpy_name; - } - - name = "//anon"; - goto cpy_name; } cpy_name: - strlcpy(tmp, name, sizeof(tmp)); + strscpy(tmp, name, sizeof(tmp)); name = tmp; got_name: /* @@ -9128,7 +9117,7 @@ void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN) goto err; - strlcpy(name, sym, KSYM_NAME_LEN); + strscpy(name, sym, KSYM_NAME_LEN); name_len = strlen(name) + 1; while (!IS_ALIGNED(name_len, sizeof(u64))) name[name_len++] = '\0'; @@ -9595,16 +9584,16 @@ u64 perf_swevent_set_period(struct perf_event *event) hwc->last_period = hwc->sample_period; -again: - old = val = local64_read(&hwc->period_left); - if (val < 0) - return 0; + old = local64_read(&hwc->period_left); + do { + val = old; + if (val < 0) + return 0; - nr = div64_u64(period + val, period); - offset = nr * period; - val -= offset; - if (local64_cmpxchg(&hwc->period_left, old, val) != old) - goto again; + nr = div64_u64(period + val, period); + offset = nr * period; + val -= offset; + } while (!local64_try_cmpxchg(&hwc->period_left, &old, val)); return nr; } diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index c3797701339c..6c2cb4e4f48d 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -523,26 +523,6 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int we return 0; } -__weak int arch_reserve_bp_slot(struct perf_event *bp) -{ - return 0; -} - -__weak void arch_release_bp_slot(struct perf_event *bp) -{ -} - -/* - * Function to perform processor-specific cleanup during unregistration - */ -__weak void arch_unregister_hw_breakpoint(struct perf_event *bp) -{ - /* - * A weak stub function here for those archs that don't define - * it inside arch/.../kernel/hw_breakpoint.c - */ -} - /* * Constraints to check before allowing this new breakpoint counter. * @@ -594,7 +574,6 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) enum bp_type_idx type; int max_pinned_slots; int weight; - int ret; /* We couldn't initialize breakpoint constraints on boot */ if (!constraints_initialized) @@ -613,10 +592,6 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) if (max_pinned_slots > hw_breakpoint_slots_cached(type)) return -ENOSPC; - ret = arch_reserve_bp_slot(bp); - if (ret) - return ret; - return toggle_bp_slot(bp, true, type, weight); } @@ -634,8 +609,6 @@ static void __release_bp_slot(struct perf_event *bp, u64 bp_type) enum bp_type_idx type; int weight; - arch_release_bp_slot(bp); - type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); WARN_ON(toggle_bp_slot(bp, false, type, weight)); @@ -645,7 +618,6 @@ void release_bp_slot(struct perf_event *bp) { struct mutex *mtx = bp_constraints_lock(bp); - arch_unregister_hw_breakpoint(bp); __release_bp_slot(bp, bp->attr.bp_type); bp_constraints_unlock(mtx); } diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index a0433f37b024..fb1e180b5f0a 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -191,9 +191,10 @@ __perf_output_begin(struct perf_output_handle *handle, perf_output_get_handle(handle); + offset = local_read(&rb->head); do { + head = offset; tail = READ_ONCE(rb->user_page->data_tail); - offset = head = local_read(&rb->head); if (!rb->overwrite) { if (unlikely(!ring_buffer_has_space(head, tail, perf_data_size(rb), @@ -217,7 +218,7 @@ __perf_output_begin(struct perf_output_handle *handle, head += size; else head -= size; - } while (local_cmpxchg(&rb->head, offset, head) != offset); + } while (!local_try_cmpxchg(&rb->head, &offset, head)); if (backward) { offset = head; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f0ac5b874919..3048589e2e85 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -193,7 +193,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, } flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte))); - ptep_clear_flush_notify(vma, addr, pvmw.pte); + ptep_clear_flush(vma, addr, pvmw.pte); if (new_page) set_pte_at_notify(mm, addr, pvmw.pte, mk_pte(new_page, vma->vm_page_prot)); diff --git a/kernel/fork.c b/kernel/fork.c index d2e12b6d2b18..3b6d20dfb9a8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -909,8 +909,6 @@ static void cleanup_lazy_tlbs(struct mm_struct *mm) */ void __mmdrop(struct mm_struct *mm) { - int i; - BUG_ON(mm == &init_mm); WARN_ON_ONCE(mm == current->mm); @@ -925,9 +923,8 @@ void __mmdrop(struct mm_struct *mm) put_user_ns(mm->user_ns); mm_pasid_drop(mm); mm_destroy_cid(mm); + percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS); - for (i = 0; i < NR_MM_COUNTERS; i++) - percpu_counter_destroy(&mm->rss_stat[i]); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -985,6 +982,14 @@ void __put_task_struct(struct task_struct *tsk) } EXPORT_SYMBOL_GPL(__put_task_struct); +void __put_task_struct_rcu_cb(struct rcu_head *rhp) +{ + struct task_struct *task = container_of(rhp, struct task_struct, rcu); + + __put_task_struct(task); +} +EXPORT_SYMBOL_GPL(__put_task_struct_rcu_cb); + void __init __weak arch_task_cache_init(void) { } /* @@ -1252,8 +1257,6 @@ static void mm_init_uprobes_state(struct mm_struct *mm) static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { - int i; - mt_init_flags(&mm->mm_mt, MM_MT_FLAGS); mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock); atomic_set(&mm->mm_users, 1); @@ -1301,17 +1304,15 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (mm_alloc_cid(mm)) goto fail_cid; - for (i = 0; i < NR_MM_COUNTERS; i++) - if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT)) - goto fail_pcpu; + if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, + NR_MM_COUNTERS)) + goto fail_pcpu; mm->user_ns = get_user_ns(user_ns); lru_gen_init_mm(mm); return mm; fail_pcpu: - while (i > 0) - percpu_counter_destroy(&mm->rss_stat[--i]); mm_destroy_cid(mm); fail_cid: destroy_context(mm); @@ -1396,8 +1397,8 @@ EXPORT_SYMBOL_GPL(mmput_async); * This changes mm's executable file (shown as symlink /proc/[pid]/exe). * * Main users are mmput() and sys_execve(). Callers prevent concurrent - * invocations: in mmput() nobody alive left, in execve task is single - * threaded. + * invocations: in mmput() nobody alive left, in execve it happens before + * the new mm is made visible to anyone. * * Can only fail if new_exe_file != NULL. */ @@ -1432,9 +1433,7 @@ int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) /** * replace_mm_exe_file - replace a reference to the mm's executable file * - * This changes mm's executable file (shown as symlink /proc/[pid]/exe), - * dealing with concurrent invocation and without grabbing the mmap lock in - * write mode. + * This changes mm's executable file (shown as symlink /proc/[pid]/exe). * * Main user is sys_prctl(PR_SET_MM_MAP/EXE_FILE). */ @@ -1464,22 +1463,20 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) return ret; } - /* set the new file, lockless */ ret = deny_write_access(new_exe_file); if (ret) return -EACCES; get_file(new_exe_file); - old_exe_file = xchg(&mm->exe_file, new_exe_file); + /* set the new file */ + mmap_write_lock(mm); + old_exe_file = rcu_dereference_raw(mm->exe_file); + rcu_assign_pointer(mm->exe_file, new_exe_file); + mmap_write_unlock(mm); + if (old_exe_file) { - /* - * Don't race with dup_mmap() getting the file and disallowing - * write access while someone might open the file writable. - */ - mmap_read_lock(mm); allow_write_access(old_exe_file); fput(old_exe_file); - mmap_read_unlock(mm); } return 0; } diff --git a/kernel/futex/core.c b/kernel/futex/core.c index adf7e2c1c8f4..d1d7b3c175a4 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -1131,8 +1131,7 @@ static int __init futex_init(void) #endif futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), - futex_hashsize, 0, - futex_hashsize < 256 ? HASH_SMALL : 0, + futex_hashsize, 0, 0, &futex_shift, NULL, futex_hashsize, futex_hashsize); futex_hashsize = 1UL << futex_shift; diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index 16f8ecc7d882..ccd02afaeffb 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile @@ -3,4 +3,6 @@ ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' obj-y := base.o fs.o obj-$(CONFIG_CC_IS_GCC) += gcc_base.o gcc_4_7.o +CFLAGS_gcc_base.o += -Wno-missing-prototypes -Wno-missing-declarations obj-$(CONFIG_CC_IS_CLANG) += clang.o +CFLAGS_clang.o += -Wno-missing-prototypes -Wno-missing-declarations diff --git a/kernel/iomem.c b/kernel/iomem.c index 62c92e43aa0d..dc2120776e1c 100644 --- a/kernel/iomem.c +++ b/kernel/iomem.c @@ -3,19 +3,16 @@ #include <linux/types.h> #include <linux/io.h> #include <linux/mm.h> - -#ifndef ioremap_cache -/* temporary while we convert existing ioremap_cache users to memremap */ -__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size) -{ - return ioremap(offset, size); -} -#endif +#include <linux/ioremap.h> #ifndef arch_memremap_wb static void *arch_memremap_wb(resource_size_t offset, unsigned long size) { +#ifdef ioremap_cache return (__force void *)ioremap_cache(offset, size); +#else + return (__force void *)ioremap(offset, size); +#endif } #endif diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index ee8c0acf39df..dc94e0bf2c94 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -473,11 +473,12 @@ void handle_nested_irq(unsigned int irq) action = desc->action; if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) { desc->istate |= IRQS_PENDING; - goto out_unlock; + raw_spin_unlock_irq(&desc->lock); + return; } kstat_incr_irqs_this_cpu(desc); - irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); + atomic_inc(&desc->threads_active); raw_spin_unlock_irq(&desc->lock); action_ret = IRQ_NONE; @@ -487,11 +488,7 @@ void handle_nested_irq(unsigned int irq) if (!irq_settings_no_debug(desc)) note_interrupt(desc, action_ret); - raw_spin_lock_irq(&desc->lock); - irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); - -out_unlock: - raw_spin_unlock_irq(&desc->lock); + wake_threads_waitq(desc); } EXPORT_SYMBOL_GPL(handle_nested_irq); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index bdd35bb9c735..bcc7f21db9ee 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -108,8 +108,6 @@ extern int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, bool *state); -extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); - irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event(struct irq_desc *desc); @@ -121,6 +119,8 @@ void irq_resend_init(struct irq_desc *desc); bool irq_wait_for_poll(struct irq_desc *desc); void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action); +void wake_threads_waitq(struct irq_desc *desc); + #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d2742af0f0fd..d309ba84e08a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -108,6 +108,16 @@ bool synchronize_hardirq(unsigned int irq) } EXPORT_SYMBOL(synchronize_hardirq); +static void __synchronize_irq(struct irq_desc *desc) +{ + __synchronize_hardirq(desc, true); + /* + * We made sure that no hardirq handler is running. Now verify that no + * threaded handlers are active. + */ + wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active)); +} + /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) * @irq: interrupt number to wait for @@ -127,16 +137,8 @@ void synchronize_irq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - if (desc) { - __synchronize_hardirq(desc, true); - /* - * We made sure that no hardirq handler is - * running. Now verify that no threaded handlers are - * active. - */ - wait_event(desc->wait_for_threads, - !atomic_read(&desc->threads_active)); - } + if (desc) + __synchronize_irq(desc); } EXPORT_SYMBOL(synchronize_irq); @@ -1216,7 +1218,7 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc, return ret; } -static void wake_threads_waitq(struct irq_desc *desc) +void wake_threads_waitq(struct irq_desc *desc) { if (atomic_dec_and_test(&desc->threads_active)) wake_up(&desc->wait_for_threads); @@ -1944,7 +1946,7 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) * supports it also make sure that there is no (not yet serviced) * interrupt in flight at the hardware level. */ - __synchronize_hardirq(desc, true); + __synchronize_irq(desc); #ifdef CONFIG_DEBUG_SHIRQ /* diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index edec335c0a7a..5f2c66860ac6 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -68,11 +68,16 @@ static int irq_sw_resend(struct irq_desc *desc) */ if (!desc->parent_irq) return -EINVAL; + + desc = irq_to_desc(desc->parent_irq); + if (!desc) + return -EINVAL; } /* Add to resend_list and activate the softirq: */ raw_spin_lock(&irq_resend_lock); - hlist_add_head(&desc->resend_node, &irq_resend_list); + if (hlist_unhashed(&desc->resend_node)) + hlist_add_head(&desc->resend_node, &irq_resend_list); raw_spin_unlock(&irq_resend_lock); tasklet_schedule(&resend_tasklet); return 0; diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 016d997131d4..18edd57b5fe8 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -163,12 +163,12 @@ unsigned long kallsyms_sym_address(int idx) return kallsyms_relative_base - 1 - kallsyms_offsets[idx]; } -static bool cleanup_symbol_name(char *s) +static void cleanup_symbol_name(char *s) { char *res; if (!IS_ENABLED(CONFIG_LTO_CLANG)) - return false; + return; /* * LLVM appends various suffixes for local functions and variables that @@ -178,26 +178,21 @@ static bool cleanup_symbol_name(char *s) * - foo.llvm.[0-9a-f]+ */ res = strstr(s, ".llvm."); - if (res) { + if (res) *res = '\0'; - return true; - } - return false; + return; } static int compare_symbol_name(const char *name, char *namebuf) { - int ret; - - ret = strcmp(name, namebuf); - if (!ret) - return ret; - - if (cleanup_symbol_name(namebuf) && !strcmp(name, namebuf)) - return 0; - - return ret; + /* The kallsyms_seqs_of_names is sorted based on names after + * cleanup_symbol_name() (see scripts/kallsyms.c) if clang lto is enabled. + * To ensure correct bisection in kallsyms_lookup_names(), do + * cleanup_symbol_name(namebuf) before comparing name and namebuf. + */ + cleanup_symbol_name(namebuf); + return strcmp(name, namebuf); } static unsigned int get_symbol_seq(int index) diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c index a2e3745d15c4..b4cac76ea5e9 100644 --- a/kernel/kallsyms_selftest.c +++ b/kernel/kallsyms_selftest.c @@ -196,7 +196,7 @@ static bool match_cleanup_name(const char *s, const char *name) if (!IS_ENABLED(CONFIG_LTO_CLANG)) return false; - p = strchr(s, '.'); + p = strstr(s, ".llvm."); if (!p) return false; @@ -341,30 +341,10 @@ static int test_kallsyms_basic_function(void) ret = lookup_symbol_name(addr, namebuf); if (unlikely(ret)) { namebuf[0] = 0; + pr_info("%d: lookup_symbol_name(%lx) failed\n", i, addr); goto failed; } - /* - * The first '.' may be the initial letter, in which case the - * entire symbol name will be truncated to an empty string in - * cleanup_symbol_name(). Do not test these symbols. - * - * For example: - * cat /proc/kallsyms | awk '{print $3}' | grep -E "^\." | head - * .E_read_words - * .E_leading_bytes - * .E_trailing_bytes - * .E_write_words - * .E_copy - * .str.292.llvm.12122243386960820698 - * .str.24.llvm.12122243386960820698 - * .str.29.llvm.12122243386960820698 - * .str.75.llvm.12122243386960820698 - * .str.99.llvm.12122243386960820698 - */ - if (IS_ENABLED(CONFIG_LTO_CLANG) && !namebuf[0]) - continue; - lookup_addr = kallsyms_lookup_name(namebuf); memset(stat, 0, sizeof(*stat)); @@ -388,8 +368,11 @@ static int test_kallsyms_basic_function(void) if (stat->addr != stat2->addr || stat->real_cnt != stat2->real_cnt || memcmp(stat->addrs, stat2->addrs, - stat->save_cnt * sizeof(stat->addrs[0]))) + stat->save_cnt * sizeof(stat->addrs[0]))) { + pr_info("%s: mismatch between kallsyms_on_each_symbol() and kallsyms_on_each_match_symbol()\n", + namebuf); goto failed; + } /* * The average of random increments is 128, that is, one of @@ -400,15 +383,23 @@ static int test_kallsyms_basic_function(void) } /* Need to be found at least once */ - if (!stat->real_cnt) + if (!stat->real_cnt) { + pr_info("%s: Never found\n", namebuf); goto failed; + } /* * kallsyms_lookup_name() returns the address of the first * symbol found and cannot be NULL. */ - if (!lookup_addr || lookup_addr != stat->addrs[0]) + if (!lookup_addr) { + pr_info("%s: NULL lookup_addr?!\n", namebuf); + goto failed; + } + if (lookup_addr != stat->addrs[0]) { + pr_info("%s: lookup_addr != stat->addrs[0]\n", namebuf); goto failed; + } /* * If the addresses of all matching symbols are recorded, the @@ -420,8 +411,10 @@ static int test_kallsyms_basic_function(void) break; } - if (j == stat->save_cnt) + if (j == stat->save_cnt) { + pr_info("%s: j == save_cnt?!\n", namebuf); goto failed; + } } } diff --git a/kernel/kexec.c b/kernel/kexec.c index 92d301f98776..107f355eac10 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -129,6 +129,11 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, if (flags & KEXEC_PRESERVE_CONTEXT) image->preserve_context = 1; +#ifdef CONFIG_CRASH_HOTPLUG + if (flags & KEXEC_UPDATE_ELFCOREHDR) + image->update_elfcorehdr = 1; +#endif + ret = machine_kexec_prepare(image); if (ret) goto out; diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index e2f2574d8b74..9dc728982d79 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -49,9 +49,6 @@ atomic_t __kexec_lock = ATOMIC_INIT(0); -/* Per cpu memory for storing cpu states in case of system crash. */ -note_buf_t __percpu *crash_notes; - /* Flag to indicate we are going to kexec a new kernel */ bool kexec_in_progress = false; @@ -277,6 +274,12 @@ struct kimage *do_kimage_alloc_init(void) /* Initialize the list of unusable pages */ INIT_LIST_HEAD(&image->unusable_pages); +#ifdef CONFIG_CRASH_HOTPLUG + image->hp_action = KEXEC_CRASH_HP_NONE; + image->elfcorehdr_index = -1; + image->elfcorehdr_updated = false; +#endif + return image; } @@ -1218,40 +1221,6 @@ void crash_save_cpu(struct pt_regs *regs, int cpu) final_note(buf); } -static int __init crash_notes_memory_init(void) -{ - /* Allocate memory for saving cpu registers. */ - size_t size, align; - - /* - * crash_notes could be allocated across 2 vmalloc pages when percpu - * is vmalloc based . vmalloc doesn't guarantee 2 continuous vmalloc - * pages are also on 2 continuous physical pages. In this case the - * 2nd part of crash_notes in 2nd page could be lost since only the - * starting address and size of crash_notes are exported through sysfs. - * Here round up the size of crash_notes to the nearest power of two - * and pass it to __alloc_percpu as align value. This can make sure - * crash_notes is allocated inside one physical page. - */ - size = sizeof(note_buf_t); - align = min(roundup_pow_of_two(sizeof(note_buf_t)), PAGE_SIZE); - - /* - * Break compile if size is bigger than PAGE_SIZE since crash_notes - * definitely will be in 2 pages with that. - */ - BUILD_BUG_ON(size > PAGE_SIZE); - - crash_notes = __alloc_percpu(size, align); - if (!crash_notes) { - pr_warn("Memory allocation for saving cpu register states failed\n"); - return -ENOMEM; - } - return 0; -} -subsys_initcall(crash_notes_memory_init); - - /* * Move into place and start executing a preloaded standalone * executable. If nothing was preloaded return an error. diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 881ba0d1714c..f9a419cd22d4 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -624,7 +624,7 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) * kexec_add_buffer - place a buffer in a kexec segment * @kbuf: Buffer contents and memory parameters. * - * This function assumes that kexec_mutex is held. + * This function assumes that kexec_lock is held. * On successful return, @kbuf->mem will have the physical address of * the buffer in memory. * @@ -685,7 +685,7 @@ static int kexec_calculate_store_digests(struct kimage *image) struct kexec_sha_region *sha_regions; struct purgatory_info *pi = &image->purgatory_info; - if (!IS_ENABLED(CONFIG_ARCH_HAS_KEXEC_PURGATORY)) + if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY)) return 0; zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT); @@ -726,6 +726,12 @@ static int kexec_calculate_store_digests(struct kimage *image) for (j = i = 0; i < image->nr_segments; i++) { struct kexec_segment *ksegment; +#ifdef CONFIG_CRASH_HOTPLUG + /* Exclude elfcorehdr segment to allow future changes via hotplug */ + if (j == image->elfcorehdr_index) + continue; +#endif + ksegment = &image->segment[i]; /* * Skip purgatory as it will be modified once we put digest @@ -790,7 +796,7 @@ out: return ret; } -#ifdef CONFIG_ARCH_HAS_KEXEC_PURGATORY +#ifdef CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY /* * kexec_purgatory_setup_kbuf - prepare buffer to load purgatory. * @pi: Purgatory to be loaded. @@ -1150,185 +1156,4 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, return 0; } -#endif /* CONFIG_ARCH_HAS_KEXEC_PURGATORY */ - -int crash_exclude_mem_range(struct crash_mem *mem, - unsigned long long mstart, unsigned long long mend) -{ - int i, j; - unsigned long long start, end, p_start, p_end; - struct range temp_range = {0, 0}; - - for (i = 0; i < mem->nr_ranges; i++) { - start = mem->ranges[i].start; - end = mem->ranges[i].end; - p_start = mstart; - p_end = mend; - - if (mstart > end || mend < start) - continue; - - /* Truncate any area outside of range */ - if (mstart < start) - p_start = start; - if (mend > end) - p_end = end; - - /* Found completely overlapping range */ - if (p_start == start && p_end == end) { - mem->ranges[i].start = 0; - mem->ranges[i].end = 0; - if (i < mem->nr_ranges - 1) { - /* Shift rest of the ranges to left */ - for (j = i; j < mem->nr_ranges - 1; j++) { - mem->ranges[j].start = - mem->ranges[j+1].start; - mem->ranges[j].end = - mem->ranges[j+1].end; - } - - /* - * Continue to check if there are another overlapping ranges - * from the current position because of shifting the above - * mem ranges. - */ - i--; - mem->nr_ranges--; - continue; - } - mem->nr_ranges--; - return 0; - } - - if (p_start > start && p_end < end) { - /* Split original range */ - mem->ranges[i].end = p_start - 1; - temp_range.start = p_end + 1; - temp_range.end = end; - } else if (p_start != start) - mem->ranges[i].end = p_start - 1; - else - mem->ranges[i].start = p_end + 1; - break; - } - - /* If a split happened, add the split to array */ - if (!temp_range.end) - return 0; - - /* Split happened */ - if (i == mem->max_nr_ranges - 1) - return -ENOMEM; - - /* Location where new range should go */ - j = i + 1; - if (j < mem->nr_ranges) { - /* Move over all ranges one slot towards the end */ - for (i = mem->nr_ranges - 1; i >= j; i--) - mem->ranges[i + 1] = mem->ranges[i]; - } - - mem->ranges[j].start = temp_range.start; - mem->ranges[j].end = temp_range.end; - mem->nr_ranges++; - return 0; -} - -int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, - void **addr, unsigned long *sz) -{ - Elf64_Ehdr *ehdr; - Elf64_Phdr *phdr; - unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz; - unsigned char *buf; - unsigned int cpu, i; - unsigned long long notes_addr; - unsigned long mstart, mend; - - /* extra phdr for vmcoreinfo ELF note */ - nr_phdr = nr_cpus + 1; - nr_phdr += mem->nr_ranges; - - /* - * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping - * area (for example, ffffffff80000000 - ffffffffa0000000 on x86_64). - * I think this is required by tools like gdb. So same physical - * memory will be mapped in two ELF headers. One will contain kernel - * text virtual addresses and other will have __va(physical) addresses. - */ - - nr_phdr++; - elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr); - elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN); - - buf = vzalloc(elf_sz); - if (!buf) - return -ENOMEM; - - ehdr = (Elf64_Ehdr *)buf; - phdr = (Elf64_Phdr *)(ehdr + 1); - memcpy(ehdr->e_ident, ELFMAG, SELFMAG); - ehdr->e_ident[EI_CLASS] = ELFCLASS64; - ehdr->e_ident[EI_DATA] = ELFDATA2LSB; - ehdr->e_ident[EI_VERSION] = EV_CURRENT; - ehdr->e_ident[EI_OSABI] = ELF_OSABI; - memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD); - ehdr->e_type = ET_CORE; - ehdr->e_machine = ELF_ARCH; - ehdr->e_version = EV_CURRENT; - ehdr->e_phoff = sizeof(Elf64_Ehdr); - ehdr->e_ehsize = sizeof(Elf64_Ehdr); - ehdr->e_phentsize = sizeof(Elf64_Phdr); - - /* Prepare one phdr of type PT_NOTE for each present CPU */ - for_each_present_cpu(cpu) { - phdr->p_type = PT_NOTE; - notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu)); - phdr->p_offset = phdr->p_paddr = notes_addr; - phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t); - (ehdr->e_phnum)++; - phdr++; - } - - /* Prepare one PT_NOTE header for vmcoreinfo */ - phdr->p_type = PT_NOTE; - phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note(); - phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE; - (ehdr->e_phnum)++; - phdr++; - - /* Prepare PT_LOAD type program header for kernel text region */ - if (need_kernel_map) { - phdr->p_type = PT_LOAD; - phdr->p_flags = PF_R|PF_W|PF_X; - phdr->p_vaddr = (unsigned long) _text; - phdr->p_filesz = phdr->p_memsz = _end - _text; - phdr->p_offset = phdr->p_paddr = __pa_symbol(_text); - ehdr->e_phnum++; - phdr++; - } - - /* Go through all the ranges in mem->ranges[] and prepare phdr */ - for (i = 0; i < mem->nr_ranges; i++) { - mstart = mem->ranges[i].start; - mend = mem->ranges[i].end; - - phdr->p_type = PT_LOAD; - phdr->p_flags = PF_R|PF_W|PF_X; - phdr->p_offset = mstart; - - phdr->p_paddr = mstart; - phdr->p_vaddr = (unsigned long) __va(mstart); - phdr->p_filesz = phdr->p_memsz = mend - mstart + 1; - phdr->p_align = 0; - ehdr->e_phnum++; - pr_debug("Crash PT_LOAD ELF header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n", - phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz, - ehdr->e_phnum, phdr->p_offset); - phdr++; - } - - *addr = buf; - *sz = elf_sz; - return 0; -} +#endif /* CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY */ diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 1fc6095d502d..0c6185aefaef 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1545,6 +1545,17 @@ static int check_ftrace_location(struct kprobe *p) return 0; } +static bool is_cfi_preamble_symbol(unsigned long addr) +{ + char symbuf[KSYM_NAME_LEN]; + + if (lookup_symbol_name(addr, symbuf)) + return false; + + return str_has_prefix("__cfi_", symbuf) || + str_has_prefix("__pfx_", symbuf); +} + static int check_kprobe_address_safe(struct kprobe *p, struct module **probed_mod) { @@ -1563,7 +1574,8 @@ static int check_kprobe_address_safe(struct kprobe *p, within_kprobe_blacklist((unsigned long) p->addr) || jump_label_text_reserved(p->addr, p->addr) || static_call_text_reserved(p->addr, p->addr) || - find_bug((unsigned long)p->addr)) { + find_bug((unsigned long)p->addr) || + is_cfi_preamble_symbol((unsigned long)p->addr)) { ret = -EINVAL; goto out; } @@ -2220,8 +2232,7 @@ int register_kretprobe(struct kretprobe *rp) return -ENOMEM; for (i = 0; i < rp->maxactive; i++) { - inst = kzalloc(sizeof(struct kretprobe_instance) + - rp->data_size, GFP_KERNEL); + inst = kzalloc(struct_size(inst, data, rp->data_size), GFP_KERNEL); if (inst == NULL) { rethook_free(rp->rh); rp->rh = NULL; @@ -2244,8 +2255,7 @@ int register_kretprobe(struct kretprobe *rp) rp->rph->rp = rp; for (i = 0; i < rp->maxactive; i++) { - inst = kzalloc(sizeof(struct kretprobe_instance) + - rp->data_size, GFP_KERNEL); + inst = kzalloc(struct_size(inst, data, rp->data_size), GFP_KERNEL); if (inst == NULL) { refcount_set(&rp->rph->ref, i); free_rp_inst(rp); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index aad7a3bfd846..1d4bc493b2f4 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -165,6 +165,18 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj, } KERNEL_ATTR_RO(vmcoreinfo); +#ifdef CONFIG_CRASH_HOTPLUG +static ssize_t crash_elfcorehdr_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + unsigned int sz = crash_get_elfcorehdr_size(); + + return sysfs_emit(buf, "%u\n", sz); +} +KERNEL_ATTR_RO(crash_elfcorehdr_size); + +#endif + #endif /* CONFIG_CRASH_CORE */ /* whether file capabilities are enabled */ @@ -255,6 +267,9 @@ static struct attribute * kernel_attrs[] = { #endif #ifdef CONFIG_CRASH_CORE &vmcoreinfo_attr.attr, +#ifdef CONFIG_CRASH_HOTPLUG + &crash_elfcorehdr_size_attr.attr, +#endif #endif #ifndef CONFIG_TINY_RCU &rcu_expedited_attr.attr, diff --git a/kernel/kthread.c b/kernel/kthread.c index 4fff7df17a68..1eea53050bab 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -159,11 +159,10 @@ bool kthread_should_stop(void) } EXPORT_SYMBOL(kthread_should_stop); -bool __kthread_should_park(struct task_struct *k) +static bool __kthread_should_park(struct task_struct *k) { return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags); } -EXPORT_SYMBOL_GPL(__kthread_should_park); /** * kthread_should_park - should this kthread park now? diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 111607d91489..e85b5ad3e206 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -819,34 +819,26 @@ static int very_verbose(struct lock_class *class) * Is this the address of a static object: */ #ifdef __KERNEL__ -/* - * Check if an address is part of freed initmem. After initmem is freed, - * memory can be allocated from it, and such allocations would then have - * addresses within the range [_stext, _end]. - */ -#ifndef arch_is_kernel_initmem_freed -static int arch_is_kernel_initmem_freed(unsigned long addr) -{ - if (system_state < SYSTEM_FREEING_INITMEM) - return 0; - - return init_section_contains((void *)addr, 1); -} -#endif - static int static_obj(const void *obj) { - unsigned long start = (unsigned long) &_stext, - end = (unsigned long) &_end, - addr = (unsigned long) obj; + unsigned long addr = (unsigned long) obj; - if (arch_is_kernel_initmem_freed(addr)) - return 0; + if (is_kernel_core_data(addr)) + return 1; + + /* + * keys are allowed in the __ro_after_init section. + */ + if (is_kernel_rodata(addr)) + return 1; /* - * static variable? + * in initdata section and used during bootup only? + * NOTE: On some platforms the initdata section is + * outside of the _stext ... _end range. */ - if ((addr >= start) && (addr < end)) + if (system_state < SYSTEM_FREEING_INITMEM && + init_section_contains((void *)addr, 1)) return 1; /* diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 949d3deae506..270c7f80ce84 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -45,6 +45,7 @@ torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable"); torture_param(int, rt_boost, 2, "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types."); torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens."); +torture_param(int, writer_fifo, 0, "Run writers at sched_set_fifo() priority"); torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)"); /* Going much higher trips "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!" errors */ @@ -809,7 +810,8 @@ static int lock_torture_writer(void *arg) bool skip_main_lock; VERBOSE_TOROUT_STRING("lock_torture_writer task started"); - set_user_nice(current, MAX_NICE); + if (!rt_task(current)) + set_user_nice(current, MAX_NICE); do { if ((torture_random(&rand) & 0xfffff) == 0) @@ -1015,8 +1017,7 @@ static void lock_torture_cleanup(void) if (writer_tasks) { for (i = 0; i < cxt.nrealwriters_stress; i++) - torture_stop_kthread(lock_torture_writer, - writer_tasks[i]); + torture_stop_kthread(lock_torture_writer, writer_tasks[i]); kfree(writer_tasks); writer_tasks = NULL; } @@ -1244,8 +1245,9 @@ static int __init lock_torture_init(void) goto create_reader; /* Create writer. */ - firsterr = torture_create_kthread(lock_torture_writer, &cxt.lwsa[i], - writer_tasks[i]); + firsterr = torture_create_kthread_cb(lock_torture_writer, &cxt.lwsa[i], + writer_tasks[i], + writer_fifo ? sched_set_fifo : NULL); if (torture_init_error(firsterr)) goto unwind; diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 6afc249ce697..6a0184e9c234 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -486,6 +486,16 @@ gotlock: } /* + * Include the architecture specific callee-save thunk of the + * __pv_queued_spin_unlock(). This thunk is put together with + * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock + * function close to each other sharing consecutive instruction cachelines. + * Alternatively, architecture specific version of __pv_queued_spin_unlock() + * can be defined. + */ +#include <asm/qspinlock_paravirt.h> + +/* * PV versions of the unlock fastpath and slowpath functions to be used * instead of queued_spin_unlock(). */ @@ -533,16 +543,6 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) pv_kick(node->cpu); } -/* - * Include the architecture specific callee-save thunk of the - * __pv_queued_spin_unlock(). This thunk is put together with - * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock - * function close to each other sharing consecutive instruction cachelines. - * Alternatively, architecture specific version of __pv_queued_spin_unlock() - * can be defined. - */ -#include <asm/qspinlock_paravirt.h> - #ifndef __pv_queued_spin_unlock __visible __lockfunc void __pv_queued_spin_unlock(struct qspinlock *lock) { diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 728f434de2bb..21db0df0eb00 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -333,21 +333,43 @@ static __always_inline int __waiter_prio(struct task_struct *task) return prio; } +/* + * Update the waiter->tree copy of the sort keys. + */ static __always_inline void waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task) { - waiter->prio = __waiter_prio(task); - waiter->deadline = task->dl.deadline; + lockdep_assert_held(&waiter->lock->wait_lock); + lockdep_assert(RB_EMPTY_NODE(&waiter->tree.entry)); + + waiter->tree.prio = __waiter_prio(task); + waiter->tree.deadline = task->dl.deadline; +} + +/* + * Update the waiter->pi_tree copy of the sort keys (from the tree copy). + */ +static __always_inline void +waiter_clone_prio(struct rt_mutex_waiter *waiter, struct task_struct *task) +{ + lockdep_assert_held(&waiter->lock->wait_lock); + lockdep_assert_held(&task->pi_lock); + lockdep_assert(RB_EMPTY_NODE(&waiter->pi_tree.entry)); + + waiter->pi_tree.prio = waiter->tree.prio; + waiter->pi_tree.deadline = waiter->tree.deadline; } /* - * Only use with rt_mutex_waiter_{less,equal}() + * Only use with rt_waiter_node_{less,equal}() */ +#define task_to_waiter_node(p) \ + &(struct rt_waiter_node){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline } #define task_to_waiter(p) \ - &(struct rt_mutex_waiter){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline } + &(struct rt_mutex_waiter){ .tree = *task_to_waiter_node(p) } -static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left, - struct rt_mutex_waiter *right) +static __always_inline int rt_waiter_node_less(struct rt_waiter_node *left, + struct rt_waiter_node *right) { if (left->prio < right->prio) return 1; @@ -364,8 +386,8 @@ static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left, return 0; } -static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left, - struct rt_mutex_waiter *right) +static __always_inline int rt_waiter_node_equal(struct rt_waiter_node *left, + struct rt_waiter_node *right) { if (left->prio != right->prio) return 0; @@ -385,7 +407,7 @@ static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left, static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter, struct rt_mutex_waiter *top_waiter) { - if (rt_mutex_waiter_less(waiter, top_waiter)) + if (rt_waiter_node_less(&waiter->tree, &top_waiter->tree)) return true; #ifdef RT_MUTEX_BUILD_SPINLOCKS @@ -393,30 +415,30 @@ static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter, * Note that RT tasks are excluded from same priority (lateral) * steals to prevent the introduction of an unbounded latency. */ - if (rt_prio(waiter->prio) || dl_prio(waiter->prio)) + if (rt_prio(waiter->tree.prio) || dl_prio(waiter->tree.prio)) return false; - return rt_mutex_waiter_equal(waiter, top_waiter); + return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree); #else return false; #endif } #define __node_2_waiter(node) \ - rb_entry((node), struct rt_mutex_waiter, tree_entry) + rb_entry((node), struct rt_mutex_waiter, tree.entry) static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_node *b) { struct rt_mutex_waiter *aw = __node_2_waiter(a); struct rt_mutex_waiter *bw = __node_2_waiter(b); - if (rt_mutex_waiter_less(aw, bw)) + if (rt_waiter_node_less(&aw->tree, &bw->tree)) return 1; if (!build_ww_mutex()) return 0; - if (rt_mutex_waiter_less(bw, aw)) + if (rt_waiter_node_less(&bw->tree, &aw->tree)) return 0; /* NOTE: relies on waiter->ww_ctx being set before insertion */ @@ -434,48 +456,58 @@ static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_nod static __always_inline void rt_mutex_enqueue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) { - rb_add_cached(&waiter->tree_entry, &lock->waiters, __waiter_less); + lockdep_assert_held(&lock->wait_lock); + + rb_add_cached(&waiter->tree.entry, &lock->waiters, __waiter_less); } static __always_inline void rt_mutex_dequeue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) { - if (RB_EMPTY_NODE(&waiter->tree_entry)) + lockdep_assert_held(&lock->wait_lock); + + if (RB_EMPTY_NODE(&waiter->tree.entry)) return; - rb_erase_cached(&waiter->tree_entry, &lock->waiters); - RB_CLEAR_NODE(&waiter->tree_entry); + rb_erase_cached(&waiter->tree.entry, &lock->waiters); + RB_CLEAR_NODE(&waiter->tree.entry); } -#define __node_2_pi_waiter(node) \ - rb_entry((node), struct rt_mutex_waiter, pi_tree_entry) +#define __node_2_rt_node(node) \ + rb_entry((node), struct rt_waiter_node, entry) -static __always_inline bool -__pi_waiter_less(struct rb_node *a, const struct rb_node *b) +static __always_inline bool __pi_waiter_less(struct rb_node *a, const struct rb_node *b) { - return rt_mutex_waiter_less(__node_2_pi_waiter(a), __node_2_pi_waiter(b)); + return rt_waiter_node_less(__node_2_rt_node(a), __node_2_rt_node(b)); } static __always_inline void rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) { - rb_add_cached(&waiter->pi_tree_entry, &task->pi_waiters, __pi_waiter_less); + lockdep_assert_held(&task->pi_lock); + + rb_add_cached(&waiter->pi_tree.entry, &task->pi_waiters, __pi_waiter_less); } static __always_inline void rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) { - if (RB_EMPTY_NODE(&waiter->pi_tree_entry)) + lockdep_assert_held(&task->pi_lock); + + if (RB_EMPTY_NODE(&waiter->pi_tree.entry)) return; - rb_erase_cached(&waiter->pi_tree_entry, &task->pi_waiters); - RB_CLEAR_NODE(&waiter->pi_tree_entry); + rb_erase_cached(&waiter->pi_tree.entry, &task->pi_waiters); + RB_CLEAR_NODE(&waiter->pi_tree.entry); } -static __always_inline void rt_mutex_adjust_prio(struct task_struct *p) +static __always_inline void rt_mutex_adjust_prio(struct rt_mutex_base *lock, + struct task_struct *p) { struct task_struct *pi_task = NULL; + lockdep_assert_held(&lock->wait_lock); + lockdep_assert(rt_mutex_owner(lock) == p); lockdep_assert_held(&p->pi_lock); if (task_has_pi_waiters(p)) @@ -571,9 +603,14 @@ static __always_inline struct rt_mutex_base *task_blocked_on_lock(struct task_st * Chain walk basics and protection scope * * [R] refcount on task - * [P] task->pi_lock held + * [Pn] task->pi_lock held * [L] rtmutex->wait_lock held * + * Normal locking order: + * + * rtmutex->wait_lock + * task->pi_lock + * * Step Description Protected by * function arguments: * @task [R] @@ -588,27 +625,32 @@ static __always_inline struct rt_mutex_base *task_blocked_on_lock(struct task_st * again: * loop_sanity_check(); * retry: - * [1] lock(task->pi_lock); [R] acquire [P] - * [2] waiter = task->pi_blocked_on; [P] - * [3] check_exit_conditions_1(); [P] - * [4] lock = waiter->lock; [P] - * [5] if (!try_lock(lock->wait_lock)) { [P] try to acquire [L] - * unlock(task->pi_lock); release [P] + * [1] lock(task->pi_lock); [R] acquire [P1] + * [2] waiter = task->pi_blocked_on; [P1] + * [3] check_exit_conditions_1(); [P1] + * [4] lock = waiter->lock; [P1] + * [5] if (!try_lock(lock->wait_lock)) { [P1] try to acquire [L] + * unlock(task->pi_lock); release [P1] * goto retry; * } - * [6] check_exit_conditions_2(); [P] + [L] - * [7] requeue_lock_waiter(lock, waiter); [P] + [L] - * [8] unlock(task->pi_lock); release [P] + * [6] check_exit_conditions_2(); [P1] + [L] + * [7] requeue_lock_waiter(lock, waiter); [P1] + [L] + * [8] unlock(task->pi_lock); release [P1] * put_task_struct(task); release [R] * [9] check_exit_conditions_3(); [L] * [10] task = owner(lock); [L] * get_task_struct(task); [L] acquire [R] - * lock(task->pi_lock); [L] acquire [P] - * [11] requeue_pi_waiter(tsk, waiters(lock));[P] + [L] - * [12] check_exit_conditions_4(); [P] + [L] - * [13] unlock(task->pi_lock); release [P] + * lock(task->pi_lock); [L] acquire [P2] + * [11] requeue_pi_waiter(tsk, waiters(lock));[P2] + [L] + * [12] check_exit_conditions_4(); [P2] + [L] + * [13] unlock(task->pi_lock); release [P2] * unlock(lock->wait_lock); release [L] * goto again; + * + * Where P1 is the blocking task and P2 is the lock owner; going up one step + * the owner becomes the next blocked task etc.. + * +* */ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, enum rtmutex_chainwalk chwalk, @@ -756,7 +798,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, * enabled we continue, but stop the requeueing in the chain * walk. */ - if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { + if (rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) { if (!detect_deadlock) goto out_unlock_pi; else @@ -764,13 +806,18 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, } /* - * [4] Get the next lock + * [4] Get the next lock; per holding task->pi_lock we can't unblock + * and guarantee @lock's existence. */ lock = waiter->lock; /* * [5] We need to trylock here as we are holding task->pi_lock, * which is the reverse lock order versus the other rtmutex * operations. + * + * Per the above, holding task->pi_lock guarantees lock exists, so + * inverting this lock order is infeasible from a life-time + * perspective. */ if (!raw_spin_trylock(&lock->wait_lock)) { raw_spin_unlock_irq(&task->pi_lock); @@ -874,17 +921,18 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, * or * * DL CBS enforcement advancing the effective deadline. - * - * Even though pi_waiters also uses these fields, and that tree is only - * updated in [11], we can do this here, since we hold [L], which - * serializes all pi_waiters access and rb_erase() does not care about - * the values of the node being removed. */ waiter_update_prio(waiter, task); rt_mutex_enqueue(lock, waiter); - /* [8] Release the task */ + /* + * [8] Release the (blocking) task in preparation for + * taking the owner task in [10]. + * + * Since we hold lock->waiter_lock, task cannot unblock, even if we + * release task->pi_lock. + */ raw_spin_unlock(&task->pi_lock); put_task_struct(task); @@ -908,7 +956,12 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, return 0; } - /* [10] Grab the next task, i.e. the owner of @lock */ + /* + * [10] Grab the next task, i.e. the owner of @lock + * + * Per holding lock->wait_lock and checking for !owner above, there + * must be an owner and it cannot go away. + */ task = get_task_struct(rt_mutex_owner(lock)); raw_spin_lock(&task->pi_lock); @@ -921,8 +974,9 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, * and adjust the priority of the owner. */ rt_mutex_dequeue_pi(task, prerequeue_top_waiter); + waiter_clone_prio(waiter, task); rt_mutex_enqueue_pi(task, waiter); - rt_mutex_adjust_prio(task); + rt_mutex_adjust_prio(lock, task); } else if (prerequeue_top_waiter == waiter) { /* @@ -937,8 +991,9 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, */ rt_mutex_dequeue_pi(task, waiter); waiter = rt_mutex_top_waiter(lock); + waiter_clone_prio(waiter, task); rt_mutex_enqueue_pi(task, waiter); - rt_mutex_adjust_prio(task); + rt_mutex_adjust_prio(lock, task); } else { /* * Nothing changed. No need to do any priority @@ -1154,6 +1209,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, waiter->task = task; waiter->lock = lock; waiter_update_prio(waiter, task); + waiter_clone_prio(waiter, task); /* Get the top priority waiter on the lock */ if (rt_mutex_has_waiters(lock)) @@ -1187,7 +1243,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, rt_mutex_dequeue_pi(owner, top_waiter); rt_mutex_enqueue_pi(owner, waiter); - rt_mutex_adjust_prio(owner); + rt_mutex_adjust_prio(lock, owner); if (owner->pi_blocked_on) chain_walk = 1; } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) { @@ -1234,6 +1290,8 @@ static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh, { struct rt_mutex_waiter *waiter; + lockdep_assert_held(&lock->wait_lock); + raw_spin_lock(¤t->pi_lock); waiter = rt_mutex_top_waiter(lock); @@ -1246,7 +1304,7 @@ static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh, * task unblocks. */ rt_mutex_dequeue_pi(current, waiter); - rt_mutex_adjust_prio(current); + rt_mutex_adjust_prio(lock, current); /* * As we are waking up the top waiter, and the waiter stays @@ -1482,7 +1540,7 @@ static void __sched remove_waiter(struct rt_mutex_base *lock, if (rt_mutex_has_waiters(lock)) rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock)); - rt_mutex_adjust_prio(owner); + rt_mutex_adjust_prio(lock, owner); /* Store the lock on which owner is blocked or NULL */ next_lock = task_blocked_on_lock(owner); diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index cb9fdff76a8a..a6974d044593 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -459,7 +459,7 @@ void __sched rt_mutex_adjust_pi(struct task_struct *task) raw_spin_lock_irqsave(&task->pi_lock, flags); waiter = task->pi_blocked_on; - if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { + if (!waiter || rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) { raw_spin_unlock_irqrestore(&task->pi_lock, flags); return; } diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index c47e8361bfb5..1162e07cdaea 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -17,27 +17,44 @@ #include <linux/rtmutex.h> #include <linux/sched/wake_q.h> + +/* + * This is a helper for the struct rt_mutex_waiter below. A waiter goes in two + * separate trees and they need their own copy of the sort keys because of + * different locking requirements. + * + * @entry: rbtree node to enqueue into the waiters tree + * @prio: Priority of the waiter + * @deadline: Deadline of the waiter if applicable + * + * See rt_waiter_node_less() and waiter_*_prio(). + */ +struct rt_waiter_node { + struct rb_node entry; + int prio; + u64 deadline; +}; + /* * This is the control structure for tasks blocked on a rt_mutex, * which is allocated on the kernel stack on of the blocked task. * - * @tree_entry: pi node to enqueue into the mutex waiters tree - * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree + * @tree: node to enqueue into the mutex waiters tree + * @pi_tree: node to enqueue into the mutex owner waiters tree * @task: task reference to the blocked task * @lock: Pointer to the rt_mutex on which the waiter blocks * @wake_state: Wakeup state to use (TASK_NORMAL or TASK_RTLOCK_WAIT) - * @prio: Priority of the waiter - * @deadline: Deadline of the waiter if applicable * @ww_ctx: WW context pointer + * + * @tree is ordered by @lock->wait_lock + * @pi_tree is ordered by rt_mutex_owner(@lock)->pi_lock */ struct rt_mutex_waiter { - struct rb_node tree_entry; - struct rb_node pi_tree_entry; + struct rt_waiter_node tree; + struct rt_waiter_node pi_tree; struct task_struct *task; struct rt_mutex_base *lock; unsigned int wake_state; - int prio; - u64 deadline; struct ww_acquire_ctx *ww_ctx; }; @@ -105,7 +122,7 @@ static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock, { struct rb_node *leftmost = rb_first_cached(&lock->waiters); - return rb_entry(leftmost, struct rt_mutex_waiter, tree_entry) == waiter; + return rb_entry(leftmost, struct rt_mutex_waiter, tree.entry) == waiter; } static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock) @@ -113,8 +130,10 @@ static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base * struct rb_node *leftmost = rb_first_cached(&lock->waiters); struct rt_mutex_waiter *w = NULL; + lockdep_assert_held(&lock->wait_lock); + if (leftmost) { - w = rb_entry(leftmost, struct rt_mutex_waiter, tree_entry); + w = rb_entry(leftmost, struct rt_mutex_waiter, tree.entry); BUG_ON(w->lock != lock); } return w; @@ -127,8 +146,10 @@ static inline int task_has_pi_waiters(struct task_struct *p) static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p) { + lockdep_assert_held(&p->pi_lock); + return rb_entry(p->pi_waiters.rb_leftmost, struct rt_mutex_waiter, - pi_tree_entry); + pi_tree.entry); } #define RT_MUTEX_HAS_WAITERS 1UL @@ -190,8 +211,8 @@ static inline void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) static inline void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) { debug_rt_mutex_init_waiter(waiter); - RB_CLEAR_NODE(&waiter->pi_tree_entry); - RB_CLEAR_NODE(&waiter->tree_entry); + RB_CLEAR_NODE(&waiter->pi_tree.entry); + RB_CLEAR_NODE(&waiter->tree.entry); waiter->wake_state = TASK_NORMAL; waiter->task = NULL; } diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 56f139201f24..3ad2cc4823e5 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -96,25 +96,25 @@ __ww_waiter_first(struct rt_mutex *lock) struct rb_node *n = rb_first(&lock->rtmutex.waiters.rb_root); if (!n) return NULL; - return rb_entry(n, struct rt_mutex_waiter, tree_entry); + return rb_entry(n, struct rt_mutex_waiter, tree.entry); } static inline struct rt_mutex_waiter * __ww_waiter_next(struct rt_mutex *lock, struct rt_mutex_waiter *w) { - struct rb_node *n = rb_next(&w->tree_entry); + struct rb_node *n = rb_next(&w->tree.entry); if (!n) return NULL; - return rb_entry(n, struct rt_mutex_waiter, tree_entry); + return rb_entry(n, struct rt_mutex_waiter, tree.entry); } static inline struct rt_mutex_waiter * __ww_waiter_prev(struct rt_mutex *lock, struct rt_mutex_waiter *w) { - struct rb_node *n = rb_prev(&w->tree_entry); + struct rb_node *n = rb_prev(&w->tree.entry); if (!n) return NULL; - return rb_entry(n, struct rt_mutex_waiter, tree_entry); + return rb_entry(n, struct rt_mutex_waiter, tree.entry); } static inline struct rt_mutex_waiter * @@ -123,7 +123,7 @@ __ww_waiter_last(struct rt_mutex *lock) struct rb_node *n = rb_last(&lock->rtmutex.waiters.rb_root); if (!n) return NULL; - return rb_entry(n, struct rt_mutex_waiter, tree_entry); + return rb_entry(n, struct rt_mutex_waiter, tree.entry); } static inline void diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c index 8a5d6d63b06c..87440f714c0c 100644 --- a/kernel/module/decompress.c +++ b/kernel/module/decompress.c @@ -241,7 +241,7 @@ static ssize_t module_zstd_decompress(struct load_info *info, } wksp_size = zstd_dstream_workspace_bound(header.windowSize); - wksp = kmalloc(wksp_size, GFP_KERNEL); + wksp = vmalloc(wksp_size); if (!wksp) { retval = -ENOMEM; goto out; @@ -284,7 +284,7 @@ static ssize_t module_zstd_decompress(struct load_info *info, retval = new_size; out: - kfree(wksp); + vfree(wksp); return retval; } #else diff --git a/kernel/module/main.c b/kernel/module/main.c index 59b1d067e528..98fedfdb8db5 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1295,12 +1295,20 @@ void *__symbol_get(const char *symbol) }; preempt_disable(); - if (!find_symbol(&fsa) || strong_try_module_get(fsa.owner)) { - preempt_enable(); - return NULL; + if (!find_symbol(&fsa)) + goto fail; + if (fsa.license != GPL_ONLY) { + pr_warn("failing symbol_get of non-GPLONLY symbol %s.\n", + symbol); + goto fail; } + if (strong_try_module_get(fsa.owner)) + goto fail; preempt_enable(); return (void *)kernel_symbol_value(fsa.sym); +fail: + preempt_enable(); + return NULL; } EXPORT_SYMBOL_GPL(__symbol_get); @@ -1484,7 +1492,7 @@ long module_get_offset_and_type(struct module *mod, enum mod_mem_type type, return offset | mask; } -static bool module_init_layout_section(const char *sname) +bool module_init_layout_section(const char *sname) { #ifndef CONFIG_MODULE_UNLOAD if (module_exit_section(sname)) diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 80d9c6d77a45..15781acaac1c 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -30,7 +30,7 @@ static struct kmem_cache *nsproxy_cachep; struct nsproxy init_nsproxy = { - .count = ATOMIC_INIT(1), + .count = REFCOUNT_INIT(1), .uts_ns = &init_uts_ns, #if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC) .ipc_ns = &init_ipc_ns, @@ -55,7 +55,7 @@ static inline struct nsproxy *create_nsproxy(void) nsproxy = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL); if (nsproxy) - atomic_set(&nsproxy->count, 1); + refcount_set(&nsproxy->count, 1); return nsproxy; } diff --git a/kernel/panic.c b/kernel/panic.c index 10effe40a3fa..ffa037fa777d 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -216,7 +216,7 @@ static void panic_print_sys_info(bool console_flush) show_state(); if (panic_print & PANIC_PRINT_MEM_INFO) - show_mem(0, NULL); + show_mem(); if (panic_print & PANIC_PRINT_TIMER_INFO) sysrq_timer_list_show(); @@ -697,6 +697,7 @@ void warn_slowpath_fmt(const char *file, int line, unsigned taint, if (!fmt) { __warn(file, line, __builtin_return_address(0), taint, NULL, NULL); + warn_rcu_exit(rcu); return; } diff --git a/kernel/params.c b/kernel/params.c index 07d01f6ce9a2..2d4a0564697e 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -331,7 +331,7 @@ EXPORT_SYMBOL(param_ops_bool); int param_set_bool_enable_only(const char *val, const struct kernel_param *kp) { - int err = 0; + int err; bool new_value; bool orig_value = *(bool *)kp->arg; struct kernel_param dummy_kp = *kp; diff --git a/kernel/pid.c b/kernel/pid.c index 6a1d23a11026..fee14a4486a3 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -83,6 +83,9 @@ struct pid_namespace init_pid_ns = { #ifdef CONFIG_PID_NS .ns.ops = &pidns_operations, #endif +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) + .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC, +#endif }; EXPORT_SYMBOL_GPL(init_pid_ns); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 0bf44afe04dd..619972c78774 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -110,9 +110,9 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; ns->pid_allocated = PIDNS_ADDING; - - initialize_memfd_noexec_scope(ns); - +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) + ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns); +#endif return ns; out_free_idr: diff --git a/kernel/pid_sysctl.h b/kernel/pid_sysctl.h index b26e027fc9cd..2ee41a3a1dfd 100644 --- a/kernel/pid_sysctl.h +++ b/kernel/pid_sysctl.h @@ -5,33 +5,30 @@ #include <linux/pid_namespace.h> #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) -static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns) -{ - ns->memfd_noexec_scope = - task_active_pid_ns(current)->memfd_noexec_scope; -} - static int pid_mfd_noexec_dointvec_minmax(struct ctl_table *table, int write, void *buf, size_t *lenp, loff_t *ppos) { struct pid_namespace *ns = task_active_pid_ns(current); struct ctl_table table_copy; + int err, scope, parent_scope; if (write && !ns_capable(ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; table_copy = *table; - if (ns != &init_pid_ns) - table_copy.data = &ns->memfd_noexec_scope; - /* - * set minimum to current value, the effect is only bigger - * value is accepted. - */ - if (*(int *)table_copy.data > *(int *)table_copy.extra1) - table_copy.extra1 = table_copy.data; + /* You cannot set a lower enforcement value than your parent. */ + parent_scope = pidns_memfd_noexec_scope(ns->parent); + /* Equivalent to pidns_memfd_noexec_scope(ns). */ + scope = max(READ_ONCE(ns->memfd_noexec_scope), parent_scope); + + table_copy.data = &scope; + table_copy.extra1 = &parent_scope; - return proc_dointvec_minmax(&table_copy, write, buf, lenp, ppos); + err = proc_dointvec_minmax(&table_copy, write, buf, lenp, ppos); + if (!err && write) + WRITE_ONCE(ns->memfd_noexec_scope, scope); + return err; } static struct ctl_table pid_ns_ctl_table_vm[] = { @@ -51,7 +48,6 @@ static inline void register_pid_ns_sysctl_table_vm(void) register_sysctl("vm", pid_ns_ctl_table_vm); } #else -static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns) {} static inline void register_pid_ns_sysctl_table_vm(void) {} #endif diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index e1b4bfa938dd..8d35b9f9aaa3 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -786,9 +786,9 @@ int hibernate(void) unlock_device_hotplug(); if (snapshot_test) { pm_pr_dbg("Checking hibernation image\n"); - error = swsusp_check(snapshot_test); + error = swsusp_check(false); if (!error) - error = load_image_and_restore(snapshot_test); + error = load_image_and_restore(false); } thaw_processes(); @@ -945,14 +945,14 @@ static int software_resume(void) pm_pr_dbg("Looking for hibernation image.\n"); mutex_lock(&system_transition_mutex); - error = swsusp_check(false); + error = swsusp_check(true); if (error) goto Unlock; /* The snapshot device should not be opened while we're running */ if (!hibernate_acquire()) { error = -EBUSY; - swsusp_close(false); + swsusp_close(true); goto Unlock; } @@ -973,7 +973,7 @@ static int software_resume(void) goto Close_Finish; } - error = load_image_and_restore(false); + error = load_image_and_restore(true); thaw_processes(); Finish: pm_notifier_call_chain(PM_POST_RESTORE); @@ -987,7 +987,7 @@ static int software_resume(void) pm_pr_dbg("Hibernation image not present or could not be loaded.\n"); return error; Close_Finish: - swsusp_close(false); + swsusp_close(true); goto Finish; } @@ -1166,7 +1166,7 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, int error; if (!hibernation_available()) - return 0; + return n; if (len && buf[len-1] == '\n') len--; diff --git a/kernel/power/power.h b/kernel/power/power.h index 46eb14dc50c3..a98f95e309a3 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -168,11 +168,11 @@ extern int swsusp_swap_in_use(void); #define SF_HW_SIG 8 /* kernel/power/hibernate.c */ -int swsusp_check(bool snapshot_test); +int swsusp_check(bool exclusive); extern void swsusp_free(void); extern int swsusp_read(unsigned int *flags_p); extern int swsusp_write(unsigned int flags); -void swsusp_close(bool snapshot_test); +void swsusp_close(bool exclusive); #ifdef CONFIG_SUSPEND extern int swsusp_unmark(void); #endif diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 562aa0e450ed..1f306f158696 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -23,7 +23,7 @@ static void do_poweroff(struct work_struct *dummy) static DECLARE_WORK(poweroff_work, do_poweroff); -static void handle_poweroff(int key) +static void handle_poweroff(u8 key) { /* run sysrq poweroff on boot cpu */ schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work); diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 782d3b41c1f3..4244b069442e 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -220,6 +220,11 @@ static struct pm_qos_constraints cpu_latency_constraints = { .type = PM_QOS_MIN, }; +static inline bool cpu_latency_qos_value_invalid(s32 value) +{ + return value < 0 && value != PM_QOS_DEFAULT_VALUE; +} + /** * cpu_latency_qos_limit - Return current system-wide CPU latency QoS limit. */ @@ -263,7 +268,7 @@ static void cpu_latency_qos_apply(struct pm_qos_request *req, */ void cpu_latency_qos_add_request(struct pm_qos_request *req, s32 value) { - if (!req) + if (!req || cpu_latency_qos_value_invalid(value)) return; if (cpu_latency_qos_request_active(req)) { @@ -289,7 +294,7 @@ EXPORT_SYMBOL_GPL(cpu_latency_qos_add_request); */ void cpu_latency_qos_update_request(struct pm_qos_request *req, s32 new_value) { - if (!req) + if (!req || cpu_latency_qos_value_invalid(new_value)) return; if (!cpu_latency_qos_request_active(req)) { diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 0415d5ecb977..87e9f7e2bdc0 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -404,6 +404,7 @@ struct bm_position { struct mem_zone_bm_rtree *zone; struct rtree_node *node; unsigned long node_pfn; + unsigned long cur_pfn; int node_bit; }; @@ -589,6 +590,7 @@ static void memory_bm_position_reset(struct memory_bitmap *bm) bm->cur.node = list_entry(bm->cur.zone->leaves.next, struct rtree_node, list); bm->cur.node_pfn = 0; + bm->cur.cur_pfn = BM_END_OF_MAP; bm->cur.node_bit = 0; } @@ -799,6 +801,7 @@ node_found: bm->cur.zone = zone; bm->cur.node = node; bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK; + bm->cur.cur_pfn = pfn; /* Set return values */ *addr = node->data; @@ -850,6 +853,11 @@ static void memory_bm_clear_current(struct memory_bitmap *bm) clear_bit(bit, bm->cur.node->data); } +static unsigned long memory_bm_get_current(struct memory_bitmap *bm) +{ + return bm->cur.cur_pfn; +} + static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; @@ -929,10 +937,12 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) if (bit < bits) { pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit; bm->cur.node_bit = bit + 1; + bm->cur.cur_pfn = pfn; return pfn; } } while (rtree_next_node(bm)); + bm->cur.cur_pfn = BM_END_OF_MAP; return BM_END_OF_MAP; } @@ -1423,14 +1433,19 @@ static unsigned int count_data_pages(void) /* * This is needed, because copy_page and memcpy are not usable for copying - * task structs. + * task structs. Returns true if the page was filled with only zeros, + * otherwise false. */ -static inline void do_copy_page(long *dst, long *src) +static inline bool do_copy_page(long *dst, long *src) { + long z = 0; int n; - for (n = PAGE_SIZE / sizeof(long); n; n--) + for (n = PAGE_SIZE / sizeof(long); n; n--) { + z |= *src; *dst++ = *src++; + } + return !z; } /** @@ -1439,17 +1454,21 @@ static inline void do_copy_page(long *dst, long *src) * Check if the page we are going to copy is marked as present in the kernel * page tables. This always is the case if CONFIG_DEBUG_PAGEALLOC or * CONFIG_ARCH_HAS_SET_DIRECT_MAP is not set. In that case kernel_page_present() - * always returns 'true'. + * always returns 'true'. Returns true if the page was entirely composed of + * zeros, otherwise it will return false. */ -static void safe_copy_page(void *dst, struct page *s_page) +static bool safe_copy_page(void *dst, struct page *s_page) { + bool zeros_only; + if (kernel_page_present(s_page)) { - do_copy_page(dst, page_address(s_page)); + zeros_only = do_copy_page(dst, page_address(s_page)); } else { hibernate_map_page(s_page); - do_copy_page(dst, page_address(s_page)); + zeros_only = do_copy_page(dst, page_address(s_page)); hibernate_unmap_page(s_page); } + return zeros_only; } #ifdef CONFIG_HIGHMEM @@ -1459,17 +1478,18 @@ static inline struct page *page_is_saveable(struct zone *zone, unsigned long pfn saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn); } -static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) +static bool copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) { struct page *s_page, *d_page; void *src, *dst; + bool zeros_only; s_page = pfn_to_page(src_pfn); d_page = pfn_to_page(dst_pfn); if (PageHighMem(s_page)) { src = kmap_atomic(s_page); dst = kmap_atomic(d_page); - do_copy_page(dst, src); + zeros_only = do_copy_page(dst, src); kunmap_atomic(dst); kunmap_atomic(src); } else { @@ -1478,30 +1498,39 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) * The page pointed to by src may contain some kernel * data modified by kmap_atomic() */ - safe_copy_page(buffer, s_page); + zeros_only = safe_copy_page(buffer, s_page); dst = kmap_atomic(d_page); copy_page(dst, buffer); kunmap_atomic(dst); } else { - safe_copy_page(page_address(d_page), s_page); + zeros_only = safe_copy_page(page_address(d_page), s_page); } } + return zeros_only; } #else #define page_is_saveable(zone, pfn) saveable_page(zone, pfn) -static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) +static inline int copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) { - safe_copy_page(page_address(pfn_to_page(dst_pfn)), + return safe_copy_page(page_address(pfn_to_page(dst_pfn)), pfn_to_page(src_pfn)); } #endif /* CONFIG_HIGHMEM */ -static void copy_data_pages(struct memory_bitmap *copy_bm, - struct memory_bitmap *orig_bm) +/* + * Copy data pages will copy all pages into pages pulled from the copy_bm. + * If a page was entirely filled with zeros it will be marked in the zero_bm. + * + * Returns the number of pages copied. + */ +static unsigned long copy_data_pages(struct memory_bitmap *copy_bm, + struct memory_bitmap *orig_bm, + struct memory_bitmap *zero_bm) { + unsigned long copied_pages = 0; struct zone *zone; - unsigned long pfn; + unsigned long pfn, copy_pfn; for_each_populated_zone(zone) { unsigned long max_zone_pfn; @@ -1514,18 +1543,29 @@ static void copy_data_pages(struct memory_bitmap *copy_bm, } memory_bm_position_reset(orig_bm); memory_bm_position_reset(copy_bm); + copy_pfn = memory_bm_next_pfn(copy_bm); for(;;) { pfn = memory_bm_next_pfn(orig_bm); if (unlikely(pfn == BM_END_OF_MAP)) break; - copy_data_page(memory_bm_next_pfn(copy_bm), pfn); + if (copy_data_page(copy_pfn, pfn)) { + memory_bm_set_bit(zero_bm, pfn); + /* Use this copy_pfn for a page that is not full of zeros */ + continue; + } + copied_pages++; + copy_pfn = memory_bm_next_pfn(copy_bm); } + return copied_pages; } /* Total number of image pages */ static unsigned int nr_copy_pages; /* Number of pages needed for saving the original pfns of the image pages */ static unsigned int nr_meta_pages; +/* Number of zero pages */ +static unsigned int nr_zero_pages; + /* * Numbers of normal and highmem page frames allocated for hibernation image * before suspending devices. @@ -1546,6 +1586,9 @@ static struct memory_bitmap orig_bm; */ static struct memory_bitmap copy_bm; +/* Memory bitmap which tracks which saveable pages were zero filled. */ +static struct memory_bitmap zero_bm; + /** * swsusp_free - Free pages allocated for hibernation image. * @@ -1590,6 +1633,7 @@ loop: out: nr_copy_pages = 0; nr_meta_pages = 0; + nr_zero_pages = 0; restore_pblist = NULL; buffer = NULL; alloc_normal = 0; @@ -1808,8 +1852,15 @@ int hibernate_preallocate_memory(void) goto err_out; } + error = memory_bm_create(&zero_bm, GFP_IMAGE, PG_ANY); + if (error) { + pr_err("Cannot allocate zero bitmap\n"); + goto err_out; + } + alloc_normal = 0; alloc_highmem = 0; + nr_zero_pages = 0; /* Count the number of saveable data pages. */ save_highmem = count_highmem_pages(); @@ -2089,19 +2140,19 @@ asmlinkage __visible int swsusp_save(void) * Kill them. */ drain_local_pages(NULL); - copy_data_pages(©_bm, &orig_bm); + nr_copy_pages = copy_data_pages(©_bm, &orig_bm, &zero_bm); /* * End of critical section. From now on, we can write to memory, * but we should not touch disk. This specially means we must _not_ * touch swap space! Except we must write out our image of course. */ - nr_pages += nr_highmem; - nr_copy_pages = nr_pages; + /* We don't actually copy the zero pages */ + nr_zero_pages = nr_pages - nr_copy_pages; nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE); - pr_info("Image created (%d pages copied)\n", nr_pages); + pr_info("Image created (%d pages copied, %d zero pages)\n", nr_copy_pages, nr_zero_pages); return 0; } @@ -2146,15 +2197,22 @@ static int init_header(struct swsusp_info *info) return init_header_complete(info); } +#define ENCODED_PFN_ZERO_FLAG ((unsigned long)1 << (BITS_PER_LONG - 1)) +#define ENCODED_PFN_MASK (~ENCODED_PFN_ZERO_FLAG) + /** * pack_pfns - Prepare PFNs for saving. * @bm: Memory bitmap. * @buf: Memory buffer to store the PFNs in. + * @zero_bm: Memory bitmap containing PFNs of zero pages. * * PFNs corresponding to set bits in @bm are stored in the area of memory - * pointed to by @buf (1 page at a time). + * pointed to by @buf (1 page at a time). Pages which were filled with only + * zeros will have the highest bit set in the packed format to distinguish + * them from PFNs which will be contained in the image file. */ -static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm) +static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm, + struct memory_bitmap *zero_bm) { int j; @@ -2162,6 +2220,8 @@ static inline void pack_pfns(unsigned long *buf, struct memory_bitmap *bm) buf[j] = memory_bm_next_pfn(bm); if (unlikely(buf[j] == BM_END_OF_MAP)) break; + if (memory_bm_test_bit(zero_bm, buf[j])) + buf[j] |= ENCODED_PFN_ZERO_FLAG; } } @@ -2203,7 +2263,7 @@ int snapshot_read_next(struct snapshot_handle *handle) memory_bm_position_reset(©_bm); } else if (handle->cur <= nr_meta_pages) { clear_page(buffer); - pack_pfns(buffer, &orig_bm); + pack_pfns(buffer, &orig_bm, &zero_bm); } else { struct page *page; @@ -2299,24 +2359,35 @@ static int load_header(struct swsusp_info *info) * unpack_orig_pfns - Set bits corresponding to given PFNs in a memory bitmap. * @bm: Memory bitmap. * @buf: Area of memory containing the PFNs. + * @zero_bm: Memory bitmap with the zero PFNs marked. * * For each element of the array pointed to by @buf (1 page at a time), set the - * corresponding bit in @bm. + * corresponding bit in @bm. If the page was originally populated with only + * zeros then a corresponding bit will also be set in @zero_bm. */ -static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) +static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm, + struct memory_bitmap *zero_bm) { + unsigned long decoded_pfn; + bool zero; int j; for (j = 0; j < PAGE_SIZE / sizeof(long); j++) { if (unlikely(buf[j] == BM_END_OF_MAP)) break; - if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j])) { - memory_bm_set_bit(bm, buf[j]); + zero = !!(buf[j] & ENCODED_PFN_ZERO_FLAG); + decoded_pfn = buf[j] & ENCODED_PFN_MASK; + if (pfn_valid(decoded_pfn) && memory_bm_pfn_present(bm, decoded_pfn)) { + memory_bm_set_bit(bm, decoded_pfn); + if (zero) { + memory_bm_set_bit(zero_bm, decoded_pfn); + nr_zero_pages++; + } } else { - if (!pfn_valid(buf[j])) + if (!pfn_valid(decoded_pfn)) pr_err(FW_BUG "Memory map mismatch at 0x%llx after hibernation\n", - (unsigned long long)PFN_PHYS(buf[j])); + (unsigned long long)PFN_PHYS(decoded_pfn)); return -EFAULT; } } @@ -2538,6 +2609,7 @@ static inline void free_highmem_data(void) {} * prepare_image - Make room for loading hibernation image. * @new_bm: Uninitialized memory bitmap structure. * @bm: Memory bitmap with unsafe pages marked. + * @zero_bm: Memory bitmap containing the zero pages. * * Use @bm to mark the pages that will be overwritten in the process of * restoring the system memory state from the suspend image ("unsafe" pages) @@ -2548,10 +2620,15 @@ static inline void free_highmem_data(void) {} * pages will be used for just yet. Instead, we mark them all as allocated and * create a lists of "safe" pages to be used later. On systems with high * memory a list of "safe" highmem pages is created too. + * + * Because it was not known which pages were unsafe when @zero_bm was created, + * make a copy of it and recreate it within safe pages. */ -static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) +static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm, + struct memory_bitmap *zero_bm) { unsigned int nr_pages, nr_highmem; + struct memory_bitmap tmp; struct linked_page *lp; int error; @@ -2568,6 +2645,24 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) duplicate_memory_bitmap(new_bm, bm); memory_bm_free(bm, PG_UNSAFE_KEEP); + + /* Make a copy of zero_bm so it can be created in safe pages */ + error = memory_bm_create(&tmp, GFP_ATOMIC, PG_ANY); + if (error) + goto Free; + + duplicate_memory_bitmap(&tmp, zero_bm); + memory_bm_free(zero_bm, PG_UNSAFE_KEEP); + + /* Recreate zero_bm in safe pages */ + error = memory_bm_create(zero_bm, GFP_ATOMIC, PG_SAFE); + if (error) + goto Free; + + duplicate_memory_bitmap(zero_bm, &tmp); + memory_bm_free(&tmp, PG_UNSAFE_KEEP); + /* At this point zero_bm is in safe pages and it can be used for restoring. */ + if (nr_highmem > 0) { error = prepare_highmem_image(bm, &nr_highmem); if (error) @@ -2582,7 +2677,7 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) * * nr_copy_pages cannot be less than allocated_unsafe_pages too. */ - nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; + nr_pages = (nr_zero_pages + nr_copy_pages) - nr_highmem - allocated_unsafe_pages; nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE); while (nr_pages > 0) { lp = get_image_page(GFP_ATOMIC, PG_SAFE); @@ -2595,7 +2690,7 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) nr_pages--; } /* Preallocate memory for the image */ - nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; + nr_pages = (nr_zero_pages + nr_copy_pages) - nr_highmem - allocated_unsafe_pages; while (nr_pages > 0) { lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC); if (!lp) { @@ -2683,8 +2778,9 @@ int snapshot_write_next(struct snapshot_handle *handle) static struct chain_allocator ca; int error = 0; +next: /* Check if we have already loaded the entire image */ - if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) + if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages + nr_zero_pages) return 0; handle->sync_read = 1; @@ -2709,19 +2805,26 @@ int snapshot_write_next(struct snapshot_handle *handle) if (error) return error; + error = memory_bm_create(&zero_bm, GFP_ATOMIC, PG_ANY); + if (error) + return error; + + nr_zero_pages = 0; + hibernate_restore_protection_begin(); } else if (handle->cur <= nr_meta_pages + 1) { - error = unpack_orig_pfns(buffer, ©_bm); + error = unpack_orig_pfns(buffer, ©_bm, &zero_bm); if (error) return error; if (handle->cur == nr_meta_pages + 1) { - error = prepare_image(&orig_bm, ©_bm); + error = prepare_image(&orig_bm, ©_bm, &zero_bm); if (error) return error; chain_init(&ca, GFP_ATOMIC, PG_SAFE); memory_bm_position_reset(&orig_bm); + memory_bm_position_reset(&zero_bm); restore_pblist = NULL; handle->buffer = get_buffer(&orig_bm, &ca); handle->sync_read = 0; @@ -2738,6 +2841,14 @@ int snapshot_write_next(struct snapshot_handle *handle) handle->sync_read = 0; } handle->cur++; + + /* Zero pages were not included in the image, memset it and move on. */ + if (handle->cur > nr_meta_pages + 1 && + memory_bm_test_bit(&zero_bm, memory_bm_get_current(&orig_bm))) { + memset(handle->buffer, 0, PAGE_SIZE); + goto next; + } + return PAGE_SIZE; } @@ -2754,7 +2865,7 @@ void snapshot_write_finalize(struct snapshot_handle *handle) copy_last_highmem_page(); hibernate_restore_protect_page(handle->buffer); /* Do that only if we have loaded the image entirely */ - if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { + if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages + nr_zero_pages) { memory_bm_recycle(&orig_bm); free_highmem_data(); } @@ -2763,7 +2874,7 @@ void snapshot_write_finalize(struct snapshot_handle *handle) int snapshot_image_loaded(struct snapshot_handle *handle) { return !(!nr_copy_pages || !last_highmem_page_copied() || - handle->cur <= nr_meta_pages + nr_copy_pages); + handle->cur <= nr_meta_pages + nr_copy_pages + nr_zero_pages); } #ifdef CONFIG_HIGHMEM diff --git a/kernel/power/swap.c b/kernel/power/swap.c index f6ebcd00c410..74edbce2320b 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -1513,12 +1513,13 @@ end: static void *swsusp_holder; /** - * swsusp_check - Check for swsusp signature in the resume device + * swsusp_check - Check for swsusp signature in the resume device + * @exclusive: Open the resume device exclusively. */ -int swsusp_check(bool snapshot_test) +int swsusp_check(bool exclusive) { - void *holder = snapshot_test ? &swsusp_holder : NULL; + void *holder = exclusive ? &swsusp_holder : NULL; int error; hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, BLK_OPEN_READ, @@ -1563,17 +1564,18 @@ put: } /** - * swsusp_close - close swap device. + * swsusp_close - close swap device. + * @exclusive: Close the resume device which is exclusively opened. */ -void swsusp_close(bool snapshot_test) +void swsusp_close(bool exclusive) { if (IS_ERR(hib_resume_bdev)) { pr_debug("Image device not initialised\n"); return; } - blkdev_put(hib_resume_bdev, snapshot_test ? &swsusp_holder : NULL); + blkdev_put(hib_resume_bdev, exclusive ? &swsusp_holder : NULL); } /** diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 2a17704136f1..7d4979d5c3ce 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -103,3 +103,5 @@ struct printk_message { u64 seq; unsigned long dropped; }; + +bool other_cpu_in_panic(void); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 357a4d18f638..7e0b4dd02398 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -88,7 +88,7 @@ EXPORT_SYMBOL(oops_in_progress); static DEFINE_MUTEX(console_mutex); /* - * console_sem protects updates to console->seq and console_suspended, + * console_sem protects updates to console->seq * and also provides serialization for console printing. */ static DEFINE_SEMAPHORE(console_sem, 1); @@ -361,7 +361,7 @@ static bool panic_in_progress(void) * paths in the console code where we end up in places I want * locked without the console semaphore held). */ -static int console_locked, console_suspended; +static int console_locked; /* * Array of consoles built from command line options (console=) @@ -2308,7 +2308,11 @@ asmlinkage int vprintk_emit(int facility, int level, preempt_enable(); } - wake_up_klogd(); + if (in_sched) + defer_console_output(); + else + wake_up_klogd(); + return printed_len; } EXPORT_SYMBOL(vprintk_emit); @@ -2547,22 +2551,46 @@ MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to hig */ void suspend_console(void) { + struct console *con; + if (!console_suspend_enabled) return; pr_info("Suspending console(s) (use no_console_suspend to debug)\n"); pr_flush(1000, true); - console_lock(); - console_suspended = 1; - up_console_sem(); + + console_list_lock(); + for_each_console(con) + console_srcu_write_flags(con, con->flags | CON_SUSPENDED); + console_list_unlock(); + + /* + * Ensure that all SRCU list walks have completed. All printing + * contexts must be able to see that they are suspended so that it + * is guaranteed that all printing has stopped when this function + * completes. + */ + synchronize_srcu(&console_srcu); } void resume_console(void) { + struct console *con; + if (!console_suspend_enabled) return; - down_console_sem(); - console_suspended = 0; - console_unlock(); + + console_list_lock(); + for_each_console(con) + console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED); + console_list_unlock(); + + /* + * Ensure that all SRCU list walks have completed. All printing + * contexts must be able to see they are no longer suspended so + * that they are guaranteed to wake up and resume printing. + */ + synchronize_srcu(&console_srcu); + pr_flush(1000, true); } @@ -2585,6 +2613,26 @@ static int console_cpu_notify(unsigned int cpu) return 0; } +/* + * Return true if a panic is in progress on a remote CPU. + * + * On true, the local CPU should immediately release any printing resources + * that may be needed by the panic CPU. + */ +bool other_cpu_in_panic(void) +{ + if (!panic_in_progress()) + return false; + + /* + * We can use raw_smp_processor_id() here because it is impossible for + * the task to be migrated to the panic_cpu, or away from it. If + * panic_cpu has already been set, and we're not currently executing on + * that CPU, then we never will be. + */ + return atomic_read(&panic_cpu) != raw_smp_processor_id(); +} + /** * console_lock - block the console subsystem from printing * @@ -2597,9 +2645,11 @@ void console_lock(void) { might_sleep(); + /* On panic, the console_lock must be left to the panic cpu. */ + while (other_cpu_in_panic()) + msleep(1000); + down_console_sem(); - if (console_suspended) - return; console_locked = 1; console_may_schedule = 1; } @@ -2615,12 +2665,11 @@ EXPORT_SYMBOL(console_lock); */ int console_trylock(void) { - if (down_trylock_console_sem()) + /* On panic, the console_lock must be left to the panic cpu. */ + if (other_cpu_in_panic()) return 0; - if (console_suspended) { - up_console_sem(); + if (down_trylock_console_sem()) return 0; - } console_locked = 1; console_may_schedule = 0; return 1; @@ -2634,25 +2683,6 @@ int is_console_locked(void) EXPORT_SYMBOL(is_console_locked); /* - * Return true when this CPU should unlock console_sem without pushing all - * messages to the console. This reduces the chance that the console is - * locked when the panic CPU tries to use it. - */ -static bool abandon_console_lock_in_panic(void) -{ - if (!panic_in_progress()) - return false; - - /* - * We can use raw_smp_processor_id() here because it is impossible for - * the task to be migrated to the panic_cpu, or away from it. If - * panic_cpu has already been set, and we're not currently executing on - * that CPU, then we never will be. - */ - return atomic_read(&panic_cpu) != raw_smp_processor_id(); -} - -/* * Check if the given console is currently capable and allowed to print * records. * @@ -2665,6 +2695,9 @@ static inline bool console_is_usable(struct console *con) if (!(flags & CON_ENABLED)) return false; + if ((flags & CON_SUSPENDED)) + return false; + if (!con->write) return false; @@ -2948,7 +2981,7 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove any_progress = true; /* Allow panic_cpu to take over the consoles safely. */ - if (abandon_console_lock_in_panic()) + if (other_cpu_in_panic()) goto abandon; if (do_cond_resched) @@ -2983,11 +3016,6 @@ void console_unlock(void) bool flushed; u64 next_seq; - if (console_suspended) { - up_console_sem(); - return; - } - /* * Console drivers are called with interrupts disabled, so * @console_may_schedule should be cleared before; however, we may @@ -3045,10 +3073,28 @@ EXPORT_SYMBOL(console_conditional_schedule); void console_unblank(void) { + bool found_unblank = false; struct console *c; int cookie; /* + * First check if there are any consoles implementing the unblank() + * callback. If not, there is no reason to continue and take the + * console lock, which in particular can be dangerous if + * @oops_in_progress is set. + */ + cookie = console_srcu_read_lock(); + for_each_console_srcu(c) { + if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank) { + found_unblank = true; + break; + } + } + console_srcu_read_unlock(cookie); + if (!found_unblank) + return; + + /* * Stop console printing because the unblank() callback may * assume the console is not within its write() callback. * @@ -3056,6 +3102,16 @@ void console_unblank(void) * In that case, attempt a trylock as best-effort. */ if (oops_in_progress) { + /* Semaphores are not NMI-safe. */ + if (in_nmi()) + return; + + /* + * Attempting to trylock the console lock can deadlock + * if another CPU was stopped while modifying the + * semaphore. "Hope and pray" that this is not the + * current situation. + */ if (down_trylock_console_sem() != 0) return; } else @@ -3085,14 +3141,24 @@ void console_unblank(void) */ void console_flush_on_panic(enum con_flush_mode mode) { + bool handover; + u64 next_seq; + /* - * If someone else is holding the console lock, trylock will fail - * and may_schedule may be set. Ignore and proceed to unlock so - * that messages are flushed out. As this can be called from any - * context and we don't want to get preempted while flushing, - * ensure may_schedule is cleared. + * Ignore the console lock and flush out the messages. Attempting a + * trylock would not be useful because: + * + * - if it is contended, it must be ignored anyway + * - console_lock() and console_trylock() block and fail + * respectively in panic for non-panic CPUs + * - semaphores are not NMI-safe + */ + + /* + * If another context is holding the console lock, + * @console_may_schedule might be set. Clear it so that + * this context does not call cond_resched() while flushing. */ - console_trylock(); console_may_schedule = 0; if (mode == CONSOLE_REPLAY_ALL) { @@ -3105,15 +3171,15 @@ void console_flush_on_panic(enum con_flush_mode mode) cookie = console_srcu_read_lock(); for_each_console_srcu(c) { /* - * If the above console_trylock() failed, this is an - * unsynchronized assignment. But in that case, the + * This is an unsynchronized assignment, but the * kernel is in "hope and pray" mode anyway. */ c->seq = seq; } console_srcu_read_unlock(cookie); } - console_unlock(); + + console_flush_all(false, &next_seq, &handover); } /* @@ -3679,8 +3745,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre /* * Hold the console_lock to guarantee safe access to - * console->seq and to prevent changes to @console_suspended - * until all consoles have been processed. + * console->seq. */ console_lock(); @@ -3688,6 +3753,11 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre for_each_console_srcu(c) { if (con && con != c) continue; + /* + * If consoles are not usable, it cannot be expected + * that they make forward progress, so only increment + * @diff for usable consoles. + */ if (!console_is_usable(c)) continue; printk_seq = c->seq; @@ -3696,18 +3766,12 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre } console_srcu_read_unlock(cookie); - /* - * If consoles are suspended, it cannot be expected that they - * make forward progress, so timeout immediately. @diff is - * still used to return a valid flush status. - */ - if (console_suspended) - remaining = 0; - else if (diff != last_diff && reset_on_progress) + if (diff != last_diff && reset_on_progress) remaining = timeout_ms; console_unlock(); + /* Note: @diff is 0 if there are no usable consoles. */ if (diff == 0 || remaining == 0) break; @@ -3741,7 +3805,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre * printer has been seen to make some forward progress. * * Context: Process context. May sleep while acquiring console lock. - * Return: true if all enabled printers are caught up. + * Return: true if all usable printers are caught up. */ static bool pr_flush(int timeout_ms, bool reset_on_progress) { @@ -3798,11 +3862,33 @@ static void __wake_up_klogd(int val) preempt_enable(); } +/** + * wake_up_klogd - Wake kernel logging daemon + * + * Use this function when new records have been added to the ringbuffer + * and the console printing of those records has already occurred or is + * known to be handled by some other context. This function will only + * wake the logging daemon. + * + * Context: Any context. + */ void wake_up_klogd(void) { __wake_up_klogd(PRINTK_PENDING_WAKEUP); } +/** + * defer_console_output - Wake kernel logging daemon and trigger + * console printing in a deferred context + * + * Use this function when new records have been added to the ringbuffer, + * this context is responsible for console printing those records, but + * the current context is not allowed to perform the console printing. + * Trigger an irq_work context to perform the console printing. This + * function also wakes the logging daemon. + * + * Context: Any context. + */ void defer_console_output(void) { /* @@ -3819,12 +3905,7 @@ void printk_trigger_flush(void) int vprintk_deferred(const char *fmt, va_list args) { - int r; - - r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args); - defer_console_output(); - - return r; + return vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args); } int _printk_deferred(const char *fmt, ...) diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 2dc4d5a1f1ff..fde338606ce8 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -1735,7 +1735,7 @@ static bool copy_data(struct prb_data_ring *data_ring, if (!buf || !buf_size) return true; - data_size = min_t(u16, buf_size, len); + data_size = min_t(unsigned int, buf_size, len); memcpy(&buf[0], data, data_size); /* LMM(copy_data:A) */ return true; diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index ef0f9a2044da..6d10927a07d8 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -38,13 +38,8 @@ asmlinkage int vprintk(const char *fmt, va_list args) * Use the main logbuf even in NMI. But avoid calling console * drivers that might have their own locks. */ - if (this_cpu_read(printk_context) || in_nmi()) { - int len; - - len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args); - defer_console_output(); - return len; - } + if (this_cpu_read(printk_context) || in_nmi()) + return vprintk_deferred(fmt, args); /* No obstacles. */ return vprintk_default(fmt, args); diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 98c1544cf572..98e13be411af 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -493,7 +493,6 @@ static inline void rcu_expedite_gp(void) { } static inline void rcu_unexpedite_gp(void) { } static inline void rcu_async_hurry(void) { } static inline void rcu_async_relax(void) { } -static inline void rcu_request_urgent_qs_task(struct task_struct *t) { } #else /* #ifdef CONFIG_TINY_RCU */ bool rcu_gp_is_normal(void); /* Internal RCU use. */ bool rcu_gp_is_expedited(void); /* Internal RCU use. */ @@ -508,9 +507,16 @@ void show_rcu_tasks_gp_kthreads(void); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ static inline void show_rcu_tasks_gp_kthreads(void) {} #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ -void rcu_request_urgent_qs_task(struct task_struct *t); #endif /* #else #ifdef CONFIG_TINY_RCU */ +#ifdef CONFIG_TASKS_RCU +struct task_struct *get_rcu_tasks_gp_kthread(void); +#endif // # ifdef CONFIG_TASKS_RCU + +#ifdef CONFIG_TASKS_RUDE_RCU +struct task_struct *get_rcu_tasks_rude_gp_kthread(void); +#endif // # ifdef CONFIG_TASKS_RUDE_RCU + #define RCU_SCHEDULER_INACTIVE 0 #define RCU_SCHEDULER_INIT 1 #define RCU_SCHEDULER_RUNNING 2 diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index d1221731c7cf..ffdb30495e3c 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -84,15 +84,17 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); #endif torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); -torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader"); +torture_param(int, gp_async_max, 1000, "Max # outstanding waits per writer"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); +torture_param(int, minruntime, 0, "Minimum run time (s)"); torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); torture_param(bool, shutdown, RCUSCALE_SHUTDOWN, "Shutdown at end of scalability tests."); torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); +torture_param(int, writer_holdoff_jiffies, 0, "Holdoff (jiffies) between GPs, zero to disable"); torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() scale test?"); torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate."); torture_param(int, kfree_by_call_rcu, 0, "Use call_rcu() to emulate kfree_rcu()?"); @@ -139,6 +141,7 @@ struct rcu_scale_ops { void (*gp_barrier)(void); void (*sync)(void); void (*exp_sync)(void); + struct task_struct *(*rso_gp_kthread)(void); const char *name; }; @@ -295,6 +298,7 @@ static struct rcu_scale_ops tasks_ops = { .gp_barrier = rcu_barrier_tasks, .sync = synchronize_rcu_tasks, .exp_sync = synchronize_rcu_tasks, + .rso_gp_kthread = get_rcu_tasks_gp_kthread, .name = "tasks" }; @@ -306,6 +310,44 @@ static struct rcu_scale_ops tasks_ops = { #endif // #else // #ifdef CONFIG_TASKS_RCU +#ifdef CONFIG_TASKS_RUDE_RCU + +/* + * Definitions for RCU-tasks-rude scalability testing. + */ + +static int tasks_rude_scale_read_lock(void) +{ + return 0; +} + +static void tasks_rude_scale_read_unlock(int idx) +{ +} + +static struct rcu_scale_ops tasks_rude_ops = { + .ptype = RCU_TASKS_RUDE_FLAVOR, + .init = rcu_sync_scale_init, + .readlock = tasks_rude_scale_read_lock, + .readunlock = tasks_rude_scale_read_unlock, + .get_gp_seq = rcu_no_completed, + .gp_diff = rcu_seq_diff, + .async = call_rcu_tasks_rude, + .gp_barrier = rcu_barrier_tasks_rude, + .sync = synchronize_rcu_tasks_rude, + .exp_sync = synchronize_rcu_tasks_rude, + .rso_gp_kthread = get_rcu_tasks_rude_gp_kthread, + .name = "tasks-rude" +}; + +#define TASKS_RUDE_OPS &tasks_rude_ops, + +#else // #ifdef CONFIG_TASKS_RUDE_RCU + +#define TASKS_RUDE_OPS + +#endif // #else // #ifdef CONFIG_TASKS_RUDE_RCU + #ifdef CONFIG_TASKS_TRACE_RCU /* @@ -334,6 +376,7 @@ static struct rcu_scale_ops tasks_tracing_ops = { .gp_barrier = rcu_barrier_tasks_trace, .sync = synchronize_rcu_tasks_trace, .exp_sync = synchronize_rcu_tasks_trace, + .rso_gp_kthread = get_rcu_tasks_trace_gp_kthread, .name = "tasks-tracing" }; @@ -410,10 +453,12 @@ rcu_scale_writer(void *arg) { int i = 0; int i_max; + unsigned long jdone; long me = (long)arg; struct rcu_head *rhp = NULL; bool started = false, done = false, alldone = false; u64 t; + DEFINE_TORTURE_RANDOM(tr); u64 *wdp; u64 *wdpp = writer_durations[me]; @@ -424,7 +469,7 @@ rcu_scale_writer(void *arg) sched_set_fifo_low(current); if (holdoff) - schedule_timeout_uninterruptible(holdoff * HZ); + schedule_timeout_idle(holdoff * HZ); /* * Wait until rcu_end_inkernel_boot() is called for normal GP tests @@ -445,9 +490,12 @@ rcu_scale_writer(void *arg) } } + jdone = jiffies + minruntime * HZ; do { if (writer_holdoff) udelay(writer_holdoff); + if (writer_holdoff_jiffies) + schedule_timeout_idle(torture_random(&tr) % writer_holdoff_jiffies + 1); wdp = &wdpp[i]; *wdp = ktime_get_mono_fast_ns(); if (gp_async) { @@ -475,7 +523,7 @@ retry: if (!started && atomic_read(&n_rcu_scale_writer_started) >= nrealwriters) started = true; - if (!done && i >= MIN_MEAS) { + if (!done && i >= MIN_MEAS && time_after(jiffies, jdone)) { done = true; sched_set_normal(current, 0); pr_alert("%s%s rcu_scale_writer %ld has %d measurements\n", @@ -518,8 +566,8 @@ static void rcu_scale_print_module_parms(struct rcu_scale_ops *cur_ops, const char *tag) { pr_alert("%s" SCALE_FLAG - "--- %s: nreaders=%d nwriters=%d verbose=%d shutdown=%d\n", - scale_type, tag, nrealreaders, nrealwriters, verbose, shutdown); + "--- %s: gp_async=%d gp_async_max=%d gp_exp=%d holdoff=%d minruntime=%d nreaders=%d nwriters=%d writer_holdoff=%d writer_holdoff_jiffies=%d verbose=%d shutdown=%d\n", + scale_type, tag, gp_async, gp_async_max, gp_exp, holdoff, minruntime, nrealreaders, nrealwriters, writer_holdoff, writer_holdoff_jiffies, verbose, shutdown); } /* @@ -556,6 +604,8 @@ static struct task_struct **kfree_reader_tasks; static int kfree_nrealthreads; static atomic_t n_kfree_scale_thread_started; static atomic_t n_kfree_scale_thread_ended; +static struct task_struct *kthread_tp; +static u64 kthread_stime; struct kfree_obj { char kfree_obj[8]; @@ -701,6 +751,10 @@ kfree_scale_init(void) unsigned long jif_start; unsigned long orig_jif; + pr_alert("%s" SCALE_FLAG + "--- kfree_rcu_test: kfree_mult=%d kfree_by_call_rcu=%d kfree_nthreads=%d kfree_alloc_num=%d kfree_loops=%d kfree_rcu_test_double=%d kfree_rcu_test_single=%d\n", + scale_type, kfree_mult, kfree_by_call_rcu, kfree_nthreads, kfree_alloc_num, kfree_loops, kfree_rcu_test_double, kfree_rcu_test_single); + // Also, do a quick self-test to ensure laziness is as much as // expected. if (kfree_by_call_rcu && !IS_ENABLED(CONFIG_RCU_LAZY)) { @@ -797,6 +851,18 @@ rcu_scale_cleanup(void) if (gp_exp && gp_async) SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!"); + // If built-in, just report all of the GP kthread's CPU time. + if (IS_BUILTIN(CONFIG_RCU_SCALE_TEST) && !kthread_tp && cur_ops->rso_gp_kthread) + kthread_tp = cur_ops->rso_gp_kthread(); + if (kthread_tp) { + u32 ns; + u64 us; + + kthread_stime = kthread_tp->stime - kthread_stime; + us = div_u64_rem(kthread_stime, 1000, &ns); + pr_info("rcu_scale: Grace-period kthread CPU time: %llu.%03u us\n", us, ns); + show_rcu_gp_kthreads(); + } if (kfree_rcu_test) { kfree_scale_cleanup(); return; @@ -885,7 +951,7 @@ rcu_scale_init(void) long i; int firsterr = 0; static struct rcu_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_TRACING_OPS + &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS }; if (!torture_init_begin(scale_type, verbose)) @@ -910,6 +976,11 @@ rcu_scale_init(void) if (cur_ops->init) cur_ops->init(); + if (cur_ops->rso_gp_kthread) { + kthread_tp = cur_ops->rso_gp_kthread(); + if (kthread_tp) + kthread_stime = kthread_tp->stime; + } if (kfree_rcu_test) return kfree_scale_init(); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 147551c23baf..ade42d6a9d9b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1581,6 +1581,7 @@ rcu_torture_writer(void *arg) rcu_access_pointer(rcu_torture_current) != &rcu_tortures[i]) { tracing_off(); + show_rcu_gp_kthreads(); WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); rcu_ftrace_dump(DUMP_ALL); } @@ -1876,7 +1877,7 @@ static int rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) { int mask = rcutorture_extend_mask_max(); - unsigned long randmask1 = torture_random(trsp) >> 8; + unsigned long randmask1 = torture_random(trsp); unsigned long randmask2 = randmask1 >> 3; unsigned long preempts = RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED; unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ; @@ -1935,7 +1936,7 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, if (!((mask - 1) & mask)) return rtrsp; /* Current RCU reader not extendable. */ /* Bias towards larger numbers of loops. */ - i = (torture_random(trsp) >> 3); + i = torture_random(trsp); i = ((i | (i >> 3)) & RCUTORTURE_RDR_MAX_LOOPS) + 1; for (j = 0; j < i; j++) { mask = rcutorture_extend_mask(*readstate, trsp); @@ -2136,7 +2137,7 @@ static int rcu_nocb_toggle(void *arg) toggle_fuzz = NSEC_PER_USEC; do { r = torture_random(&rand); - cpu = (r >> 4) % (maxcpu + 1); + cpu = (r >> 1) % (maxcpu + 1); if (r & 0x1) { rcu_nocb_cpu_offload(cpu); atomic_long_inc(&n_nocb_offload); diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 1970ce5f22d4..91a0fd0d4d9a 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -528,6 +528,38 @@ static struct ref_scale_ops clock_ops = { .name = "clock" }; +static void ref_jiffies_section(const int nloops) +{ + u64 x = 0; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) + x += jiffies; + preempt_enable(); + stopopts = x; +} + +static void ref_jiffies_delay_section(const int nloops, const int udl, const int ndl) +{ + u64 x = 0; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + x += jiffies; + un_delay(udl, ndl); + } + preempt_enable(); + stopopts = x; +} + +static struct ref_scale_ops jiffies_ops = { + .readsection = ref_jiffies_section, + .delaysection = ref_jiffies_delay_section, + .name = "jiffies" +}; + //////////////////////////////////////////////////////////////////////// // // Methods leveraging SLAB_TYPESAFE_BY_RCU. @@ -1047,7 +1079,7 @@ ref_scale_init(void) int firsterr = 0; static struct ref_scale_ops *scale_ops[] = { &rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, - &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, + &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, &jiffies_ops, &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops, }; @@ -1107,12 +1139,11 @@ ref_scale_init(void) VERBOSE_SCALEOUT("Starting %d reader threads", nreaders); for (i = 0; i < nreaders; i++) { + init_waitqueue_head(&reader_tasks[i].wq); firsterr = torture_create_kthread(ref_scale_reader, (void *)i, reader_tasks[i].task); if (torture_init_error(firsterr)) goto unwind; - - init_waitqueue_head(&(reader_tasks[i].wq)); } // Main Task diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index b770add3f843..8d65f7d576a3 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -25,6 +25,8 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @cblist: Callback list. * @lock: Lock protecting per-CPU callback list. * @rtp_jiffies: Jiffies counter value for statistics. + * @lazy_timer: Timer to unlazify callbacks. + * @urgent_gp: Number of additional non-lazy grace periods. * @rtp_n_lock_retries: Rough lock-contention statistic. * @rtp_work: Work queue for invoking callbacks. * @rtp_irq_work: IRQ work queue for deferred wakeups. @@ -38,6 +40,8 @@ struct rcu_tasks_percpu { raw_spinlock_t __private lock; unsigned long rtp_jiffies; unsigned long rtp_n_lock_retries; + struct timer_list lazy_timer; + unsigned int urgent_gp; struct work_struct rtp_work; struct irq_work rtp_irq_work; struct rcu_head barrier_q_head; @@ -51,7 +55,6 @@ struct rcu_tasks_percpu { * @cbs_wait: RCU wait allowing a new callback to get kthread's attention. * @cbs_gbl_lock: Lock protecting callback list. * @tasks_gp_mutex: Mutex protecting grace period, needed during mid-boot dead zone. - * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. * @gp_func: This flavor's grace-period-wait function. * @gp_state: Grace period's most recent state transition (debugging). * @gp_sleep: Per-grace-period sleep to prevent CPU-bound looping. @@ -61,6 +64,8 @@ struct rcu_tasks_percpu { * @tasks_gp_seq: Number of grace periods completed since boot. * @n_ipis: Number of IPIs sent to encourage grace periods to end. * @n_ipis_fails: Number of IPI-send failures. + * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. + * @lazy_jiffies: Number of jiffies to allow callbacks to be lazy. * @pregp_func: This flavor's pre-grace-period function (optional). * @pertask_func: This flavor's per-task scan function (optional). * @postscan_func: This flavor's post-task scan function (optional). @@ -92,6 +97,7 @@ struct rcu_tasks { unsigned long n_ipis; unsigned long n_ipis_fails; struct task_struct *kthread_ptr; + unsigned long lazy_jiffies; rcu_tasks_gp_func_t gp_func; pregp_func_t pregp_func; pertask_func_t pertask_func; @@ -127,6 +133,7 @@ static struct rcu_tasks rt_name = \ .gp_func = gp, \ .call_func = call, \ .rtpcpu = &rt_name ## __percpu, \ + .lazy_jiffies = DIV_ROUND_UP(HZ, 4), \ .name = n, \ .percpu_enqueue_shift = order_base_2(CONFIG_NR_CPUS), \ .percpu_enqueue_lim = 1, \ @@ -139,9 +146,7 @@ static struct rcu_tasks rt_name = \ #ifdef CONFIG_TASKS_RCU /* Track exiting tasks in order to allow them to be waited for. */ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); -#endif -#ifdef CONFIG_TASKS_RCU /* Report delay in synchronize_srcu() completion in rcu_tasks_postscan(). */ static void tasks_rcu_exit_srcu_stall(struct timer_list *unused); static DEFINE_TIMER(tasks_rcu_exit_srcu_stall_timer, tasks_rcu_exit_srcu_stall); @@ -171,6 +176,8 @@ static int rcu_task_contend_lim __read_mostly = 100; module_param(rcu_task_contend_lim, int, 0444); static int rcu_task_collapse_lim __read_mostly = 10; module_param(rcu_task_collapse_lim, int, 0444); +static int rcu_task_lazy_lim __read_mostly = 32; +module_param(rcu_task_lazy_lim, int, 0444); /* RCU tasks grace-period state for debugging. */ #define RTGS_INIT 0 @@ -229,7 +236,7 @@ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp) #endif /* #ifndef CONFIG_TINY_RCU */ // Initialize per-CPU callback lists for the specified flavor of -// Tasks RCU. +// Tasks RCU. Do not enqueue callbacks before this function is invoked. static void cblist_init_generic(struct rcu_tasks *rtp) { int cpu; @@ -237,7 +244,6 @@ static void cblist_init_generic(struct rcu_tasks *rtp) int lim; int shift; - raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); if (rcu_task_enqueue_lim < 0) { rcu_task_enqueue_lim = 1; rcu_task_cb_adjust = true; @@ -260,22 +266,48 @@ static void cblist_init_generic(struct rcu_tasks *rtp) WARN_ON_ONCE(!rtpcp); if (cpu) raw_spin_lock_init(&ACCESS_PRIVATE(rtpcp, lock)); - raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. + local_irq_save(flags); // serialize initialization if (rcu_segcblist_empty(&rtpcp->cblist)) rcu_segcblist_init(&rtpcp->cblist); + local_irq_restore(flags); INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq); rtpcp->cpu = cpu; rtpcp->rtpp = rtp; if (!rtpcp->rtp_blkd_tasks.next) INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks); - raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled. } - raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d.\n", rtp->name, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim), rcu_task_cb_adjust); } +// Compute wakeup time for lazy callback timer. +static unsigned long rcu_tasks_lazy_time(struct rcu_tasks *rtp) +{ + return jiffies + rtp->lazy_jiffies; +} + +// Timer handler that unlazifies lazy callbacks. +static void call_rcu_tasks_generic_timer(struct timer_list *tlp) +{ + unsigned long flags; + bool needwake = false; + struct rcu_tasks *rtp; + struct rcu_tasks_percpu *rtpcp = from_timer(rtpcp, tlp, lazy_timer); + + rtp = rtpcp->rtpp; + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + if (!rcu_segcblist_empty(&rtpcp->cblist) && rtp->lazy_jiffies) { + if (!rtpcp->urgent_gp) + rtpcp->urgent_gp = 1; + needwake = true; + mod_timer(&rtpcp->lazy_timer, rcu_tasks_lazy_time(rtp)); + } + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + if (needwake) + rcuwait_wake_up(&rtp->cbs_wait); +} + // IRQ-work handler that does deferred wakeup for call_rcu_tasks_generic(). static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp) { @@ -292,6 +324,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, { int chosen_cpu; unsigned long flags; + bool havekthread = smp_load_acquire(&rtp->kthread_ptr); int ideal_cpu; unsigned long j; bool needadjust = false; @@ -316,12 +349,19 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, READ_ONCE(rtp->percpu_enqueue_lim) != nr_cpu_ids) needadjust = true; // Defer adjustment to avoid deadlock. } - if (!rcu_segcblist_is_enabled(&rtpcp->cblist)) { - raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled. - cblist_init_generic(rtp); - raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. + // Queuing callbacks before initialization not yet supported. + if (WARN_ON_ONCE(!rcu_segcblist_is_enabled(&rtpcp->cblist))) + rcu_segcblist_init(&rtpcp->cblist); + needwake = (func == wakeme_after_rcu) || + (rcu_segcblist_n_cbs(&rtpcp->cblist) == rcu_task_lazy_lim); + if (havekthread && !needwake && !timer_pending(&rtpcp->lazy_timer)) { + if (rtp->lazy_jiffies) + mod_timer(&rtpcp->lazy_timer, rcu_tasks_lazy_time(rtp)); + else + needwake = rcu_segcblist_empty(&rtpcp->cblist); } - needwake = rcu_segcblist_empty(&rtpcp->cblist); + if (needwake) + rtpcp->urgent_gp = 3; rcu_segcblist_enqueue(&rtpcp->cblist, rhp); raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); if (unlikely(needadjust)) { @@ -415,9 +455,14 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) } rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq)); - if (rcu_segcblist_pend_cbs(&rtpcp->cblist)) + if (rtpcp->urgent_gp > 0 && rcu_segcblist_pend_cbs(&rtpcp->cblist)) { + if (rtp->lazy_jiffies) + rtpcp->urgent_gp--; needgpcb |= 0x3; - if (!rcu_segcblist_empty(&rtpcp->cblist)) + } else if (rcu_segcblist_empty(&rtpcp->cblist)) { + rtpcp->urgent_gp = 0; + } + if (rcu_segcblist_ready_cbs(&rtpcp->cblist)) needgpcb |= 0x1; raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); } @@ -525,10 +570,12 @@ static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot) if (unlikely(midboot)) { needgpcb = 0x2; } else { + mutex_unlock(&rtp->tasks_gp_mutex); set_tasks_gp_state(rtp, RTGS_WAIT_CBS); rcuwait_wait_event(&rtp->cbs_wait, (needgpcb = rcu_tasks_need_gpcb(rtp)), TASK_IDLE); + mutex_lock(&rtp->tasks_gp_mutex); } if (needgpcb & 0x2) { @@ -549,11 +596,19 @@ static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot) // RCU-tasks kthread that detects grace periods and invokes callbacks. static int __noreturn rcu_tasks_kthread(void *arg) { + int cpu; struct rcu_tasks *rtp = arg; + for_each_possible_cpu(cpu) { + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + + timer_setup(&rtpcp->lazy_timer, call_rcu_tasks_generic_timer, 0); + rtpcp->urgent_gp = 1; + } + /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ housekeeping_affine(current, HK_TYPE_RCU); - WRITE_ONCE(rtp->kthread_ptr, current); // Let GPs start! + smp_store_release(&rtp->kthread_ptr, current); // Let GPs start! /* * Each pass through the following loop makes one check for @@ -635,16 +690,22 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) { int cpu; bool havecbs = false; + bool haveurgent = false; + bool haveurgentcbs = false; for_each_possible_cpu(cpu) { struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); - if (!data_race(rcu_segcblist_empty(&rtpcp->cblist))) { + if (!data_race(rcu_segcblist_empty(&rtpcp->cblist))) havecbs = true; + if (data_race(rtpcp->urgent_gp)) + haveurgent = true; + if (!data_race(rcu_segcblist_empty(&rtpcp->cblist)) && data_race(rtpcp->urgent_gp)) + haveurgentcbs = true; + if (havecbs && haveurgent && haveurgentcbs) break; - } } - pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c %s\n", + pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c%c%c l:%lu %s\n", rtp->kname, tasks_gp_state_getname(rtp), data_race(rtp->gp_state), jiffies - data_race(rtp->gp_jiffies), @@ -652,6 +713,9 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis), ".k"[!!data_race(rtp->kthread_ptr)], ".C"[havecbs], + ".u"[haveurgent], + ".U"[haveurgentcbs], + rtp->lazy_jiffies, s); } #endif // #ifndef CONFIG_TINY_RCU @@ -1020,11 +1084,16 @@ void rcu_barrier_tasks(void) } EXPORT_SYMBOL_GPL(rcu_barrier_tasks); +int rcu_tasks_lazy_ms = -1; +module_param(rcu_tasks_lazy_ms, int, 0444); + static int __init rcu_spawn_tasks_kthread(void) { cblist_init_generic(&rcu_tasks); rcu_tasks.gp_sleep = HZ / 10; rcu_tasks.init_fract = HZ / 10; + if (rcu_tasks_lazy_ms >= 0) + rcu_tasks.lazy_jiffies = msecs_to_jiffies(rcu_tasks_lazy_ms); rcu_tasks.pregp_func = rcu_tasks_pregp_step; rcu_tasks.pertask_func = rcu_tasks_pertask; rcu_tasks.postscan_func = rcu_tasks_postscan; @@ -1042,6 +1111,12 @@ void show_rcu_tasks_classic_gp_kthread(void) EXPORT_SYMBOL_GPL(show_rcu_tasks_classic_gp_kthread); #endif // !defined(CONFIG_TINY_RCU) +struct task_struct *get_rcu_tasks_gp_kthread(void) +{ + return rcu_tasks.kthread_ptr; +} +EXPORT_SYMBOL_GPL(get_rcu_tasks_gp_kthread); + /* * Contribute to protect against tasklist scan blind spot while the * task is exiting and may be removed from the tasklist. See @@ -1173,10 +1248,15 @@ void rcu_barrier_tasks_rude(void) } EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude); +int rcu_tasks_rude_lazy_ms = -1; +module_param(rcu_tasks_rude_lazy_ms, int, 0444); + static int __init rcu_spawn_tasks_rude_kthread(void) { cblist_init_generic(&rcu_tasks_rude); rcu_tasks_rude.gp_sleep = HZ / 10; + if (rcu_tasks_rude_lazy_ms >= 0) + rcu_tasks_rude.lazy_jiffies = msecs_to_jiffies(rcu_tasks_rude_lazy_ms); rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude); return 0; } @@ -1188,6 +1268,13 @@ void show_rcu_tasks_rude_gp_kthread(void) } EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread); #endif // !defined(CONFIG_TINY_RCU) + +struct task_struct *get_rcu_tasks_rude_gp_kthread(void) +{ + return rcu_tasks_rude.kthread_ptr; +} +EXPORT_SYMBOL_GPL(get_rcu_tasks_rude_gp_kthread); + #endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ //////////////////////////////////////////////////////////////////////// @@ -1793,6 +1880,9 @@ void rcu_barrier_tasks_trace(void) } EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace); +int rcu_tasks_trace_lazy_ms = -1; +module_param(rcu_tasks_trace_lazy_ms, int, 0444); + static int __init rcu_spawn_tasks_trace_kthread(void) { cblist_init_generic(&rcu_tasks_trace); @@ -1807,6 +1897,8 @@ static int __init rcu_spawn_tasks_trace_kthread(void) if (rcu_tasks_trace.init_fract <= 0) rcu_tasks_trace.init_fract = 1; } + if (rcu_tasks_trace_lazy_ms >= 0) + rcu_tasks_trace.lazy_jiffies = msecs_to_jiffies(rcu_tasks_trace_lazy_ms); rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step; rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan; rcu_tasks_trace.holdouts_func = check_all_holdout_tasks_trace; @@ -1830,6 +1922,12 @@ void show_rcu_tasks_trace_gp_kthread(void) EXPORT_SYMBOL_GPL(show_rcu_tasks_trace_gp_kthread); #endif // !defined(CONFIG_TINY_RCU) +struct task_struct *get_rcu_tasks_trace_gp_kthread(void) +{ + return rcu_tasks_trace.kthread_ptr; +} +EXPORT_SYMBOL_GPL(get_rcu_tasks_trace_gp_kthread); + #else /* #ifdef CONFIG_TASKS_TRACE_RCU */ static void exit_tasks_rcu_finish_trace(struct task_struct *t) { } #endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1449cb69a0e0..cb1caefa8bd0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -632,7 +632,7 @@ void __rcu_irq_enter_check_tick(void) // prevents self-deadlock. So we can safely recheck under the lock. // Note that the nohz_full state currently cannot change. raw_spin_lock_rcu_node(rdp->mynode); - if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { + if (READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) { // A nohz_full CPU is in the kernel and RCU needs a // quiescent state. Turn on the tick! WRITE_ONCE(rdp->rcu_forced_tick, true); @@ -677,12 +677,16 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) } /** - * rcu_is_watching - see if RCU thinks that the current CPU is not idle + * rcu_is_watching - RCU read-side critical sections permitted on current CPU? * - * Return true if RCU is watching the running CPU, which means that this - * CPU can safely enter RCU read-side critical sections. In other words, - * if the current CPU is not in its idle loop or is in an interrupt or - * NMI handler, return true. + * Return @true if RCU is watching the running CPU and @false otherwise. + * An @true return means that this CPU can safely enter RCU read-side + * critical sections. + * + * Although calls to rcu_is_watching() from most parts of the kernel + * will return @true, there are important exceptions. For example, if the + * current CPU is deep within its idle loop, in kernel entry/exit code, + * or offline, rcu_is_watching() will return @false. * * Make notrace because it can be called by the internal functions of * ftrace, and making this notrace removes unnecessary recursion calls. diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 43229d2b0c44..5598212d1f27 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -77,9 +77,9 @@ __setup("rcu_nocbs", rcu_nocb_setup); static int __init parse_rcu_nocb_poll(char *arg) { rcu_nocb_poll = true; - return 0; + return 1; } -early_param("rcu_nocb_poll", parse_rcu_nocb_poll); +__setup("rcu_nocb_poll", parse_rcu_nocb_poll); /* * Don't bother bypassing ->cblist if the call_rcu() rate is low. diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index b10b8349bb2a..6f06dc12904a 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -1035,7 +1035,7 @@ static bool sysrq_rcu; module_param(sysrq_rcu, bool, 0444); /* Dump grace-period-request information due to commandeered sysrq. */ -static void sysrq_show_rcu(int key) +static void sysrq_show_rcu(u8 key) { show_rcu_gp_kthreads(); } diff --git a/kernel/relay.c b/kernel/relay.c index a80fa01042e9..83fe0325cde1 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -375,7 +375,7 @@ static struct dentry *relay_create_buf_file(struct rchan *chan, */ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) { - struct rchan_buf *buf = NULL; + struct rchan_buf *buf; struct dentry *dentry; if (chan->is_global) diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 5d113aa59e77..59032aaccd18 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -171,7 +171,8 @@ static void scf_torture_stats_print(void) scfs.n_all_wait += scf_stats_p[i].n_all_wait; } if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) || - atomic_read(&n_mb_out_errs) || atomic_read(&n_alloc_errs)) + atomic_read(&n_mb_out_errs) || + (!IS_ENABLED(CONFIG_KASAN) && atomic_read(&n_alloc_errs))) bangstr = "!!! "; pr_alert("%s %sscf_invoked_count %s: %lld resched: %lld single: %lld/%lld single_ofl: %lld/%lld single_rpc: %lld single_rpc_ofl: %lld many: %lld/%lld all: %lld/%lld ", SCFTORT_FLAG, bangstr, isdone ? "VER" : "ver", invoked_count, scfs.n_resched, @@ -312,6 +313,7 @@ static void scf_handler_1(void *scfc_in) // Randomly do an smp_call_function*() invocation. static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_random_state *trsp) { + bool allocfail = false; uintptr_t cpu; int ret = 0; struct scf_check *scfcp = NULL; @@ -323,8 +325,10 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra preempt_disable(); if (scfsp->scfs_prim == SCF_PRIM_SINGLE || scfsp->scfs_wait) { scfcp = kmalloc(sizeof(*scfcp), GFP_ATOMIC); - if (WARN_ON_ONCE(!scfcp)) { + if (!scfcp) { + WARN_ON_ONCE(!IS_ENABLED(CONFIG_KASAN)); atomic_inc(&n_alloc_errs); + allocfail = true; } else { scfcp->scfc_cpu = -1; scfcp->scfc_wait = scfsp->scfs_wait; @@ -431,7 +435,9 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra cpus_read_unlock(); else preempt_enable(); - if (!(torture_random(trsp) & 0xfff)) + if (allocfail) + schedule_timeout_idle((1 + longwait) * HZ); // Let no-wait handlers complete. + else if (!(torture_random(trsp) & 0xfff)) schedule_timeout_uninterruptible(1); } diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index d57a5c1c1cd9..3561ab533dd4 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -13,6 +13,23 @@ * Waiting for completion is a typically sync point, but not an exclusion point. */ +static void complete_with_flags(struct completion *x, int wake_flags) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&x->wait.lock, flags); + + if (x->done != UINT_MAX) + x->done++; + swake_up_locked(&x->wait, wake_flags); + raw_spin_unlock_irqrestore(&x->wait.lock, flags); +} + +void complete_on_current_cpu(struct completion *x) +{ + return complete_with_flags(x, WF_CURRENT_CPU); +} + /** * complete: - signals a single thread waiting on this completion * @x: holds the state of this particular completion @@ -27,14 +44,7 @@ */ void complete(struct completion *x) { - unsigned long flags; - - raw_spin_lock_irqsave(&x->wait.lock, flags); - - if (x->done != UINT_MAX) - x->done++; - swake_up_locked(&x->wait); - raw_spin_unlock_irqrestore(&x->wait.lock, flags); + complete_with_flags(x, 0); } EXPORT_SYMBOL(complete); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c52c2eba7c73..2299a5cfbfb9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1097,25 +1097,22 @@ int get_nohz_timer_target(void) hk_mask = housekeeping_cpumask(HK_TYPE_TIMER); - rcu_read_lock(); + guard(rcu)(); + for_each_domain(cpu, sd) { for_each_cpu_and(i, sched_domain_span(sd), hk_mask) { if (cpu == i) continue; - if (!idle_cpu(i)) { - cpu = i; - goto unlock; - } + if (!idle_cpu(i)) + return i; } } if (default_cpu == -1) default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER); - cpu = default_cpu; -unlock: - rcu_read_unlock(); - return cpu; + + return default_cpu; } /* @@ -1194,6 +1191,20 @@ static void nohz_csd_func(void *info) #endif /* CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_FULL +static inline bool __need_bw_check(struct rq *rq, struct task_struct *p) +{ + if (rq->nr_running != 1) + return false; + + if (p->sched_class != &fair_sched_class) + return false; + + if (!task_on_rq_queued(p)) + return false; + + return true; +} + bool sched_can_stop_tick(struct rq *rq) { int fifo_nr_running; @@ -1229,6 +1240,18 @@ bool sched_can_stop_tick(struct rq *rq) if (rq->nr_running > 1) return false; + /* + * If there is one task and it has CFS runtime bandwidth constraints + * and it's on the cpu now we don't want to stop the tick. + * This check prevents clearing the bit if a newly enqueued task here is + * dequeued by migrating while the constrained task continues to run. + * E.g. going from 2->1 without going through pick_next_task(). + */ + if (sched_feat(HZ_BW) && __need_bw_check(rq, rq->curr)) { + if (cfs_task_bw_constrained(rq->curr)) + return false; + } + return true; } #endif /* CONFIG_NO_HZ_FULL */ @@ -1804,7 +1827,8 @@ static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, int old_min, old_max, old_min_rt; int result; - mutex_lock(&uclamp_mutex); + guard(mutex)(&uclamp_mutex); + old_min = sysctl_sched_uclamp_util_min; old_max = sysctl_sched_uclamp_util_max; old_min_rt = sysctl_sched_uclamp_util_min_rt_default; @@ -1813,7 +1837,7 @@ static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, if (result) goto undo; if (!write) - goto done; + return 0; if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max || sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE || @@ -1849,16 +1873,12 @@ static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, * Otherwise, keep it simple and do just a lazy update at each next * task enqueue time. */ - - goto done; + return 0; undo: sysctl_sched_uclamp_util_min = old_min; sysctl_sched_uclamp_util_max = old_max; sysctl_sched_uclamp_util_min_rt_default = old_min_rt; -done: - mutex_unlock(&uclamp_mutex); - return result; } #endif @@ -3413,7 +3433,6 @@ static int migrate_swap_stop(void *data) { struct migration_swap_arg *arg = data; struct rq *src_rq, *dst_rq; - int ret = -EAGAIN; if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu)) return -EAGAIN; @@ -3421,33 +3440,25 @@ static int migrate_swap_stop(void *data) src_rq = cpu_rq(arg->src_cpu); dst_rq = cpu_rq(arg->dst_cpu); - double_raw_lock(&arg->src_task->pi_lock, - &arg->dst_task->pi_lock); - double_rq_lock(src_rq, dst_rq); + guard(double_raw_spinlock)(&arg->src_task->pi_lock, &arg->dst_task->pi_lock); + guard(double_rq_lock)(src_rq, dst_rq); if (task_cpu(arg->dst_task) != arg->dst_cpu) - goto unlock; + return -EAGAIN; if (task_cpu(arg->src_task) != arg->src_cpu) - goto unlock; + return -EAGAIN; if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr)) - goto unlock; + return -EAGAIN; if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr)) - goto unlock; + return -EAGAIN; __migrate_swap_task(arg->src_task, arg->dst_cpu); __migrate_swap_task(arg->dst_task, arg->src_cpu); - ret = 0; - -unlock: - double_rq_unlock(src_rq, dst_rq); - raw_spin_unlock(&arg->dst_task->pi_lock); - raw_spin_unlock(&arg->src_task->pi_lock); - - return ret; + return 0; } /* @@ -3722,14 +3733,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) struct sched_domain *sd; __schedstat_inc(p->stats.nr_wakeups_remote); - rcu_read_lock(); + + guard(rcu)(); for_each_domain(rq->cpu, sd) { if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { __schedstat_inc(sd->ttwu_wake_remote); break; } } - rcu_read_unlock(); } if (wake_flags & WF_MIGRATED) @@ -3928,21 +3939,13 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags void wake_up_if_idle(int cpu) { struct rq *rq = cpu_rq(cpu); - struct rq_flags rf; - - rcu_read_lock(); - if (!is_idle_task(rcu_dereference(rq->curr))) - goto out; - - rq_lock_irqsave(rq, &rf); - if (is_idle_task(rq->curr)) - resched_curr(rq); - /* Else CPU is not idle, do nothing here: */ - rq_unlock_irqrestore(rq, &rf); - -out: - rcu_read_unlock(); + guard(rcu)(); + if (is_idle_task(rcu_dereference(rq->curr))) { + guard(rq_lock_irqsave)(rq); + if (is_idle_task(rq->curr)) + resched_curr(rq); + } } bool cpus_share_cache(int this_cpu, int that_cpu) @@ -4193,13 +4196,11 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) * Return: %true if @p->state changes (an actual wakeup was done), * %false otherwise. */ -static int -try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) +int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { - unsigned long flags; + guard(preempt)(); int cpu, success = 0; - preempt_disable(); if (p == current) { /* * We're waking current, this means 'p->on_rq' and 'task_cpu(p) @@ -4226,129 +4227,127 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * reordered with p->state check below. This pairs with smp_store_mb() * in set_current_state() that the waiting thread does. */ - raw_spin_lock_irqsave(&p->pi_lock, flags); - smp_mb__after_spinlock(); - if (!ttwu_state_match(p, state, &success)) - goto unlock; + scoped_guard (raw_spinlock_irqsave, &p->pi_lock) { + smp_mb__after_spinlock(); + if (!ttwu_state_match(p, state, &success)) + break; - trace_sched_waking(p); + trace_sched_waking(p); - /* - * Ensure we load p->on_rq _after_ p->state, otherwise it would - * be possible to, falsely, observe p->on_rq == 0 and get stuck - * in smp_cond_load_acquire() below. - * - * sched_ttwu_pending() try_to_wake_up() - * STORE p->on_rq = 1 LOAD p->state - * UNLOCK rq->lock - * - * __schedule() (switch to task 'p') - * LOCK rq->lock smp_rmb(); - * smp_mb__after_spinlock(); - * UNLOCK rq->lock - * - * [task p] - * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq - * - * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in - * __schedule(). See the comment for smp_mb__after_spinlock(). - * - * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). - */ - smp_rmb(); - if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) - goto unlock; + /* + * Ensure we load p->on_rq _after_ p->state, otherwise it would + * be possible to, falsely, observe p->on_rq == 0 and get stuck + * in smp_cond_load_acquire() below. + * + * sched_ttwu_pending() try_to_wake_up() + * STORE p->on_rq = 1 LOAD p->state + * UNLOCK rq->lock + * + * __schedule() (switch to task 'p') + * LOCK rq->lock smp_rmb(); + * smp_mb__after_spinlock(); + * UNLOCK rq->lock + * + * [task p] + * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq + * + * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in + * __schedule(). See the comment for smp_mb__after_spinlock(). + * + * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). + */ + smp_rmb(); + if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) + break; #ifdef CONFIG_SMP - /* - * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be - * possible to, falsely, observe p->on_cpu == 0. - * - * One must be running (->on_cpu == 1) in order to remove oneself - * from the runqueue. - * - * __schedule() (switch to task 'p') try_to_wake_up() - * STORE p->on_cpu = 1 LOAD p->on_rq - * UNLOCK rq->lock - * - * __schedule() (put 'p' to sleep) - * LOCK rq->lock smp_rmb(); - * smp_mb__after_spinlock(); - * STORE p->on_rq = 0 LOAD p->on_cpu - * - * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in - * __schedule(). See the comment for smp_mb__after_spinlock(). - * - * Form a control-dep-acquire with p->on_rq == 0 above, to ensure - * schedule()'s deactivate_task() has 'happened' and p will no longer - * care about it's own p->state. See the comment in __schedule(). - */ - smp_acquire__after_ctrl_dep(); + /* + * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be + * possible to, falsely, observe p->on_cpu == 0. + * + * One must be running (->on_cpu == 1) in order to remove oneself + * from the runqueue. + * + * __schedule() (switch to task 'p') try_to_wake_up() + * STORE p->on_cpu = 1 LOAD p->on_rq + * UNLOCK rq->lock + * + * __schedule() (put 'p' to sleep) + * LOCK rq->lock smp_rmb(); + * smp_mb__after_spinlock(); + * STORE p->on_rq = 0 LOAD p->on_cpu + * + * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in + * __schedule(). See the comment for smp_mb__after_spinlock(). + * + * Form a control-dep-acquire with p->on_rq == 0 above, to ensure + * schedule()'s deactivate_task() has 'happened' and p will no longer + * care about it's own p->state. See the comment in __schedule(). + */ + smp_acquire__after_ctrl_dep(); - /* - * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq - * == 0), which means we need to do an enqueue, change p->state to - * TASK_WAKING such that we can unlock p->pi_lock before doing the - * enqueue, such as ttwu_queue_wakelist(). - */ - WRITE_ONCE(p->__state, TASK_WAKING); + /* + * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq + * == 0), which means we need to do an enqueue, change p->state to + * TASK_WAKING such that we can unlock p->pi_lock before doing the + * enqueue, such as ttwu_queue_wakelist(). + */ + WRITE_ONCE(p->__state, TASK_WAKING); - /* - * If the owning (remote) CPU is still in the middle of schedule() with - * this task as prev, considering queueing p on the remote CPUs wake_list - * which potentially sends an IPI instead of spinning on p->on_cpu to - * let the waker make forward progress. This is safe because IRQs are - * disabled and the IPI will deliver after on_cpu is cleared. - * - * Ensure we load task_cpu(p) after p->on_cpu: - * - * set_task_cpu(p, cpu); - * STORE p->cpu = @cpu - * __schedule() (switch to task 'p') - * LOCK rq->lock - * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) - * STORE p->on_cpu = 1 LOAD p->cpu - * - * to ensure we observe the correct CPU on which the task is currently - * scheduling. - */ - if (smp_load_acquire(&p->on_cpu) && - ttwu_queue_wakelist(p, task_cpu(p), wake_flags)) - goto unlock; + /* + * If the owning (remote) CPU is still in the middle of schedule() with + * this task as prev, considering queueing p on the remote CPUs wake_list + * which potentially sends an IPI instead of spinning on p->on_cpu to + * let the waker make forward progress. This is safe because IRQs are + * disabled and the IPI will deliver after on_cpu is cleared. + * + * Ensure we load task_cpu(p) after p->on_cpu: + * + * set_task_cpu(p, cpu); + * STORE p->cpu = @cpu + * __schedule() (switch to task 'p') + * LOCK rq->lock + * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) + * STORE p->on_cpu = 1 LOAD p->cpu + * + * to ensure we observe the correct CPU on which the task is currently + * scheduling. + */ + if (smp_load_acquire(&p->on_cpu) && + ttwu_queue_wakelist(p, task_cpu(p), wake_flags)) + break; - /* - * If the owning (remote) CPU is still in the middle of schedule() with - * this task as prev, wait until it's done referencing the task. - * - * Pairs with the smp_store_release() in finish_task(). - * - * This ensures that tasks getting woken will be fully ordered against - * their previous state and preserve Program Order. - */ - smp_cond_load_acquire(&p->on_cpu, !VAL); + /* + * If the owning (remote) CPU is still in the middle of schedule() with + * this task as prev, wait until it's done referencing the task. + * + * Pairs with the smp_store_release() in finish_task(). + * + * This ensures that tasks getting woken will be fully ordered against + * their previous state and preserve Program Order. + */ + smp_cond_load_acquire(&p->on_cpu, !VAL); - cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU); - if (task_cpu(p) != cpu) { - if (p->in_iowait) { - delayacct_blkio_end(p); - atomic_dec(&task_rq(p)->nr_iowait); - } + cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU); + if (task_cpu(p) != cpu) { + if (p->in_iowait) { + delayacct_blkio_end(p); + atomic_dec(&task_rq(p)->nr_iowait); + } - wake_flags |= WF_MIGRATED; - psi_ttwu_dequeue(p); - set_task_cpu(p, cpu); - } + wake_flags |= WF_MIGRATED; + psi_ttwu_dequeue(p); + set_task_cpu(p, cpu); + } #else - cpu = task_cpu(p); + cpu = task_cpu(p); #endif /* CONFIG_SMP */ - ttwu_queue(p, cpu, wake_flags); -unlock: - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + ttwu_queue(p, cpu, wake_flags); + } out: if (success) ttwu_stat(p, task_cpu(p), wake_flags); - preempt_enable(); return success; } @@ -4501,6 +4500,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; + p->se.vlag = 0; + p->se.slice = sysctl_sched_base_slice; INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -5496,23 +5497,20 @@ unsigned int nr_iowait(void) void sched_exec(void) { struct task_struct *p = current; - unsigned long flags; + struct migration_arg arg; int dest_cpu; - raw_spin_lock_irqsave(&p->pi_lock, flags); - dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC); - if (dest_cpu == smp_processor_id()) - goto unlock; + scoped_guard (raw_spinlock_irqsave, &p->pi_lock) { + dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC); + if (dest_cpu == smp_processor_id()) + return; - if (likely(cpu_active(dest_cpu))) { - struct migration_arg arg = { p, dest_cpu }; + if (unlikely(!cpu_active(dest_cpu))) + return; - raw_spin_unlock_irqrestore(&p->pi_lock, flags); - stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); - return; + arg = (struct migration_arg){ p, dest_cpu }; } -unlock: - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); } #endif @@ -5722,9 +5720,6 @@ static void sched_tick_remote(struct work_struct *work) struct tick_work *twork = container_of(dwork, struct tick_work, work); int cpu = twork->cpu; struct rq *rq = cpu_rq(cpu); - struct task_struct *curr; - struct rq_flags rf; - u64 delta; int os; /* @@ -5734,30 +5729,26 @@ static void sched_tick_remote(struct work_struct *work) * statistics and checks timeslices in a time-independent way, regardless * of when exactly it is running. */ - if (!tick_nohz_tick_stopped_cpu(cpu)) - goto out_requeue; + if (tick_nohz_tick_stopped_cpu(cpu)) { + guard(rq_lock_irq)(rq); + struct task_struct *curr = rq->curr; - rq_lock_irq(rq, &rf); - curr = rq->curr; - if (cpu_is_offline(cpu)) - goto out_unlock; + if (cpu_online(cpu)) { + update_rq_clock(rq); - update_rq_clock(rq); + if (!is_idle_task(curr)) { + /* + * Make sure the next tick runs within a + * reasonable amount of time. + */ + u64 delta = rq_clock_task(rq) - curr->se.exec_start; + WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + } + curr->sched_class->task_tick(rq, curr, 0); - if (!is_idle_task(curr)) { - /* - * Make sure the next tick runs within a reasonable - * amount of time. - */ - delta = rq_clock_task(rq) - curr->se.exec_start; - WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + calc_load_nohz_remote(rq); + } } - curr->sched_class->task_tick(rq, curr, 0); - - calc_load_nohz_remote(rq); -out_unlock: - rq_unlock_irq(rq, &rf); -out_requeue: /* * Run the remote tick once per second (1Hz). This arbitrary @@ -6306,19 +6297,19 @@ static bool try_steal_cookie(int this, int that) unsigned long cookie; bool success = false; - local_irq_disable(); - double_rq_lock(dst, src); + guard(irq)(); + guard(double_rq_lock)(dst, src); cookie = dst->core->core_cookie; if (!cookie) - goto unlock; + return false; if (dst->curr != dst->idle) - goto unlock; + return false; p = sched_core_find(src, cookie); if (!p) - goto unlock; + return false; do { if (p == src->core_pick || p == src->curr) @@ -6330,9 +6321,10 @@ static bool try_steal_cookie(int this, int that) if (p->core_occupation > dst->idle->core_occupation) goto next; /* - * sched_core_find() and sched_core_next() will ensure that task @p - * is not throttled now, we also need to check whether the runqueue - * of the destination CPU is being throttled. + * sched_core_find() and sched_core_next() will ensure + * that task @p is not throttled now, we also need to + * check whether the runqueue of the destination CPU is + * being throttled. */ if (sched_task_is_throttled(p, this)) goto next; @@ -6350,10 +6342,6 @@ next: p = sched_core_next(p, cookie); } while (p); -unlock: - double_rq_unlock(dst, src); - local_irq_enable(); - return success; } @@ -6411,20 +6399,24 @@ static void queue_core_balance(struct rq *rq) queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance); } +DEFINE_LOCK_GUARD_1(core_lock, int, + sched_core_lock(*_T->lock, &_T->flags), + sched_core_unlock(*_T->lock, &_T->flags), + unsigned long flags) + static void sched_core_cpu_starting(unsigned int cpu) { const struct cpumask *smt_mask = cpu_smt_mask(cpu); struct rq *rq = cpu_rq(cpu), *core_rq = NULL; - unsigned long flags; int t; - sched_core_lock(cpu, &flags); + guard(core_lock)(&cpu); WARN_ON_ONCE(rq->core != rq); /* if we're the first, we'll be our own leader */ if (cpumask_weight(smt_mask) == 1) - goto unlock; + return; /* find the leader */ for_each_cpu(t, smt_mask) { @@ -6438,7 +6430,7 @@ static void sched_core_cpu_starting(unsigned int cpu) } if (WARN_ON_ONCE(!core_rq)) /* whoopsie */ - goto unlock; + return; /* install and validate core_rq */ for_each_cpu(t, smt_mask) { @@ -6449,29 +6441,25 @@ static void sched_core_cpu_starting(unsigned int cpu) WARN_ON_ONCE(rq->core != core_rq); } - -unlock: - sched_core_unlock(cpu, &flags); } static void sched_core_cpu_deactivate(unsigned int cpu) { const struct cpumask *smt_mask = cpu_smt_mask(cpu); struct rq *rq = cpu_rq(cpu), *core_rq = NULL; - unsigned long flags; int t; - sched_core_lock(cpu, &flags); + guard(core_lock)(&cpu); /* if we're the last man standing, nothing to do */ if (cpumask_weight(smt_mask) == 1) { WARN_ON_ONCE(rq->core != rq); - goto unlock; + return; } /* if we're not the leader, nothing to do */ if (rq->core != rq) - goto unlock; + return; /* find a new leader */ for_each_cpu(t, smt_mask) { @@ -6482,7 +6470,7 @@ static void sched_core_cpu_deactivate(unsigned int cpu) } if (WARN_ON_ONCE(!core_rq)) /* impossible */ - goto unlock; + return; /* copy the shared state to the new leader */ core_rq->core_task_seq = rq->core_task_seq; @@ -6504,9 +6492,6 @@ static void sched_core_cpu_deactivate(unsigned int cpu) rq = cpu_rq(t); rq->core = core_rq; } - -unlock: - sched_core_unlock(cpu, &flags); } static inline void sched_core_cpu_dying(unsigned int cpu) @@ -7030,7 +7015,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, void *key) { - WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); + WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC|WF_CURRENT_CPU)); return try_to_wake_up(curr->private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); @@ -7383,6 +7368,19 @@ struct task_struct *idle_task(int cpu) return cpu_rq(cpu)->idle; } +#ifdef CONFIG_SCHED_CORE +int sched_core_idle_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (sched_core_enabled(rq) && rq->curr == rq->idle) + return 1; + + return idle_cpu(cpu); +} + +#endif + #ifdef CONFIG_SMP /* * This function computes an effective utilization for the given CPU, to be @@ -9940,7 +9938,7 @@ void __init sched_init(void) ptr += nr_cpu_ids * sizeof(void **); root_task_group.shares = ROOT_TASK_GROUP_LOAD; - init_cfs_bandwidth(&root_task_group.cfs_bandwidth); + init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED root_task_group.rt_se = (struct sched_rt_entity **)ptr; @@ -11074,11 +11072,16 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) /* * Ensure max(child_quota) <= parent_quota. On cgroup2, - * always take the min. On cgroup1, only inherit when no - * limit is set: + * always take the non-RUNTIME_INF min. On cgroup1, only + * inherit when no limit is set. In both cases this is used + * by the scheduler to determine if a given CFS task has a + * bandwidth constraint at some higher level. */ if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) { - quota = min(quota, parent_quota); + if (quota == RUNTIME_INF) + quota = parent_quota; + else if (parent_quota != RUNTIME_INF) + quota = min(quota, parent_quota); } else { if (quota == RUNTIME_INF) quota = parent_quota; @@ -11139,6 +11142,27 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v) return 0; } + +static u64 throttled_time_self(struct task_group *tg) +{ + int i; + u64 total = 0; + + for_each_possible_cpu(i) { + total += READ_ONCE(tg->cfs_rq[i]->throttled_clock_self_time); + } + + return total; +} + +static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v) +{ + struct task_group *tg = css_tg(seq_css(sf)); + + seq_printf(sf, "throttled_time %llu\n", throttled_time_self(tg)); + + return 0; +} #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -11215,6 +11239,10 @@ static struct cftype cpu_legacy_files[] = { .name = "stat", .seq_show = cpu_cfs_stat_show, }, + { + .name = "stat.local", + .seq_show = cpu_cfs_local_stat_show, + }, #endif #ifdef CONFIG_RT_GROUP_SCHED { @@ -11271,6 +11299,24 @@ static int cpu_extra_stat_show(struct seq_file *sf, return 0; } +static int cpu_local_stat_show(struct seq_file *sf, + struct cgroup_subsys_state *css) +{ +#ifdef CONFIG_CFS_BANDWIDTH + { + struct task_group *tg = css_tg(css); + u64 throttled_self_usec; + + throttled_self_usec = throttled_time_self(tg); + do_div(throttled_self_usec, NSEC_PER_USEC); + + seq_printf(sf, "throttled_usec %llu\n", + throttled_self_usec); + } +#endif + return 0; +} + #ifdef CONFIG_FAIR_GROUP_SCHED static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) @@ -11449,6 +11495,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, .css_extra_stat_show = cpu_extra_stat_show, + .css_local_stat_show = cpu_local_stat_show, #ifdef CONFIG_RT_GROUP_SCHED .can_attach = cpu_cgroup_can_attach, #endif diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 066ff1c8ae4e..4c3d0d9f3db6 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -347,10 +347,7 @@ static __init int sched_init_debug(void) debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); #endif - debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency); - debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity); - debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity); - debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity); + debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice); debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); @@ -427,6 +424,7 @@ static void register_sd(struct sched_domain *sd, struct dentry *parent) #undef SDM debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops); + debugfs_create_file("groups_flags", 0444, parent, &sd->groups->flags, &sd_flags_fops); } void update_sched_domain_debugfs(void) @@ -581,9 +579,13 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) else SEQ_printf(m, " %c", task_state_to_char(p)); - SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ", + SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ", p->comm, task_pid_nr(p), SPLIT_NS(p->se.vruntime), + entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N', + SPLIT_NS(p->se.deadline), + SPLIT_NS(p->se.slice), + SPLIT_NS(p->se.sum_exec_runtime), (long long)(p->nvcsw + p->nivcsw), p->prio); @@ -626,10 +628,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { - s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, - spread, rq0_min_vruntime, spread0; + s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, spread; + struct sched_entity *last, *first; struct rq *rq = cpu_rq(cpu); - struct sched_entity *last; unsigned long flags; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -643,26 +644,25 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(cfs_rq->exec_clock)); raw_spin_rq_lock_irqsave(rq, flags); - if (rb_first_cached(&cfs_rq->tasks_timeline)) - MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; + first = __pick_first_entity(cfs_rq); + if (first) + left_vruntime = first->vruntime; last = __pick_last_entity(cfs_rq); if (last) - max_vruntime = last->vruntime; + right_vruntime = last->vruntime; min_vruntime = cfs_rq->min_vruntime; - rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; raw_spin_rq_unlock_irqrestore(rq, flags); - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", - SPLIT_NS(MIN_vruntime)); + + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime", + SPLIT_NS(left_vruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", SPLIT_NS(min_vruntime)); - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime", - SPLIT_NS(max_vruntime)); - spread = max_vruntime - MIN_vruntime; - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", - SPLIT_NS(spread)); - spread0 = min_vruntime - rq0_min_vruntime; - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", - SPLIT_NS(spread0)); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime", + SPLIT_NS(avg_vruntime(cfs_rq))); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime", + SPLIT_NS(right_vruntime)); + spread = right_vruntime - left_vruntime; + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread)); SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", cfs_rq->nr_spread_over); SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); @@ -863,10 +863,7 @@ static void sched_debug_header(struct seq_file *m) SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) #define PN(x) \ SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) - PN(sysctl_sched_latency); - PN(sysctl_sched_min_granularity); - PN(sysctl_sched_idle_min_granularity); - PN(sysctl_sched_wakeup_granularity); + PN(sysctl_sched_base_slice); P(sysctl_sched_child_runs_first); P(sysctl_sched_features); #undef PN diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b3e25be58e2b..cb225921bbca 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -47,6 +47,7 @@ #include <linux/psi.h> #include <linux/ratelimit.h> #include <linux/task_work.h> +#include <linux/rbtree_augmented.h> #include <asm/switch_to.h> @@ -57,22 +58,6 @@ #include "autogroup.h" /* - * Targeted preemption latency for CPU-bound tasks: - * - * NOTE: this latency value is not the same as the concept of - * 'timeslice length' - timeslices in CFS are of variable length - * and have no persistent notion like in traditional, time-slice - * based scheduling concepts. - * - * (to see the precise effective timeslice length of your workload, - * run vmstat and monitor the context-switches (cs) field) - * - * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) - */ -unsigned int sysctl_sched_latency = 6000000ULL; -static unsigned int normalized_sysctl_sched_latency = 6000000ULL; - -/* * The initial- and re-scaling of tunables is configurable * * Options are: @@ -90,21 +75,8 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; * * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_min_granularity = 750000ULL; -static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; - -/* - * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks. - * Applies only when SCHED_IDLE tasks compete with normal tasks. - * - * (default: 0.75 msec) - */ -unsigned int sysctl_sched_idle_min_granularity = 750000ULL; - -/* - * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity - */ -static unsigned int sched_nr_latency = 8; +unsigned int sysctl_sched_base_slice = 750000ULL; +static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; /* * After fork, child runs first. If set to 0 (default) then @@ -112,18 +84,6 @@ static unsigned int sched_nr_latency = 8; */ unsigned int sysctl_sched_child_runs_first __read_mostly; -/* - * SCHED_OTHER wake-up granularity. - * - * This option delays the preemption effects of decoupled workloads - * and reduces their over-scheduling. Synchronous workloads will still - * have immediate wakeup/sleep latencies. - * - * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) - */ -unsigned int sysctl_sched_wakeup_granularity = 1000000UL; -static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; - const_debug unsigned int sysctl_sched_migration_cost = 500000UL; int sched_thermal_decay_shift; @@ -277,9 +237,7 @@ static void update_sysctl(void) #define SET_SYSCTL(name) \ (sysctl_##name = (factor) * normalized_sysctl_##name) - SET_SYSCTL(sched_min_granularity); - SET_SYSCTL(sched_latency); - SET_SYSCTL(sched_wakeup_granularity); + SET_SYSCTL(sched_base_slice); #undef SET_SYSCTL } @@ -347,6 +305,16 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight return mul_u64_u32_shr(delta_exec, fact, shift); } +/* + * delta /= w + */ +static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) +{ + if (unlikely(se->load.weight != NICE_0_LOAD)) + delta = __calc_delta(delta, NICE_0_LOAD, &se->load); + + return delta; +} const struct sched_class fair_sched_class; @@ -601,13 +569,198 @@ static inline bool entity_before(const struct sched_entity *a, return (s64)(a->vruntime - b->vruntime) < 0; } +static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + return (s64)(se->vruntime - cfs_rq->min_vruntime); +} + #define __node_2_se(node) \ rb_entry((node), struct sched_entity, run_node) +/* + * Compute virtual time from the per-task service numbers: + * + * Fair schedulers conserve lag: + * + * \Sum lag_i = 0 + * + * Where lag_i is given by: + * + * lag_i = S - s_i = w_i * (V - v_i) + * + * Where S is the ideal service time and V is it's virtual time counterpart. + * Therefore: + * + * \Sum lag_i = 0 + * \Sum w_i * (V - v_i) = 0 + * \Sum w_i * V - w_i * v_i = 0 + * + * From which we can solve an expression for V in v_i (which we have in + * se->vruntime): + * + * \Sum v_i * w_i \Sum v_i * w_i + * V = -------------- = -------------- + * \Sum w_i W + * + * Specifically, this is the weighted average of all entity virtual runtimes. + * + * [[ NOTE: this is only equal to the ideal scheduler under the condition + * that join/leave operations happen at lag_i = 0, otherwise the + * virtual time has non-continguous motion equivalent to: + * + * V +-= lag_i / W + * + * Also see the comment in place_entity() that deals with this. ]] + * + * However, since v_i is u64, and the multiplcation could easily overflow + * transform it into a relative form that uses smaller quantities: + * + * Substitute: v_i == (v_i - v0) + v0 + * + * \Sum ((v_i - v0) + v0) * w_i \Sum (v_i - v0) * w_i + * V = ---------------------------- = --------------------- + v0 + * W W + * + * Which we track using: + * + * v0 := cfs_rq->min_vruntime + * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime + * \Sum w_i := cfs_rq->avg_load + * + * Since min_vruntime is a monotonic increasing variable that closely tracks + * the per-task service, these deltas: (v_i - v), will be in the order of the + * maximal (virtual) lag induced in the system due to quantisation. + * + * Also, we use scale_load_down() to reduce the size. + * + * As measured, the max (key * weight) value was ~44 bits for a kernel build. + */ +static void +avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + unsigned long weight = scale_load_down(se->load.weight); + s64 key = entity_key(cfs_rq, se); + + cfs_rq->avg_vruntime += key * weight; + cfs_rq->avg_load += weight; +} + +static void +avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + unsigned long weight = scale_load_down(se->load.weight); + s64 key = entity_key(cfs_rq, se); + + cfs_rq->avg_vruntime -= key * weight; + cfs_rq->avg_load -= weight; +} + +static inline +void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) +{ + /* + * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load + */ + cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta; +} + +u64 avg_vruntime(struct cfs_rq *cfs_rq) +{ + struct sched_entity *curr = cfs_rq->curr; + s64 avg = cfs_rq->avg_vruntime; + long load = cfs_rq->avg_load; + + if (curr && curr->on_rq) { + unsigned long weight = scale_load_down(curr->load.weight); + + avg += entity_key(cfs_rq, curr) * weight; + load += weight; + } + + if (load) + avg = div_s64(avg, load); + + return cfs_rq->min_vruntime + avg; +} + +/* + * lag_i = S - s_i = w_i * (V - v_i) + * + * However, since V is approximated by the weighted average of all entities it + * is possible -- by addition/removal/reweight to the tree -- to move V around + * and end up with a larger lag than we started with. + * + * Limit this to either double the slice length with a minimum of TICK_NSEC + * since that is the timing granularity. + * + * EEVDF gives the following limit for a steady state system: + * + * -r_max < lag < max(r_max, q) + * + * XXX could add max_slice to the augmented data to track this. + */ +static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + s64 lag, limit; + + SCHED_WARN_ON(!se->on_rq); + lag = avg_vruntime(cfs_rq) - se->vruntime; + + limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); + se->vlag = clamp(lag, -limit, limit); +} + +/* + * Entity is eligible once it received less service than it ought to have, + * eg. lag >= 0. + * + * lag_i = S - s_i = w_i*(V - v_i) + * + * lag_i >= 0 -> V >= v_i + * + * \Sum (v_i - v)*w_i + * V = ------------------ + v + * \Sum w_i + * + * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i) + * + * Note: using 'avg_vruntime() > se->vruntime' is inacurate due + * to the loss in precision caused by the division. + */ +int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct sched_entity *curr = cfs_rq->curr; + s64 avg = cfs_rq->avg_vruntime; + long load = cfs_rq->avg_load; + + if (curr && curr->on_rq) { + unsigned long weight = scale_load_down(curr->load.weight); + + avg += entity_key(cfs_rq, curr) * weight; + load += weight; + } + + return avg >= entity_key(cfs_rq, se) * load; +} + +static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) +{ + u64 min_vruntime = cfs_rq->min_vruntime; + /* + * open coded max_vruntime() to allow updating avg_vruntime + */ + s64 delta = (s64)(vruntime - min_vruntime); + if (delta > 0) { + avg_vruntime_update(cfs_rq, delta); + min_vruntime = vruntime; + } + return min_vruntime; +} + static void update_min_vruntime(struct cfs_rq *cfs_rq) { + struct sched_entity *se = __pick_first_entity(cfs_rq); struct sched_entity *curr = cfs_rq->curr; - struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline); u64 vruntime = cfs_rq->min_vruntime; @@ -618,9 +771,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) curr = NULL; } - if (leftmost) { /* non-empty tree */ - struct sched_entity *se = __node_2_se(leftmost); - + if (se) { if (!curr) vruntime = se->vruntime; else @@ -629,7 +780,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) /* ensure we never gain time by being placed backwards. */ u64_u32_store(cfs_rq->min_vruntime, - max_vruntime(cfs_rq->min_vruntime, vruntime)); + __update_min_vruntime(cfs_rq, vruntime)); } static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) @@ -637,17 +788,51 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) return entity_before(__node_2_se(a), __node_2_se(b)); } +#define deadline_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; }) + +static inline void __update_min_deadline(struct sched_entity *se, struct rb_node *node) +{ + if (node) { + struct sched_entity *rse = __node_2_se(node); + if (deadline_gt(min_deadline, se, rse)) + se->min_deadline = rse->min_deadline; + } +} + +/* + * se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline) + */ +static inline bool min_deadline_update(struct sched_entity *se, bool exit) +{ + u64 old_min_deadline = se->min_deadline; + struct rb_node *node = &se->run_node; + + se->min_deadline = se->deadline; + __update_min_deadline(se, node->rb_right); + __update_min_deadline(se, node->rb_left); + + return se->min_deadline == old_min_deadline; +} + +RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity, + run_node, min_deadline, min_deadline_update); + /* * Enqueue an entity into the rb-tree: */ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less); + avg_vruntime_add(cfs_rq, se); + se->min_deadline = se->deadline; + rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, + __entity_less, &min_deadline_cb); } static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline); + rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, + &min_deadline_cb); + avg_vruntime_sub(cfs_rq, se); } struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) @@ -660,14 +845,88 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) return __node_2_se(left); } -static struct sched_entity *__pick_next_entity(struct sched_entity *se) +/* + * Earliest Eligible Virtual Deadline First + * + * In order to provide latency guarantees for different request sizes + * EEVDF selects the best runnable task from two criteria: + * + * 1) the task must be eligible (must be owed service) + * + * 2) from those tasks that meet 1), we select the one + * with the earliest virtual deadline. + * + * We can do this in O(log n) time due to an augmented RB-tree. The + * tree keeps the entries sorted on service, but also functions as a + * heap based on the deadline by keeping: + * + * se->min_deadline = min(se->deadline, se->{left,right}->min_deadline) + * + * Which allows an EDF like search on (sub)trees. + */ +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) { - struct rb_node *next = rb_next(&se->run_node); + struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; + struct sched_entity *curr = cfs_rq->curr; + struct sched_entity *best = NULL; - if (!next) - return NULL; + if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) + curr = NULL; + + /* + * Once selected, run a task until it either becomes non-eligible or + * until it gets a new slice. See the HACK in set_next_entity(). + */ + if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline) + return curr; + + while (node) { + struct sched_entity *se = __node_2_se(node); + + /* + * If this entity is not eligible, try the left subtree. + */ + if (!entity_eligible(cfs_rq, se)) { + node = node->rb_left; + continue; + } + + /* + * If this entity has an earlier deadline than the previous + * best, take this one. If it also has the earliest deadline + * of its subtree, we're done. + */ + if (!best || deadline_gt(deadline, best, se)) { + best = se; + if (best->deadline == best->min_deadline) + break; + } + + /* + * If the earlest deadline in this subtree is in the fully + * eligible left half of our space, go there. + */ + if (node->rb_left && + __node_2_se(node->rb_left)->min_deadline == se->min_deadline) { + node = node->rb_left; + continue; + } - return __node_2_se(next); + node = node->rb_right; + } + + if (!best || (curr && deadline_gt(deadline, best, curr))) + best = curr; + + if (unlikely(!best)) { + struct sched_entity *left = __pick_first_entity(cfs_rq); + if (left) { + pr_err("EEVDF scheduling fail, picking leftmost\n"); + return left; + } + } + + return best; } #ifdef CONFIG_SCHED_DEBUG @@ -684,109 +943,51 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) /************************************************************** * Scheduling class statistics methods: */ - +#ifdef CONFIG_SMP int sched_update_scaling(void) { unsigned int factor = get_update_sysctl_factor(); - sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, - sysctl_sched_min_granularity); - #define WRT_SYSCTL(name) \ (normalized_sysctl_##name = sysctl_##name / (factor)) - WRT_SYSCTL(sched_min_granularity); - WRT_SYSCTL(sched_latency); - WRT_SYSCTL(sched_wakeup_granularity); + WRT_SYSCTL(sched_base_slice); #undef WRT_SYSCTL return 0; } #endif +#endif -/* - * delta /= w - */ -static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) -{ - if (unlikely(se->load.weight != NICE_0_LOAD)) - delta = __calc_delta(delta, NICE_0_LOAD, &se->load); - - return delta; -} - -/* - * The idea is to set a period in which each task runs once. - * - * When there are too many tasks (sched_nr_latency) we have to stretch - * this period because otherwise the slices get too small. - * - * p = (nr <= nl) ? l : l*nr/nl - */ -static u64 __sched_period(unsigned long nr_running) -{ - if (unlikely(nr_running > sched_nr_latency)) - return nr_running * sysctl_sched_min_granularity; - else - return sysctl_sched_latency; -} - -static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq); +static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); /* - * We calculate the wall-time slice from the period by taking a part - * proportional to the weight. - * - * s = p*P[w/rw] + * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i + * this is probably good enough. */ -static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) +static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) { - unsigned int nr_running = cfs_rq->nr_running; - struct sched_entity *init_se = se; - unsigned int min_gran; - u64 slice; - - if (sched_feat(ALT_PERIOD)) - nr_running = rq_of(cfs_rq)->cfs.h_nr_running; - - slice = __sched_period(nr_running + !se->on_rq); - - for_each_sched_entity(se) { - struct load_weight *load; - struct load_weight lw; - struct cfs_rq *qcfs_rq; - - qcfs_rq = cfs_rq_of(se); - load = &qcfs_rq->load; - - if (unlikely(!se->on_rq)) { - lw = qcfs_rq->load; + if ((s64)(se->vruntime - se->deadline) < 0) + return; - update_load_add(&lw, se->load.weight); - load = &lw; - } - slice = __calc_delta(slice, se->load.weight, load); - } + /* + * For EEVDF the virtual time slope is determined by w_i (iow. + * nice) while the request time r_i is determined by + * sysctl_sched_base_slice. + */ + se->slice = sysctl_sched_base_slice; - if (sched_feat(BASE_SLICE)) { - if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq)) - min_gran = sysctl_sched_idle_min_granularity; - else - min_gran = sysctl_sched_min_granularity; + /* + * EEVDF: vd_i = ve_i + r_i / w_i + */ + se->deadline = se->vruntime + calc_delta_fair(se->slice, se); - slice = max_t(u64, slice, min_gran); + /* + * The task has consumed its request, reschedule. + */ + if (cfs_rq->nr_running > 1) { + resched_curr(rq_of(cfs_rq)); + clear_buddies(cfs_rq, se); } - - return slice; -} - -/* - * We calculate the vruntime slice of a to-be-inserted task. - * - * vs = s/w - */ -static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - return calc_delta_fair(sched_slice(cfs_rq, se), se); } #include "pelt.h" @@ -921,6 +1122,7 @@ static void update_curr(struct cfs_rq *cfs_rq) schedstat_add(cfs_rq->exec_clock, delta_exec); curr->vruntime += calc_delta_fair(delta_exec, curr); + update_deadline(cfs_rq, curr); update_min_vruntime(cfs_rq); if (entity_is_task(curr)) { @@ -3375,16 +3577,36 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { + unsigned long old_weight = se->load.weight; + if (se->on_rq) { /* commit outstanding execution time */ if (cfs_rq->curr == se) update_curr(cfs_rq); + else + avg_vruntime_sub(cfs_rq, se); update_load_sub(&cfs_rq->load, se->load.weight); } dequeue_load_avg(cfs_rq, se); update_load_set(&se->load, weight); + if (!se->on_rq) { + /* + * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i), + * we need to scale se->vlag when w_i changes. + */ + se->vlag = div_s64(se->vlag * old_weight, weight); + } else { + s64 deadline = se->deadline - se->vruntime; + /* + * When the weight changes, the virtual time slope changes and + * we should adjust the relative virtual deadline accordingly. + */ + deadline = div_s64(deadline * old_weight, weight); + se->deadline = se->vruntime + deadline; + } + #ifdef CONFIG_SMP do { u32 divider = get_pelt_divider(&se->avg); @@ -3394,9 +3616,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, #endif enqueue_load_avg(cfs_rq, se); - if (se->on_rq) + if (se->on_rq) { update_load_add(&cfs_rq->load, se->load.weight); - + if (cfs_rq->curr != se) + avg_vruntime_add(cfs_rq, se); + } } void reweight_task(struct task_struct *p, int prio) @@ -4692,159 +4916,125 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} #endif /* CONFIG_SMP */ -static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -#ifdef CONFIG_SCHED_DEBUG - s64 d = se->vruntime - cfs_rq->min_vruntime; - - if (d < 0) - d = -d; - - if (d > 3*sysctl_sched_latency) - schedstat_inc(cfs_rq->nr_spread_over); -#endif -} - -static inline bool entity_is_long_sleeper(struct sched_entity *se) +static void +place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - struct cfs_rq *cfs_rq; - u64 sleep_time; + u64 vslice = calc_delta_fair(se->slice, se); + u64 vruntime = avg_vruntime(cfs_rq); + s64 lag = 0; - if (se->exec_start == 0) - return false; - - cfs_rq = cfs_rq_of(se); - - sleep_time = rq_clock_task(rq_of(cfs_rq)); + /* + * Due to how V is constructed as the weighted average of entities, + * adding tasks with positive lag, or removing tasks with negative lag + * will move 'time' backwards, this can screw around with the lag of + * other tasks. + * + * EEVDF: placement strategy #1 / #2 + */ + if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) { + struct sched_entity *curr = cfs_rq->curr; + unsigned long load; - /* Happen while migrating because of clock task divergence */ - if (sleep_time <= se->exec_start) - return false; + lag = se->vlag; - sleep_time -= se->exec_start; - if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD))) - return true; + /* + * If we want to place a task and preserve lag, we have to + * consider the effect of the new entity on the weighted + * average and compensate for this, otherwise lag can quickly + * evaporate. + * + * Lag is defined as: + * + * lag_i = S - s_i = w_i * (V - v_i) + * + * To avoid the 'w_i' term all over the place, we only track + * the virtual lag: + * + * vl_i = V - v_i <=> v_i = V - vl_i + * + * And we take V to be the weighted average of all v: + * + * V = (\Sum w_j*v_j) / W + * + * Where W is: \Sum w_j + * + * Then, the weighted average after adding an entity with lag + * vl_i is given by: + * + * V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i) + * = (W*V + w_i*(V - vl_i)) / (W + w_i) + * = (W*V + w_i*V - w_i*vl_i) / (W + w_i) + * = (V*(W + w_i) - w_i*l) / (W + w_i) + * = V - w_i*vl_i / (W + w_i) + * + * And the actual lag after adding an entity with vl_i is: + * + * vl'_i = V' - v_i + * = V - w_i*vl_i / (W + w_i) - (V - vl_i) + * = vl_i - w_i*vl_i / (W + w_i) + * + * Which is strictly less than vl_i. So in order to preserve lag + * we should inflate the lag before placement such that the + * effective lag after placement comes out right. + * + * As such, invert the above relation for vl'_i to get the vl_i + * we need to use such that the lag after placement is the lag + * we computed before dequeue. + * + * vl'_i = vl_i - w_i*vl_i / (W + w_i) + * = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i) + * + * (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i + * = W*vl_i + * + * vl_i = (W + w_i)*vl'_i / W + */ + load = cfs_rq->avg_load; + if (curr && curr->on_rq) + load += scale_load_down(curr->load.weight); - return false; -} + lag *= load + scale_load_down(se->load.weight); + if (WARN_ON_ONCE(!load)) + load = 1; + lag = div_s64(lag, load); + } -static void -place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) -{ - u64 vruntime = cfs_rq->min_vruntime; + se->vruntime = vruntime - lag; /* - * The 'current' period is already promised to the current tasks, - * however the extra weight of the new task will slow them down a - * little, place the new task so that it fits in the slot that - * stays open at the end. + * When joining the competition; the exisiting tasks will be, + * on average, halfway through their slice, as such start tasks + * off with half a slice to ease into the competition. */ - if (initial && sched_feat(START_DEBIT)) - vruntime += sched_vslice(cfs_rq, se); - - /* sleeps up to a single latency don't count. */ - if (!initial) { - unsigned long thresh; - - if (se_is_idle(se)) - thresh = sysctl_sched_min_granularity; - else - thresh = sysctl_sched_latency; + if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL)) + vslice /= 2; - /* - * Halve their sleep time's effect, to allow - * for a gentler effect of sleepers: - */ - if (sched_feat(GENTLE_FAIR_SLEEPERS)) - thresh >>= 1; - - vruntime -= thresh; - } - - /* - * Pull vruntime of the entity being placed to the base level of - * cfs_rq, to prevent boosting it if placed backwards. - * However, min_vruntime can advance much faster than real time, with - * the extreme being when an entity with the minimal weight always runs - * on the cfs_rq. If the waking entity slept for a long time, its - * vruntime difference from min_vruntime may overflow s64 and their - * comparison may get inversed, so ignore the entity's original - * vruntime in that case. - * The maximal vruntime speedup is given by the ratio of normal to - * minimal weight: scale_load_down(NICE_0_LOAD) / MIN_SHARES. - * When placing a migrated waking entity, its exec_start has been set - * from a different rq. In order to take into account a possible - * divergence between new and prev rq's clocks task because of irq and - * stolen time, we take an additional margin. - * So, cutting off on the sleep time of - * 2^63 / scale_load_down(NICE_0_LOAD) ~ 104 days - * should be safe. - */ - if (entity_is_long_sleeper(se)) - se->vruntime = vruntime; - else - se->vruntime = max_vruntime(se->vruntime, vruntime); + /* + * EEVDF: vd_i = ve_i + r_i/w_i + */ + se->deadline = se->vruntime + vslice; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq); +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); static inline bool cfs_bandwidth_used(void); -/* - * MIGRATION - * - * dequeue - * update_curr() - * update_min_vruntime() - * vruntime -= min_vruntime - * - * enqueue - * update_curr() - * update_min_vruntime() - * vruntime += min_vruntime - * - * this way the vruntime transition between RQs is done when both - * min_vruntime are up-to-date. - * - * WAKEUP (remote) - * - * ->migrate_task_rq_fair() (p->state == TASK_WAKING) - * vruntime -= min_vruntime - * - * enqueue - * update_curr() - * update_min_vruntime() - * vruntime += min_vruntime - * - * this way we don't have the most up-to-date min_vruntime on the originating - * CPU and an up-to-date min_vruntime on the destination CPU. - */ - static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED); bool curr = cfs_rq->curr == se; /* * If we're the current task, we must renormalise before calling * update_curr(). */ - if (renorm && curr) - se->vruntime += cfs_rq->min_vruntime; + if (curr) + place_entity(cfs_rq, se, flags); update_curr(cfs_rq); /* - * Otherwise, renormalise after, such that we're placed at the current - * moment in time, instead of some random moment in the past. Being - * placed in the past could significantly boost this task to the - * fairness detriment of existing tasks. - */ - if (renorm && !curr) - se->vruntime += cfs_rq->min_vruntime; - - /* * When enqueuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. * - For group_entity, update its runnable_weight to reflect the new @@ -4855,37 +5045,46 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); se_update_runnable(se); + /* + * XXX update_load_avg() above will have attached us to the pelt sum; + * but update_cfs_group() here will re-adjust the weight and have to + * undo/redo all that. Seems wasteful. + */ update_cfs_group(se); + + /* + * XXX now that the entity has been re-weighted, and it's lag adjusted, + * we can place the entity. + */ + if (!curr) + place_entity(cfs_rq, se, flags); + account_entity_enqueue(cfs_rq, se); - if (flags & ENQUEUE_WAKEUP) - place_entity(cfs_rq, se, 0); /* Entity has migrated, no longer consider this task hot */ if (flags & ENQUEUE_MIGRATED) se->exec_start = 0; check_schedstat_required(); update_stats_enqueue_fair(cfs_rq, se, flags); - check_spread(cfs_rq, se); if (!curr) __enqueue_entity(cfs_rq, se); se->on_rq = 1; if (cfs_rq->nr_running == 1) { check_enqueue_throttle(cfs_rq); - if (!throttled_hierarchy(cfs_rq)) + if (!throttled_hierarchy(cfs_rq)) { list_add_leaf_cfs_rq(cfs_rq); - } -} - -static void __clear_buddies_last(struct sched_entity *se) -{ - for_each_sched_entity(se) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->last != se) - break; + } else { +#ifdef CONFIG_CFS_BANDWIDTH + struct rq *rq = rq_of(cfs_rq); - cfs_rq->last = NULL; + if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) + cfs_rq->throttled_clock = rq_clock(rq); + if (!cfs_rq->throttled_clock_self) + cfs_rq->throttled_clock_self = rq_clock(rq); +#endif + } } } @@ -4900,27 +5099,10 @@ static void __clear_buddies_next(struct sched_entity *se) } } -static void __clear_buddies_skip(struct sched_entity *se) -{ - for_each_sched_entity(se) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->skip != se) - break; - - cfs_rq->skip = NULL; - } -} - static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (cfs_rq->last == se) - __clear_buddies_last(se); - if (cfs_rq->next == se) __clear_buddies_next(se); - - if (cfs_rq->skip == se) - __clear_buddies_skip(se); } static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); @@ -4954,20 +5136,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) clear_buddies(cfs_rq, se); + update_entity_lag(cfs_rq, se); if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->on_rq = 0; account_entity_dequeue(cfs_rq, se); - /* - * Normalize after update_curr(); which will also have moved - * min_vruntime if @se is the one holding it back. But before doing - * update_min_vruntime() again, which will discount @se's position and - * can move min_vruntime forward still more. - */ - if (!(flags & DEQUEUE_SLEEP)) - se->vruntime -= cfs_rq->min_vruntime; - /* return excess runtime on last dequeue */ return_cfs_rq_runtime(cfs_rq); @@ -4986,52 +5160,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_idle_cfs_rq_clock_pelt(cfs_rq); } -/* - * Preempt the current task with a newly woken task if needed: - */ -static void -check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) -{ - unsigned long ideal_runtime, delta_exec; - struct sched_entity *se; - s64 delta; - - /* - * When many tasks blow up the sched_period; it is possible that - * sched_slice() reports unusually large results (when many tasks are - * very light for example). Therefore impose a maximum. - */ - ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency); - - delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; - if (delta_exec > ideal_runtime) { - resched_curr(rq_of(cfs_rq)); - /* - * The current task ran long enough, ensure it doesn't get - * re-elected due to buddy favours. - */ - clear_buddies(cfs_rq, curr); - return; - } - - /* - * Ensure that a task that missed wakeup preemption by a - * narrow margin doesn't have to wait for a full slice. - * This also mitigates buddy induced latencies under load. - */ - if (delta_exec < sysctl_sched_min_granularity) - return; - - se = __pick_first_entity(cfs_rq); - delta = curr->vruntime - se->vruntime; - - if (delta < 0) - return; - - if (delta > ideal_runtime) - resched_curr(rq_of(cfs_rq)); -} - static void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -5047,6 +5175,11 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) update_stats_wait_end_fair(cfs_rq, se); __dequeue_entity(cfs_rq, se); update_load_avg(cfs_rq, se, UPDATE_TG); + /* + * HACK, stash a copy of deadline at the point of pick in vlag, + * which isn't used until dequeue. + */ + se->vlag = se->deadline; } update_stats_curr_start(cfs_rq, se); @@ -5070,9 +5203,6 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) se->prev_sum_exec_runtime = se->sum_exec_runtime; } -static int -wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); - /* * Pick the next process, keeping these things in mind, in this order: * 1) keep things fair between processes/task groups @@ -5083,50 +5213,14 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); static struct sched_entity * pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) { - struct sched_entity *left = __pick_first_entity(cfs_rq); - struct sched_entity *se; - /* - * If curr is set we have to see if its left of the leftmost entity - * still in the tree, provided there was anything in the tree at all. + * Enabling NEXT_BUDDY will affect latency but not fairness. */ - if (!left || (curr && entity_before(curr, left))) - left = curr; - - se = left; /* ideally we run the leftmost entity */ - - /* - * Avoid running the skip buddy, if running something else can - * be done without getting too unfair. - */ - if (cfs_rq->skip && cfs_rq->skip == se) { - struct sched_entity *second; - - if (se == curr) { - second = __pick_first_entity(cfs_rq); - } else { - second = __pick_next_entity(se); - if (!second || (curr && entity_before(curr, second))) - second = curr; - } + if (sched_feat(NEXT_BUDDY) && + cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) + return cfs_rq->next; - if (second && wakeup_preempt_entity(second, left) < 1) - se = second; - } - - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) { - /* - * Someone really wants this to run. If it's not unfair, run it. - */ - se = cfs_rq->next; - } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) { - /* - * Prefer last buddy, try to return the CPU to a preempted task. - */ - se = cfs_rq->last; - } - - return se; + return pick_eevdf(cfs_rq); } static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); @@ -5143,8 +5237,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* throttle cfs_rqs exceeding runtime */ check_cfs_rq_runtime(cfs_rq); - check_spread(cfs_rq, prev); - if (prev->on_rq) { update_stats_wait_start_fair(cfs_rq, prev); /* Put 'current' back into the tree. */ @@ -5185,9 +5277,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) return; #endif - - if (cfs_rq->nr_running > 1) - check_preempt_tick(cfs_rq, curr); } @@ -5377,6 +5466,17 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) /* Add cfs_rq with load or one or more already running entities to the list */ if (!cfs_rq_is_decayed(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); + + if (cfs_rq->throttled_clock_self) { + u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self; + + cfs_rq->throttled_clock_self = 0; + + if (SCHED_WARN_ON((s64)delta < 0)) + delta = 0; + + cfs_rq->throttled_clock_self_time += delta; + } } return 0; @@ -5391,6 +5491,10 @@ static int tg_throttle_down(struct task_group *tg, void *data) if (!cfs_rq->throttle_count) { cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); list_del_leaf_cfs_rq(cfs_rq); + + SCHED_WARN_ON(cfs_rq->throttled_clock_self); + if (cfs_rq->nr_running) + cfs_rq->throttled_clock_self = rq_clock(rq); } cfs_rq->throttle_count++; @@ -5480,7 +5584,9 @@ done: * throttled-list. rq->lock protects completion. */ cfs_rq->throttled = 1; - cfs_rq->throttled_clock = rq_clock(rq); + SCHED_WARN_ON(cfs_rq->throttled_clock); + if (cfs_rq->nr_running) + cfs_rq->throttled_clock = rq_clock(rq); return true; } @@ -5498,7 +5604,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) update_rq_clock(rq); raw_spin_lock(&cfs_b->lock); - cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; + if (cfs_rq->throttled_clock) { + cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; + cfs_rq->throttled_clock = 0; + } list_del_rcu(&cfs_rq->throttled_list); raw_spin_unlock(&cfs_b->lock); @@ -6014,13 +6123,14 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; } -void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) { raw_spin_lock_init(&cfs_b->lock); cfs_b->runtime = 0; cfs_b->quota = RUNTIME_INF; cfs_b->period = ns_to_ktime(default_cfs_period()); cfs_b->burst = 0; + cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF; INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); @@ -6157,6 +6267,46 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) rq_clock_stop_loop_update(rq); } +bool cfs_task_bw_constrained(struct task_struct *p) +{ + struct cfs_rq *cfs_rq = task_cfs_rq(p); + + if (!cfs_bandwidth_used()) + return false; + + if (cfs_rq->runtime_enabled || + tg_cfs_bandwidth(cfs_rq->tg)->hierarchical_quota != RUNTIME_INF) + return true; + + return false; +} + +#ifdef CONFIG_NO_HZ_FULL +/* called from pick_next_task_fair() */ +static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) +{ + int cpu = cpu_of(rq); + + if (!sched_feat(HZ_BW) || !cfs_bandwidth_used()) + return; + + if (!tick_nohz_full_cpu(cpu)) + return; + + if (rq->nr_running != 1) + return; + + /* + * We know there is only one task runnable and we've just picked it. The + * normal enqueue path will have cleared TICK_DEP_BIT_SCHED if we will + * be otherwise able to stop the tick. Just need to check if we are using + * bandwidth control. + */ + if (cfs_task_bw_constrained(p)) + tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); +} +#endif + #else /* CONFIG_CFS_BANDWIDTH */ static inline bool cfs_bandwidth_used(void) @@ -6186,9 +6336,8 @@ static inline int throttled_lb_pair(struct task_group *tg, return 0; } -void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} - #ifdef CONFIG_FAIR_GROUP_SCHED +void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {} static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} #endif @@ -6199,9 +6348,18 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} static inline void update_runtime_enabled(struct rq *rq) {} static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} - +#ifdef CONFIG_CGROUP_SCHED +bool cfs_task_bw_constrained(struct task_struct *p) +{ + return false; +} +#endif #endif /* CONFIG_CFS_BANDWIDTH */ +#if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL) +static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {} +#endif + /************************************************** * CFS operations on tasks: */ @@ -6210,13 +6368,12 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} static void hrtick_start_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); SCHED_WARN_ON(task_rq(p) != rq); if (rq->cfs.h_nr_running > 1) { - u64 slice = sched_slice(cfs_rq, se); u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; + u64 slice = se->slice; s64 delta = slice - ran; if (delta < 0) { @@ -6240,8 +6397,7 @@ static void hrtick_update(struct rq *rq) if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class) return; - if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) - hrtick_start_fair(rq, curr); + hrtick_start_fair(rq, curr); } #else /* !CONFIG_SCHED_HRTICK */ static inline void @@ -6282,17 +6438,6 @@ static int sched_idle_rq(struct rq *rq) rq->nr_running); } -/* - * Returns true if cfs_rq only has SCHED_IDLE entities enqueued. Note the use - * of idle_nr_running, which does not consider idle descendants of normal - * entities. - */ -static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq) -{ - return cfs_rq->nr_running && - cfs_rq->nr_running == cfs_rq->idle_nr_running; -} - #ifdef CONFIG_SMP static int sched_idle_cpu(int cpu) { @@ -6474,6 +6619,7 @@ dequeue_throttle: /* Working cpumask for: load_balance, load_balance_newidle. */ static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); +static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask); #ifdef CONFIG_NO_HZ_COMMON @@ -7065,7 +7211,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) util_min = uclamp_eff_value(p, UCLAMP_MIN); util_max = uclamp_eff_value(p, UCLAMP_MAX); - for_each_cpu_wrap(cpu, cpus, target + 1) { + for_each_cpu_wrap(cpu, cpus, target) { unsigned long cpu_cap = capacity_of(cpu); if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) @@ -7289,9 +7435,6 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost) util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); - if (boost) - util_est = max(util_est, runnable); - /* * During wake-up @p isn't enqueued yet and doesn't contribute * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued. @@ -7741,6 +7884,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) if (wake_flags & WF_TTWU) { record_wakee(p); + if ((wake_flags & WF_CURRENT_CPU) && + cpumask_test_cpu(cpu, p->cpus_ptr)) + return cpu; + if (sched_energy_enabled()) { new_cpu = find_energy_efficient_cpu(p, prev_cpu); if (new_cpu >= 0) @@ -7798,18 +7945,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) { struct sched_entity *se = &p->se; - /* - * As blocked tasks retain absolute vruntime the migration needs to - * deal with this by subtracting the old and adding the new - * min_vruntime -- the latter is done by enqueue_entity() when placing - * the task on the new runqueue. - */ - if (READ_ONCE(p->__state) == TASK_WAKING) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); - - se->vruntime -= u64_u32_load(cfs_rq->min_vruntime); - } - if (!task_on_rq_migrating(p)) { remove_entity_load_avg(se); @@ -7847,66 +7982,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } #endif /* CONFIG_SMP */ -static unsigned long wakeup_gran(struct sched_entity *se) -{ - unsigned long gran = sysctl_sched_wakeup_granularity; - - /* - * Since its curr running now, convert the gran from real-time - * to virtual-time in his units. - * - * By using 'se' instead of 'curr' we penalize light tasks, so - * they get preempted easier. That is, if 'se' < 'curr' then - * the resulting gran will be larger, therefore penalizing the - * lighter, if otoh 'se' > 'curr' then the resulting gran will - * be smaller, again penalizing the lighter task. - * - * This is especially important for buddies when the leftmost - * task is higher priority than the buddy. - */ - return calc_delta_fair(gran, se); -} - -/* - * Should 'se' preempt 'curr'. - * - * |s1 - * |s2 - * |s3 - * g - * |<--->|c - * - * w(c, s1) = -1 - * w(c, s2) = 0 - * w(c, s3) = 1 - * - */ -static int -wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) -{ - s64 gran, vdiff = curr->vruntime - se->vruntime; - - if (vdiff <= 0) - return -1; - - gran = wakeup_gran(se); - if (vdiff > gran) - return 1; - - return 0; -} - -static void set_last_buddy(struct sched_entity *se) -{ - for_each_sched_entity(se) { - if (SCHED_WARN_ON(!se->on_rq)) - return; - if (se_is_idle(se)) - return; - cfs_rq_of(se)->last = se; - } -} - static void set_next_buddy(struct sched_entity *se) { for_each_sched_entity(se) { @@ -7918,12 +7993,6 @@ static void set_next_buddy(struct sched_entity *se) } } -static void set_skip_buddy(struct sched_entity *se) -{ - for_each_sched_entity(se) - cfs_rq_of(se)->skip = se; -} - /* * Preempt the current task with a newly woken task if needed: */ @@ -7932,7 +8001,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); - int scale = cfs_rq->nr_running >= sched_nr_latency; int next_buddy_marked = 0; int cse_is_idle, pse_is_idle; @@ -7948,7 +8016,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) return; - if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { + if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) { set_next_buddy(pse); next_buddy_marked = 1; } @@ -7993,35 +8061,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (cse_is_idle != pse_is_idle) return; - update_curr(cfs_rq_of(se)); - if (wakeup_preempt_entity(se, pse) == 1) { - /* - * Bias pick_next to pick the sched entity that is - * triggering this preemption. - */ - if (!next_buddy_marked) - set_next_buddy(pse); + cfs_rq = cfs_rq_of(se); + update_curr(cfs_rq); + + /* + * XXX pick_eevdf(cfs_rq) != se ? + */ + if (pick_eevdf(cfs_rq) == pse) goto preempt; - } return; preempt: resched_curr(rq); - /* - * Only set the backward buddy when the current task is still - * on the rq. This can happen when a wakeup gets interleaved - * with schedule on the ->pre_schedule() or idle_balance() - * point, either of which can * drop the rq lock. - * - * Also, during early boot the idle thread is in the fair class, - * for obvious reasons its a bad idea to schedule back to it. - */ - if (unlikely(!se->on_rq || curr == rq->idle)) - return; - - if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) - set_last_buddy(se); } #ifdef CONFIG_SMP @@ -8172,6 +8224,7 @@ done: __maybe_unused; hrtick_start_fair(rq, p); update_misfit_status(p, rq); + sched_fair_update_stop_tick(rq, p); return p; @@ -8222,8 +8275,6 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) /* * sched_yield() is very simple - * - * The magic of dealing with the ->skip buddy is in pick_next_entity. */ static void yield_task_fair(struct rq *rq) { @@ -8239,21 +8290,19 @@ static void yield_task_fair(struct rq *rq) clear_buddies(cfs_rq, se); - if (curr->policy != SCHED_BATCH) { - update_rq_clock(rq); - /* - * Update run-time statistics of the 'current'. - */ - update_curr(cfs_rq); - /* - * Tell update_rq_clock() that we've just updated, - * so we don't do microscopic update in schedule() - * and double the fastpath cost. - */ - rq_clock_skip_update(rq); - } + update_rq_clock(rq); + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + /* + * Tell update_rq_clock() that we've just updated, + * so we don't do microscopic update in schedule() + * and double the fastpath cost. + */ + rq_clock_skip_update(rq); - set_skip_buddy(se); + se->deadline += calc_delta_fair(se->slice, se); } static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) @@ -8416,6 +8465,11 @@ enum group_type { */ group_misfit_task, /* + * Balance SMT group that's fully busy. Can benefit from migration + * a task on SMT with busy sibling to another CPU on idle core. + */ + group_smt_balance, + /* * SD_ASYM_PACKING only: One local CPU with higher capacity is available, * and the task should be migrated to it instead of running on the * current CPU. @@ -8496,8 +8550,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) * Buddy candidates are cache hot: */ if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running && - (&p->se == cfs_rq_of(&p->se)->next || - &p->se == cfs_rq_of(&p->se)->last)) + (&p->se == cfs_rq_of(&p->se)->next)) return 1; if (sysctl_sched_migration_cost == -1) @@ -9123,6 +9176,7 @@ struct sg_lb_stats { unsigned int group_weight; enum group_type group_type; unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ + unsigned int group_smt_balance; /* Task on busy SMT be moved */ unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; @@ -9396,6 +9450,9 @@ group_type group_classify(unsigned int imbalance_pct, if (sgs->group_asym_packing) return group_asym_packing; + if (sgs->group_smt_balance) + return group_smt_balance; + if (sgs->group_misfit_task_load) return group_misfit_task; @@ -9465,6 +9522,71 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu); } +/* One group has more than one SMT CPU while the other group does not */ +static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1, + struct sched_group *sg2) +{ + if (!sg1 || !sg2) + return false; + + return (sg1->flags & SD_SHARE_CPUCAPACITY) != + (sg2->flags & SD_SHARE_CPUCAPACITY); +} + +static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs, + struct sched_group *group) +{ + if (env->idle == CPU_NOT_IDLE) + return false; + + /* + * For SMT source group, it is better to move a task + * to a CPU that doesn't have multiple tasks sharing its CPU capacity. + * Note that if a group has a single SMT, SD_SHARE_CPUCAPACITY + * will not be on. + */ + if (group->flags & SD_SHARE_CPUCAPACITY && + sgs->sum_h_nr_running > 1) + return true; + + return false; +} + +static inline long sibling_imbalance(struct lb_env *env, + struct sd_lb_stats *sds, + struct sg_lb_stats *busiest, + struct sg_lb_stats *local) +{ + int ncores_busiest, ncores_local; + long imbalance; + + if (env->idle == CPU_NOT_IDLE || !busiest->sum_nr_running) + return 0; + + ncores_busiest = sds->busiest->cores; + ncores_local = sds->local->cores; + + if (ncores_busiest == ncores_local) { + imbalance = busiest->sum_nr_running; + lsub_positive(&imbalance, local->sum_nr_running); + return imbalance; + } + + /* Balance such that nr_running/ncores ratio are same on both groups */ + imbalance = ncores_local * busiest->sum_nr_running; + lsub_positive(&imbalance, ncores_busiest * local->sum_nr_running); + /* Normalize imbalance and do rounding on normalization */ + imbalance = 2 * imbalance + ncores_local + ncores_busiest; + imbalance /= ncores_local + ncores_busiest; + + /* Take advantage of resource in an empty sched group */ + if (imbalance <= 1 && local->sum_nr_running == 0 && + busiest->sum_nr_running > 1) + imbalance = 2; + + return imbalance; +} + static inline bool sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) { @@ -9557,6 +9679,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_asym_packing = 1; } + /* Check for loaded SMT group to be balanced to dst CPU */ + if (!local_group && smt_balance(env, sgs, group)) + sgs->group_smt_balance = 1; + sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); /* Computing avg_load makes sense only when group is overloaded */ @@ -9641,6 +9767,16 @@ static bool update_sd_pick_busiest(struct lb_env *env, return false; break; + case group_smt_balance: + /* + * Check if we have spare CPUs on either SMT group to + * choose has spare or fully busy handling. + */ + if (sgs->idle_cpus != 0 || busiest->idle_cpus != 0) + goto has_spare; + + fallthrough; + case group_fully_busy: /* * Select the fully busy group with highest avg_load. In @@ -9670,6 +9806,19 @@ static bool update_sd_pick_busiest(struct lb_env *env, case group_has_spare: /* + * Do not pick sg with SMT CPUs over sg with pure CPUs, + * as we do not want to pull task off SMT core with one task + * and make the core idle. + */ + if (smt_vs_nonsmt_groups(sds->busiest, sg)) { + if (sg->flags & SD_SHARE_CPUCAPACITY && sgs->sum_h_nr_running <= 1) + return false; + else + return true; + } +has_spare: + + /* * Select not overloaded group with lowest number of idle cpus * and highest number of running tasks. We could also compare * the spare capacity which is more stable but it can end up @@ -9865,6 +10014,7 @@ static bool update_pick_idlest(struct sched_group *idlest, case group_imbalanced: case group_asym_packing: + case group_smt_balance: /* Those types are not used in the slow wakeup path */ return false; @@ -9996,6 +10146,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) case group_imbalanced: case group_asym_packing: + case group_smt_balance: /* Those type are not used in the slow wakeup path */ return NULL; @@ -10250,6 +10401,13 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s return; } + if (busiest->group_type == group_smt_balance) { + /* Reduce number of tasks sharing CPU capacity */ + env->migration_type = migrate_task; + env->imbalance = 1; + return; + } + if (busiest->group_type == group_imbalanced) { /* * In the group_imb case we cannot rely on group-wide averages @@ -10297,14 +10455,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s } if (busiest->group_weight == 1 || sds->prefer_sibling) { - unsigned int nr_diff = busiest->sum_nr_running; /* * When prefer sibling, evenly spread running tasks on * groups. */ env->migration_type = migrate_task; - lsub_positive(&nr_diff, local->sum_nr_running); - env->imbalance = nr_diff; + env->imbalance = sibling_imbalance(env, sds, busiest, local); } else { /* @@ -10501,20 +10657,27 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * group's child domain. */ if (sds.prefer_sibling && local->group_type == group_has_spare && - busiest->sum_nr_running > local->sum_nr_running + 1) + sibling_imbalance(env, &sds, busiest, local) > 1) goto force_balance; if (busiest->group_type != group_overloaded) { - if (env->idle == CPU_NOT_IDLE) + if (env->idle == CPU_NOT_IDLE) { /* * If the busiest group is not overloaded (and as a * result the local one too) but this CPU is already * busy, let another idle CPU try to pull task. */ goto out_balanced; + } + + if (busiest->group_type == group_smt_balance && + smt_vs_nonsmt_groups(sds.local, sds.busiest)) { + /* Let non SMT CPU pull from SMT CPU sharing with sibling */ + goto force_balance; + } if (busiest->group_weight > 1 && - local->idle_cpus <= (busiest->idle_cpus + 1)) + local->idle_cpus <= (busiest->idle_cpus + 1)) { /* * If the busiest group is not overloaded * and there is no imbalance between this and busiest @@ -10525,12 +10688,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * there is more than 1 CPU per group. */ goto out_balanced; + } - if (busiest->sum_h_nr_running == 1) + if (busiest->sum_h_nr_running == 1) { /* * busiest doesn't have any tasks waiting to run */ goto out_balanced; + } } force_balance: @@ -10763,8 +10928,9 @@ static int active_load_balance_cpu_stop(void *data); static int should_we_balance(struct lb_env *env) { + struct cpumask *swb_cpus = this_cpu_cpumask_var_ptr(should_we_balance_tmpmask); struct sched_group *sg = env->sd->groups; - int cpu; + int cpu, idle_smt = -1; /* * Ensure the balancing environment is consistent; can happen @@ -10786,15 +10952,38 @@ static int should_we_balance(struct lb_env *env) return 1; } + cpumask_copy(swb_cpus, group_balance_mask(sg)); /* Try to find first idle CPU */ - for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { + for_each_cpu_and(cpu, swb_cpus, env->cpus) { if (!idle_cpu(cpu)) continue; + /* + * Don't balance to idle SMT in busy core right away when + * balancing cores, but remember the first idle SMT CPU for + * later consideration. Find CPU on an idle core first. + */ + if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) { + if (idle_smt == -1) + idle_smt = cpu; + /* + * If the core is not idle, and first SMT sibling which is + * idle has been found, then its not needed to check other + * SMT siblings for idleness: + */ +#ifdef CONFIG_SCHED_SMT + cpumask_andnot(swb_cpus, swb_cpus, cpu_smt_mask(cpu)); +#endif + continue; + } + /* Are we the first idle CPU? */ return cpu == env->dst_cpu; } + if (idle_smt == env->dst_cpu) + return true; + /* Are we the first CPU of this group ? */ return group_balance_cpu(sg) == env->dst_cpu; } @@ -12007,8 +12196,8 @@ static void rq_offline_fair(struct rq *rq) static inline bool __entity_slice_used(struct sched_entity *se, int min_nr_tasks) { - u64 slice = sched_slice(cfs_rq_of(se), se); u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime; + u64 slice = se->slice; return (rtime * min_nr_tasks > slice); } @@ -12164,8 +12353,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) */ static void task_fork_fair(struct task_struct *p) { - struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se, *curr; + struct cfs_rq *cfs_rq; struct rq *rq = this_rq(); struct rq_flags rf; @@ -12174,22 +12363,9 @@ static void task_fork_fair(struct task_struct *p) cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; - if (curr) { + if (curr) update_curr(cfs_rq); - se->vruntime = curr->vruntime; - } - place_entity(cfs_rq, se, 1); - - if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { - /* - * Upon rescheduling, sched_class::put_prev_task() will place - * 'current' within the tree based on its new key value. - */ - swap(curr->vruntime, se->vruntime); - resched_curr(rq); - } - - se->vruntime -= cfs_rq->min_vruntime; + place_entity(cfs_rq, se, ENQUEUE_INITIAL); rq_unlock(rq, &rf); } @@ -12218,34 +12394,6 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) check_preempt_curr(rq, p, 0); } -static inline bool vruntime_normalized(struct task_struct *p) -{ - struct sched_entity *se = &p->se; - - /* - * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases, - * the dequeue_entity(.flags=0) will already have normalized the - * vruntime. - */ - if (p->on_rq) - return true; - - /* - * When !on_rq, vruntime of the task has usually NOT been normalized. - * But there are some cases where it has already been normalized: - * - * - A forked child which is waiting for being woken up by - * wake_up_new_task(). - * - A task which has been woken up by try_to_wake_up() and - * waiting for actually being woken up by sched_ttwu_pending(). - */ - if (!se->sum_exec_runtime || - (READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup)) - return true; - - return false; -} - #ifdef CONFIG_FAIR_GROUP_SCHED /* * Propagate the changes of the sched_entity across the tg tree to make it @@ -12316,16 +12464,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se) static void detach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); - - if (!vruntime_normalized(p)) { - /* - * Fix up our vruntime so that the current sleep doesn't - * cause 'unlimited' sleep bonus. - */ - place_entity(cfs_rq, se, 0); - se->vruntime -= cfs_rq->min_vruntime; - } detach_entity_cfs_rq(se); } @@ -12333,12 +12471,8 @@ static void detach_task_cfs_rq(struct task_struct *p) static void attach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); attach_entity_cfs_rq(se); - - if (!vruntime_normalized(p)) - se->vruntime += cfs_rq->min_vruntime; } static void switched_from_fair(struct rq *rq, struct task_struct *p) @@ -12450,7 +12584,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) tg->shares = NICE_0_LOAD; - init_cfs_bandwidth(tg_cfs_bandwidth(tg)); + init_cfs_bandwidth(tg_cfs_bandwidth(tg), tg_cfs_bandwidth(parent)); for_each_possible_cpu(i) { cfs_rq = kzalloc_node(sizeof(struct cfs_rq), @@ -12703,7 +12837,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task * idle runqueue: */ if (rq->cfs.load.weight) - rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se)); + rr_interval = NS_TO_JIFFIES(se->slice); return rr_interval; } @@ -12805,6 +12939,8 @@ __init void init_sched_fair_class(void) for_each_possible_cpu(i) { zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i)); zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i)); + zalloc_cpumask_var_node(&per_cpu(should_we_balance_tmpmask, i), + GFP_KERNEL, cpu_to_node(i)); #ifdef CONFIG_CFS_BANDWIDTH INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i)); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index ee7f23c76bd3..f770168230ae 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -1,16 +1,12 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* - * Only give sleepers 50% of their service deficit. This allows - * them to run sooner, but does not allow tons of sleepers to - * rip the spread apart. - */ -SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) /* - * Place new tasks ahead so that they do not starve already running - * tasks + * Using the avg_vruntime, do the right thing and preserve lag across + * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled. */ -SCHED_FEAT(START_DEBIT, true) +SCHED_FEAT(PLACE_LAG, true) +SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) +SCHED_FEAT(RUN_TO_PARITY, true) /* * Prefer to schedule the task we woke last (assuming it failed @@ -20,13 +16,6 @@ SCHED_FEAT(START_DEBIT, true) SCHED_FEAT(NEXT_BUDDY, false) /* - * Prefer to schedule the task that ran last (when we did - * wake-preempt) as that likely will touch the same data, increases - * cache locality. - */ -SCHED_FEAT(LAST_BUDDY, true) - -/* * Consider buddies to be cache hot, decreases the likeliness of a * cache buddy being migrated away, increases cache locality. */ @@ -99,5 +88,4 @@ SCHED_FEAT(UTIL_EST_FASTUP, true) SCHED_FEAT(LATENCY_WARN, false) -SCHED_FEAT(ALT_PERIOD, true) -SCHED_FEAT(BASE_SLICE, true) +SCHED_FEAT(HZ_BW, true) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 9bb3f2b3ccfc..1d0f634725a6 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -140,7 +140,7 @@ static int psi_bug __read_mostly; DEFINE_STATIC_KEY_FALSE(psi_disabled); -DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled); +static DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled); #ifdef CONFIG_PSI_DEFAULT_DISABLED static bool psi_enable; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 00e0e5074115..0597ba0f85ff 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -25,7 +25,7 @@ unsigned int sysctl_sched_rt_period = 1000000; int sysctl_sched_rt_runtime = 950000; #ifdef CONFIG_SYSCTL -static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; +static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC * RR_TIMESLICE) / HZ; static int sched_rt_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); static int sched_rr_handler(struct ctl_table *table, int write, void *buffer, @@ -3062,6 +3062,9 @@ static int sched_rr_handler(struct ctl_table *table, int write, void *buffer, sched_rr_timeslice = sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE : msecs_to_jiffies(sysctl_sched_rr_timeslice); + + if (sysctl_sched_rr_timeslice <= 0) + sysctl_sched_rr_timeslice = jiffies_to_msecs(RR_TIMESLICE); } mutex_unlock(&mutex); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e93e006a942b..04846272409c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -454,11 +454,12 @@ extern void unregister_fair_sched_group(struct task_group *tg); extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *se, int cpu, struct sched_entity *parent); -extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); +extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent); extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); +extern bool cfs_task_bw_constrained(struct task_struct *p); extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int cpu, @@ -494,6 +495,7 @@ static inline void set_task_rq_fair(struct sched_entity *se, #else /* CONFIG_CGROUP_SCHED */ struct cfs_bandwidth { }; +static inline bool cfs_task_bw_constrained(struct task_struct *p) { return false; } #endif /* CONFIG_CGROUP_SCHED */ @@ -548,6 +550,9 @@ struct cfs_rq { unsigned int idle_nr_running; /* SCHED_IDLE */ unsigned int idle_h_nr_running; /* SCHED_IDLE */ + s64 avg_vruntime; + u64 avg_load; + u64 exec_clock; u64 min_vruntime; #ifdef CONFIG_SCHED_CORE @@ -567,8 +572,6 @@ struct cfs_rq { */ struct sched_entity *curr; struct sched_entity *next; - struct sched_entity *last; - struct sched_entity *skip; #ifdef CONFIG_SCHED_DEBUG unsigned int nr_spread_over; @@ -636,6 +639,8 @@ struct cfs_rq { u64 throttled_clock; u64 throttled_clock_pelt; u64 throttled_clock_pelt_time; + u64 throttled_clock_self; + u64 throttled_clock_self_time; int throttled; int throttle_count; struct list_head throttled_list; @@ -1245,6 +1250,7 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq) bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, bool fi); +void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi); /* * Helpers to check if the CPU's core cookie matches with the task's cookie @@ -1700,6 +1706,21 @@ rq_unlock(struct rq *rq, struct rq_flags *rf) raw_spin_rq_unlock(rq); } +DEFINE_LOCK_GUARD_1(rq_lock, struct rq, + rq_lock(_T->lock, &_T->rf), + rq_unlock(_T->lock, &_T->rf), + struct rq_flags rf) + +DEFINE_LOCK_GUARD_1(rq_lock_irq, struct rq, + rq_lock_irq(_T->lock, &_T->rf), + rq_unlock_irq(_T->lock, &_T->rf), + struct rq_flags rf) + +DEFINE_LOCK_GUARD_1(rq_lock_irqsave, struct rq, + rq_lock_irqsave(_T->lock, &_T->rf), + rq_unlock_irqrestore(_T->lock, &_T->rf), + struct rq_flags rf) + static inline struct rq * this_rq_lock_irq(struct rq_flags *rf) __acquires(rq->lock) @@ -1882,6 +1903,7 @@ struct sched_group { atomic_t ref; unsigned int group_weight; + unsigned int cores; struct sched_group_capacity *sgc; int asym_prefer_cpu; /* CPU of highest priority in group */ int flags; @@ -2131,12 +2153,13 @@ static inline int task_on_rq_migrating(struct task_struct *p) } /* Wake flags. The first three directly map to some SD flag value */ -#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */ -#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */ -#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */ +#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */ +#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */ +#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */ -#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ -#define WF_MIGRATED 0x20 /* Internal use, task got migrated */ +#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ +#define WF_MIGRATED 0x20 /* Internal use, task got migrated */ +#define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */ #ifdef CONFIG_SMP static_assert(WF_EXEC == SD_BALANCE_EXEC); @@ -2195,6 +2218,7 @@ extern const u32 sched_prio_to_wmult[40]; #else #define ENQUEUE_MIGRATED 0x00 #endif +#define ENQUEUE_INITIAL 0x80 #define RETRY_TASK ((void *)-1UL) @@ -2398,6 +2422,7 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq) #endif extern void schedule_idle(void); +asmlinkage void schedule_user(void); extern void sysrq_sched_debug_show(void); extern void sched_init_granularity(void); @@ -2499,11 +2524,9 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); extern const_debug unsigned int sysctl_sched_nr_migrate; extern const_debug unsigned int sysctl_sched_migration_cost; +extern unsigned int sysctl_sched_base_slice; + #ifdef CONFIG_SCHED_DEBUG -extern unsigned int sysctl_sched_latency; -extern unsigned int sysctl_sched_min_granularity; -extern unsigned int sysctl_sched_idle_min_granularity; -extern unsigned int sysctl_sched_wakeup_granularity; extern int sysctl_resched_latency_warn_ms; extern int sysctl_resched_latency_warn_once; @@ -2609,6 +2632,12 @@ static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) {} #endif +#define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \ +__DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \ +static inline class_##name##_t class_##name##_constructor(type *lock, type *lock2) \ +{ class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \ + _lock; return _t; } + #ifdef CONFIG_SMP static inline bool rq_order_less(struct rq *rq1, struct rq *rq2) @@ -2738,6 +2767,16 @@ static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2) raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING); } +static inline void double_raw_unlock(raw_spinlock_t *l1, raw_spinlock_t *l2) +{ + raw_spin_unlock(l1); + raw_spin_unlock(l2); +} + +DEFINE_LOCK_GUARD_2(double_raw_spinlock, raw_spinlock_t, + double_raw_lock(_T->lock, _T->lock2), + double_raw_unlock(_T->lock, _T->lock2)) + /* * double_rq_unlock - safely unlock two runqueues * @@ -2795,6 +2834,10 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) #endif +DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq, + double_rq_lock(_T->lock, _T->lock2), + double_rq_unlock(_T->lock, _T->lock2)) + extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); @@ -3229,6 +3272,8 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) extern void swake_up_all_locked(struct swait_queue_head *q); extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); +extern int try_to_wake_up(struct task_struct *tsk, unsigned int state, int wake_flags); + #ifdef CONFIG_PREEMPT_DYNAMIC extern int preempt_dynamic_mode; extern int sched_dynamic_mode(const char *str); @@ -3480,4 +3525,7 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } static inline void init_sched_mm_cid(struct task_struct *t) { } #endif +extern u64 avg_vruntime(struct cfs_rq *cfs_rq); +extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); + #endif /* _KERNEL_SCHED_SCHED_H */ diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index 76b9b796e695..72505cd3b60a 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -18,7 +18,7 @@ EXPORT_SYMBOL(__init_swait_queue_head); * If for some reason it would return 0, that means the previously waiting * task is already running, so it will observe condition true (or has already). */ -void swake_up_locked(struct swait_queue_head *q) +void swake_up_locked(struct swait_queue_head *q, int wake_flags) { struct swait_queue *curr; @@ -26,7 +26,7 @@ void swake_up_locked(struct swait_queue_head *q) return; curr = list_first_entry(&q->task_list, typeof(*curr), task_list); - wake_up_process(curr->task); + try_to_wake_up(curr->task, TASK_NORMAL, wake_flags); list_del_init(&curr->task_list); } EXPORT_SYMBOL(swake_up_locked); @@ -41,7 +41,7 @@ EXPORT_SYMBOL(swake_up_locked); void swake_up_all_locked(struct swait_queue_head *q) { while (!list_empty(&q->task_list)) - swake_up_locked(q); + swake_up_locked(q, 0); } void swake_up_one(struct swait_queue_head *q) @@ -49,7 +49,7 @@ void swake_up_one(struct swait_queue_head *q) unsigned long flags; raw_spin_lock_irqsave(&q->lock, flags); - swake_up_locked(q); + swake_up_locked(q, 0); raw_spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(swake_up_one); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index d3a3b2646ec4..05a5bc678c08 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -722,8 +722,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) if (parent->parent) { parent->parent->child = tmp; - if (tmp->flags & SD_SHARE_CPUCAPACITY) - parent->parent->groups->flags |= SD_SHARE_CPUCAPACITY; + parent->parent->groups->flags = tmp->flags; } /* @@ -1275,14 +1274,24 @@ build_sched_groups(struct sched_domain *sd, int cpu) static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) { struct sched_group *sg = sd->groups; + struct cpumask *mask = sched_domains_tmpmask2; WARN_ON(!sg); do { - int cpu, max_cpu = -1; + int cpu, cores = 0, max_cpu = -1; sg->group_weight = cpumask_weight(sched_group_span(sg)); + cpumask_copy(mask, sched_group_span(sg)); + for_each_cpu(cpu, mask) { + cores++; +#ifdef CONFIG_SCHED_SMT + cpumask_andnot(mask, mask, cpu_smt_mask(cpu)); +#endif + } + sg->cores = cores; + if (!(sd->flags & SD_ASYM_PACKING)) goto next; diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 48c53e4739ea..802d98cf2de3 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -161,6 +161,11 @@ int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, } EXPORT_SYMBOL(__wake_up); +void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key) +{ + __wake_up_common_lock(wq_head, mode, 1, WF_CURRENT_CPU, key); +} + /* * Same as __wake_up but called with the spinlock in wait_queue_head_t held. */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index d3e584065c7f..255999ba9190 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -110,11 +110,13 @@ struct seccomp_knotif { * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC * is allowed. * @ioctl_flags: The flags used for the seccomp_addfd ioctl. + * @setfd: whether or not SECCOMP_ADDFD_FLAG_SETFD was set during notify_addfd * @ret: The return value of the installing process. It is set to the fd num * upon success (>= 0). * @completion: Indicates that the installing process has completed fd * installation, or gone away (either due to successful * reply, or signal) + * @list: list_head for chaining seccomp_kaddfd together. * */ struct seccomp_kaddfd { @@ -138,14 +140,17 @@ struct seccomp_kaddfd { * structure is fairly large, we store the notification-specific stuff in a * separate structure. * - * @request: A semaphore that users of this notification can wait on for - * changes. Actual reads and writes are still controlled with - * filter->notify_lock. + * @requests: A semaphore that users of this notification can wait on for + * changes. Actual reads and writes are still controlled with + * filter->notify_lock. + * @flags: A set of SECCOMP_USER_NOTIF_FD_* flags. * @next_id: The id of the next request. * @notifications: A list of struct seccomp_knotif elements. */ + struct notification { - struct semaphore request; + atomic_t requests; + u32 flags; u64 next_id; struct list_head notifications; }; @@ -555,6 +560,8 @@ static void __seccomp_filter_release(struct seccomp_filter *orig) * drop its reference count, and notify * about unused filters * + * @tsk: task the filter should be released from. + * * This function should only be called when the task is exiting as * it detaches it from its filter tree. As such, READ_ONCE() and * barriers are not needed here, as would normally be needed. @@ -574,6 +581,8 @@ void seccomp_filter_release(struct task_struct *tsk) /** * seccomp_sync_threads: sets all threads to use current's filter * + * @flags: SECCOMP_FILTER_FLAG_* flags to set during sync. + * * Expects sighand and cred_guard_mutex locks to be held, and for * seccomp_can_sync_threads() to have returned success already * without dropping the locks. @@ -1116,8 +1125,11 @@ static int seccomp_do_user_notification(int this_syscall, list_add_tail(&n.list, &match->notif->notifications); INIT_LIST_HEAD(&n.addfd); - up(&match->notif->request); - wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM); + atomic_inc(&match->notif->requests); + if (match->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP) + wake_up_poll_on_current_cpu(&match->wqh, EPOLLIN | EPOLLRDNORM); + else + wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM); /* * This is where we wait for a reply from userspace. @@ -1450,6 +1462,37 @@ find_notification(struct seccomp_filter *filter, u64 id) return NULL; } +static int recv_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync, + void *key) +{ + /* Avoid a wakeup if event not interesting for us. */ + if (key && !(key_to_poll(key) & (EPOLLIN | EPOLLERR))) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} + +static int recv_wait_event(struct seccomp_filter *filter) +{ + DEFINE_WAIT_FUNC(wait, recv_wake_function); + int ret; + + if (atomic_dec_if_positive(&filter->notif->requests) >= 0) + return 0; + + for (;;) { + ret = prepare_to_wait_event(&filter->wqh, &wait, TASK_INTERRUPTIBLE); + + if (atomic_dec_if_positive(&filter->notif->requests) >= 0) + break; + + if (ret) + return ret; + + schedule(); + } + finish_wait(&filter->wqh, &wait); + return 0; +} static long seccomp_notify_recv(struct seccomp_filter *filter, void __user *buf) @@ -1467,7 +1510,7 @@ static long seccomp_notify_recv(struct seccomp_filter *filter, memset(&unotif, 0, sizeof(unotif)); - ret = down_interruptible(&filter->notif->request); + ret = recv_wait_event(filter); if (ret < 0) return ret; @@ -1515,7 +1558,8 @@ out: if (should_sleep_killable(filter, knotif)) complete(&knotif->ready); knotif->state = SECCOMP_NOTIFY_INIT; - up(&filter->notif->request); + atomic_inc(&filter->notif->requests); + wake_up_poll(&filter->wqh, EPOLLIN | EPOLLRDNORM); } mutex_unlock(&filter->notify_lock); } @@ -1561,7 +1605,10 @@ static long seccomp_notify_send(struct seccomp_filter *filter, knotif->error = resp.error; knotif->val = resp.val; knotif->flags = resp.flags; - complete(&knotif->ready); + if (filter->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP) + complete_on_current_cpu(&knotif->ready); + else + complete(&knotif->ready); out: mutex_unlock(&filter->notify_lock); return ret; @@ -1591,6 +1638,22 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter, return ret; } +static long seccomp_notify_set_flags(struct seccomp_filter *filter, + unsigned long flags) +{ + long ret; + + if (flags & ~SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP) + return -EINVAL; + + ret = mutex_lock_interruptible(&filter->notify_lock); + if (ret < 0) + return ret; + filter->notif->flags = flags; + mutex_unlock(&filter->notify_lock); + return 0; +} + static long seccomp_notify_addfd(struct seccomp_filter *filter, struct seccomp_notif_addfd __user *uaddfd, unsigned int size) @@ -1720,6 +1783,8 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd, case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR: case SECCOMP_IOCTL_NOTIF_ID_VALID: return seccomp_notify_id_valid(filter, buf); + case SECCOMP_IOCTL_NOTIF_SET_FLAGS: + return seccomp_notify_set_flags(filter, arg); } /* Extensible Argument ioctls */ @@ -1777,7 +1842,6 @@ static struct file *init_listener(struct seccomp_filter *filter) if (!filter->notif) goto out; - sema_init(&filter->notif->request, 0); filter->notif->next_id = get_random_u64(); INIT_LIST_HEAD(&filter->notif->notifications); diff --git a/kernel/signal.c b/kernel/signal.c index b5370fe5c198..09019017d669 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -22,6 +22,7 @@ #include <linux/sched/cputime.h> #include <linux/file.h> #include <linux/fs.h> +#include <linux/mm.h> #include <linux/proc_fs.h> #include <linux/tty.h> #include <linux/binfmts.h> @@ -562,6 +563,10 @@ bool unhandled_signal(struct task_struct *tsk, int sig) if (handler != SIG_IGN && handler != SIG_DFL) return false; + /* If dying, we handle all new signals by ignoring them */ + if (fatal_signal_pending(tsk)) + return false; + /* if ptraced, let the tracer determine */ return !tsk->ptrace; } @@ -1256,7 +1261,17 @@ int send_signal_locked(int sig, struct kernel_siginfo *info, static void print_fatal_signal(int signr) { struct pt_regs *regs = task_pt_regs(current); - pr_info("potentially unexpected fatal signal %d.\n", signr); + struct file *exe_file; + + exe_file = get_task_exe_file(current); + if (exe_file) { + pr_info("%pD: %s: potentially unexpected fatal signal %d.\n", + exe_file, current->comm, signr); + fput(exe_file); + } else { + pr_info("%s: potentially unexpected fatal signal %d.\n", + current->comm, signr); + } #if defined(__i386__) && !defined(__arch_um__) pr_info("code at %08lx: ", regs->ip); diff --git a/kernel/smp.c b/kernel/smp.c index 385179dae360..8455a53465af 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -46,6 +46,8 @@ static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data); static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); +static DEFINE_PER_CPU(atomic_t, trigger_backtrace) = ATOMIC_INIT(1); + static void __flush_smp_call_function_queue(bool warn_cpu_offline); int smpcfd_prepare_cpu(unsigned int cpu) @@ -253,13 +255,15 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 * *bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request"); } if (cpu >= 0) { - dump_cpu_task(cpu); + if (atomic_cmpxchg_acquire(&per_cpu(trigger_backtrace, cpu), 1, 0)) + dump_cpu_task(cpu); if (!cpu_cur_csd) { pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu); arch_send_call_function_single_ipi(cpu); } } - dump_stack(); + if (firsttime) + dump_stack(); *ts1 = ts2; return false; @@ -433,9 +437,14 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline) struct llist_node *entry, *prev; struct llist_head *head; static bool warned; + atomic_t *tbt; lockdep_assert_irqs_disabled(); + /* Allow waiters to send backtrace NMI from here onwards */ + tbt = this_cpu_ptr(&trigger_backtrace); + atomic_set_release(tbt, 1); + head = this_cpu_ptr(&call_single_queue); entry = llist_del_all(head); entry = llist_reverse_order(entry); diff --git a/kernel/softirq.c b/kernel/softirq.c index 807b34ccd797..210cf5f8d92c 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -612,7 +612,7 @@ static inline void tick_irq_exit(void) int cpu = smp_processor_id(); /* Make sure that timer wheel updates are propagated */ - if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) { + if ((sched_core_idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) { if (!in_hardirq()) tick_nohz_irq_exit(); } diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 781de7cc6a4e..e137c1385c56 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -274,6 +274,7 @@ COND_SYSCALL(vm86old); COND_SYSCALL(modify_ldt); COND_SYSCALL(vm86); COND_SYSCALL(kexec_file_load); +COND_SYSCALL(map_shadow_stack); /* s390 */ COND_SYSCALL(s390_pci_mmio_read); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 88cbc1181b23..c108ed8a9804 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -473,8 +473,8 @@ static void clocksource_watchdog(struct timer_list *unused) /* Check the deviation from the watchdog clocksource. */ md = cs->uncertainty_margin + watchdog->uncertainty_margin; if (abs(cs_nsec - wd_nsec) > md) { - u64 cs_wd_msec; - u64 wd_msec; + s64 cs_wd_msec; + s64 wd_msec; u32 wd_rem; pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n", @@ -483,8 +483,8 @@ static void clocksource_watchdog(struct timer_list *unused) watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask); pr_warn(" '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n", cs->name, cs_nsec, csnow, cslast, cs->mask); - cs_wd_msec = div_u64_rem(cs_nsec - wd_nsec, 1000U * 1000U, &wd_rem); - wd_msec = div_u64_rem(wd_nsec, 1000U * 1000U, &wd_rem); + cs_wd_msec = div_s64_rem(cs_nsec - wd_nsec, 1000 * 1000, &wd_rem); + wd_msec = div_s64_rem(wd_nsec, 1000 * 1000, &wd_rem); pr_warn(" Clocksource '%s' skewed %lld ns (%lld ms) over watchdog '%s' interval of %lld ns (%lld ms)\n", cs->name, cs_nsec - wd_nsec, cs_wd_msec, watchdog->name, wd_nsec, wd_msec); if (curr_clocksource == cs) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 4df14db4da49..87015e9deacc 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1045,7 +1045,7 @@ static bool report_idle_softirq(void) return false; /* On RT, softirqs handling may be waiting on some lock */ - if (!local_bh_blocked()) + if (local_bh_blocked()) return false; pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", diff --git a/kernel/time/time.c b/kernel/time/time.c index f4198af60fee..642647f5046b 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -365,11 +365,14 @@ SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp) } #endif -/* - * Convert jiffies to milliseconds and back. +/** + * jiffies_to_msecs - Convert jiffies to milliseconds + * @j: jiffies value * * Avoid unnecessary multiplications/divisions in the - * two most common HZ cases: + * two most common HZ cases. + * + * Return: milliseconds value */ unsigned int jiffies_to_msecs(const unsigned long j) { @@ -388,6 +391,12 @@ unsigned int jiffies_to_msecs(const unsigned long j) } EXPORT_SYMBOL(jiffies_to_msecs); +/** + * jiffies_to_usecs - Convert jiffies to microseconds + * @j: jiffies value + * + * Return: microseconds value + */ unsigned int jiffies_to_usecs(const unsigned long j) { /* @@ -408,8 +417,15 @@ unsigned int jiffies_to_usecs(const unsigned long j) } EXPORT_SYMBOL(jiffies_to_usecs); -/* +/** * mktime64 - Converts date to seconds. + * @year0: year to convert + * @mon0: month to convert + * @day: day to convert + * @hour: hour to convert + * @min: minute to convert + * @sec: second to convert + * * Converts Gregorian date to seconds since 1970-01-01 00:00:00. * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. @@ -427,6 +443,8 @@ EXPORT_SYMBOL(jiffies_to_usecs); * * An encoding of midnight at the end of the day as 24:00:00 - ie. midnight * tomorrow - (allowable under ISO 8601) is supported. + * + * Return: seconds since the epoch time for the given input date */ time64_t mktime64(const unsigned int year0, const unsigned int mon0, const unsigned int day, const unsigned int hour, @@ -471,8 +489,7 @@ EXPORT_SYMBOL(ns_to_kernel_old_timeval); * Set seconds and nanoseconds field of a timespec variable and * normalize to the timespec storage format * - * Note: The tv_nsec part is always in the range of - * 0 <= tv_nsec < NSEC_PER_SEC + * Note: The tv_nsec part is always in the range of 0 <= tv_nsec < NSEC_PER_SEC. * For negative values only the tv_sec field is negative ! */ void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec) @@ -501,7 +518,7 @@ EXPORT_SYMBOL(set_normalized_timespec64); * ns_to_timespec64 - Convert nanoseconds to timespec64 * @nsec: the nanoseconds value to be converted * - * Returns the timespec64 representation of the nsec parameter. + * Return: the timespec64 representation of the nsec parameter. */ struct timespec64 ns_to_timespec64(s64 nsec) { @@ -548,6 +565,8 @@ EXPORT_SYMBOL(ns_to_timespec64); * runtime. * The _msecs_to_jiffies helpers are the HZ dependent conversion * routines found in include/linux/jiffies.h + * + * Return: jiffies value */ unsigned long __msecs_to_jiffies(const unsigned int m) { @@ -560,6 +579,12 @@ unsigned long __msecs_to_jiffies(const unsigned int m) } EXPORT_SYMBOL(__msecs_to_jiffies); +/** + * __usecs_to_jiffies: - convert microseconds to jiffies + * @u: time in milliseconds + * + * Return: jiffies value + */ unsigned long __usecs_to_jiffies(const unsigned int u) { if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) @@ -568,7 +593,10 @@ unsigned long __usecs_to_jiffies(const unsigned int u) } EXPORT_SYMBOL(__usecs_to_jiffies); -/* +/** + * timespec64_to_jiffies - convert a timespec64 value to jiffies + * @value: pointer to &struct timespec64 + * * The TICK_NSEC - 1 rounds up the value to the next resolution. Note * that a remainder subtract here would not do the right thing as the * resolution values don't fall on second boundaries. I.e. the line: @@ -582,8 +610,9 @@ EXPORT_SYMBOL(__usecs_to_jiffies); * * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec * value to a scaled second value. + * + * Return: jiffies value */ - unsigned long timespec64_to_jiffies(const struct timespec64 *value) { @@ -601,6 +630,11 @@ timespec64_to_jiffies(const struct timespec64 *value) } EXPORT_SYMBOL(timespec64_to_jiffies); +/** + * jiffies_to_timespec64 - convert jiffies value to &struct timespec64 + * @jiffies: jiffies value + * @value: pointer to &struct timespec64 + */ void jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value) { @@ -618,6 +652,13 @@ EXPORT_SYMBOL(jiffies_to_timespec64); /* * Convert jiffies/jiffies_64 to clock_t and back. */ + +/** + * jiffies_to_clock_t - Convert jiffies to clock_t + * @x: jiffies value + * + * Return: jiffies converted to clock_t (CLOCKS_PER_SEC) + */ clock_t jiffies_to_clock_t(unsigned long x) { #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 @@ -632,6 +673,12 @@ clock_t jiffies_to_clock_t(unsigned long x) } EXPORT_SYMBOL(jiffies_to_clock_t); +/** + * clock_t_to_jiffies - Convert clock_t to jiffies + * @x: clock_t value + * + * Return: clock_t value converted to jiffies + */ unsigned long clock_t_to_jiffies(unsigned long x) { #if (HZ % USER_HZ)==0 @@ -649,6 +696,12 @@ unsigned long clock_t_to_jiffies(unsigned long x) } EXPORT_SYMBOL(clock_t_to_jiffies); +/** + * jiffies_64_to_clock_t - Convert jiffies_64 to clock_t + * @x: jiffies_64 value + * + * Return: jiffies_64 value converted to 64-bit "clock_t" (CLOCKS_PER_SEC) + */ u64 jiffies_64_to_clock_t(u64 x) { #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 @@ -671,6 +724,12 @@ u64 jiffies_64_to_clock_t(u64 x) } EXPORT_SYMBOL(jiffies_64_to_clock_t); +/** + * nsec_to_clock_t - Convert nsec value to clock_t + * @x: nsec value + * + * Return: nsec value converted to 64-bit "clock_t" (CLOCKS_PER_SEC) + */ u64 nsec_to_clock_t(u64 x) { #if (NSEC_PER_SEC % USER_HZ) == 0 @@ -687,6 +746,12 @@ u64 nsec_to_clock_t(u64 x) #endif } +/** + * jiffies64_to_nsecs - Convert jiffies64 to nanoseconds + * @j: jiffies64 value + * + * Return: nanoseconds value + */ u64 jiffies64_to_nsecs(u64 j) { #if !(NSEC_PER_SEC % HZ) @@ -697,6 +762,12 @@ u64 jiffies64_to_nsecs(u64 j) } EXPORT_SYMBOL(jiffies64_to_nsecs); +/** + * jiffies64_to_msecs - Convert jiffies64 to milliseconds + * @j: jiffies64 value + * + * Return: milliseconds value + */ u64 jiffies64_to_msecs(const u64 j) { #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) @@ -719,6 +790,8 @@ EXPORT_SYMBOL(jiffies64_to_msecs); * note: * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years + * + * Return: nsecs converted to jiffies64 value */ u64 nsecs_to_jiffies64(u64 n) { @@ -750,6 +823,8 @@ EXPORT_SYMBOL(nsecs_to_jiffies64); * note: * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years + * + * Return: nsecs converted to jiffies value */ unsigned long nsecs_to_jiffies(u64 n) { @@ -757,10 +832,16 @@ unsigned long nsecs_to_jiffies(u64 n) } EXPORT_SYMBOL_GPL(nsecs_to_jiffies); -/* - * Add two timespec64 values and do a safety check for overflow. +/** + * timespec64_add_safe - Add two timespec64 values and do a safety check + * for overflow. + * @lhs: first (left) timespec64 to add + * @rhs: second (right) timespec64 to add + * * It's assumed that both values are valid (>= 0). * And, each timespec64 is in normalized form. + * + * Return: sum of @lhs + @rhs */ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, const struct timespec64 rhs) @@ -778,6 +859,15 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, return res; } +/** + * get_timespec64 - get user's time value into kernel space + * @ts: destination &struct timespec64 + * @uts: user's time value as &struct __kernel_timespec + * + * Handles compat or 32-bit modes. + * + * Return: %0 on success or negative errno on error + */ int get_timespec64(struct timespec64 *ts, const struct __kernel_timespec __user *uts) { @@ -801,6 +891,14 @@ int get_timespec64(struct timespec64 *ts, } EXPORT_SYMBOL_GPL(get_timespec64); +/** + * put_timespec64 - convert timespec64 value to __kernel_timespec format and + * copy the latter to userspace + * @ts: input &struct timespec64 + * @uts: user's &struct __kernel_timespec + * + * Return: %0 on success or negative errno on error + */ int put_timespec64(const struct timespec64 *ts, struct __kernel_timespec __user *uts) { @@ -839,6 +937,15 @@ static int __put_old_timespec32(const struct timespec64 *ts64, return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0; } +/** + * get_old_timespec32 - get user's old-format time value into kernel space + * @ts: destination &struct timespec64 + * @uts: user's old-format time value (&struct old_timespec32) + * + * Handles X86_X32_ABI compatibility conversion. + * + * Return: %0 on success or negative errno on error + */ int get_old_timespec32(struct timespec64 *ts, const void __user *uts) { if (COMPAT_USE_64BIT_TIME) @@ -848,6 +955,16 @@ int get_old_timespec32(struct timespec64 *ts, const void __user *uts) } EXPORT_SYMBOL_GPL(get_old_timespec32); +/** + * put_old_timespec32 - convert timespec64 value to &struct old_timespec32 and + * copy the latter to userspace + * @ts: input &struct timespec64 + * @uts: user's &struct old_timespec32 + * + * Handles X86_X32_ABI compatibility conversion. + * + * Return: %0 on success or negative errno on error + */ int put_old_timespec32(const struct timespec64 *ts, void __user *uts) { if (COMPAT_USE_64BIT_TIME) @@ -857,6 +974,13 @@ int put_old_timespec32(const struct timespec64 *ts, void __user *uts) } EXPORT_SYMBOL_GPL(put_old_timespec32); +/** + * get_itimerspec64 - get user's &struct __kernel_itimerspec into kernel space + * @it: destination &struct itimerspec64 + * @uit: user's &struct __kernel_itimerspec + * + * Return: %0 on success or negative errno on error + */ int get_itimerspec64(struct itimerspec64 *it, const struct __kernel_itimerspec __user *uit) { @@ -872,6 +996,14 @@ int get_itimerspec64(struct itimerspec64 *it, } EXPORT_SYMBOL_GPL(get_itimerspec64); +/** + * put_itimerspec64 - convert &struct itimerspec64 to __kernel_itimerspec format + * and copy the latter to userspace + * @it: input &struct itimerspec64 + * @uit: user's &struct __kernel_itimerspec + * + * Return: %0 on success or negative errno on error + */ int put_itimerspec64(const struct itimerspec64 *it, struct __kernel_itimerspec __user *uit) { @@ -887,6 +1019,13 @@ int put_itimerspec64(const struct itimerspec64 *it, } EXPORT_SYMBOL_GPL(put_itimerspec64); +/** + * get_old_itimerspec32 - get user's &struct old_itimerspec32 into kernel space + * @its: destination &struct itimerspec64 + * @uits: user's &struct old_itimerspec32 + * + * Return: %0 on success or negative errno on error + */ int get_old_itimerspec32(struct itimerspec64 *its, const struct old_itimerspec32 __user *uits) { @@ -898,6 +1037,14 @@ int get_old_itimerspec32(struct itimerspec64 *its, } EXPORT_SYMBOL_GPL(get_old_itimerspec32); +/** + * put_old_itimerspec32 - convert &struct itimerspec64 to &struct + * old_itimerspec32 and copy the latter to userspace + * @its: input &struct itimerspec64 + * @uits: user's &struct old_itimerspec32 + * + * Return: %0 on success or negative errno on error + */ int put_old_itimerspec32(const struct itimerspec64 *its, struct old_itimerspec32 __user *uits) { diff --git a/kernel/time/time_test.c b/kernel/time/time_test.c index 831e8e779ace..ca058c8af6ba 100644 --- a/kernel/time/time_test.c +++ b/kernel/time/time_test.c @@ -86,7 +86,7 @@ static void time64_to_tm_test_date_range(struct kunit *test) } static struct kunit_case time_test_cases[] = { - KUNIT_CASE(time64_to_tm_test_date_range), + KUNIT_CASE_SLOW(time64_to_tm_test_date_range), {} }; diff --git a/kernel/torture.c b/kernel/torture.c index 1a0519b836ac..b28b05bbef02 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -37,6 +37,7 @@ #include <linux/ktime.h> #include <asm/byteorder.h> #include <linux/torture.h> +#include <linux/sched/rt.h> #include "rcu/rcu.h" MODULE_LICENSE("GPL"); @@ -54,6 +55,9 @@ module_param(verbose_sleep_frequency, int, 0444); static int verbose_sleep_duration = 1; module_param(verbose_sleep_duration, int, 0444); +static int random_shuffle; +module_param(random_shuffle, int, 0444); + static char *torture_type; static int verbose; @@ -88,8 +92,8 @@ int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, struct torture_random_s ktime_t hto = baset_ns; if (trsp) - hto += (torture_random(trsp) >> 3) % fuzzt_ns; - set_current_state(TASK_UNINTERRUPTIBLE); + hto += torture_random(trsp) % fuzzt_ns; + set_current_state(TASK_IDLE); return schedule_hrtimeout(&hto, HRTIMER_MODE_REL); } EXPORT_SYMBOL_GPL(torture_hrtimeout_ns); @@ -350,22 +354,22 @@ torture_onoff(void *arg) if (onoff_holdoff > 0) { VERBOSE_TOROUT_STRING("torture_onoff begin holdoff"); - schedule_timeout_interruptible(onoff_holdoff); + torture_hrtimeout_jiffies(onoff_holdoff, &rand); VERBOSE_TOROUT_STRING("torture_onoff end holdoff"); } while (!torture_must_stop()) { if (disable_onoff_at_boot && !rcu_inkernel_boot_has_ended()) { - schedule_timeout_interruptible(HZ / 10); + torture_hrtimeout_jiffies(HZ / 10, &rand); continue; } - cpu = (torture_random(&rand) >> 4) % (maxcpu + 1); + cpu = torture_random(&rand) % (maxcpu + 1); if (!torture_offline(cpu, &n_offline_attempts, &n_offline_successes, &sum_offline, &min_offline, &max_offline)) torture_online(cpu, &n_online_attempts, &n_online_successes, &sum_online, &min_online, &max_online); - schedule_timeout_interruptible(onoff_interval); + torture_hrtimeout_jiffies(onoff_interval, &rand); } stop: @@ -518,6 +522,7 @@ static void torture_shuffle_task_unregister_all(void) */ static void torture_shuffle_tasks(void) { + DEFINE_TORTURE_RANDOM(rand); struct shuffle_task *stp; cpumask_setall(shuffle_tmp_mask); @@ -537,8 +542,10 @@ static void torture_shuffle_tasks(void) cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask); mutex_lock(&shuffle_task_mutex); - list_for_each_entry(stp, &shuffle_task_list, st_l) - set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask); + list_for_each_entry(stp, &shuffle_task_list, st_l) { + if (!random_shuffle || torture_random(&rand) & 0x1) + set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask); + } mutex_unlock(&shuffle_task_mutex); cpus_read_unlock(); @@ -550,9 +557,11 @@ static void torture_shuffle_tasks(void) */ static int torture_shuffle(void *arg) { + DEFINE_TORTURE_RANDOM(rand); + VERBOSE_TOROUT_STRING("torture_shuffle task started"); do { - schedule_timeout_interruptible(shuffle_interval); + torture_hrtimeout_jiffies(shuffle_interval, &rand); torture_shuffle_tasks(); torture_shutdown_absorb("torture_shuffle"); } while (!torture_must_stop()); @@ -728,12 +737,12 @@ bool stutter_wait(const char *title) cond_resched_tasks_rcu_qs(); spt = READ_ONCE(stutter_pause_test); for (; spt; spt = READ_ONCE(stutter_pause_test)) { - if (!ret) { + if (!ret && !rt_task(current)) { sched_set_normal(current, MAX_NICE); ret = true; } if (spt == 1) { - schedule_timeout_interruptible(1); + torture_hrtimeout_jiffies(1, NULL); } else if (spt == 2) { while (READ_ONCE(stutter_pause_test)) { if (!(i++ & 0xffff)) @@ -741,7 +750,7 @@ bool stutter_wait(const char *title) cond_resched(); } } else { - schedule_timeout_interruptible(round_jiffies_relative(HZ)); + torture_hrtimeout_jiffies(round_jiffies_relative(HZ), NULL); } torture_shutdown_absorb(title); } @@ -926,7 +935,7 @@ EXPORT_SYMBOL_GPL(torture_kthread_stopping); * it starts, you will need to open-code your own. */ int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m, - char *f, struct task_struct **tp) + char *f, struct task_struct **tp, void (*cbf)(struct task_struct *tp)) { int ret = 0; @@ -938,6 +947,10 @@ int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m, *tp = NULL; return ret; } + + if (cbf) + cbf(*tp); + wake_up_process(*tp); // Process is sleeping, so ordering provided. torture_shuffle_task_register(*tp); return ret; diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 64b61f67a403..057cd975d014 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -99,6 +99,7 @@ obj-$(CONFIG_KGDB_KDB) += trace_kdb.o endif obj-$(CONFIG_DYNAMIC_EVENTS) += trace_dynevent.o obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o +obj-$(CONFIG_PROBE_EVENTS_BTF_ARGS) += trace_btf.o obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 5f2dcabad202..a7264b2c17ad 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -23,6 +23,7 @@ #include <linux/sort.h> #include <linux/key.h> #include <linux/verification.h> +#include <linux/namei.h> #include <net/bpf_sk_storage.h> @@ -86,6 +87,9 @@ static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size, static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx); static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx); +static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx); +static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx); + /** * trace_call_bpf - invoke BPF program * @call: tracepoint event @@ -223,17 +227,6 @@ const struct bpf_func_proto bpf_probe_read_user_str_proto = { .arg3_type = ARG_ANYTHING, }; -static __always_inline int -bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr) -{ - int ret; - - ret = copy_from_kernel_nofault(dst, unsafe_ptr, size); - if (unlikely(ret < 0)) - memset(dst, 0, size); - return ret; -} - BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size, const void *, unsafe_ptr) { @@ -661,8 +654,7 @@ static DEFINE_PER_CPU(int, bpf_trace_nest_level); BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, u64, flags, void *, data, u64, size) { - struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds); - int nest_level = this_cpu_inc_return(bpf_trace_nest_level); + struct bpf_trace_sample_data *sds; struct perf_raw_record raw = { .frag = { .size = size, @@ -670,7 +662,11 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, }, }; struct perf_sample_data *sd; - int err; + int nest_level, err; + + preempt_disable(); + sds = this_cpu_ptr(&bpf_trace_sds); + nest_level = this_cpu_inc_return(bpf_trace_nest_level); if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) { err = -EBUSY; @@ -688,9 +684,9 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, perf_sample_save_raw_data(sd, &raw); err = __bpf_perf_event_output(regs, map, flags, sd); - out: this_cpu_dec(bpf_trace_nest_level); + preempt_enable(); return err; } @@ -715,7 +711,6 @@ static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_misc_sds); u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) { - int nest_level = this_cpu_inc_return(bpf_event_output_nest_level); struct perf_raw_frag frag = { .copy = ctx_copy, .size = ctx_size, @@ -732,8 +727,12 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, }; struct perf_sample_data *sd; struct pt_regs *regs; + int nest_level; u64 ret; + preempt_disable(); + nest_level = this_cpu_inc_return(bpf_event_output_nest_level); + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bpf_misc_sds.sds))) { ret = -EBUSY; goto out; @@ -748,6 +747,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, ret = __bpf_perf_event_output(regs, map, flags, sd); out: this_cpu_dec(bpf_event_output_nest_level); + preempt_enable(); return ret; } @@ -1059,7 +1059,16 @@ static unsigned long get_entry_ip(unsigned long fentry_ip) BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs) { - struct kprobe *kp = kprobe_running(); + struct bpf_trace_run_ctx *run_ctx __maybe_unused; + struct kprobe *kp; + +#ifdef CONFIG_UPROBES + run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx); + if (run_ctx->is_uprobe) + return ((struct uprobe_dispatch_data *)current->utask->vaddr)->bp_addr; +#endif + + kp = kprobe_running(); if (!kp || !(kp->flags & KPROBE_FLAG_ON_FUNC_ENTRY)) return 0; @@ -1098,6 +1107,30 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto_kmulti = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_1(bpf_get_func_ip_uprobe_multi, struct pt_regs *, regs) +{ + return bpf_uprobe_multi_entry_ip(current->bpf_ctx); +} + +static const struct bpf_func_proto bpf_get_func_ip_proto_uprobe_multi = { + .func = bpf_get_func_ip_uprobe_multi, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_get_attach_cookie_uprobe_multi, struct pt_regs *, regs) +{ + return bpf_uprobe_multi_cookie(current->bpf_ctx); +} + +static const struct bpf_func_proto bpf_get_attach_cookie_proto_umulti = { + .func = bpf_get_attach_cookie_uprobe_multi, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + BPF_CALL_1(bpf_get_attach_cookie_trace, void *, ctx) { struct bpf_trace_run_ctx *run_ctx; @@ -1540,13 +1573,17 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_override_return_proto; #endif case BPF_FUNC_get_func_ip: - return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ? - &bpf_get_func_ip_proto_kprobe_multi : - &bpf_get_func_ip_proto_kprobe; + if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI) + return &bpf_get_func_ip_proto_kprobe_multi; + if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI) + return &bpf_get_func_ip_proto_uprobe_multi; + return &bpf_get_func_ip_proto_kprobe; case BPF_FUNC_get_attach_cookie: - return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ? - &bpf_get_attach_cookie_proto_kmulti : - &bpf_get_attach_cookie_proto_trace; + if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI) + return &bpf_get_attach_cookie_proto_kmulti; + if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI) + return &bpf_get_attach_cookie_proto_umulti; + return &bpf_get_attach_cookie_proto_trace; default: return bpf_tracing_func_proto(func_id, prog); } @@ -2369,9 +2406,13 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, if (is_tracepoint || is_syscall_tp) { *buf = is_tracepoint ? event->tp_event->tp->name : event->tp_event->name; - *fd_type = BPF_FD_TYPE_TRACEPOINT; - *probe_offset = 0x0; - *probe_addr = 0x0; + /* We allow NULL pointer for tracepoint */ + if (fd_type) + *fd_type = BPF_FD_TYPE_TRACEPOINT; + if (probe_offset) + *probe_offset = 0x0; + if (probe_addr) + *probe_addr = 0x0; } else { /* kprobe/uprobe */ err = -EOPNOTSUPP; @@ -2384,7 +2425,7 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, #ifdef CONFIG_UPROBE_EVENTS if (flags & TRACE_EVENT_FL_UPROBE) err = bpf_get_uprobe_info(event, fd_type, buf, - probe_offset, + probe_offset, probe_addr, event->attr.type == PERF_TYPE_TRACEPOINT); #endif } @@ -2469,6 +2510,7 @@ struct bpf_kprobe_multi_link { u32 cnt; u32 mods_cnt; struct module **mods; + u32 flags; }; struct bpf_kprobe_multi_run_ctx { @@ -2558,9 +2600,44 @@ static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link) kfree(kmulti_link); } +static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + u64 __user *uaddrs = u64_to_user_ptr(info->kprobe_multi.addrs); + struct bpf_kprobe_multi_link *kmulti_link; + u32 ucount = info->kprobe_multi.count; + int err = 0, i; + + if (!uaddrs ^ !ucount) + return -EINVAL; + + kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); + info->kprobe_multi.count = kmulti_link->cnt; + info->kprobe_multi.flags = kmulti_link->flags; + + if (!uaddrs) + return 0; + if (ucount < kmulti_link->cnt) + err = -ENOSPC; + else + ucount = kmulti_link->cnt; + + if (kallsyms_show_value(current_cred())) { + if (copy_to_user(uaddrs, kmulti_link->addrs, ucount * sizeof(u64))) + return -EFAULT; + } else { + for (i = 0; i < ucount; i++) { + if (put_user(0, uaddrs + i)) + return -EFAULT; + } + } + return err; +} + static const struct bpf_link_ops bpf_kprobe_multi_link_lops = { .release = bpf_kprobe_multi_link_release, .dealloc = bpf_kprobe_multi_link_dealloc, + .fill_link_info = bpf_kprobe_multi_link_fill_link_info, }; static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void *priv) @@ -2874,6 +2951,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr link->addrs = addrs; link->cookies = cookies; link->cnt = cnt; + link->flags = flags; if (cookies) { /* @@ -2924,3 +3002,301 @@ static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx) return 0; } #endif + +#ifdef CONFIG_UPROBES +struct bpf_uprobe_multi_link; + +struct bpf_uprobe { + struct bpf_uprobe_multi_link *link; + loff_t offset; + u64 cookie; + struct uprobe_consumer consumer; +}; + +struct bpf_uprobe_multi_link { + struct path path; + struct bpf_link link; + u32 cnt; + struct bpf_uprobe *uprobes; + struct task_struct *task; +}; + +struct bpf_uprobe_multi_run_ctx { + struct bpf_run_ctx run_ctx; + unsigned long entry_ip; + struct bpf_uprobe *uprobe; +}; + +static void bpf_uprobe_unregister(struct path *path, struct bpf_uprobe *uprobes, + u32 cnt) +{ + u32 i; + + for (i = 0; i < cnt; i++) { + uprobe_unregister(d_real_inode(path->dentry), uprobes[i].offset, + &uprobes[i].consumer); + } +} + +static void bpf_uprobe_multi_link_release(struct bpf_link *link) +{ + struct bpf_uprobe_multi_link *umulti_link; + + umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); + bpf_uprobe_unregister(&umulti_link->path, umulti_link->uprobes, umulti_link->cnt); +} + +static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link) +{ + struct bpf_uprobe_multi_link *umulti_link; + + umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); + if (umulti_link->task) + put_task_struct(umulti_link->task); + path_put(&umulti_link->path); + kvfree(umulti_link->uprobes); + kfree(umulti_link); +} + +static const struct bpf_link_ops bpf_uprobe_multi_link_lops = { + .release = bpf_uprobe_multi_link_release, + .dealloc = bpf_uprobe_multi_link_dealloc, +}; + +static int uprobe_prog_run(struct bpf_uprobe *uprobe, + unsigned long entry_ip, + struct pt_regs *regs) +{ + struct bpf_uprobe_multi_link *link = uprobe->link; + struct bpf_uprobe_multi_run_ctx run_ctx = { + .entry_ip = entry_ip, + .uprobe = uprobe, + }; + struct bpf_prog *prog = link->link.prog; + bool sleepable = prog->aux->sleepable; + struct bpf_run_ctx *old_run_ctx; + int err = 0; + + if (link->task && current != link->task) + return 0; + + if (sleepable) + rcu_read_lock_trace(); + else + rcu_read_lock(); + + migrate_disable(); + + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + err = bpf_prog_run(link->link.prog, regs); + bpf_reset_run_ctx(old_run_ctx); + + migrate_enable(); + + if (sleepable) + rcu_read_unlock_trace(); + else + rcu_read_unlock(); + return err; +} + +static bool +uprobe_multi_link_filter(struct uprobe_consumer *con, enum uprobe_filter_ctx ctx, + struct mm_struct *mm) +{ + struct bpf_uprobe *uprobe; + + uprobe = container_of(con, struct bpf_uprobe, consumer); + return uprobe->link->task->mm == mm; +} + +static int +uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs) +{ + struct bpf_uprobe *uprobe; + + uprobe = container_of(con, struct bpf_uprobe, consumer); + return uprobe_prog_run(uprobe, instruction_pointer(regs), regs); +} + +static int +uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs) +{ + struct bpf_uprobe *uprobe; + + uprobe = container_of(con, struct bpf_uprobe, consumer); + return uprobe_prog_run(uprobe, func, regs); +} + +static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx) +{ + struct bpf_uprobe_multi_run_ctx *run_ctx; + + run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, run_ctx); + return run_ctx->entry_ip; +} + +static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx) +{ + struct bpf_uprobe_multi_run_ctx *run_ctx; + + run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, run_ctx); + return run_ctx->uprobe->cookie; +} + +int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_uprobe_multi_link *link = NULL; + unsigned long __user *uref_ctr_offsets; + unsigned long *ref_ctr_offsets = NULL; + struct bpf_link_primer link_primer; + struct bpf_uprobe *uprobes = NULL; + struct task_struct *task = NULL; + unsigned long __user *uoffsets; + u64 __user *ucookies; + void __user *upath; + u32 flags, cnt, i; + struct path path; + char *name; + pid_t pid; + int err; + + /* no support for 32bit archs yet */ + if (sizeof(u64) != sizeof(void *)) + return -EOPNOTSUPP; + + if (prog->expected_attach_type != BPF_TRACE_UPROBE_MULTI) + return -EINVAL; + + flags = attr->link_create.uprobe_multi.flags; + if (flags & ~BPF_F_UPROBE_MULTI_RETURN) + return -EINVAL; + + /* + * path, offsets and cnt are mandatory, + * ref_ctr_offsets and cookies are optional + */ + upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path); + uoffsets = u64_to_user_ptr(attr->link_create.uprobe_multi.offsets); + cnt = attr->link_create.uprobe_multi.cnt; + + if (!upath || !uoffsets || !cnt) + return -EINVAL; + + uref_ctr_offsets = u64_to_user_ptr(attr->link_create.uprobe_multi.ref_ctr_offsets); + ucookies = u64_to_user_ptr(attr->link_create.uprobe_multi.cookies); + + name = strndup_user(upath, PATH_MAX); + if (IS_ERR(name)) { + err = PTR_ERR(name); + return err; + } + + err = kern_path(name, LOOKUP_FOLLOW, &path); + kfree(name); + if (err) + return err; + + if (!d_is_reg(path.dentry)) { + err = -EBADF; + goto error_path_put; + } + + pid = attr->link_create.uprobe_multi.pid; + if (pid) { + rcu_read_lock(); + task = get_pid_task(find_vpid(pid), PIDTYPE_PID); + rcu_read_unlock(); + if (!task) + goto error_path_put; + } + + err = -ENOMEM; + + link = kzalloc(sizeof(*link), GFP_KERNEL); + uprobes = kvcalloc(cnt, sizeof(*uprobes), GFP_KERNEL); + + if (!uprobes || !link) + goto error_free; + + if (uref_ctr_offsets) { + ref_ctr_offsets = kvcalloc(cnt, sizeof(*ref_ctr_offsets), GFP_KERNEL); + if (!ref_ctr_offsets) + goto error_free; + } + + for (i = 0; i < cnt; i++) { + if (ucookies && __get_user(uprobes[i].cookie, ucookies + i)) { + err = -EFAULT; + goto error_free; + } + if (uref_ctr_offsets && __get_user(ref_ctr_offsets[i], uref_ctr_offsets + i)) { + err = -EFAULT; + goto error_free; + } + if (__get_user(uprobes[i].offset, uoffsets + i)) { + err = -EFAULT; + goto error_free; + } + + uprobes[i].link = link; + + if (flags & BPF_F_UPROBE_MULTI_RETURN) + uprobes[i].consumer.ret_handler = uprobe_multi_link_ret_handler; + else + uprobes[i].consumer.handler = uprobe_multi_link_handler; + + if (pid) + uprobes[i].consumer.filter = uprobe_multi_link_filter; + } + + link->cnt = cnt; + link->uprobes = uprobes; + link->path = path; + link->task = task; + + bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI, + &bpf_uprobe_multi_link_lops, prog); + + for (i = 0; i < cnt; i++) { + err = uprobe_register_refctr(d_real_inode(link->path.dentry), + uprobes[i].offset, + ref_ctr_offsets ? ref_ctr_offsets[i] : 0, + &uprobes[i].consumer); + if (err) { + bpf_uprobe_unregister(&path, uprobes, i); + goto error_free; + } + } + + err = bpf_link_prime(&link->link, &link_primer); + if (err) + goto error_free; + + kvfree(ref_ctr_offsets); + return bpf_link_settle(&link_primer); + +error_free: + kvfree(ref_ctr_offsets); + kvfree(uprobes); + kfree(link); + if (task) + put_task_struct(task); +error_path_put: + path_put(&path); + return err; +} +#else /* !CONFIG_UPROBES */ +int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} +static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx) +{ + return 0; +} +static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx) +{ + return 0; +} +#endif /* CONFIG_UPROBES */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 05c0024815bf..8de8bec5f366 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6779,8 +6779,7 @@ void ftrace_release_mod(struct module *mod) last_pg = &ftrace_pages_start; for (pg = ftrace_pages_start; pg; pg = *last_pg) { rec = &pg->records[0]; - if (within_module_core(rec->ip, mod) || - within_module_init(rec->ip, mod)) { + if (within_module(rec->ip, mod)) { /* * As core pages are first, the first * page should never be a module page. @@ -6852,8 +6851,7 @@ void ftrace_module_enable(struct module *mod) * not part of this module, then skip this pg, * which the "break" will do. */ - if (!within_module_core(rec->ip, mod) && - !within_module_init(rec->ip, mod)) + if (!within_module(rec->ip, mod)) break; /* Weak functions should still be ignored */ @@ -7142,9 +7140,7 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) struct dyn_ftrace key; struct ftrace_mod_map *mod_map = NULL; struct ftrace_init_func *func, *func_next; - struct list_head clear_hash; - - INIT_LIST_HEAD(&clear_hash); + LIST_HEAD(clear_hash); key.ip = start; key.flags = end; /* overload flags, as it is unsigned long */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index de061dd47313..a1651edc48d5 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -523,6 +523,8 @@ struct ring_buffer_per_cpu { rb_time_t before_stamp; u64 event_stamp[MAX_NEST]; u64 read_stamp; + /* pages removed since last reset */ + unsigned long pages_removed; /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; struct list_head new_pages; /* new pages to add */ @@ -559,6 +561,7 @@ struct ring_buffer_iter { struct buffer_page *head_page; struct buffer_page *cache_reader_page; unsigned long cache_read; + unsigned long cache_pages_removed; u64 read_stamp; u64 page_stamp; struct ring_buffer_event *event; @@ -689,10 +692,7 @@ static void rb_time_set(rb_time_t *t, u64 val) static inline bool rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set) { - unsigned long ret; - - ret = local_cmpxchg(l, expect, set); - return ret == expect; + return local_try_cmpxchg(l, &expect, set); } static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) @@ -749,9 +749,7 @@ static void rb_time_set(rb_time_t *t, u64 val) static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) { - u64 val; - val = local64_cmpxchg(&t->time, expect, set); - return val == expect; + return local64_try_cmpxchg(&t->time, &expect, set); } #endif @@ -947,6 +945,7 @@ static void rb_wake_up_waiters(struct irq_work *work) /** * ring_buffer_wake_waiters - wake up any waiters on this ring buffer * @buffer: The ring buffer to wake waiters on + * @cpu: The CPU buffer to wake waiters on * * In the case of a file that represents a ring buffer is closing, * it is prudent to wake up any waiters that are on this. @@ -1490,14 +1489,11 @@ static bool rb_head_page_replace(struct buffer_page *old, { unsigned long *ptr = (unsigned long *)&old->list.prev->next; unsigned long val; - unsigned long ret; val = *ptr & ~RB_FLAG_MASK; val |= RB_PAGE_HEAD; - ret = cmpxchg(ptr, val, (unsigned long)&new->list); - - return ret == val; + return try_cmpxchg(ptr, &val, (unsigned long)&new->list); } /* @@ -1957,6 +1953,8 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) to_remove = rb_list_head(to_remove)->next; head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD; } + /* Read iterators need to reset themselves when some pages removed */ + cpu_buffer->pages_removed += nr_removed; next_page = rb_list_head(to_remove)->next; @@ -1978,12 +1976,6 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) cpu_buffer->head_page = list_entry(next_page, struct buffer_page, list); - /* - * change read pointer to make sure any read iterators reset - * themselves - */ - cpu_buffer->read = 0; - /* pages are removed, resume tracing and then free the pages */ atomic_dec(&cpu_buffer->record_disabled); raw_spin_unlock_irq(&cpu_buffer->reader_lock); @@ -2206,6 +2198,8 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, err = -ENOMEM; goto out_err; } + + cond_resched(); } cpus_read_lock(); @@ -2396,6 +2390,11 @@ rb_iter_head_event(struct ring_buffer_iter *iter) */ commit = rb_page_commit(iter_head_page); smp_rmb(); + + /* An event needs to be at least 8 bytes in size */ + if (iter->head > commit - 8) + goto reset; + event = __rb_page_index(iter_head_page, iter->head); length = rb_event_length(event); @@ -3003,7 +3002,6 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, { unsigned long new_index, old_index; struct buffer_page *bpage; - unsigned long index; unsigned long addr; u64 write_stamp; u64 delta; @@ -3060,8 +3058,9 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, */ old_index += write_mask; new_index += write_mask; - index = local_cmpxchg(&bpage->write, old_index, new_index); - if (index == old_index) { + + /* caution: old_index gets updated on cmpxchg failure */ + if (local_try_cmpxchg(&bpage->write, &old_index, new_index)) { /* update counters */ local_sub(event_length, &cpu_buffer->entries_bytes); return true; @@ -3376,7 +3375,6 @@ void ring_buffer_nest_end(struct trace_buffer *buffer) /** * ring_buffer_unlock_commit - commit a reserved * @buffer: The buffer to commit to - * @event: The event pointer to commit. * * This commits the data to the ring buffer, and releases any locks held. * @@ -4395,6 +4393,7 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) iter->cache_reader_page = iter->head_page; iter->cache_read = cpu_buffer->read; + iter->cache_pages_removed = cpu_buffer->pages_removed; if (iter->head) { iter->read_stamp = cpu_buffer->read_stamp; @@ -4849,12 +4848,13 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) buffer = cpu_buffer->buffer; /* - * Check if someone performed a consuming read to - * the buffer. A consuming read invalidates the iterator - * and we need to reset the iterator in this case. + * Check if someone performed a consuming read to the buffer + * or removed some pages from the buffer. In these cases, + * iterator was invalidated and we need to reset it. */ if (unlikely(iter->cache_read != cpu_buffer->read || - iter->cache_reader_page != cpu_buffer->reader_page)) + iter->cache_reader_page != cpu_buffer->reader_page || + iter->cache_pages_removed != cpu_buffer->pages_removed)) rb_iter_reset(iter); again: @@ -5298,6 +5298,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->last_overrun = 0; rb_head_page_activate(cpu_buffer); + cpu_buffer->pages_removed = 0; } /* Must have disabled the cpu buffer then done a synchronize_rcu */ @@ -5356,7 +5357,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); /** * ring_buffer_reset_online_cpus - reset a ring buffer per CPU buffer * @buffer: The ring buffer to reset a per cpu buffer of - * @cpu: The CPU buffer to be reset */ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b8870078ef58..abaaf516fcae 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1772,7 +1772,7 @@ static void trace_create_maxlat_file(struct trace_array *tr, init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq); tr->d_max_latency = trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, - d_tracer, &tr->max_latency, + d_tracer, tr, &tracing_max_lat_fops); } @@ -1805,7 +1805,7 @@ void latency_fsnotify(struct trace_array *tr) #define trace_create_maxlat_file(tr, d_tracer) \ trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, \ - d_tracer, &tr->max_latency, &tracing_max_lat_fops) + d_tracer, tr, &tracing_max_lat_fops) #endif @@ -3119,7 +3119,6 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, struct ftrace_stack *fstack; struct stack_entry *entry; int stackidx; - void *ptr; /* * Add one, for this function and the call to save_stack_trace() @@ -3157,32 +3156,16 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, nr_entries = stack_trace_save(fstack->calls, size, skip); } - size = nr_entries * sizeof(unsigned long); event = __trace_buffer_lock_reserve(buffer, TRACE_STACK, - (sizeof(*entry) - sizeof(entry->caller)) + size, + struct_size(entry, caller, nr_entries), trace_ctx); if (!event) goto out; - ptr = ring_buffer_event_data(event); - entry = ptr; - - /* - * For backward compatibility reasons, the entry->caller is an - * array of 8 slots to store the stack. This is also exported - * to user space. The amount allocated on the ring buffer actually - * holds enough for the stack specified by nr_entries. This will - * go into the location of entry->caller. Due to string fortifiers - * checking the size of the destination of memcpy() it triggers - * when it detects that size is greater than 8. To hide this from - * the fortifiers, we use "ptr" and pointer arithmetic to assign caller. - * - * The below is really just: - * memcpy(&entry->caller, fstack->calls, size); - */ - ptr += offsetof(typeof(*entry), caller); - memcpy(ptr, fstack->calls, size); + entry = ring_buffer_event_data(event); entry->size = nr_entries; + memcpy(&entry->caller, fstack->calls, + flex_array_size(entry, caller, nr_entries)); if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); @@ -4206,15 +4189,16 @@ static void *s_start(struct seq_file *m, loff_t *pos) loff_t l = 0; int cpu; - /* - * copy the tracer to avoid using a global lock all around. - * iter->trace is a copy of current_trace, the pointer to the - * name may be used instead of a strcmp(), as iter->trace->name - * will point to the same string as current_trace->name. - */ mutex_lock(&trace_types_lock); - if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name)) - *iter->trace = *tr->current_trace; + if (unlikely(tr->current_trace != iter->trace)) { + /* Close iter->trace before switching to the new current tracer */ + if (iter->trace->close) + iter->trace->close(iter); + iter->trace = tr->current_trace; + /* Reopen the new current tracer */ + if (iter->trace->open) + iter->trace->open(iter); + } mutex_unlock(&trace_types_lock); #ifdef CONFIG_TRACER_MAX_TRACE @@ -4822,6 +4806,25 @@ static const struct seq_operations tracer_seq_ops = { .show = s_show, }; +/* + * Note, as iter itself can be allocated and freed in different + * ways, this function is only used to free its content, and not + * the iterator itself. The only requirement to all the allocations + * is that it must zero all fields (kzalloc), as freeing works with + * ethier allocated content or NULL. + */ +static void free_trace_iter_content(struct trace_iterator *iter) +{ + /* The fmt is either NULL, allocated or points to static_fmt_buf */ + if (iter->fmt != static_fmt_buf) + kfree(iter->fmt); + + kfree(iter->temp); + kfree(iter->buffer_iter); + mutex_destroy(&iter->mutex); + free_cpumask_var(iter->started); +} + static struct trace_iterator * __tracing_open(struct inode *inode, struct file *file, bool snapshot) { @@ -4863,16 +4866,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) iter->fmt = NULL; iter->fmt_size = 0; - /* - * We make a copy of the current tracer to avoid concurrent - * changes on it while we are reading. - */ mutex_lock(&trace_types_lock); - iter->trace = kzalloc(sizeof(*iter->trace), GFP_KERNEL); - if (!iter->trace) - goto fail; - - *iter->trace = *tr->current_trace; + iter->trace = tr->current_trace; if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) goto fail; @@ -4937,9 +4932,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) fail: mutex_unlock(&trace_types_lock); - kfree(iter->trace); - kfree(iter->temp); - kfree(iter->buffer_iter); + free_trace_iter_content(iter); release: seq_release_private(inode, file); return ERR_PTR(-ENOMEM); @@ -4980,6 +4973,33 @@ int tracing_open_generic_tr(struct inode *inode, struct file *filp) return 0; } +/* + * The private pointer of the inode is the trace_event_file. + * Update the tr ref count associated to it. + */ +int tracing_open_file_tr(struct inode *inode, struct file *filp) +{ + struct trace_event_file *file = inode->i_private; + int ret; + + ret = tracing_check_open_get_tr(file->tr); + if (ret) + return ret; + + filp->private_data = inode->i_private; + + return 0; +} + +int tracing_release_file_tr(struct inode *inode, struct file *filp) +{ + struct trace_event_file *file = inode->i_private; + + trace_array_put(file->tr); + + return 0; +} + static int tracing_mark_open(struct inode *inode, struct file *filp) { stream_open(inode, filp); @@ -5018,12 +5038,7 @@ static int tracing_release(struct inode *inode, struct file *file) mutex_unlock(&trace_types_lock); - mutex_destroy(&iter->mutex); - free_cpumask_var(iter->started); - kfree(iter->fmt); - kfree(iter->temp); - kfree(iter->trace); - kfree(iter->buffer_iter); + free_trace_iter_content(iter); seq_release_private(inode, file); return 0; @@ -5277,11 +5292,17 @@ int tracing_set_cpumask(struct trace_array *tr, !cpumask_test_cpu(cpu, tracing_cpumask_new)) { atomic_inc(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled); ring_buffer_record_disable_cpu(tr->array_buffer.buffer, cpu); +#ifdef CONFIG_TRACER_MAX_TRACE + ring_buffer_record_disable_cpu(tr->max_buffer.buffer, cpu); +#endif } if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) && cpumask_test_cpu(cpu, tracing_cpumask_new)) { atomic_dec(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled); ring_buffer_record_enable_cpu(tr->array_buffer.buffer, cpu); +#ifdef CONFIG_TRACER_MAX_TRACE + ring_buffer_record_enable_cpu(tr->max_buffer.buffer, cpu); +#endif } } arch_spin_unlock(&tr->max_lock); @@ -5717,7 +5738,8 @@ static const char readme_msg[] = "\t fetcharg: (%<register>|$<efield>), @<address>, @<symbol>[+|-<offset>],\n" #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API #ifdef CONFIG_PROBE_EVENTS_BTF_ARGS - "\t $stack<index>, $stack, $retval, $comm, $arg<N>, <argname>\n" + "\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n" + "\t <argname>[->field[->field|.field...]],\n" #else "\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n" #endif @@ -6305,6 +6327,15 @@ static void set_buffer_entries(struct array_buffer *buf, unsigned long val) per_cpu_ptr(buf->data, cpu)->entries = val; } +static void update_buffer_entries(struct array_buffer *buf, int cpu) +{ + if (cpu == RING_BUFFER_ALL_CPUS) { + set_buffer_entries(buf, ring_buffer_size(buf->buffer, 0)); + } else { + per_cpu_ptr(buf->data, cpu)->entries = ring_buffer_size(buf->buffer, cpu); + } +} + #ifdef CONFIG_TRACER_MAX_TRACE /* resize @tr's buffer to the size of @size_tr's entries */ static int resize_buffer_duplicate_size(struct array_buffer *trace_buf, @@ -6383,18 +6414,12 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, return ret; } - if (cpu == RING_BUFFER_ALL_CPUS) - set_buffer_entries(&tr->max_buffer, size); - else - per_cpu_ptr(tr->max_buffer.data, cpu)->entries = size; + update_buffer_entries(&tr->max_buffer, cpu); out: #endif /* CONFIG_TRACER_MAX_TRACE */ - if (cpu == RING_BUFFER_ALL_CPUS) - set_buffer_entries(&tr->array_buffer, size); - else - per_cpu_ptr(tr->array_buffer.data, cpu)->entries = size; + update_buffer_entries(&tr->array_buffer, cpu); return ret; } @@ -6693,22 +6718,52 @@ static ssize_t tracing_max_lat_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - return tracing_nsecs_read(filp->private_data, ubuf, cnt, ppos); + struct trace_array *tr = filp->private_data; + + return tracing_nsecs_read(&tr->max_latency, ubuf, cnt, ppos); } static ssize_t tracing_max_lat_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - return tracing_nsecs_write(filp->private_data, ubuf, cnt, ppos); + struct trace_array *tr = filp->private_data; + + return tracing_nsecs_write(&tr->max_latency, ubuf, cnt, ppos); } #endif +static int open_pipe_on_cpu(struct trace_array *tr, int cpu) +{ + if (cpu == RING_BUFFER_ALL_CPUS) { + if (cpumask_empty(tr->pipe_cpumask)) { + cpumask_setall(tr->pipe_cpumask); + return 0; + } + } else if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) { + cpumask_set_cpu(cpu, tr->pipe_cpumask); + return 0; + } + return -EBUSY; +} + +static void close_pipe_on_cpu(struct trace_array *tr, int cpu) +{ + if (cpu == RING_BUFFER_ALL_CPUS) { + WARN_ON(!cpumask_full(tr->pipe_cpumask)); + cpumask_clear(tr->pipe_cpumask); + } else { + WARN_ON(!cpumask_test_cpu(cpu, tr->pipe_cpumask)); + cpumask_clear_cpu(cpu, tr->pipe_cpumask); + } +} + static int tracing_open_pipe(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; struct trace_iterator *iter; + int cpu; int ret; ret = tracing_check_open_get_tr(tr); @@ -6716,13 +6771,16 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) return ret; mutex_lock(&trace_types_lock); + cpu = tracing_get_cpu(inode); + ret = open_pipe_on_cpu(tr, cpu); + if (ret) + goto fail_pipe_on_cpu; /* create a buffer to store the information to pass to userspace */ iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) { ret = -ENOMEM; - __trace_array_put(tr); - goto out; + goto fail_alloc_iter; } trace_seq_init(&iter->seq); @@ -6745,7 +6803,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) iter->tr = tr; iter->array_buffer = &tr->array_buffer; - iter->cpu_file = tracing_get_cpu(inode); + iter->cpu_file = cpu; mutex_init(&iter->mutex); filp->private_data = iter; @@ -6755,12 +6813,15 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) nonseekable_open(inode, filp); tr->trace_ref++; -out: + mutex_unlock(&trace_types_lock); return ret; fail: kfree(iter); +fail_alloc_iter: + close_pipe_on_cpu(tr, cpu); +fail_pipe_on_cpu: __trace_array_put(tr); mutex_unlock(&trace_types_lock); return ret; @@ -6777,13 +6838,10 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) if (iter->trace->pipe_close) iter->trace->pipe_close(iter); - + close_pipe_on_cpu(tr, iter->cpu_file); mutex_unlock(&trace_types_lock); - free_cpumask_var(iter->started); - kfree(iter->fmt); - kfree(iter->temp); - mutex_destroy(&iter->mutex); + free_trace_iter_content(iter); kfree(iter); trace_array_put(tr); @@ -7573,6 +7631,11 @@ out: return ret; } +static void tracing_swap_cpu_buffer(void *tr) +{ + update_max_tr_single((struct trace_array *)tr, current, smp_processor_id()); +} + static ssize_t tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) @@ -7631,13 +7694,15 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, ret = tracing_alloc_snapshot_instance(tr); if (ret < 0) break; - local_irq_disable(); /* Now, we're going to swap */ - if (iter->cpu_file == RING_BUFFER_ALL_CPUS) + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { + local_irq_disable(); update_max_tr(tr, current, smp_processor_id(), NULL); - else - update_max_tr_single(tr, current, iter->cpu_file); - local_irq_enable(); + local_irq_enable(); + } else { + smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer, + (void *)tr, 1); + } break; default: if (tr->allocated_snapshot) { @@ -7718,18 +7783,20 @@ static const struct file_operations tracing_thresh_fops = { #ifdef CONFIG_TRACER_MAX_TRACE static const struct file_operations tracing_max_lat_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_tr, .read = tracing_max_lat_read, .write = tracing_max_lat_write, .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, }; #endif static const struct file_operations set_tracer_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_tr, .read = tracing_set_trace_read, .write = tracing_set_trace_write, .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, }; static const struct file_operations tracing_pipe_fops = { @@ -8922,12 +8989,33 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } +static int tracing_open_options(struct inode *inode, struct file *filp) +{ + struct trace_option_dentry *topt = inode->i_private; + int ret; + + ret = tracing_check_open_get_tr(topt->tr); + if (ret) + return ret; + + filp->private_data = inode->i_private; + return 0; +} + +static int tracing_release_options(struct inode *inode, struct file *file) +{ + struct trace_option_dentry *topt = file->private_data; + + trace_array_put(topt->tr); + return 0; +} static const struct file_operations trace_options_fops = { - .open = tracing_open_generic, + .open = tracing_open_options, .read = trace_options_read, .write = trace_options_write, .llseek = generic_file_llseek, + .release = tracing_release_options, }; /* @@ -9441,6 +9529,9 @@ static struct trace_array *trace_array_create(const char *name) if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL)) goto out_free_tr; + if (!zalloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL)) + goto out_free_tr; + tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS; cpumask_copy(tr->tracing_cpumask, cpu_all_mask); @@ -9482,6 +9573,7 @@ static struct trace_array *trace_array_create(const char *name) out_free_tr: ftrace_free_ftrace_ops(tr); free_trace_buffers(tr); + free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); @@ -9584,6 +9676,7 @@ static int __remove_instance(struct trace_array *tr) } kfree(tr->topts); + free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); @@ -9700,8 +9793,8 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) tr, &tracing_mark_fops); file = __find_event_file(tr, "ftrace", "print"); - if (file && file->dir) - trace_create_file("trigger", TRACE_MODE_WRITE, file->dir, + if (file && file->ef) + eventfs_add_file("trigger", TRACE_MODE_WRITE, file->ef, file, &event_trigger_fops); tr->trace_marker_file = file; @@ -10381,12 +10474,14 @@ __init static int tracer_alloc_buffers(void) if (trace_create_savedcmd() < 0) goto out_free_temp_buffer; + if (!zalloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL)) + goto out_free_savedcmd; + /* TODO: make the number of buffers hot pluggable with CPUS */ if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n"); - goto out_free_savedcmd; + goto out_free_pipe_cpumask; } - if (global_trace.buffer_disabled) tracing_off(); @@ -10439,6 +10534,8 @@ __init static int tracer_alloc_buffers(void) return 0; +out_free_pipe_cpumask: + free_cpumask_var(global_trace.pipe_cpumask); out_free_savedcmd: free_saved_cmdlines_buffer(savedcmd); out_free_temp_buffer: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e1edc2197fc8..77debe53f07c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -77,6 +77,16 @@ enum trace_type { #undef __array #define __array(type, item, size) type item[size]; +/* + * For backward compatibility, older user space expects to see the + * kernel_stack event with a fixed size caller field. But today the fix + * size is ignored by the kernel, and the real structure is dynamic. + * Expose to user space: "unsigned long caller[8];" but the real structure + * will be "unsigned long caller[] __counted_by(size)" + */ +#undef __stack_array +#define __stack_array(type, item, size, field) type item[] __counted_by(field); + #undef __array_desc #define __array_desc(type, container, item, size) @@ -377,6 +387,8 @@ struct trace_array { struct list_head events; struct trace_event_file *trace_marker_file; cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ + /* one per_cpu trace_pipe can be opened by only one user */ + cpumask_var_t pipe_cpumask; int ref; int trace_ref; #ifdef CONFIG_FUNCTION_TRACER @@ -594,11 +606,12 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu) int tracer_init(struct tracer *t, struct trace_array *tr); int tracing_is_enabled(void); void tracing_reset_online_cpus(struct array_buffer *buf); -void tracing_reset_current(int cpu); void tracing_reset_all_online_cpus(void); void tracing_reset_all_online_cpus_unlocked(void); int tracing_open_generic(struct inode *inode, struct file *filp); int tracing_open_generic_tr(struct inode *inode, struct file *filp); +int tracing_open_file_tr(struct inode *inode, struct file *filp); +int tracing_release_file_tr(struct inode *inode, struct file *filp); bool tracing_is_disabled(void); bool tracer_tracing_is_on(struct trace_array *tr); void tracer_tracing_on(struct trace_array *tr); @@ -695,7 +708,6 @@ void trace_filter_add_remove_task(struct trace_pid_list *pid_list, void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos); void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos); int trace_pid_show(struct seq_file *m, void *v); -void trace_free_pid_list(struct trace_pid_list *pid_list); int trace_pid_write(struct trace_pid_list *filtered_pids, struct trace_pid_list **new_pid_list, const char __user *ubuf, size_t cnt); @@ -1295,6 +1307,14 @@ static inline void trace_branch_disable(void) /* set ring buffers to default size if not already done so */ int tracing_update_buffers(void); +union trace_synth_field { + u8 as_u8; + u16 as_u16; + u32 as_u32; + u64 as_u64; + struct trace_dynamic_info as_dynamic; +}; + struct ftrace_event_field { struct list_head link; const char *name; @@ -1324,7 +1344,7 @@ struct trace_subsystem_dir { struct list_head list; struct event_subsystem *subsystem; struct trace_array *tr; - struct dentry *entry; + struct eventfs_file *ef; int ref_count; int nr_events; }; diff --git a/kernel/trace/trace_btf.c b/kernel/trace/trace_btf.c new file mode 100644 index 000000000000..ca224d53bfdc --- /dev/null +++ b/kernel/trace/trace_btf.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/btf.h> +#include <linux/kernel.h> +#include <linux/slab.h> + +#include "trace_btf.h" + +/* + * Find a function proto type by name, and return the btf_type with its btf + * in *@btf_p. Return NULL if not found. + * Note that caller has to call btf_put(*@btf_p) after using the btf_type. + */ +const struct btf_type *btf_find_func_proto(const char *func_name, struct btf **btf_p) +{ + const struct btf_type *t; + s32 id; + + id = bpf_find_btf_id(func_name, BTF_KIND_FUNC, btf_p); + if (id < 0) + return NULL; + + /* Get BTF_KIND_FUNC type */ + t = btf_type_by_id(*btf_p, id); + if (!t || !btf_type_is_func(t)) + goto err; + + /* The type of BTF_KIND_FUNC is BTF_KIND_FUNC_PROTO */ + t = btf_type_by_id(*btf_p, t->type); + if (!t || !btf_type_is_func_proto(t)) + goto err; + + return t; +err: + btf_put(*btf_p); + return NULL; +} + +/* + * Get function parameter with the number of parameters. + * This can return NULL if the function has no parameters. + * It can return -EINVAL if the @func_proto is not a function proto type. + */ +const struct btf_param *btf_get_func_param(const struct btf_type *func_proto, s32 *nr) +{ + if (!btf_type_is_func_proto(func_proto)) + return ERR_PTR(-EINVAL); + + *nr = btf_type_vlen(func_proto); + if (*nr > 0) + return (const struct btf_param *)(func_proto + 1); + else + return NULL; +} + +#define BTF_ANON_STACK_MAX 16 + +struct btf_anon_stack { + u32 tid; + u32 offset; +}; + +/* + * Find a member of data structure/union by name and return it. + * Return NULL if not found, or -EINVAL if parameter is invalid. + * If the member is an member of anonymous union/structure, the offset + * of that anonymous union/structure is stored into @anon_offset. Caller + * can calculate the correct offset from the root data structure by + * adding anon_offset to the member's offset. + */ +const struct btf_member *btf_find_struct_member(struct btf *btf, + const struct btf_type *type, + const char *member_name, + u32 *anon_offset) +{ + struct btf_anon_stack *anon_stack; + const struct btf_member *member; + u32 tid, cur_offset = 0; + const char *name; + int i, top = 0; + + anon_stack = kcalloc(BTF_ANON_STACK_MAX, sizeof(*anon_stack), GFP_KERNEL); + if (!anon_stack) + return ERR_PTR(-ENOMEM); + +retry: + if (!btf_type_is_struct(type)) { + member = ERR_PTR(-EINVAL); + goto out; + } + + for_each_member(i, type, member) { + if (!member->name_off) { + /* Anonymous union/struct: push it for later use */ + type = btf_type_skip_modifiers(btf, member->type, &tid); + if (type && top < BTF_ANON_STACK_MAX) { + anon_stack[top].tid = tid; + anon_stack[top++].offset = + cur_offset + member->offset; + } + } else { + name = btf_name_by_offset(btf, member->name_off); + if (name && !strcmp(member_name, name)) { + if (anon_offset) + *anon_offset = cur_offset; + goto out; + } + } + } + if (top > 0) { + /* Pop from the anonymous stack and retry */ + tid = anon_stack[--top].tid; + cur_offset = anon_stack[top].offset; + type = btf_type_by_id(btf, tid); + goto retry; + } + member = NULL; + +out: + kfree(anon_stack); + return member; +} + diff --git a/kernel/trace/trace_btf.h b/kernel/trace/trace_btf.h new file mode 100644 index 000000000000..4bc44bc261e6 --- /dev/null +++ b/kernel/trace/trace_btf.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/btf.h> + +const struct btf_type *btf_find_func_proto(const char *func_name, + struct btf **btf_p); +const struct btf_param *btf_get_func_param(const struct btf_type *func_proto, + s32 *nr); +const struct btf_member *btf_find_struct_member(struct btf *btf, + const struct btf_type *type, + const char *member_name, + u32 *anon_offset); diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 340b2fa98218..c47422b20908 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -190,7 +190,7 @@ FTRACE_ENTRY(kernel_stack, stack_entry, F_STRUCT( __field( int, size ) - __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) + __stack_array( unsigned long, caller, FTRACE_STACK_ENTRIES, size) ), F_printk("\t=> %ps\n\t=> %ps\n\t=> %ps\n" diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index a0a704ba27db..72714cbf475c 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -41,6 +41,10 @@ struct eprobe_data { struct trace_eprobe *ep; }; + +#define for_each_trace_eprobe_tp(ep, _tp) \ + list_for_each_entry(ep, trace_probe_probe_list(_tp), tp.list) + static int __trace_eprobe_create(int argc, const char *argv[]); static void trace_event_probe_cleanup(struct trace_eprobe *ep) @@ -640,7 +644,7 @@ static int disable_eprobe(struct trace_eprobe *ep, static int enable_trace_eprobe(struct trace_event_call *call, struct trace_event_file *file) { - struct trace_probe *pos, *tp; + struct trace_probe *tp; struct trace_eprobe *ep; bool enabled; int ret = 0; @@ -662,8 +666,7 @@ static int enable_trace_eprobe(struct trace_event_call *call, if (enabled) return 0; - list_for_each_entry(pos, trace_probe_probe_list(tp), list) { - ep = container_of(pos, struct trace_eprobe, tp); + for_each_trace_eprobe_tp(ep, tp) { ret = enable_eprobe(ep, file); if (ret) break; @@ -680,8 +683,7 @@ static int enable_trace_eprobe(struct trace_event_call *call, */ WARN_ON_ONCE(ret != -ENOMEM); - list_for_each_entry(pos, trace_probe_probe_list(tp), list) { - ep = container_of(pos, struct trace_eprobe, tp); + for_each_trace_eprobe_tp(ep, tp) { disable_eprobe(ep, file->tr); if (!--cnt) break; @@ -699,7 +701,7 @@ static int enable_trace_eprobe(struct trace_event_call *call, static int disable_trace_eprobe(struct trace_event_call *call, struct trace_event_file *file) { - struct trace_probe *pos, *tp; + struct trace_probe *tp; struct trace_eprobe *ep; tp = trace_probe_primary_from_call(call); @@ -716,10 +718,8 @@ static int disable_trace_eprobe(struct trace_event_call *call, trace_probe_clear_flag(tp, TP_FLAG_PROFILE); if (!trace_probe_is_enabled(tp)) { - list_for_each_entry(pos, trace_probe_probe_list(tp), list) { - ep = container_of(pos, struct trace_eprobe, tp); + for_each_trace_eprobe_tp(ep, tp) disable_eprobe(ep, file->tr); - } } out: @@ -807,13 +807,11 @@ static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[ int ret; ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], &ctx); - if (ret) - return ret; - /* Handle symbols "@" */ if (!ret) ret = traceprobe_update_arg(&ep->tp.args[i]); + traceprobe_finish_parse(&ctx); return ret; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5d6ae4eae510..91951d038ba4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -611,7 +611,6 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, { struct trace_event_call *call = file->event_call; struct trace_array *tr = file->tr; - unsigned long file_flags = file->flags; int ret = 0; int disable; @@ -635,6 +634,8 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, break; disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED; clear_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags); + /* Disable use of trace_buffered_event */ + trace_buffered_event_disable(); } else disable = !(file->flags & EVENT_FILE_FL_SOFT_MODE); @@ -673,6 +674,8 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, if (atomic_inc_return(&file->sm_ref) > 1) break; set_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags); + /* Enable use of trace_buffered_event */ + trace_buffered_event_enable(); } if (!(file->flags & EVENT_FILE_FL_ENABLED)) { @@ -712,15 +715,6 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, break; } - /* Enable or disable use of trace_buffered_event */ - if ((file_flags & EVENT_FILE_FL_SOFT_DISABLED) != - (file->flags & EVENT_FILE_FL_SOFT_DISABLED)) { - if (file->flags & EVENT_FILE_FL_SOFT_DISABLED) - trace_buffered_event_enable(); - else - trace_buffered_event_disable(); - } - return ret; } @@ -990,7 +984,7 @@ static void remove_subsystem(struct trace_subsystem_dir *dir) return; if (!--dir->nr_events) { - tracefs_remove(dir->entry); + eventfs_remove(dir->ef); list_del(&dir->list); __put_system_dir(dir); } @@ -998,20 +992,7 @@ static void remove_subsystem(struct trace_subsystem_dir *dir) static void remove_event_file_dir(struct trace_event_file *file) { - struct dentry *dir = file->dir; - struct dentry *child; - - if (dir) { - spin_lock(&dir->d_lock); /* probably unneeded */ - list_for_each_entry(child, &dir->d_subdirs, d_child) { - if (d_really_is_positive(child)) /* probably unneeded */ - d_inode(child)->i_private = NULL; - } - spin_unlock(&dir->d_lock); - - tracefs_remove(dir); - } - + eventfs_remove(file->ef); list_del(&file->list); remove_subsystem(file->system); free_event_filter(file->filter); @@ -2109,9 +2090,10 @@ static const struct file_operations ftrace_set_event_notrace_pid_fops = { }; static const struct file_operations ftrace_enable_fops = { - .open = tracing_open_generic, + .open = tracing_open_file_tr, .read = event_enable_read, .write = event_enable_write, + .release = tracing_release_file_tr, .llseek = default_llseek, }; @@ -2128,9 +2110,10 @@ static const struct file_operations ftrace_event_id_fops = { }; static const struct file_operations ftrace_event_filter_fops = { - .open = tracing_open_generic, + .open = tracing_open_file_tr, .read = event_filter_read, .write = event_filter_write, + .release = tracing_release_file_tr, .llseek = default_llseek, }; @@ -2297,13 +2280,14 @@ create_new_subsystem(const char *name) return NULL; } -static struct dentry * +static struct eventfs_file * event_subsystem_dir(struct trace_array *tr, const char *name, struct trace_event_file *file, struct dentry *parent) { struct event_subsystem *system, *iter; struct trace_subsystem_dir *dir; - struct dentry *entry; + struct eventfs_file *ef; + int res; /* First see if we did not already create this dir */ list_for_each_entry(dir, &tr->systems, list) { @@ -2311,7 +2295,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, if (strcmp(system->name, name) == 0) { dir->nr_events++; file->system = dir; - return dir->entry; + return dir->ef; } } @@ -2335,13 +2319,14 @@ event_subsystem_dir(struct trace_array *tr, const char *name, } else __get_system(system); - dir->entry = tracefs_create_dir(name, parent); - if (!dir->entry) { + ef = eventfs_add_subsystem_dir(name, parent); + if (IS_ERR(ef)) { pr_warn("Failed to create system directory %s\n", name); __put_system(system); goto out_free; } + dir->ef = ef; dir->tr = tr; dir->ref_count = 1; dir->nr_events = 1; @@ -2351,22 +2336,22 @@ event_subsystem_dir(struct trace_array *tr, const char *name, /* the ftrace system is special, do not create enable or filter files */ if (strcmp(name, "ftrace") != 0) { - entry = tracefs_create_file("filter", TRACE_MODE_WRITE, - dir->entry, dir, + res = eventfs_add_file("filter", TRACE_MODE_WRITE, + dir->ef, dir, &ftrace_subsystem_filter_fops); - if (!entry) { + if (res) { kfree(system->filter); system->filter = NULL; pr_warn("Could not create tracefs '%s/filter' entry\n", name); } - trace_create_file("enable", TRACE_MODE_WRITE, dir->entry, dir, + eventfs_add_file("enable", TRACE_MODE_WRITE, dir->ef, dir, &ftrace_system_enable_fops); } list_add(&dir->list, &tr->systems); - return dir->entry; + return dir->ef; out_free: kfree(dir); @@ -2419,36 +2404,40 @@ static int event_create_dir(struct dentry *parent, struct trace_event_file *file) { struct trace_event_call *call = file->event_call; + struct eventfs_file *ef_subsystem = NULL; struct trace_array *tr = file->tr; - struct dentry *d_events; + struct eventfs_file *ef; const char *name; int ret; /* * If the trace point header did not define TRACE_SYSTEM - * then the system would be called "TRACE_SYSTEM". + * then the system would be called "TRACE_SYSTEM". This should + * never happen. */ - if (strcmp(call->class->system, TRACE_SYSTEM) != 0) { - d_events = event_subsystem_dir(tr, call->class->system, file, parent); - if (!d_events) - return -ENOMEM; - } else - d_events = parent; + if (WARN_ON_ONCE(strcmp(call->class->system, TRACE_SYSTEM) == 0)) + return -ENODEV; + + ef_subsystem = event_subsystem_dir(tr, call->class->system, file, parent); + if (!ef_subsystem) + return -ENOMEM; name = trace_event_name(call); - file->dir = tracefs_create_dir(name, d_events); - if (!file->dir) { + ef = eventfs_add_dir(name, ef_subsystem); + if (IS_ERR(ef)) { pr_warn("Could not create tracefs '%s' directory\n", name); return -1; } + file->ef = ef; + if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) - trace_create_file("enable", TRACE_MODE_WRITE, file->dir, file, + eventfs_add_file("enable", TRACE_MODE_WRITE, file->ef, file, &ftrace_enable_fops); #ifdef CONFIG_PERF_EVENTS if (call->event.type && call->class->reg) - trace_create_file("id", TRACE_MODE_READ, file->dir, + eventfs_add_file("id", TRACE_MODE_READ, file->ef, (void *)(long)call->event.type, &ftrace_event_id_fops); #endif @@ -2464,27 +2453,27 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) * triggers or filters. */ if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) { - trace_create_file("filter", TRACE_MODE_WRITE, file->dir, + eventfs_add_file("filter", TRACE_MODE_WRITE, file->ef, file, &ftrace_event_filter_fops); - trace_create_file("trigger", TRACE_MODE_WRITE, file->dir, + eventfs_add_file("trigger", TRACE_MODE_WRITE, file->ef, file, &event_trigger_fops); } #ifdef CONFIG_HIST_TRIGGERS - trace_create_file("hist", TRACE_MODE_READ, file->dir, file, + eventfs_add_file("hist", TRACE_MODE_READ, file->ef, file, &event_hist_fops); #endif #ifdef CONFIG_HIST_TRIGGERS_DEBUG - trace_create_file("hist_debug", TRACE_MODE_READ, file->dir, file, + eventfs_add_file("hist_debug", TRACE_MODE_READ, file->ef, file, &event_hist_debug_fops); #endif - trace_create_file("format", TRACE_MODE_READ, file->dir, call, + eventfs_add_file("format", TRACE_MODE_READ, file->ef, call, &ftrace_event_format_fops); #ifdef CONFIG_TRACE_EVENT_INJECT if (call->event.type && call->class->reg) - trace_create_file("inject", 0200, file->dir, file, + eventfs_add_file("inject", 0200, file->ef, file, &event_inject_fops); #endif @@ -3637,21 +3626,22 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) { struct dentry *d_events; struct dentry *entry; + int error = 0; entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent, tr, &ftrace_set_event_fops); if (!entry) return -ENOMEM; - d_events = tracefs_create_dir("events", parent); - if (!d_events) { + d_events = eventfs_create_events_dir("events", parent); + if (IS_ERR(d_events)) { pr_warn("Could not create tracefs 'events' directory\n"); return -ENOMEM; } - entry = trace_create_file("enable", TRACE_MODE_WRITE, d_events, + error = eventfs_add_events_file("enable", TRACE_MODE_WRITE, d_events, tr, &ftrace_tr_enable_fops); - if (!entry) + if (error) return -ENOMEM; /* There are not as crucial, just warn if they are not created */ @@ -3664,11 +3654,11 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) &ftrace_set_event_notrace_pid_fops); /* ring buffer internal formats */ - trace_create_file("header_page", TRACE_MODE_READ, d_events, + eventfs_add_events_file("header_page", TRACE_MODE_READ, d_events, ring_buffer_print_page_header, &ftrace_show_header_fops); - trace_create_file("header_event", TRACE_MODE_READ, d_events, + eventfs_add_events_file("header_event", TRACE_MODE_READ, d_events, ring_buffer_print_entry_header, &ftrace_show_header_fops); @@ -3756,7 +3746,7 @@ int event_trace_del_tracer(struct trace_array *tr) down_write(&trace_event_sem); __trace_remove_event_dirs(tr); - tracefs_remove(tr->event_dir); + eventfs_remove_events_dir(tr->event_dir); up_write(&trace_event_sem); tr->event_dir = NULL; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 1dad64267878..33264e510d16 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -46,15 +46,19 @@ static const char * ops[] = { OPS }; enum filter_pred_fn { FILTER_PRED_FN_NOP, FILTER_PRED_FN_64, + FILTER_PRED_FN_64_CPUMASK, FILTER_PRED_FN_S64, FILTER_PRED_FN_U64, FILTER_PRED_FN_32, + FILTER_PRED_FN_32_CPUMASK, FILTER_PRED_FN_S32, FILTER_PRED_FN_U32, FILTER_PRED_FN_16, + FILTER_PRED_FN_16_CPUMASK, FILTER_PRED_FN_S16, FILTER_PRED_FN_U16, FILTER_PRED_FN_8, + FILTER_PRED_FN_8_CPUMASK, FILTER_PRED_FN_S8, FILTER_PRED_FN_U8, FILTER_PRED_FN_COMM, @@ -64,21 +68,25 @@ enum filter_pred_fn { FILTER_PRED_FN_PCHAR_USER, FILTER_PRED_FN_PCHAR, FILTER_PRED_FN_CPU, + FILTER_PRED_FN_CPU_CPUMASK, + FILTER_PRED_FN_CPUMASK, + FILTER_PRED_FN_CPUMASK_CPU, FILTER_PRED_FN_FUNCTION, FILTER_PRED_FN_, FILTER_PRED_TEST_VISITED, }; struct filter_pred { - enum filter_pred_fn fn_num; - u64 val; - u64 val2; - struct regex regex; + struct regex *regex; + struct cpumask *mask; unsigned short *ops; struct ftrace_event_field *field; - int offset; + u64 val; + u64 val2; + enum filter_pred_fn fn_num; + int offset; int not; - int op; + int op; }; /* @@ -94,6 +102,8 @@ struct filter_pred { C(TOO_MANY_OPEN, "Too many '('"), \ C(TOO_MANY_CLOSE, "Too few '('"), \ C(MISSING_QUOTE, "Missing matching quote"), \ + C(MISSING_BRACE_OPEN, "Missing '{'"), \ + C(MISSING_BRACE_CLOSE, "Missing '}'"), \ C(OPERAND_TOO_LONG, "Operand too long"), \ C(EXPECT_STRING, "Expecting string field"), \ C(EXPECT_DIGIT, "Expecting numeric field"), \ @@ -103,6 +113,7 @@ struct filter_pred { C(BAD_SUBSYS_FILTER, "Couldn't find or set field in one of a subsystem's events"), \ C(TOO_MANY_PREDS, "Too many terms in predicate expression"), \ C(INVALID_FILTER, "Meaningless filter expression"), \ + C(INVALID_CPULIST, "Invalid cpulist"), \ C(IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \ C(INVALID_VALUE, "Invalid value (did you forget quotes)?"), \ C(NO_FUNCTION, "Function not found"), \ @@ -186,6 +197,15 @@ enum { PROCESS_OR = 4, }; +static void free_predicate(struct filter_pred *pred) +{ + if (pred) { + kfree(pred->regex); + kfree(pred->mask); + kfree(pred); + } +} + /* * Without going into a formal proof, this explains the method that is used in * parsing the logical expressions. @@ -623,12 +643,64 @@ out_free: kfree(inverts); if (prog_stack) { for (i = 0; prog_stack[i].pred; i++) - kfree(prog_stack[i].pred); + free_predicate(prog_stack[i].pred); kfree(prog_stack); } return ERR_PTR(ret); } +static inline int +do_filter_cpumask(int op, const struct cpumask *mask, const struct cpumask *cmp) +{ + switch (op) { + case OP_EQ: + return cpumask_equal(mask, cmp); + case OP_NE: + return !cpumask_equal(mask, cmp); + case OP_BAND: + return cpumask_intersects(mask, cmp); + default: + return 0; + } +} + +/* Optimisation of do_filter_cpumask() for scalar fields */ +static inline int +do_filter_scalar_cpumask(int op, unsigned int cpu, const struct cpumask *mask) +{ + /* + * Per the weight-of-one cpumask optimisations, the mask passed in this + * function has a weight >= 2, so it is never equal to a single scalar. + */ + switch (op) { + case OP_EQ: + return false; + case OP_NE: + return true; + case OP_BAND: + return cpumask_test_cpu(cpu, mask); + default: + return 0; + } +} + +static inline int +do_filter_cpumask_scalar(int op, const struct cpumask *mask, unsigned int cpu) +{ + switch (op) { + case OP_EQ: + return cpumask_test_cpu(cpu, mask) && + cpumask_nth(1, mask) >= nr_cpu_ids; + case OP_NE: + return !cpumask_test_cpu(cpu, mask) || + cpumask_nth(1, mask) < nr_cpu_ids; + case OP_BAND: + return cpumask_test_cpu(cpu, mask); + default: + return 0; + } +} + enum pred_cmp_types { PRED_CMP_TYPE_NOP, PRED_CMP_TYPE_LT, @@ -672,6 +744,18 @@ static int filter_pred_##type(struct filter_pred *pred, void *event) \ } \ } +#define DEFINE_CPUMASK_COMPARISON_PRED(size) \ +static int filter_pred_##size##_cpumask(struct filter_pred *pred, void *event) \ +{ \ + u##size *addr = (u##size *)(event + pred->offset); \ + unsigned int cpu = *addr; \ + \ + if (cpu >= nr_cpu_ids) \ + return 0; \ + \ + return do_filter_scalar_cpumask(pred->op, cpu, pred->mask); \ +} + #define DEFINE_EQUALITY_PRED(size) \ static int filter_pred_##size(struct filter_pred *pred, void *event) \ { \ @@ -693,6 +777,11 @@ DEFINE_COMPARISON_PRED(u16); DEFINE_COMPARISON_PRED(s8); DEFINE_COMPARISON_PRED(u8); +DEFINE_CPUMASK_COMPARISON_PRED(64); +DEFINE_CPUMASK_COMPARISON_PRED(32); +DEFINE_CPUMASK_COMPARISON_PRED(16); +DEFINE_CPUMASK_COMPARISON_PRED(8); + DEFINE_EQUALITY_PRED(64); DEFINE_EQUALITY_PRED(32); DEFINE_EQUALITY_PRED(16); @@ -750,7 +839,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event) char *addr = (char *)(event + pred->offset); int cmp, match; - cmp = pred->regex.match(addr, &pred->regex, pred->regex.field_len); + cmp = pred->regex->match(addr, pred->regex, pred->regex->field_len); match = cmp ^ pred->not; @@ -763,7 +852,7 @@ static __always_inline int filter_pchar(struct filter_pred *pred, char *str) int len; len = strlen(str) + 1; /* including tailing '\0' */ - cmp = pred->regex.match(str, &pred->regex, len); + cmp = pred->regex->match(str, pred->regex, len); match = cmp ^ pred->not; @@ -813,7 +902,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event) char *addr = (char *)(event + str_loc); int cmp, match; - cmp = pred->regex.match(addr, &pred->regex, str_len); + cmp = pred->regex->match(addr, pred->regex, str_len); match = cmp ^ pred->not; @@ -836,7 +925,7 @@ static int filter_pred_strrelloc(struct filter_pred *pred, void *event) char *addr = (char *)(&item[1]) + str_loc; int cmp, match; - cmp = pred->regex.match(addr, &pred->regex, str_len); + cmp = pred->regex->match(addr, pred->regex, str_len); match = cmp ^ pred->not; @@ -869,12 +958,42 @@ static int filter_pred_cpu(struct filter_pred *pred, void *event) } } +/* Filter predicate for current CPU vs user-provided cpumask */ +static int filter_pred_cpu_cpumask(struct filter_pred *pred, void *event) +{ + int cpu = raw_smp_processor_id(); + + return do_filter_scalar_cpumask(pred->op, cpu, pred->mask); +} + +/* Filter predicate for cpumask field vs user-provided cpumask */ +static int filter_pred_cpumask(struct filter_pred *pred, void *event) +{ + u32 item = *(u32 *)(event + pred->offset); + int loc = item & 0xffff; + const struct cpumask *mask = (event + loc); + const struct cpumask *cmp = pred->mask; + + return do_filter_cpumask(pred->op, mask, cmp); +} + +/* Filter predicate for cpumask field vs user-provided scalar */ +static int filter_pred_cpumask_cpu(struct filter_pred *pred, void *event) +{ + u32 item = *(u32 *)(event + pred->offset); + int loc = item & 0xffff; + const struct cpumask *mask = (event + loc); + unsigned int cpu = pred->val; + + return do_filter_cpumask_scalar(pred->op, mask, cpu); +} + /* Filter predicate for COMM. */ static int filter_pred_comm(struct filter_pred *pred, void *event) { int cmp; - cmp = pred->regex.match(current->comm, &pred->regex, + cmp = pred->regex->match(current->comm, pred->regex, TASK_COMM_LEN); return cmp ^ pred->not; } @@ -1004,7 +1123,7 @@ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not) static void filter_build_regex(struct filter_pred *pred) { - struct regex *r = &pred->regex; + struct regex *r = pred->regex; char *search; enum regex_type type = MATCH_FULL; @@ -1169,7 +1288,7 @@ static void free_prog(struct event_filter *filter) return; for (i = 0; prog[i].pred; i++) - kfree(prog[i].pred); + free_predicate(prog[i].pred); kfree(prog); } @@ -1236,8 +1355,12 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir, int filter_assign_type(const char *type) { - if (strstr(type, "__data_loc") && strstr(type, "char")) - return FILTER_DYN_STRING; + if (strstr(type, "__data_loc")) { + if (strstr(type, "char")) + return FILTER_DYN_STRING; + if (strstr(type, "cpumask_t")) + return FILTER_CPUMASK; + } if (strstr(type, "__rel_loc") && strstr(type, "char")) return FILTER_RDYN_STRING; @@ -1313,24 +1436,32 @@ static int filter_pred_fn_call(struct filter_pred *pred, void *event) switch (pred->fn_num) { case FILTER_PRED_FN_64: return filter_pred_64(pred, event); + case FILTER_PRED_FN_64_CPUMASK: + return filter_pred_64_cpumask(pred, event); case FILTER_PRED_FN_S64: return filter_pred_s64(pred, event); case FILTER_PRED_FN_U64: return filter_pred_u64(pred, event); case FILTER_PRED_FN_32: return filter_pred_32(pred, event); + case FILTER_PRED_FN_32_CPUMASK: + return filter_pred_32_cpumask(pred, event); case FILTER_PRED_FN_S32: return filter_pred_s32(pred, event); case FILTER_PRED_FN_U32: return filter_pred_u32(pred, event); case FILTER_PRED_FN_16: return filter_pred_16(pred, event); + case FILTER_PRED_FN_16_CPUMASK: + return filter_pred_16_cpumask(pred, event); case FILTER_PRED_FN_S16: return filter_pred_s16(pred, event); case FILTER_PRED_FN_U16: return filter_pred_u16(pred, event); case FILTER_PRED_FN_8: return filter_pred_8(pred, event); + case FILTER_PRED_FN_8_CPUMASK: + return filter_pred_8_cpumask(pred, event); case FILTER_PRED_FN_S8: return filter_pred_s8(pred, event); case FILTER_PRED_FN_U8: @@ -1349,6 +1480,12 @@ static int filter_pred_fn_call(struct filter_pred *pred, void *event) return filter_pred_pchar(pred, event); case FILTER_PRED_FN_CPU: return filter_pred_cpu(pred, event); + case FILTER_PRED_FN_CPU_CPUMASK: + return filter_pred_cpu_cpumask(pred, event); + case FILTER_PRED_FN_CPUMASK: + return filter_pred_cpumask(pred, event); + case FILTER_PRED_FN_CPUMASK_CPU: + return filter_pred_cpumask_cpu(pred, event); case FILTER_PRED_FN_FUNCTION: return filter_pred_function(pred, event); case FILTER_PRED_TEST_VISITED: @@ -1553,9 +1690,130 @@ static int parse_pred(const char *str, void *data, goto err_free; } - pred->regex.len = len; - strncpy(pred->regex.pattern, str + s, len); - pred->regex.pattern[len] = 0; + pred->regex = kzalloc(sizeof(*pred->regex), GFP_KERNEL); + if (!pred->regex) + goto err_mem; + pred->regex->len = len; + strncpy(pred->regex->pattern, str + s, len); + pred->regex->pattern[len] = 0; + + } else if (!strncmp(str + i, "CPUS", 4)) { + unsigned int maskstart; + bool single; + char *tmp; + + switch (field->filter_type) { + case FILTER_CPUMASK: + case FILTER_CPU: + case FILTER_OTHER: + break; + default: + parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i); + goto err_free; + } + + switch (op) { + case OP_EQ: + case OP_NE: + case OP_BAND: + break; + default: + parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i); + goto err_free; + } + + /* Skip CPUS */ + i += 4; + if (str[i++] != '{') { + parse_error(pe, FILT_ERR_MISSING_BRACE_OPEN, pos + i); + goto err_free; + } + maskstart = i; + + /* Walk the cpulist until closing } */ + for (; str[i] && str[i] != '}'; i++) + ; + + if (str[i] != '}') { + parse_error(pe, FILT_ERR_MISSING_BRACE_CLOSE, pos + i); + goto err_free; + } + + if (maskstart == i) { + parse_error(pe, FILT_ERR_INVALID_CPULIST, pos + i); + goto err_free; + } + + /* Copy the cpulist between { and } */ + tmp = kmalloc((i - maskstart) + 1, GFP_KERNEL); + if (!tmp) + goto err_mem; + + strscpy(tmp, str + maskstart, (i - maskstart) + 1); + pred->mask = kzalloc(cpumask_size(), GFP_KERNEL); + if (!pred->mask) { + kfree(tmp); + goto err_mem; + } + + /* Now parse it */ + if (cpulist_parse(tmp, pred->mask)) { + kfree(tmp); + parse_error(pe, FILT_ERR_INVALID_CPULIST, pos + i); + goto err_free; + } + kfree(tmp); + + /* Move along */ + i++; + + /* + * Optimisation: if the user-provided mask has a weight of one + * then we can treat it as a scalar input. + */ + single = cpumask_weight(pred->mask) == 1; + if (single) { + pred->val = cpumask_first(pred->mask); + kfree(pred->mask); + pred->mask = NULL; + } + + if (field->filter_type == FILTER_CPUMASK) { + pred->fn_num = single ? + FILTER_PRED_FN_CPUMASK_CPU : + FILTER_PRED_FN_CPUMASK; + } else if (field->filter_type == FILTER_CPU) { + if (single) { + if (pred->op == OP_BAND) + pred->op = OP_EQ; + + pred->fn_num = FILTER_PRED_FN_CPU; + } else { + pred->fn_num = FILTER_PRED_FN_CPU_CPUMASK; + } + } else if (single) { + if (pred->op == OP_BAND) + pred->op = OP_EQ; + + pred->fn_num = select_comparison_fn(pred->op, field->size, false); + if (pred->op == OP_NE) + pred->not = 1; + } else { + switch (field->size) { + case 8: + pred->fn_num = FILTER_PRED_FN_64_CPUMASK; + break; + case 4: + pred->fn_num = FILTER_PRED_FN_32_CPUMASK; + break; + case 2: + pred->fn_num = FILTER_PRED_FN_16_CPUMASK; + break; + case 1: + pred->fn_num = FILTER_PRED_FN_8_CPUMASK; + break; + } + } /* This is either a string, or an integer */ } else if (str[i] == '\'' || str[i] == '"') { @@ -1597,9 +1855,12 @@ static int parse_pred(const char *str, void *data, goto err_free; } - pred->regex.len = len; - strncpy(pred->regex.pattern, str + s, len); - pred->regex.pattern[len] = 0; + pred->regex = kzalloc(sizeof(*pred->regex), GFP_KERNEL); + if (!pred->regex) + goto err_mem; + pred->regex->len = len; + strncpy(pred->regex->pattern, str + s, len); + pred->regex->pattern[len] = 0; filter_build_regex(pred); @@ -1608,7 +1869,7 @@ static int parse_pred(const char *str, void *data, } else if (field->filter_type == FILTER_STATIC_STRING) { pred->fn_num = FILTER_PRED_FN_STRING; - pred->regex.field_len = field->size; + pred->regex->field_len = field->size; } else if (field->filter_type == FILTER_DYN_STRING) { pred->fn_num = FILTER_PRED_FN_STRLOC; @@ -1691,10 +1952,10 @@ static int parse_pred(const char *str, void *data, return i; err_free: - kfree(pred); + free_predicate(pred); return -EINVAL; err_mem: - kfree(pred); + free_predicate(pred); return -ENOMEM; } @@ -2287,8 +2548,8 @@ static int ftrace_function_set_filter_pred(struct filter_pred *pred, return ret; return __ftrace_function_set_filter(pred->op == OP_EQ, - pred->regex.pattern, - pred->regex.len, + pred->regex->pattern, + pred->regex->len, data); } diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c index abe805d471eb..8650562bdaa9 100644 --- a/kernel/trace/trace_events_inject.c +++ b/kernel/trace/trace_events_inject.c @@ -328,7 +328,8 @@ event_inject_read(struct file *file, char __user *buf, size_t size, } const struct file_operations event_inject_fops = { - .open = tracing_open_generic, + .open = tracing_open_file_tr, .read = event_inject_read, .write = event_inject_write, + .release = tracing_release_file_tr, }; diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index d6a70aff2410..14cb275a0bab 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -127,7 +127,7 @@ static bool synth_event_match(const char *system, const char *event, struct synth_trace_event { struct trace_entry ent; - u64 fields[]; + union trace_synth_field fields[]; }; static int synth_event_define_fields(struct trace_event_call *call) @@ -321,23 +321,23 @@ static const char *synth_field_fmt(char *type) static void print_synth_event_num_val(struct trace_seq *s, char *print_fmt, char *name, - int size, u64 val, char *space) + int size, union trace_synth_field *val, char *space) { switch (size) { case 1: - trace_seq_printf(s, print_fmt, name, (u8)val, space); + trace_seq_printf(s, print_fmt, name, val->as_u8, space); break; case 2: - trace_seq_printf(s, print_fmt, name, (u16)val, space); + trace_seq_printf(s, print_fmt, name, val->as_u16, space); break; case 4: - trace_seq_printf(s, print_fmt, name, (u32)val, space); + trace_seq_printf(s, print_fmt, name, val->as_u32, space); break; default: - trace_seq_printf(s, print_fmt, name, val, space); + trace_seq_printf(s, print_fmt, name, val->as_u64, space); break; } } @@ -350,7 +350,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, struct trace_seq *s = &iter->seq; struct synth_trace_event *entry; struct synth_event *se; - unsigned int i, n_u64; + unsigned int i, j, n_u64; char print_fmt[32]; const char *fmt; @@ -374,43 +374,28 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, /* parameter values */ if (se->fields[i]->is_string) { if (se->fields[i]->is_dynamic) { - u32 offset, data_offset; - char *str_field; - - offset = (u32)entry->fields[n_u64]; - data_offset = offset & 0xffff; - - str_field = (char *)entry + data_offset; + union trace_synth_field *data = &entry->fields[n_u64]; trace_seq_printf(s, print_fmt, se->fields[i]->name, STR_VAR_LEN_MAX, - str_field, + (char *)entry + data->as_dynamic.offset, i == se->n_fields - 1 ? "" : " "); n_u64++; } else { trace_seq_printf(s, print_fmt, se->fields[i]->name, STR_VAR_LEN_MAX, - (char *)&entry->fields[n_u64], + (char *)&entry->fields[n_u64].as_u64, i == se->n_fields - 1 ? "" : " "); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } } else if (se->fields[i]->is_stack) { - u32 offset, data_offset, len; - unsigned long *p, *end; - - offset = (u32)entry->fields[n_u64]; - data_offset = offset & 0xffff; - len = offset >> 16; - - p = (void *)entry + data_offset; - end = (void *)p + len - (sizeof(long) - 1); + union trace_synth_field *data = &entry->fields[n_u64]; + unsigned long *p = (void *)entry + data->as_dynamic.offset; trace_seq_printf(s, "%s=STACK:\n", se->fields[i]->name); - - for (; *p && p < end; p++) - trace_seq_printf(s, "=> %pS\n", (void *)*p); + for (j = 1; j < data->as_dynamic.len / sizeof(long); j++) + trace_seq_printf(s, "=> %pS\n", (void *)p[j]); n_u64++; - } else { struct trace_print_flags __flags[] = { __def_gfpflag_names, {-1, NULL} }; @@ -419,13 +404,13 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, print_synth_event_num_val(s, print_fmt, se->fields[i]->name, se->fields[i]->size, - entry->fields[n_u64], + &entry->fields[n_u64], space); if (strcmp(se->fields[i]->type, "gfp_t") == 0) { trace_seq_puts(s, " ("); trace_print_flags_seq(s, "|", - entry->fields[n_u64], + entry->fields[n_u64].as_u64, __flags); trace_seq_putc(s, ')'); } @@ -454,21 +439,16 @@ static unsigned int trace_string(struct synth_trace_event *entry, int ret; if (is_dynamic) { - u32 data_offset; + union trace_synth_field *data = &entry->fields[*n_u64]; - data_offset = struct_size(entry, fields, event->n_u64); - data_offset += data_size; - - len = fetch_store_strlen((unsigned long)str_val); - - data_offset |= len << 16; - *(u32 *)&entry->fields[*n_u64] = data_offset; + data->as_dynamic.offset = struct_size(entry, fields, event->n_u64) + data_size; + data->as_dynamic.len = fetch_store_strlen((unsigned long)str_val); ret = fetch_store_string((unsigned long)str_val, &entry->fields[*n_u64], entry); (*n_u64)++; } else { - str_field = (char *)&entry->fields[*n_u64]; + str_field = (char *)&entry->fields[*n_u64].as_u64; #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE if ((unsigned long)str_val < TASK_SIZE) @@ -492,6 +472,7 @@ static unsigned int trace_stack(struct synth_trace_event *entry, unsigned int data_size, unsigned int *n_u64) { + union trace_synth_field *data = &entry->fields[*n_u64]; unsigned int len; u32 data_offset; void *data_loc; @@ -504,10 +485,6 @@ static unsigned int trace_stack(struct synth_trace_event *entry, break; } - /* Include the zero'd element if it fits */ - if (len < HIST_STACKTRACE_DEPTH) - len++; - len *= sizeof(long); /* Find the dynamic section to copy the stack into. */ @@ -515,8 +492,9 @@ static unsigned int trace_stack(struct synth_trace_event *entry, memcpy(data_loc, stack, len); /* Fill in the field that holds the offset/len combo */ - data_offset |= len << 16; - *(u32 *)&entry->fields[*n_u64] = data_offset; + + data->as_dynamic.offset = data_offset; + data->as_dynamic.len = len; (*n_u64)++; @@ -550,7 +528,8 @@ static notrace void trace_event_raw_event_synth(void *__data, str_val = (char *)(long)var_ref_vals[val_idx]; if (event->dynamic_fields[i]->is_stack) { - len = *((unsigned long *)str_val); + /* reserve one extra element for size */ + len = *((unsigned long *)str_val) + 1; len *= sizeof(unsigned long); } else { len = fetch_store_strlen((unsigned long)str_val); @@ -592,19 +571,19 @@ static notrace void trace_event_raw_event_synth(void *__data, switch (field->size) { case 1: - *(u8 *)&entry->fields[n_u64] = (u8)val; + entry->fields[n_u64].as_u8 = (u8)val; break; case 2: - *(u16 *)&entry->fields[n_u64] = (u16)val; + entry->fields[n_u64].as_u16 = (u16)val; break; case 4: - *(u32 *)&entry->fields[n_u64] = (u32)val; + entry->fields[n_u64].as_u32 = (u32)val; break; default: - entry->fields[n_u64] = val; + entry->fields[n_u64].as_u64 = val; break; } n_u64++; @@ -1230,6 +1209,7 @@ EXPORT_SYMBOL_GPL(__synth_event_gen_cmd_start); * synth_event_gen_cmd_array_start - Start synthetic event command from an array * @cmd: A pointer to the dynevent_cmd struct representing the new event * @name: The name of the synthetic event + * @mod: The module creating the event, NULL if not created from a module * @fields: An array of type/name field descriptions * @n_fields: The number of field descriptions contained in the fields array * @@ -1790,19 +1770,19 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) switch (field->size) { case 1: - *(u8 *)&state.entry->fields[n_u64] = (u8)val; + state.entry->fields[n_u64].as_u8 = (u8)val; break; case 2: - *(u16 *)&state.entry->fields[n_u64] = (u16)val; + state.entry->fields[n_u64].as_u16 = (u16)val; break; case 4: - *(u32 *)&state.entry->fields[n_u64] = (u32)val; + state.entry->fields[n_u64].as_u32 = (u32)val; break; default: - state.entry->fields[n_u64] = val; + state.entry->fields[n_u64].as_u64 = val; break; } n_u64++; @@ -1883,19 +1863,19 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals, switch (field->size) { case 1: - *(u8 *)&state.entry->fields[n_u64] = (u8)val; + state.entry->fields[n_u64].as_u8 = (u8)val; break; case 2: - *(u16 *)&state.entry->fields[n_u64] = (u16)val; + state.entry->fields[n_u64].as_u16 = (u16)val; break; case 4: - *(u32 *)&state.entry->fields[n_u64] = (u32)val; + state.entry->fields[n_u64].as_u32 = (u32)val; break; default: - state.entry->fields[n_u64] = val; + state.entry->fields[n_u64].as_u64 = val; break; } n_u64++; @@ -2030,19 +2010,19 @@ static int __synth_event_add_val(const char *field_name, u64 val, } else { switch (field->size) { case 1: - *(u8 *)&trace_state->entry->fields[field->offset] = (u8)val; + trace_state->entry->fields[field->offset].as_u8 = (u8)val; break; case 2: - *(u16 *)&trace_state->entry->fields[field->offset] = (u16)val; + trace_state->entry->fields[field->offset].as_u16 = (u16)val; break; case 4: - *(u32 *)&trace_state->entry->fields[field->offset] = (u32)val; + trace_state->entry->fields[field->offset].as_u32 = (u32)val; break; default: - trace_state->entry->fields[field->offset] = val; + trace_state->entry->fields[field->offset].as_u64 = val; break; } } diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index e535959939d3..46439e3bcec4 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -31,7 +31,9 @@ void trigger_data_free(struct event_trigger_data *data) /** * event_triggers_call - Call triggers associated with a trace event * @file: The trace_event_file associated with the event + * @buffer: The ring buffer that the event is being written to * @rec: The trace entry for the event, NULL for unconditional invocation + * @event: The event meta data in the ring buffer * * For each trigger associated with an event, invoke the trigger * function registered with the associated trigger command. If rec is diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 33cb6af31f39..6f046650e527 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -1328,14 +1328,14 @@ static int user_field_set_string(struct ftrace_event_field *field, static int user_event_set_print_fmt(struct user_event *user, char *buf, int len) { - struct ftrace_event_field *field, *next; + struct ftrace_event_field *field; struct list_head *head = &user->fields; int pos = 0, depth = 0; const char *str_func; pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); - list_for_each_entry_safe_reverse(field, next, head, link) { + list_for_each_entry_reverse(field, head, link) { if (depth != 0) pos += snprintf(buf + pos, LEN_OR_ZERO, " "); @@ -1347,7 +1347,7 @@ static int user_event_set_print_fmt(struct user_event *user, char *buf, int len) pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); - list_for_each_entry_safe_reverse(field, next, head, link) { + list_for_each_entry_reverse(field, head, link) { if (user_field_is_dyn_string(field->type, &str_func)) pos += snprintf(buf + pos, LEN_OR_ZERO, ", %s(%s)", str_func, field->name); @@ -1732,7 +1732,7 @@ static int user_event_create(const char *raw_command) static int user_event_show(struct seq_file *m, struct dyn_event *ev) { struct user_event *user = container_of(ev, struct user_event, devent); - struct ftrace_event_field *field, *next; + struct ftrace_event_field *field; struct list_head *head; int depth = 0; @@ -1740,7 +1740,7 @@ static int user_event_show(struct seq_file *m, struct dyn_event *ev) head = trace_get_fields(&user->call); - list_for_each_entry_safe_reverse(field, next, head, link) { + list_for_each_entry_reverse(field, head, link) { if (depth == 0) seq_puts(m, " "); else @@ -1816,13 +1816,14 @@ out: static bool user_fields_match(struct user_event *user, int argc, const char **argv) { - struct ftrace_event_field *field, *next; + struct ftrace_event_field *field; struct list_head *head = &user->fields; int i = 0; - list_for_each_entry_safe_reverse(field, next, head, link) + list_for_each_entry_reverse(field, head, link) { if (!user_field_match(field, argc, argv, &i)) return false; + } if (i != argc) return false; diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 58f3946081e2..1698fc22afa0 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -51,6 +51,9 @@ static int ftrace_event_register(struct trace_event_call *call, #undef __array #define __array(type, item, size) type item[size]; +#undef __stack_array +#define __stack_array(type, item, size, field) __array(type, item, size) + #undef __array_desc #define __array_desc(type, container, item, size) type item[size]; @@ -114,6 +117,9 @@ static void __always_unused ____ftrace_check_##name(void) \ is_signed_type(_type), .filter_type = FILTER_OTHER, \ .len = _len }, +#undef __stack_array +#define __stack_array(_type, _item, _len, _field) __array(_type, _item, _len) + #undef __array_desc #define __array_desc(_type, _container, _item, _len) __array(_type, _item, _len) @@ -149,6 +155,9 @@ static struct trace_event_fields ftrace_event_fields_##name[] = { \ #undef __array #define __array(type, item, len) +#undef __stack_array +#define __stack_array(type, item, len, field) + #undef __array_desc #define __array_desc(type, container, item, len) diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index dfe2e546acdc..8bfe23af9c73 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -898,6 +898,46 @@ static struct tracepoint *find_tracepoint(const char *tp_name) return data.tpoint; } +static int parse_symbol_and_return(int argc, const char *argv[], + char **symbol, bool *is_return, + bool is_tracepoint) +{ + char *tmp = strchr(argv[1], '%'); + int i; + + if (tmp) { + int len = tmp - argv[1]; + + if (!is_tracepoint && !strcmp(tmp, "%return")) { + *is_return = true; + } else { + trace_probe_log_err(len, BAD_ADDR_SUFFIX); + return -EINVAL; + } + *symbol = kmemdup_nul(argv[1], len, GFP_KERNEL); + } else + *symbol = kstrdup(argv[1], GFP_KERNEL); + if (!*symbol) + return -ENOMEM; + + if (*is_return) + return 0; + + /* If there is $retval, this should be a return fprobe. */ + for (i = 2; i < argc; i++) { + tmp = strstr(argv[i], "$retval"); + if (tmp && !isalnum(tmp[7]) && tmp[7] != '_') { + *is_return = true; + /* + * NOTE: Don't check is_tracepoint here, because it will + * be checked when the argument is parsed. + */ + break; + } + } + return 0; +} + static int __trace_fprobe_create(int argc, const char *argv[]) { /* @@ -927,7 +967,7 @@ static int __trace_fprobe_create(int argc, const char *argv[]) struct trace_fprobe *tf = NULL; int i, len, new_argc = 0, ret = 0; bool is_return = false; - char *symbol = NULL, *tmp = NULL; + char *symbol = NULL; const char *event = NULL, *group = FPROBE_EVENT_SYSTEM; const char **new_argv = NULL; int maxactive = 0; @@ -983,20 +1023,10 @@ static int __trace_fprobe_create(int argc, const char *argv[]) trace_probe_log_set_index(1); /* a symbol(or tracepoint) must be specified */ - symbol = kstrdup(argv[1], GFP_KERNEL); - if (!symbol) - return -ENOMEM; + ret = parse_symbol_and_return(argc, argv, &symbol, &is_return, is_tracepoint); + if (ret < 0) + goto parse_error; - tmp = strchr(symbol, '%'); - if (tmp) { - if (!is_tracepoint && !strcmp(tmp, "%return")) { - *tmp = '\0'; - is_return = true; - } else { - trace_probe_log_err(tmp - symbol, BAD_ADDR_SUFFIX); - goto parse_error; - } - } if (!is_return && maxactive) { trace_probe_log_set_index(0); trace_probe_log_err(1, BAD_MAXACT_TYPE); @@ -1096,6 +1126,7 @@ static int __trace_fprobe_create(int argc, const char *argv[]) } out: + traceprobe_finish_parse(&ctx); trace_probe_log_clear(); kfree(new_argv); kfree(symbol); diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 2f37a6e68aa9..b791524a6536 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -635,7 +635,7 @@ static int s_mode_show(struct seq_file *s, void *v) else seq_printf(s, "%s", thread_mode_str[mode]); - if (mode != MODE_MAX) + if (mode < MODE_MAX - 1) /* if mode is any but last */ seq_puts(s, " "); return 0; diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 590b3d51afae..ba37f768e2f2 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -231,7 +231,8 @@ static void irqsoff_trace_open(struct trace_iterator *iter) { if (is_graph(iter->tr)) graph_trace_open(iter); - + else + iter->private = NULL; } static void irqsoff_trace_close(struct trace_iterator *iter) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 23dba01831f7..3d7a180a8427 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -907,6 +907,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) } out: + traceprobe_finish_parse(&ctx); trace_probe_log_clear(); kfree(new_argv); kfree(symbol); @@ -1561,15 +1562,10 @@ int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type, *fd_type = trace_kprobe_is_return(tk) ? BPF_FD_TYPE_KRETPROBE : BPF_FD_TYPE_KPROBE; - if (tk->symbol) { - *symbol = tk->symbol; - *probe_offset = tk->rp.kp.offset; - *probe_addr = 0; - } else { - *symbol = NULL; - *probe_offset = 0; - *probe_addr = (unsigned long)tk->rp.kp.addr; - } + *probe_offset = tk->rp.kp.offset; + *probe_addr = kallsyms_show_value(current_cred()) ? + (unsigned long)tk->rp.kp.addr : 0; + *symbol = tk->symbol; return 0; } #endif /* CONFIG_PERF_EVENTS */ diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index b2b726bea1f9..4dc74d73fc1d 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -12,6 +12,7 @@ #define pr_fmt(fmt) "trace_probe: " fmt #include <linux/bpf.h> +#include "trace_btf.h" #include "trace_probe.h" @@ -304,31 +305,90 @@ static int parse_trace_event_arg(char *arg, struct fetch_insn *code, #ifdef CONFIG_PROBE_EVENTS_BTF_ARGS -static struct btf *traceprobe_get_btf(void) +static u32 btf_type_int(const struct btf_type *t) { - struct btf *btf = bpf_get_btf_vmlinux(); + return *(u32 *)(t + 1); +} - if (IS_ERR_OR_NULL(btf)) - return NULL; +static bool btf_type_is_char_ptr(struct btf *btf, const struct btf_type *type) +{ + const struct btf_type *real_type; + u32 intdata; + s32 tid; + + real_type = btf_type_skip_modifiers(btf, type->type, &tid); + if (!real_type) + return false; + + if (BTF_INFO_KIND(real_type->info) != BTF_KIND_INT) + return false; - return btf; + intdata = btf_type_int(real_type); + return !(BTF_INT_ENCODING(intdata) & BTF_INT_SIGNED) + && BTF_INT_BITS(intdata) == 8; } -static u32 btf_type_int(const struct btf_type *t) +static bool btf_type_is_char_array(struct btf *btf, const struct btf_type *type) { - return *(u32 *)(t + 1); + const struct btf_type *real_type; + const struct btf_array *array; + u32 intdata; + s32 tid; + + if (BTF_INFO_KIND(type->info) != BTF_KIND_ARRAY) + return false; + + array = (const struct btf_array *)(type + 1); + + real_type = btf_type_skip_modifiers(btf, array->type, &tid); + + intdata = btf_type_int(real_type); + return !(BTF_INT_ENCODING(intdata) & BTF_INT_SIGNED) + && BTF_INT_BITS(intdata) == 8; } -static const char *type_from_btf_id(struct btf *btf, s32 id) +static int check_prepare_btf_string_fetch(char *typename, + struct fetch_insn **pcode, + struct traceprobe_parse_context *ctx) +{ + struct btf *btf = ctx->btf; + + if (!btf || !ctx->last_type) + return 0; + + /* char [] does not need any change. */ + if (btf_type_is_char_array(btf, ctx->last_type)) + return 0; + + /* char * requires dereference the pointer. */ + if (btf_type_is_char_ptr(btf, ctx->last_type)) { + struct fetch_insn *code = *pcode + 1; + + if (code->op == FETCH_OP_END) { + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); + return -E2BIG; + } + if (typename[0] == 'u') + code->op = FETCH_OP_UDEREF; + else + code->op = FETCH_OP_DEREF; + code->offset = 0; + *pcode = code; + return 0; + } + /* Other types are not available for string */ + trace_probe_log_err(ctx->offset, BAD_TYPE4STR); + return -EINVAL; +} + +static const char *fetch_type_from_btf_type(struct btf *btf, + const struct btf_type *type, + struct traceprobe_parse_context *ctx) { - const struct btf_type *t; u32 intdata; - s32 tid; /* TODO: const char * could be converted as a string */ - t = btf_type_skip_modifiers(btf, id, &tid); - - switch (BTF_INFO_KIND(t->info)) { + switch (BTF_INFO_KIND(type->info)) { case BTF_KIND_ENUM: /* enum is "int", so convert to "s32" */ return "s32"; @@ -341,7 +401,7 @@ static const char *type_from_btf_id(struct btf *btf, s32 id) else return "x32"; case BTF_KIND_INT: - intdata = btf_type_int(t); + intdata = btf_type_int(type); if (BTF_INT_ENCODING(intdata) & BTF_INT_SIGNED) { switch (BTF_INT_BITS(intdata)) { case 8: @@ -364,6 +424,10 @@ static const char *type_from_btf_id(struct btf *btf, s32 id) case 64: return "u64"; } + /* bitfield, size is encoded in the type */ + ctx->last_bitsize = BTF_INT_BITS(intdata); + ctx->last_bitoffs += BTF_INT_OFFSET(intdata); + return "u64"; } } /* TODO: support other types */ @@ -371,88 +435,223 @@ static const char *type_from_btf_id(struct btf *btf, s32 id) return NULL; } -static const struct btf_type *find_btf_func_proto(const char *funcname) +static int query_btf_context(struct traceprobe_parse_context *ctx) { - struct btf *btf = traceprobe_get_btf(); - const struct btf_type *t; - s32 id; + const struct btf_param *param; + const struct btf_type *type; + struct btf *btf; + s32 nr; - if (!btf || !funcname) - return ERR_PTR(-EINVAL); + if (ctx->btf) + return 0; + + if (!ctx->funcname) + return -EINVAL; + + type = btf_find_func_proto(ctx->funcname, &btf); + if (!type) + return -ENOENT; - id = btf_find_by_name_kind(btf, funcname, BTF_KIND_FUNC); - if (id <= 0) - return ERR_PTR(-ENOENT); + ctx->btf = btf; + ctx->proto = type; + + /* ctx->params is optional, since func(void) will not have params. */ + nr = 0; + param = btf_get_func_param(type, &nr); + if (!IS_ERR_OR_NULL(param)) { + /* Hide the first 'data' argument of tracepoint */ + if (ctx->flags & TPARG_FL_TPOINT) { + nr--; + param++; + } + } - /* Get BTF_KIND_FUNC type */ - t = btf_type_by_id(btf, id); - if (!btf_type_is_func(t)) - return ERR_PTR(-ENOENT); + if (nr > 0) { + ctx->nr_params = nr; + ctx->params = param; + } else { + ctx->nr_params = 0; + ctx->params = NULL; + } - /* The type of BTF_KIND_FUNC is BTF_KIND_FUNC_PROTO */ - t = btf_type_by_id(btf, t->type); - if (!btf_type_is_func_proto(t)) - return ERR_PTR(-ENOENT); + return 0; +} - return t; +static void clear_btf_context(struct traceprobe_parse_context *ctx) +{ + if (ctx->btf) { + btf_put(ctx->btf); + ctx->btf = NULL; + ctx->proto = NULL; + ctx->params = NULL; + ctx->nr_params = 0; + } } -static const struct btf_param *find_btf_func_param(const char *funcname, s32 *nr, - bool tracepoint) +/* Return 1 if the field separater is arrow operator ('->') */ +static int split_next_field(char *varname, char **next_field, + struct traceprobe_parse_context *ctx) { - const struct btf_param *param; - const struct btf_type *t; + char *field; + int ret = 0; + + field = strpbrk(varname, ".-"); + if (field) { + if (field[0] == '-' && field[1] == '>') { + field[0] = '\0'; + field += 2; + ret = 1; + } else if (field[0] == '.') { + field[0] = '\0'; + field += 1; + } else { + trace_probe_log_err(ctx->offset + field - varname, BAD_HYPHEN); + return -EINVAL; + } + *next_field = field; + } - if (!funcname || !nr) - return ERR_PTR(-EINVAL); + return ret; +} - t = find_btf_func_proto(funcname); - if (IS_ERR(t)) - return (const struct btf_param *)t; +/* + * Parse the field of data structure. The @type must be a pointer type + * pointing the target data structure type. + */ +static int parse_btf_field(char *fieldname, const struct btf_type *type, + struct fetch_insn **pcode, struct fetch_insn *end, + struct traceprobe_parse_context *ctx) +{ + struct fetch_insn *code = *pcode; + const struct btf_member *field; + u32 bitoffs, anon_offs; + char *next; + int is_ptr; + s32 tid; - *nr = btf_type_vlen(t); - param = (const struct btf_param *)(t + 1); + do { + /* Outer loop for solving arrow operator ('->') */ + if (BTF_INFO_KIND(type->info) != BTF_KIND_PTR) { + trace_probe_log_err(ctx->offset, NO_PTR_STRCT); + return -EINVAL; + } + /* Convert a struct pointer type to a struct type */ + type = btf_type_skip_modifiers(ctx->btf, type->type, &tid); + if (!type) { + trace_probe_log_err(ctx->offset, BAD_BTF_TID); + return -EINVAL; + } - /* Hide the first 'data' argument of tracepoint */ - if (tracepoint) { - (*nr)--; - param++; - } + bitoffs = 0; + do { + /* Inner loop for solving dot operator ('.') */ + next = NULL; + is_ptr = split_next_field(fieldname, &next, ctx); + if (is_ptr < 0) + return is_ptr; + + anon_offs = 0; + field = btf_find_struct_member(ctx->btf, type, fieldname, + &anon_offs); + if (!field) { + trace_probe_log_err(ctx->offset, NO_BTF_FIELD); + return -ENOENT; + } + /* Add anonymous structure/union offset */ + bitoffs += anon_offs; + + /* Accumulate the bit-offsets of the dot-connected fields */ + if (btf_type_kflag(type)) { + bitoffs += BTF_MEMBER_BIT_OFFSET(field->offset); + ctx->last_bitsize = BTF_MEMBER_BITFIELD_SIZE(field->offset); + } else { + bitoffs += field->offset; + ctx->last_bitsize = 0; + } - if (*nr > 0) - return param; - else - return NULL; + type = btf_type_skip_modifiers(ctx->btf, field->type, &tid); + if (!type) { + trace_probe_log_err(ctx->offset, BAD_BTF_TID); + return -EINVAL; + } + + ctx->offset += next - fieldname; + fieldname = next; + } while (!is_ptr && fieldname); + + if (++code == end) { + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); + return -EINVAL; + } + code->op = FETCH_OP_DEREF; /* TODO: user deref support */ + code->offset = bitoffs / 8; + *pcode = code; + + ctx->last_bitoffs = bitoffs % 8; + ctx->last_type = type; + } while (fieldname); + + return 0; } -static int parse_btf_arg(const char *varname, struct fetch_insn *code, +static int parse_btf_arg(char *varname, + struct fetch_insn **pcode, struct fetch_insn *end, struct traceprobe_parse_context *ctx) { - struct btf *btf = traceprobe_get_btf(); + struct fetch_insn *code = *pcode; const struct btf_param *params; - int i; + const struct btf_type *type; + char *field = NULL; + int i, is_ptr, ret; + u32 tid; + + if (WARN_ON_ONCE(!ctx->funcname)) + return -EINVAL; - if (!btf) { - trace_probe_log_err(ctx->offset, NOSUP_BTFARG); + is_ptr = split_next_field(varname, &field, ctx); + if (is_ptr < 0) + return is_ptr; + if (!is_ptr && field) { + /* dot-connected field on an argument is not supported. */ + trace_probe_log_err(ctx->offset + field - varname, + NOSUP_DAT_ARG); return -EOPNOTSUPP; } - if (WARN_ON_ONCE(!ctx->funcname)) - return -EINVAL; + if (ctx->flags & TPARG_FL_RETURN) { + if (strcmp(varname, "$retval") != 0) { + trace_probe_log_err(ctx->offset, NO_BTFARG); + return -ENOENT; + } + code->op = FETCH_OP_RETVAL; + /* Check whether the function return type is not void */ + if (query_btf_context(ctx) == 0) { + if (ctx->proto->type == 0) { + trace_probe_log_err(ctx->offset, NO_RETVAL); + return -ENOENT; + } + tid = ctx->proto->type; + goto found; + } + if (field) { + trace_probe_log_err(ctx->offset + field - varname, + NO_BTF_ENTRY); + return -ENOENT; + } + return 0; + } - if (!ctx->params) { - params = find_btf_func_param(ctx->funcname, &ctx->nr_params, - ctx->flags & TPARG_FL_TPOINT); - if (IS_ERR(params)) { + if (!ctx->btf) { + ret = query_btf_context(ctx); + if (ret < 0 || ctx->nr_params == 0) { trace_probe_log_err(ctx->offset, NO_BTF_ENTRY); return PTR_ERR(params); } - ctx->params = params; - } else - params = ctx->params; + } + params = ctx->params; for (i = 0; i < ctx->nr_params; i++) { - const char *name = btf_name_by_offset(btf, params[i].name_off); + const char *name = btf_name_by_offset(ctx->btf, params[i].name_off); if (name && !strcmp(name, varname)) { code->op = FETCH_OP_ARG; @@ -460,91 +659,114 @@ static int parse_btf_arg(const char *varname, struct fetch_insn *code, code->param = i + 1; else code->param = i; - return 0; + tid = params[i].type; + goto found; } } trace_probe_log_err(ctx->offset, NO_BTFARG); return -ENOENT; -} - -static const struct fetch_type *parse_btf_arg_type(int arg_idx, - struct traceprobe_parse_context *ctx) -{ - struct btf *btf = traceprobe_get_btf(); - const char *typestr = NULL; - if (btf && ctx->params) { - if (ctx->flags & TPARG_FL_TPOINT) - arg_idx--; - typestr = type_from_btf_id(btf, ctx->params[arg_idx].type); +found: + type = btf_type_skip_modifiers(ctx->btf, tid, &tid); + if (!type) { + trace_probe_log_err(ctx->offset, BAD_BTF_TID); + return -EINVAL; } - - return find_fetch_type(typestr, ctx->flags); + /* Initialize the last type information */ + ctx->last_type = type; + ctx->last_bitoffs = 0; + ctx->last_bitsize = 0; + if (field) { + ctx->offset += field - varname; + return parse_btf_field(field, type, pcode, end, ctx); + } + return 0; } -static const struct fetch_type *parse_btf_retval_type( +static const struct fetch_type *find_fetch_type_from_btf_type( struct traceprobe_parse_context *ctx) { - struct btf *btf = traceprobe_get_btf(); + struct btf *btf = ctx->btf; const char *typestr = NULL; - const struct btf_type *t; - if (btf && ctx->funcname) { - t = find_btf_func_proto(ctx->funcname); - if (!IS_ERR(t)) - typestr = type_from_btf_id(btf, t->type); - } + if (btf && ctx->last_type) + typestr = fetch_type_from_btf_type(btf, ctx->last_type, ctx); return find_fetch_type(typestr, ctx->flags); } -static bool is_btf_retval_void(const char *funcname) +static int parse_btf_bitfield(struct fetch_insn **pcode, + struct traceprobe_parse_context *ctx) { - const struct btf_type *t; + struct fetch_insn *code = *pcode; - t = find_btf_func_proto(funcname); - if (IS_ERR(t)) - return false; + if ((ctx->last_bitsize % 8 == 0) && ctx->last_bitoffs == 0) + return 0; + + code++; + if (code->op != FETCH_OP_NOP) { + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); + return -EINVAL; + } + *pcode = code; - return t->type == 0; + code->op = FETCH_OP_MOD_BF; + code->lshift = 64 - (ctx->last_bitsize + ctx->last_bitoffs); + code->rshift = 64 - ctx->last_bitsize; + code->basesize = 64 / 8; + return 0; } + #else -static struct btf *traceprobe_get_btf(void) +static void clear_btf_context(struct traceprobe_parse_context *ctx) { - return NULL; + ctx->btf = NULL; } -static const struct btf_param *find_btf_func_param(const char *funcname, s32 *nr, - bool tracepoint) +static int query_btf_context(struct traceprobe_parse_context *ctx) { - return ERR_PTR(-EOPNOTSUPP); + return -EOPNOTSUPP; } -static int parse_btf_arg(const char *varname, struct fetch_insn *code, +static int parse_btf_arg(char *varname, + struct fetch_insn **pcode, struct fetch_insn *end, struct traceprobe_parse_context *ctx) { trace_probe_log_err(ctx->offset, NOSUP_BTFARG); return -EOPNOTSUPP; } -#define parse_btf_arg_type(idx, ctx) \ - find_fetch_type(NULL, ctx->flags) +static int parse_btf_bitfield(struct fetch_insn **pcode, + struct traceprobe_parse_context *ctx) +{ + trace_probe_log_err(ctx->offset, NOSUP_BTFARG); + return -EOPNOTSUPP; +} -#define parse_btf_retval_type(ctx) \ +#define find_fetch_type_from_btf_type(ctx) \ find_fetch_type(NULL, ctx->flags) -#define is_btf_retval_void(funcname) (false) +static int check_prepare_btf_string_fetch(char *typename, + struct fetch_insn **pcode, + struct traceprobe_parse_context *ctx) +{ + return 0; +} #endif #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) -static int parse_probe_vars(char *arg, const struct fetch_type *t, - struct fetch_insn *code, +/* Parse $vars. @orig_arg points '$', which syncs to @ctx->offset */ +static int parse_probe_vars(char *orig_arg, const struct fetch_type *t, + struct fetch_insn **pcode, + struct fetch_insn *end, struct traceprobe_parse_context *ctx) { - unsigned long param; + struct fetch_insn *code = *pcode; int err = TP_ERR_BAD_VAR; + char *arg = orig_arg + 1; + unsigned long param; int ret = 0; int len; @@ -563,18 +785,17 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, goto inval; } - if (strcmp(arg, "retval") == 0) { - if (ctx->flags & TPARG_FL_RETURN) { - if ((ctx->flags & TPARG_FL_KERNEL) && - is_btf_retval_void(ctx->funcname)) { - err = TP_ERR_NO_RETVAL; - goto inval; - } + if (str_has_prefix(arg, "retval")) { + if (!(ctx->flags & TPARG_FL_RETURN)) { + err = TP_ERR_RETVAL_ON_PROBE; + goto inval; + } + if (!(ctx->flags & TPARG_FL_KERNEL) || + !IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS)) { code->op = FETCH_OP_RETVAL; return 0; } - err = TP_ERR_RETVAL_ON_PROBE; - goto inval; + return parse_btf_arg(orig_arg, pcode, end, ctx); } len = str_has_prefix(arg, "stack"); @@ -676,7 +897,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, switch (arg[0]) { case '$': - ret = parse_probe_vars(arg + 1, type, code, ctx); + ret = parse_probe_vars(arg, type, pcode, end, ctx); break; case '%': /* named register */ @@ -795,6 +1016,8 @@ parse_probe_arg(char *arg, const struct fetch_type *type, code->op = deref; code->offset = offset; + /* Reset the last type if used */ + ctx->last_type = NULL; } break; case '\\': /* Immediate value */ @@ -818,7 +1041,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, trace_probe_log_err(ctx->offset, NOSUP_BTFARG); return -EINVAL; } - ret = parse_btf_arg(arg, code, ctx); + ret = parse_btf_arg(arg, pcode, end, ctx); break; } } @@ -964,17 +1187,22 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, goto out; code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; + ctx->last_type = NULL; ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1], ctx); if (ret) goto fail; /* Update storing type if BTF is available */ - if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) && !t) { - if (code->op == FETCH_OP_ARG) - parg->type = parse_btf_arg_type(code->param, ctx); - else if (code->op == FETCH_OP_RETVAL) - parg->type = parse_btf_retval_type(ctx); + if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) && + ctx->last_type) { + if (!t) { + parg->type = find_fetch_type_from_btf_type(ctx); + } else if (strstr(t, "string")) { + ret = check_prepare_btf_string_fetch(t, &code, ctx); + if (ret) + goto fail; + } } ret = -EINVAL; @@ -1048,6 +1276,11 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, trace_probe_log_err(ctx->offset + t - arg, BAD_BITFIELD); goto fail; } + } else if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) && + ctx->last_type) { + ret = parse_btf_bitfield(&code, ctx); + if (ret) + goto fail; } ret = -EINVAL; /* Loop(Array) operation */ @@ -1231,7 +1464,6 @@ static int sprint_nth_btf_arg(int idx, const char *type, char *buf, int bufsize, struct traceprobe_parse_context *ctx) { - struct btf *btf = traceprobe_get_btf(); const char *name; int ret; @@ -1239,7 +1471,7 @@ static int sprint_nth_btf_arg(int idx, const char *type, trace_probe_log_err(0, NO_BTFARG); return -ENOENT; } - name = btf_name_by_offset(btf, ctx->params[idx].name_off); + name = btf_name_by_offset(ctx->btf, ctx->params[idx].name_off); if (!name) { trace_probe_log_err(0, NO_BTF_ENTRY); return -ENOENT; @@ -1260,7 +1492,6 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[], const struct btf_param *params = NULL; int i, j, n, used, ret, args_idx = -1; const char **new_argv = NULL; - int nr_params; ret = argv_has_var_arg(argc, argv, &args_idx, ctx); if (ret < 0) @@ -1271,9 +1502,8 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[], return NULL; } - params = find_btf_func_param(ctx->funcname, &nr_params, - ctx->flags & TPARG_FL_TPOINT); - if (IS_ERR(params)) { + ret = query_btf_context(ctx); + if (ret < 0 || ctx->nr_params == 0) { if (args_idx != -1) { /* $arg* requires BTF info */ trace_probe_log_err(0, NOSUP_BTFARG); @@ -1282,8 +1512,6 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[], *new_argc = argc; return NULL; } - ctx->params = params; - ctx->nr_params = nr_params; if (args_idx >= 0) *new_argc = argc + ctx->nr_params - 1; @@ -1298,7 +1526,7 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[], for (i = 0, j = 0; i < argc; i++) { trace_probe_log_set_index(i + 2); if (i == args_idx) { - for (n = 0; n < nr_params; n++) { + for (n = 0; n < ctx->nr_params; n++) { ret = sprint_nth_btf_arg(n, "", buf + used, bufsize - used, ctx); if (ret < 0) @@ -1337,6 +1565,11 @@ error: return ERR_PTR(ret); } +void traceprobe_finish_parse(struct traceprobe_parse_context *ctx) +{ + clear_btf_context(ctx); +} + int traceprobe_update_arg(struct probe_arg *arg) { struct fetch_insn *code = arg->code; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 01ea148723de..02b432ae7513 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -383,9 +383,15 @@ static inline bool tparg_is_function_entry(unsigned int flags) struct traceprobe_parse_context { struct trace_event_call *event; - const struct btf_param *params; - s32 nr_params; - const char *funcname; + /* BTF related parameters */ + const char *funcname; /* Function name in BTF */ + const struct btf_type *proto; /* Prototype of the function */ + const struct btf_param *params; /* Parameter of the function */ + s32 nr_params; /* The number of the parameters */ + struct btf *btf; /* The BTF to be used */ + const struct btf_type *last_type; /* Saved type */ + u32 last_bitoffs; /* Saved bitoffs */ + u32 last_bitsize; /* Saved bitsize */ unsigned int flags; int offset; }; @@ -400,6 +406,12 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[], extern int traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); +/* + * If either traceprobe_parse_probe_arg() or traceprobe_expand_meta_args() is called, + * this MUST be called for clean up the context and return a resource. + */ +void traceprobe_finish_parse(struct traceprobe_parse_context *ctx); + extern int traceprobe_split_symbol_offset(char *symbol, long *offset); int traceprobe_parse_event_name(const char **pevent, const char **pgroup, char *buf, int offset); @@ -495,7 +507,14 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(BAD_VAR_ARGS, "$arg* must be an independent parameter without name etc."),\ C(NOFENTRY_ARGS, "$arg* can be used only on function entry"), \ C(DOUBLE_ARGS, "$arg* can be used only once in the parameters"), \ - C(ARGS_2LONG, "$arg* failed because the argument list is too long"), + C(ARGS_2LONG, "$arg* failed because the argument list is too long"), \ + C(ARGIDX_2BIG, "$argN index is too big"), \ + C(NO_PTR_STRCT, "This is not a pointer to union/structure."), \ + C(NOSUP_DAT_ARG, "Non pointer structure/union argument is not supported."),\ + C(BAD_HYPHEN, "Failed to parse single hyphen. Forgot '>'?"), \ + C(NO_BTF_FIELD, "This field is not found."), \ + C(BAD_BTF_TID, "Failed to get BTF type info."),\ + C(BAD_TYPE4STR, "This type does not fit for string."), #undef C #define C(a, b) TP_ERR_##a @@ -519,3 +538,8 @@ void __trace_probe_log_err(int offset, int err); #define trace_probe_log_err(offs, err) \ __trace_probe_log_err(offs, TP_ERR_##err) + +struct uprobe_dispatch_data { + struct trace_uprobe *tu; + unsigned long bp_addr; +}; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 330aee1c1a49..0469a04a355f 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -168,6 +168,8 @@ static void wakeup_trace_open(struct trace_iterator *iter) { if (is_graph(iter->tr)) graph_trace_open(iter); + else + iter->private = NULL; } static void wakeup_trace_close(struct trace_iterator *iter) diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index e5e299260d0c..bac06ee3b98b 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -131,6 +131,7 @@ EXPORT_SYMBOL_GPL(trace_seq_bitmask); * trace_seq_vprintf - sequence printing of trace information * @s: trace sequence descriptor * @fmt: printf format string + * @args: Arguments for the format string * * The tracer may use either sequence operations or its own * copy to user routines. To simplify formatting of a trace diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 942ddbdace4a..de753403cdaf 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -555,12 +555,15 @@ static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *re struct syscall_trace_enter *rec) { struct syscall_tp_t { - unsigned long long regs; + struct trace_entry ent; unsigned long syscall_nr; unsigned long args[SYSCALL_DEFINE_MAXARGS]; - } param; + } __aligned(8) param; int i; + BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *)); + + /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */ *(struct pt_regs **)¶m = regs; param.syscall_nr = rec->nr; for (i = 0; i < sys_data->nb_args; i++) @@ -657,11 +660,12 @@ static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *reg struct syscall_trace_exit *rec) { struct syscall_tp_t { - unsigned long long regs; + struct trace_entry ent; unsigned long syscall_nr; unsigned long ret; - } param; + } __aligned(8) param; + /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */ *(struct pt_regs **)¶m = regs; param.syscall_nr = rec->nr; param.ret = rec->ret; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 688bf579f2f1..99c051de412a 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -88,11 +88,6 @@ static struct trace_uprobe *to_trace_uprobe(struct dyn_event *ev) static int register_uprobe_event(struct trace_uprobe *tu); static int unregister_uprobe_event(struct trace_uprobe *tu); -struct uprobe_dispatch_data { - struct trace_uprobe *tu; - unsigned long bp_addr; -}; - static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); static int uretprobe_dispatcher(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs); @@ -693,6 +688,7 @@ static int __trace_uprobe_create(int argc, const char **argv) trace_probe_log_set_index(i + 2); ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], &ctx); + traceprobe_finish_parse(&ctx); if (ret) goto error; } @@ -1352,7 +1348,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, if (bpf_prog_array_valid(call)) { u32 ret; - ret = bpf_prog_run_array_sleepable(call->prog_array, regs, bpf_prog_run); + ret = bpf_prog_run_array_uprobe(call->prog_array, regs, bpf_prog_run); if (!ret) return; } @@ -1418,7 +1414,7 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, const char **filename, u64 *probe_offset, - bool perf_type_tracepoint) + u64 *probe_addr, bool perf_type_tracepoint) { const char *pevent = trace_event_name(event->tp_event); const char *group = event->tp_event->class->system; @@ -1435,6 +1431,7 @@ int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, : BPF_FD_TYPE_UPROBE; *filename = tu->filename; *probe_offset = tu->offset; + *probe_addr = 0; return 0; } #endif /* CONFIG_PERF_EVENTS */ diff --git a/kernel/ucount.c b/kernel/ucount.c index ee8e57fd6f90..4aa6166cb856 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -104,7 +104,8 @@ bool setup_userns_sysctls(struct user_namespace *ns) for (i = 0; i < UCOUNT_COUNTS; i++) { tbl[i].data = &ns->ucount_max[i]; } - ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl); + ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl, + ARRAY_SIZE(user_table)); } if (!ns->sysctls) { kfree(tbl); @@ -364,7 +365,7 @@ static __init int user_namespace_sysctl_init(void) * default set so that registrations in the child sets work * properly. */ - user_header = register_sysctl("user", empty); + user_header = register_sysctl_sz("user", empty, 0); kmemleak_ignore(user_header); BUG_ON(!user_header); BUG_ON(!setup_userns_sysctls(&init_user_ns)); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index be38276a365f..d145305d95fe 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -151,9 +151,6 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) */ if (is_hardlockup(cpu)) { unsigned int this_cpu = smp_processor_id(); - struct cpumask backtrace_mask; - - cpumask_copy(&backtrace_mask, cpu_online_mask); /* Only print hardlockups once. */ if (per_cpu(watchdog_hardlockup_warned, cpu)) @@ -167,10 +164,8 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) show_regs(regs); else dump_stack(); - cpumask_clear_cpu(cpu, &backtrace_mask); } else { - if (trigger_single_cpu_backtrace(cpu)) - cpumask_clear_cpu(cpu, &backtrace_mask); + trigger_single_cpu_backtrace(cpu); } /* @@ -179,7 +174,7 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) */ if (sysctl_hardlockup_all_cpu_backtrace && !test_and_set_bit(0, &watchdog_hardlockup_all_cpu_dumped)) - trigger_cpumask_backtrace(&backtrace_mask); + trigger_allbutcpu_cpu_backtrace(cpu); if (hardlockup_panic) nmi_panic(regs, "Hard LOCKUP"); @@ -523,7 +518,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) dump_stack(); if (softlockup_all_cpu_backtrace) { - trigger_allbutself_cpu_backtrace(); + trigger_allbutcpu_cpu_backtrace(smp_processor_id()); clear_bit_unlock(0, &soft_lockup_nmi_warn); } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 02a8f402eeb5..c85825e17df8 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -52,6 +52,7 @@ #include <linux/sched/debug.h> #include <linux/nmi.h> #include <linux/kvm_para.h> +#include <linux/delay.h> #include "workqueue_internal.h" @@ -121,11 +122,6 @@ enum { * * L: pool->lock protected. Access with pool->lock held. * - * X: During normal operation, modification requires pool->lock and should - * be done only from local cpu. Either disabling preemption on local - * cpu or grabbing pool->lock is enough for read access. If - * POOL_DISASSOCIATED is set, it's identical to L. - * * K: Only modified by worker while holding pool->lock. Can be safely read by * self, while holding pool->lock or from IRQ context if %current is the * kworker. @@ -159,7 +155,7 @@ struct worker_pool { int cpu; /* I: the associated cpu */ int node; /* I: the associated node ID */ int id; /* I: pool ID */ - unsigned int flags; /* X: flags */ + unsigned int flags; /* L: flags */ unsigned long watchdog_ts; /* L: watchdog timestamp */ bool cpu_stall; /* WD: stalled cpu bound pool */ @@ -215,6 +211,7 @@ enum pool_workqueue_stats { PWQ_STAT_CPU_TIME, /* total CPU time consumed */ PWQ_STAT_CPU_INTENSIVE, /* wq_cpu_intensive_thresh_us violations */ PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */ + PWQ_STAT_REPATRIATED, /* unbound workers brought back into scope */ PWQ_STAT_MAYDAY, /* maydays to rescuer */ PWQ_STAT_RESCUED, /* linked work items executed by rescuer */ @@ -261,12 +258,12 @@ struct pool_workqueue { u64 stats[PWQ_NR_STATS]; /* - * Release of unbound pwq is punted to system_wq. See put_pwq() - * and pwq_unbound_release_workfn() for details. pool_workqueue - * itself is also RCU protected so that the first pwq can be - * determined without grabbing wq->mutex. + * Release of unbound pwq is punted to a kthread_worker. See put_pwq() + * and pwq_release_workfn() for details. pool_workqueue itself is also + * RCU protected so that the first pwq can be determined without + * grabbing wq->mutex. */ - struct work_struct unbound_release_work; + struct kthread_work release_work; struct rcu_head rcu; } __aligned(1 << WORK_STRUCT_FLAG_BITS); @@ -325,36 +322,52 @@ struct workqueue_struct { /* hot fields used during command issue, aligned to cacheline */ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ - struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */ - struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */ + struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */ }; static struct kmem_cache *pwq_cache; -static cpumask_var_t *wq_numa_possible_cpumask; - /* possible CPUs of each node */ +/* + * Each pod type describes how CPUs should be grouped for unbound workqueues. + * See the comment above workqueue_attrs->affn_scope. + */ +struct wq_pod_type { + int nr_pods; /* number of pods */ + cpumask_var_t *pod_cpus; /* pod -> cpus */ + int *pod_node; /* pod -> node */ + int *cpu_pod; /* cpu -> pod */ +}; + +static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES]; +static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE; + +static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = { + [WQ_AFFN_DFL] = "default", + [WQ_AFFN_CPU] = "cpu", + [WQ_AFFN_SMT] = "smt", + [WQ_AFFN_CACHE] = "cache", + [WQ_AFFN_NUMA] = "numa", + [WQ_AFFN_SYSTEM] = "system", +}; /* * Per-cpu work items which run for longer than the following threshold are * automatically considered CPU intensive and excluded from concurrency * management to prevent them from noticeably delaying other per-cpu work items. + * ULONG_MAX indicates that the user hasn't overridden it with a boot parameter. + * The actual value is initialized in wq_cpu_intensive_thresh_init(). */ -static unsigned long wq_cpu_intensive_thresh_us = 10000; +static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX; module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644); -static bool wq_disable_numa; -module_param_named(disable_numa, wq_disable_numa, bool, 0444); - /* see the comment above the definition of WQ_POWER_EFFICIENT */ static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT); module_param_named(power_efficient, wq_power_efficient, bool, 0444); static bool wq_online; /* can kworkers be created yet? */ -static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ - -/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ -static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; +/* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */ +static struct workqueue_attrs *wq_update_pod_attrs_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */ @@ -368,6 +381,9 @@ static bool workqueue_freezing; /* PL: have wqs started freezing? */ /* PL&A: allowable cpus for unbound wqs and work items */ static cpumask_var_t wq_unbound_cpumask; +/* for further constrain wq_unbound_cpumask by cmdline parameter*/ +static struct cpumask wq_cmdline_cpumask __initdata; + /* CPU where unbound work was last round robin scheduled from this CPU */ static DEFINE_PER_CPU(int, wq_rr_cpu_last); @@ -397,6 +413,13 @@ static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; /* I: attributes used when instantiating ordered pools on demand */ static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS]; +/* + * I: kthread_worker to release pwq's. pwq release needs to be bounced to a + * process context while holding a pool lock. Bounce to a dedicated kthread + * worker to avoid A-A deadlocks. + */ +static struct kthread_worker *pwq_release_worker; + struct workqueue_struct *system_wq __read_mostly; EXPORT_SYMBOL(system_wq); struct workqueue_struct *system_highpri_wq __read_mostly; @@ -603,35 +626,6 @@ static int worker_pool_assign_id(struct worker_pool *pool) return ret; } -/** - * unbound_pwq_by_node - return the unbound pool_workqueue for the given node - * @wq: the target workqueue - * @node: the node ID - * - * This must be called with any of wq_pool_mutex, wq->mutex or RCU - * read locked. - * If the pwq needs to be used beyond the locking in effect, the caller is - * responsible for guaranteeing that the pwq stays online. - * - * Return: The unbound pool_workqueue for @node. - */ -static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq, - int node) -{ - assert_rcu_or_wq_mutex_or_pool_mutex(wq); - - /* - * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a - * delayed item is pending. The plan is to keep CPU -> NODE - * mapping valid and stable across CPU on/offlines. Once that - * happens, this workaround can be removed. - */ - if (unlikely(node == NUMA_NO_NODE)) - return wq->dfl_pwq; - - return rcu_dereference_raw(wq->numa_pwq_tbl[node]); -} - static unsigned int work_color_to_flags(int color) { return color << WORK_STRUCT_COLOR_SHIFT; @@ -822,11 +816,6 @@ static bool work_is_canceling(struct work_struct *work) * they're being called with pool->lock held. */ -static bool __need_more_worker(struct worker_pool *pool) -{ - return !pool->nr_running; -} - /* * Need to wake up a worker? Called from anything but currently * running workers. @@ -837,7 +826,7 @@ static bool __need_more_worker(struct worker_pool *pool) */ static bool need_more_worker(struct worker_pool *pool) { - return !list_empty(&pool->worklist) && __need_more_worker(pool); + return !list_empty(&pool->worklist) && !pool->nr_running; } /* Can I start working? Called from busy but !running workers. */ @@ -868,51 +857,18 @@ static bool too_many_workers(struct worker_pool *pool) return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; } -/* - * Wake up functions. - */ - -/* Return the first idle worker. Called with pool->lock held. */ -static struct worker *first_idle_worker(struct worker_pool *pool) -{ - if (unlikely(list_empty(&pool->idle_list))) - return NULL; - - return list_first_entry(&pool->idle_list, struct worker, entry); -} - -/** - * wake_up_worker - wake up an idle worker - * @pool: worker pool to wake worker from - * - * Wake up the first idle worker of @pool. - * - * CONTEXT: - * raw_spin_lock_irq(pool->lock). - */ -static void wake_up_worker(struct worker_pool *pool) -{ - struct worker *worker = first_idle_worker(pool); - - if (likely(worker)) - wake_up_process(worker->task); -} - /** * worker_set_flags - set worker flags and adjust nr_running accordingly * @worker: self * @flags: flags to set * * Set @flags in @worker->flags and adjust nr_running accordingly. - * - * CONTEXT: - * raw_spin_lock_irq(pool->lock) */ static inline void worker_set_flags(struct worker *worker, unsigned int flags) { struct worker_pool *pool = worker->pool; - WARN_ON_ONCE(worker->task != current); + lockdep_assert_held(&pool->lock); /* If transitioning into NOT_RUNNING, adjust nr_running. */ if ((flags & WORKER_NOT_RUNNING) && @@ -929,16 +885,13 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags) * @flags: flags to clear * * Clear @flags in @worker->flags and adjust nr_running accordingly. - * - * CONTEXT: - * raw_spin_lock_irq(pool->lock) */ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) { struct worker_pool *pool = worker->pool; unsigned int oflags = worker->flags; - WARN_ON_ONCE(worker->task != current); + lockdep_assert_held(&pool->lock); worker->flags &= ~flags; @@ -952,6 +905,244 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) pool->nr_running++; } +/* Return the first idle worker. Called with pool->lock held. */ +static struct worker *first_idle_worker(struct worker_pool *pool) +{ + if (unlikely(list_empty(&pool->idle_list))) + return NULL; + + return list_first_entry(&pool->idle_list, struct worker, entry); +} + +/** + * worker_enter_idle - enter idle state + * @worker: worker which is entering idle state + * + * @worker is entering idle state. Update stats and idle timer if + * necessary. + * + * LOCKING: + * raw_spin_lock_irq(pool->lock). + */ +static void worker_enter_idle(struct worker *worker) +{ + struct worker_pool *pool = worker->pool; + + if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) || + WARN_ON_ONCE(!list_empty(&worker->entry) && + (worker->hentry.next || worker->hentry.pprev))) + return; + + /* can't use worker_set_flags(), also called from create_worker() */ + worker->flags |= WORKER_IDLE; + pool->nr_idle++; + worker->last_active = jiffies; + + /* idle_list is LIFO */ + list_add(&worker->entry, &pool->idle_list); + + if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) + mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); + + /* Sanity check nr_running. */ + WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running); +} + +/** + * worker_leave_idle - leave idle state + * @worker: worker which is leaving idle state + * + * @worker is leaving idle state. Update stats. + * + * LOCKING: + * raw_spin_lock_irq(pool->lock). + */ +static void worker_leave_idle(struct worker *worker) +{ + struct worker_pool *pool = worker->pool; + + if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE))) + return; + worker_clr_flags(worker, WORKER_IDLE); + pool->nr_idle--; + list_del_init(&worker->entry); +} + +/** + * find_worker_executing_work - find worker which is executing a work + * @pool: pool of interest + * @work: work to find worker for + * + * Find a worker which is executing @work on @pool by searching + * @pool->busy_hash which is keyed by the address of @work. For a worker + * to match, its current execution should match the address of @work and + * its work function. This is to avoid unwanted dependency between + * unrelated work executions through a work item being recycled while still + * being executed. + * + * This is a bit tricky. A work item may be freed once its execution + * starts and nothing prevents the freed area from being recycled for + * another work item. If the same work item address ends up being reused + * before the original execution finishes, workqueue will identify the + * recycled work item as currently executing and make it wait until the + * current execution finishes, introducing an unwanted dependency. + * + * This function checks the work item address and work function to avoid + * false positives. Note that this isn't complete as one may construct a + * work function which can introduce dependency onto itself through a + * recycled work item. Well, if somebody wants to shoot oneself in the + * foot that badly, there's only so much we can do, and if such deadlock + * actually occurs, it should be easy to locate the culprit work function. + * + * CONTEXT: + * raw_spin_lock_irq(pool->lock). + * + * Return: + * Pointer to worker which is executing @work if found, %NULL + * otherwise. + */ +static struct worker *find_worker_executing_work(struct worker_pool *pool, + struct work_struct *work) +{ + struct worker *worker; + + hash_for_each_possible(pool->busy_hash, worker, hentry, + (unsigned long)work) + if (worker->current_work == work && + worker->current_func == work->func) + return worker; + + return NULL; +} + +/** + * move_linked_works - move linked works to a list + * @work: start of series of works to be scheduled + * @head: target list to append @work to + * @nextp: out parameter for nested worklist walking + * + * Schedule linked works starting from @work to @head. Work series to be + * scheduled starts at @work and includes any consecutive work with + * WORK_STRUCT_LINKED set in its predecessor. See assign_work() for details on + * @nextp. + * + * CONTEXT: + * raw_spin_lock_irq(pool->lock). + */ +static void move_linked_works(struct work_struct *work, struct list_head *head, + struct work_struct **nextp) +{ + struct work_struct *n; + + /* + * Linked worklist will always end before the end of the list, + * use NULL for list head. + */ + list_for_each_entry_safe_from(work, n, NULL, entry) { + list_move_tail(&work->entry, head); + if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) + break; + } + + /* + * If we're already inside safe list traversal and have moved + * multiple works to the scheduled queue, the next position + * needs to be updated. + */ + if (nextp) + *nextp = n; +} + +/** + * assign_work - assign a work item and its linked work items to a worker + * @work: work to assign + * @worker: worker to assign to + * @nextp: out parameter for nested worklist walking + * + * Assign @work and its linked work items to @worker. If @work is already being + * executed by another worker in the same pool, it'll be punted there. + * + * If @nextp is not NULL, it's updated to point to the next work of the last + * scheduled work. This allows assign_work() to be nested inside + * list_for_each_entry_safe(). + * + * Returns %true if @work was successfully assigned to @worker. %false if @work + * was punted to another worker already executing it. + */ +static bool assign_work(struct work_struct *work, struct worker *worker, + struct work_struct **nextp) +{ + struct worker_pool *pool = worker->pool; + struct worker *collision; + + lockdep_assert_held(&pool->lock); + + /* + * A single work shouldn't be executed concurrently by multiple workers. + * __queue_work() ensures that @work doesn't jump to a different pool + * while still running in the previous pool. Here, we should ensure that + * @work is not executed concurrently by multiple workers from the same + * pool. Check whether anyone is already processing the work. If so, + * defer the work to the currently executing one. + */ + collision = find_worker_executing_work(pool, work); + if (unlikely(collision)) { + move_linked_works(work, &collision->scheduled, nextp); + return false; + } + + move_linked_works(work, &worker->scheduled, nextp); + return true; +} + +/** + * kick_pool - wake up an idle worker if necessary + * @pool: pool to kick + * + * @pool may have pending work items. Wake up worker if necessary. Returns + * whether a worker was woken up. + */ +static bool kick_pool(struct worker_pool *pool) +{ + struct worker *worker = first_idle_worker(pool); + struct task_struct *p; + + lockdep_assert_held(&pool->lock); + + if (!need_more_worker(pool) || !worker) + return false; + + p = worker->task; + +#ifdef CONFIG_SMP + /* + * Idle @worker is about to execute @work and waking up provides an + * opportunity to migrate @worker at a lower cost by setting the task's + * wake_cpu field. Let's see if we want to move @worker to improve + * execution locality. + * + * We're waking the worker that went idle the latest and there's some + * chance that @worker is marked idle but hasn't gone off CPU yet. If + * so, setting the wake_cpu won't do anything. As this is a best-effort + * optimization and the race window is narrow, let's leave as-is for + * now. If this becomes pronounced, we can skip over workers which are + * still on cpu when picking an idle worker. + * + * If @pool has non-strict affinity, @worker might have ended up outside + * its affinity scope. Repatriate. + */ + if (!pool->attrs->affn_strict && + !cpumask_test_cpu(p->wake_cpu, pool->attrs->__pod_cpumask)) { + struct work_struct *work = list_first_entry(&pool->worklist, + struct work_struct, entry); + p->wake_cpu = cpumask_any_distribute(pool->attrs->__pod_cpumask); + get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++; + } +#endif + wake_up_process(p); + return true; +} + #ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT /* @@ -1117,10 +1308,9 @@ void wq_worker_sleeping(struct task_struct *task) } pool->nr_running--; - if (need_more_worker(pool)) { + if (kick_pool(pool)) worker->current_pwq->stats[PWQ_STAT_CM_WAKEUP]++; - wake_up_worker(pool); - } + raw_spin_unlock_irq(&pool->lock); } @@ -1168,10 +1358,8 @@ void wq_worker_tick(struct task_struct *task) wq_cpu_intensive_report(worker->current_func); pwq->stats[PWQ_STAT_CPU_INTENSIVE]++; - if (need_more_worker(pool)) { + if (kick_pool(pool)) pwq->stats[PWQ_STAT_CM_WAKEUP]++; - wake_up_worker(pool); - } raw_spin_unlock(&pool->lock); } @@ -1208,94 +1396,6 @@ work_func_t wq_worker_last_func(struct task_struct *task) } /** - * find_worker_executing_work - find worker which is executing a work - * @pool: pool of interest - * @work: work to find worker for - * - * Find a worker which is executing @work on @pool by searching - * @pool->busy_hash which is keyed by the address of @work. For a worker - * to match, its current execution should match the address of @work and - * its work function. This is to avoid unwanted dependency between - * unrelated work executions through a work item being recycled while still - * being executed. - * - * This is a bit tricky. A work item may be freed once its execution - * starts and nothing prevents the freed area from being recycled for - * another work item. If the same work item address ends up being reused - * before the original execution finishes, workqueue will identify the - * recycled work item as currently executing and make it wait until the - * current execution finishes, introducing an unwanted dependency. - * - * This function checks the work item address and work function to avoid - * false positives. Note that this isn't complete as one may construct a - * work function which can introduce dependency onto itself through a - * recycled work item. Well, if somebody wants to shoot oneself in the - * foot that badly, there's only so much we can do, and if such deadlock - * actually occurs, it should be easy to locate the culprit work function. - * - * CONTEXT: - * raw_spin_lock_irq(pool->lock). - * - * Return: - * Pointer to worker which is executing @work if found, %NULL - * otherwise. - */ -static struct worker *find_worker_executing_work(struct worker_pool *pool, - struct work_struct *work) -{ - struct worker *worker; - - hash_for_each_possible(pool->busy_hash, worker, hentry, - (unsigned long)work) - if (worker->current_work == work && - worker->current_func == work->func) - return worker; - - return NULL; -} - -/** - * move_linked_works - move linked works to a list - * @work: start of series of works to be scheduled - * @head: target list to append @work to - * @nextp: out parameter for nested worklist walking - * - * Schedule linked works starting from @work to @head. Work series to - * be scheduled starts at @work and includes any consecutive work with - * WORK_STRUCT_LINKED set in its predecessor. - * - * If @nextp is not NULL, it's updated to point to the next work of - * the last scheduled work. This allows move_linked_works() to be - * nested inside outer list_for_each_entry_safe(). - * - * CONTEXT: - * raw_spin_lock_irq(pool->lock). - */ -static void move_linked_works(struct work_struct *work, struct list_head *head, - struct work_struct **nextp) -{ - struct work_struct *n; - - /* - * Linked worklist will always end before the end of the list, - * use NULL for list head. - */ - list_for_each_entry_safe_from(work, n, NULL, entry) { - list_move_tail(&work->entry, head); - if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) - break; - } - - /* - * If we're already inside safe list traversal and have moved - * multiple works to the scheduled queue, the next position - * needs to be updated. - */ - if (nextp) - *nextp = n; -} - -/** * get_pwq - get an extra reference on the specified pool_workqueue * @pwq: pool_workqueue to get * @@ -1321,17 +1421,11 @@ static void put_pwq(struct pool_workqueue *pwq) lockdep_assert_held(&pwq->pool->lock); if (likely(--pwq->refcnt)) return; - if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND))) - return; /* - * @pwq can't be released under pool->lock, bounce to - * pwq_unbound_release_workfn(). This never recurses on the same - * pool->lock as this path is taken only for unbound workqueues and - * the release work item is scheduled on a per-cpu workqueue. To - * avoid lockdep warning, unbound pool->locks are given lockdep - * subclass of 1 in get_unbound_pool(). + * @pwq can't be released under pool->lock, bounce to a dedicated + * kthread_worker to avoid A-A deadlocks. */ - schedule_work(&pwq->unbound_release_work); + kthread_queue_work(pwq_release_worker, &pwq->release_work); } /** @@ -1547,7 +1641,7 @@ fail: static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, struct list_head *head, unsigned int extra_flags) { - struct worker_pool *pool = pwq->pool; + debug_work_activate(work); /* record the work call stack in order to print it in KASAN reports */ kasan_record_aux_stack_noalloc(work); @@ -1556,9 +1650,6 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, set_work_pwq(work, pwq, extra_flags); list_add_tail(&work->entry, head); get_pwq(pwq); - - if (__need_more_worker(pool)) - wake_up_worker(pool); } /* @@ -1612,8 +1703,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, struct work_struct *work) { struct pool_workqueue *pwq; - struct worker_pool *last_pool; - struct list_head *worklist; + struct worker_pool *last_pool, *pool; unsigned int work_flags; unsigned int req_cpu = cpu; @@ -1637,23 +1727,23 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, rcu_read_lock(); retry: /* pwq which will be used unless @work is executing elsewhere */ - if (wq->flags & WQ_UNBOUND) { - if (req_cpu == WORK_CPU_UNBOUND) + if (req_cpu == WORK_CPU_UNBOUND) { + if (wq->flags & WQ_UNBOUND) cpu = wq_select_unbound_cpu(raw_smp_processor_id()); - pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); - } else { - if (req_cpu == WORK_CPU_UNBOUND) + else cpu = raw_smp_processor_id(); - pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); } + pwq = rcu_dereference(*per_cpu_ptr(wq->cpu_pwq, cpu)); + pool = pwq->pool; + /* * If @work was previously on a different pool, it might still be * running there, in which case the work needs to be queued on that * pool to guarantee non-reentrancy. */ last_pool = get_work_pool(work); - if (last_pool && last_pool != pwq->pool) { + if (last_pool && last_pool != pool) { struct worker *worker; raw_spin_lock(&last_pool->lock); @@ -1662,26 +1752,27 @@ retry: if (worker && worker->current_pwq->wq == wq) { pwq = worker->current_pwq; + pool = pwq->pool; + WARN_ON_ONCE(pool != last_pool); } else { /* meh... not running there, queue here */ raw_spin_unlock(&last_pool->lock); - raw_spin_lock(&pwq->pool->lock); + raw_spin_lock(&pool->lock); } } else { - raw_spin_lock(&pwq->pool->lock); + raw_spin_lock(&pool->lock); } /* - * pwq is determined and locked. For unbound pools, we could have - * raced with pwq release and it could already be dead. If its - * refcnt is zero, repeat pwq selection. Note that pwqs never die - * without another pwq replacing it in the numa_pwq_tbl or while - * work items are executing on it, so the retrying is guaranteed to - * make forward-progress. + * pwq is determined and locked. For unbound pools, we could have raced + * with pwq release and it could already be dead. If its refcnt is zero, + * repeat pwq selection. Note that unbound pwqs never die without + * another pwq replacing it in cpu_pwq or while work items are executing + * on it, so the retrying is guaranteed to make forward-progress. */ if (unlikely(!pwq->refcnt)) { if (wq->flags & WQ_UNBOUND) { - raw_spin_unlock(&pwq->pool->lock); + raw_spin_unlock(&pool->lock); cpu_relax(); goto retry; } @@ -1700,21 +1791,20 @@ retry: work_flags = work_color_to_flags(pwq->work_color); if (likely(pwq->nr_active < pwq->max_active)) { + if (list_empty(&pool->worklist)) + pool->watchdog_ts = jiffies; + trace_workqueue_activate_work(work); pwq->nr_active++; - worklist = &pwq->pool->worklist; - if (list_empty(worklist)) - pwq->pool->watchdog_ts = jiffies; + insert_work(pwq, work, &pool->worklist, work_flags); + kick_pool(pool); } else { work_flags |= WORK_STRUCT_INACTIVE; - worklist = &pwq->inactive_works; + insert_work(pwq, work, &pwq->inactive_works, work_flags); } - debug_work_activate(work); - insert_work(pwq, work, worklist, work_flags); - out: - raw_spin_unlock(&pwq->pool->lock); + raw_spin_unlock(&pool->lock); rcu_read_unlock(); } @@ -1751,7 +1841,7 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq, EXPORT_SYMBOL(queue_work_on); /** - * workqueue_select_cpu_near - Select a CPU based on NUMA node + * select_numa_node_cpu - Select a CPU based on NUMA node * @node: NUMA node ID that we want to select a CPU from * * This function will attempt to find a "random" cpu available on a given @@ -1759,14 +1849,10 @@ EXPORT_SYMBOL(queue_work_on); * WORK_CPU_UNBOUND indicating that we should just schedule to any * available CPU if we need to schedule this work. */ -static int workqueue_select_cpu_near(int node) +static int select_numa_node_cpu(int node) { int cpu; - /* No point in doing this if NUMA isn't enabled for workqueues */ - if (!wq_numa_enabled) - return WORK_CPU_UNBOUND; - /* Delay binding to CPU if node is not valid or online */ if (node < 0 || node >= MAX_NUMNODES || !node_online(node)) return WORK_CPU_UNBOUND; @@ -1823,7 +1909,7 @@ bool queue_work_node(int node, struct workqueue_struct *wq, local_irq_save(flags); if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { - int cpu = workqueue_select_cpu_near(node); + int cpu = select_numa_node_cpu(node); __queue_work(cpu, wq, work); ret = true; @@ -1978,60 +2064,6 @@ bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork) } EXPORT_SYMBOL(queue_rcu_work); -/** - * worker_enter_idle - enter idle state - * @worker: worker which is entering idle state - * - * @worker is entering idle state. Update stats and idle timer if - * necessary. - * - * LOCKING: - * raw_spin_lock_irq(pool->lock). - */ -static void worker_enter_idle(struct worker *worker) -{ - struct worker_pool *pool = worker->pool; - - if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) || - WARN_ON_ONCE(!list_empty(&worker->entry) && - (worker->hentry.next || worker->hentry.pprev))) - return; - - /* can't use worker_set_flags(), also called from create_worker() */ - worker->flags |= WORKER_IDLE; - pool->nr_idle++; - worker->last_active = jiffies; - - /* idle_list is LIFO */ - list_add(&worker->entry, &pool->idle_list); - - if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) - mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); - - /* Sanity check nr_running. */ - WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running); -} - -/** - * worker_leave_idle - leave idle state - * @worker: worker which is leaving idle state - * - * @worker is leaving idle state. Update stats. - * - * LOCKING: - * raw_spin_lock_irq(pool->lock). - */ -static void worker_leave_idle(struct worker *worker) -{ - struct worker_pool *pool = worker->pool; - - if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE))) - return; - worker_clr_flags(worker, WORKER_IDLE); - pool->nr_idle--; - list_del_init(&worker->entry); -} - static struct worker *alloc_worker(int node) { struct worker *worker; @@ -2047,6 +2079,14 @@ static struct worker *alloc_worker(int node) return worker; } +static cpumask_t *pool_allowed_cpus(struct worker_pool *pool) +{ + if (pool->cpu < 0 && pool->attrs->affn_strict) + return pool->attrs->__pod_cpumask; + else + return pool->attrs->cpumask; +} + /** * worker_attach_to_pool() - attach a worker to a pool * @worker: worker to be attached @@ -2072,7 +2112,7 @@ static void worker_attach_to_pool(struct worker *worker, kthread_set_per_cpu(worker->task, pool->cpu); if (worker->rescue_wq) - set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); + set_cpus_allowed_ptr(worker->task, pool_allowed_cpus(pool)); list_add_tail(&worker->node, &pool->workers); worker->pool = pool; @@ -2164,16 +2204,25 @@ static struct worker *create_worker(struct worker_pool *pool) } set_user_nice(worker->task, pool->attrs->nice); - kthread_bind_mask(worker->task, pool->attrs->cpumask); + kthread_bind_mask(worker->task, pool_allowed_cpus(pool)); /* successful, attach the worker to the pool */ worker_attach_to_pool(worker, pool); /* start the newly created worker */ raw_spin_lock_irq(&pool->lock); + worker->pool->nr_workers++; worker_enter_idle(worker); + kick_pool(pool); + + /* + * @worker is waiting on a completion in kthread() and will trigger hung + * check if not woken up soon. As kick_pool() might not have waken it + * up, wake it up explicitly once more. + */ wake_up_process(worker->task); + raw_spin_unlock_irq(&pool->lock); return worker; @@ -2301,9 +2350,8 @@ static void idle_worker_timeout(struct timer_list *t) static void idle_cull_fn(struct work_struct *work) { struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work); - struct list_head cull_list; + LIST_HEAD(cull_list); - INIT_LIST_HEAD(&cull_list); /* * Grabbing wq_pool_attach_mutex here ensures an already-running worker * cannot proceed beyong worker_detach_from_pool() in its self-destruct @@ -2492,7 +2540,6 @@ __acquires(&pool->lock) struct pool_workqueue *pwq = get_work_pwq(work); struct worker_pool *pool = worker->pool; unsigned long work_data; - struct worker *collision; #ifdef CONFIG_LOCKDEP /* * It is permissible to free the struct work_struct from @@ -2509,18 +2556,6 @@ __acquires(&pool->lock) WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) && raw_smp_processor_id() != pool->cpu); - /* - * A single work shouldn't be executed concurrently by - * multiple workers on a single cpu. Check whether anyone is - * already processing the work. If so, defer the work to the - * currently executing one. - */ - collision = find_worker_executing_work(pool, work); - if (unlikely(collision)) { - move_linked_works(work, &collision->scheduled, NULL); - return; - } - /* claim and dequeue */ debug_work_deactivate(work); hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work); @@ -2549,14 +2584,12 @@ __acquires(&pool->lock) worker_set_flags(worker, WORKER_CPU_INTENSIVE); /* - * Wake up another worker if necessary. The condition is always - * false for normal per-cpu workers since nr_running would always - * be >= 1 at this point. This is used to chain execution of the - * pending work items for WORKER_NOT_RUNNING workers such as the - * UNBOUND and CPU_INTENSIVE ones. + * Kick @pool if necessary. It's always noop for per-cpu worker pools + * since nr_running would always be >= 1 at this point. This is used to + * chain execution of the pending work items for WORKER_NOT_RUNNING + * workers such as the UNBOUND and CPU_INTENSIVE ones. */ - if (need_more_worker(pool)) - wake_up_worker(pool); + kick_pool(pool); /* * Record the last pool and clear PENDING which should be the last @@ -2566,6 +2599,7 @@ __acquires(&pool->lock) */ set_work_pool_and_clear_pending(work, pool->id); + pwq->stats[PWQ_STAT_STARTED]++; raw_spin_unlock_irq(&pool->lock); lock_map_acquire(&pwq->wq->lockdep_map); @@ -2592,7 +2626,6 @@ __acquires(&pool->lock) * workqueues), so hiding them isn't a problem. */ lockdep_invariant_state(true); - pwq->stats[PWQ_STAT_STARTED]++; trace_workqueue_execute_start(work); worker->current_func(work); /* @@ -2658,9 +2691,15 @@ __acquires(&pool->lock) */ static void process_scheduled_works(struct worker *worker) { - while (!list_empty(&worker->scheduled)) { - struct work_struct *work = list_first_entry(&worker->scheduled, - struct work_struct, entry); + struct work_struct *work; + bool first = true; + + while ((work = list_first_entry_or_null(&worker->scheduled, + struct work_struct, entry))) { + if (first) { + worker->pool->watchdog_ts = jiffies; + first = false; + } process_one_work(worker, work); } } @@ -2741,17 +2780,8 @@ recheck: list_first_entry(&pool->worklist, struct work_struct, entry); - pool->watchdog_ts = jiffies; - - if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { - /* optimization path, not strictly necessary */ - process_one_work(worker, work); - if (unlikely(!list_empty(&worker->scheduled))) - process_scheduled_works(worker); - } else { - move_linked_works(work, &worker->scheduled, NULL); + if (assign_work(work, worker, NULL)) process_scheduled_works(worker); - } } while (keep_working(pool)); worker_set_flags(worker, WORKER_PREP); @@ -2795,7 +2825,6 @@ static int rescuer_thread(void *__rescuer) { struct worker *rescuer = __rescuer; struct workqueue_struct *wq = rescuer->rescue_wq; - struct list_head *scheduled = &rescuer->scheduled; bool should_stop; set_user_nice(current, RESCUER_NICE_LEVEL); @@ -2826,7 +2855,6 @@ repeat: struct pool_workqueue, mayday_node); struct worker_pool *pool = pwq->pool; struct work_struct *work, *n; - bool first = true; __set_current_state(TASK_RUNNING); list_del_init(&pwq->mayday_node); @@ -2841,18 +2869,14 @@ repeat: * Slurp in all works issued via this workqueue and * process'em. */ - WARN_ON_ONCE(!list_empty(scheduled)); + WARN_ON_ONCE(!list_empty(&rescuer->scheduled)); list_for_each_entry_safe(work, n, &pool->worklist, entry) { - if (get_work_pwq(work) == pwq) { - if (first) - pool->watchdog_ts = jiffies; - move_linked_works(work, scheduled, &n); + if (get_work_pwq(work) == pwq && + assign_work(work, rescuer, &n)) pwq->stats[PWQ_STAT_RESCUED]++; - } - first = false; } - if (!list_empty(scheduled)) { + if (!list_empty(&rescuer->scheduled)) { process_scheduled_works(rescuer); /* @@ -2885,12 +2909,10 @@ repeat: put_pwq(pwq); /* - * Leave this pool. If need_more_worker() is %true, notify a - * regular worker; otherwise, we end up with 0 concurrency - * and stalling the execution. + * Leave this pool. Notify regular workers; otherwise, we end up + * with 0 concurrency and stalling the execution. */ - if (need_more_worker(pool)) - wake_up_worker(pool); + kick_pool(pool); raw_spin_unlock_irq(&pool->lock); @@ -3025,7 +3047,6 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, pwq->nr_in_flight[work_color]++; work_flags |= work_color_to_flags(work_color); - debug_work_activate(&barr->work); insert_work(pwq, &barr->work, head, work_flags); } @@ -3688,6 +3709,7 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs) { if (attrs) { free_cpumask_var(attrs->cpumask); + free_cpumask_var(attrs->__pod_cpumask); kfree(attrs); } } @@ -3709,8 +3731,11 @@ struct workqueue_attrs *alloc_workqueue_attrs(void) goto fail; if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL)) goto fail; + if (!alloc_cpumask_var(&attrs->__pod_cpumask, GFP_KERNEL)) + goto fail; cpumask_copy(attrs->cpumask, cpu_possible_mask); + attrs->affn_scope = WQ_AFFN_DFL; return attrs; fail: free_workqueue_attrs(attrs); @@ -3722,12 +3747,26 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, { to->nice = from->nice; cpumask_copy(to->cpumask, from->cpumask); + cpumask_copy(to->__pod_cpumask, from->__pod_cpumask); + to->affn_strict = from->affn_strict; + /* - * Unlike hash and equality test, this function doesn't ignore - * ->no_numa as it is used for both pool and wq attrs. Instead, - * get_unbound_pool() explicitly clears ->no_numa after copying. + * Unlike hash and equality test, copying shouldn't ignore wq-only + * fields as copying is used for both pool and wq attrs. Instead, + * get_unbound_pool() explicitly clears the fields. */ - to->no_numa = from->no_numa; + to->affn_scope = from->affn_scope; + to->ordered = from->ordered; +} + +/* + * Some attrs fields are workqueue-only. Clear them for worker_pool's. See the + * comments in 'struct workqueue_attrs' definition. + */ +static void wqattrs_clear_for_pool(struct workqueue_attrs *attrs) +{ + attrs->affn_scope = WQ_AFFN_NR_TYPES; + attrs->ordered = false; } /* hash value of the content of @attr */ @@ -3738,6 +3777,9 @@ static u32 wqattrs_hash(const struct workqueue_attrs *attrs) hash = jhash_1word(attrs->nice, hash); hash = jhash(cpumask_bits(attrs->cpumask), BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); + hash = jhash(cpumask_bits(attrs->__pod_cpumask), + BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); + hash = jhash_1word(attrs->affn_strict, hash); return hash; } @@ -3749,9 +3791,57 @@ static bool wqattrs_equal(const struct workqueue_attrs *a, return false; if (!cpumask_equal(a->cpumask, b->cpumask)) return false; + if (!cpumask_equal(a->__pod_cpumask, b->__pod_cpumask)) + return false; + if (a->affn_strict != b->affn_strict) + return false; return true; } +/* Update @attrs with actually available CPUs */ +static void wqattrs_actualize_cpumask(struct workqueue_attrs *attrs, + const cpumask_t *unbound_cpumask) +{ + /* + * Calculate the effective CPU mask of @attrs given @unbound_cpumask. If + * @attrs->cpumask doesn't overlap with @unbound_cpumask, we fallback to + * @unbound_cpumask. + */ + cpumask_and(attrs->cpumask, attrs->cpumask, unbound_cpumask); + if (unlikely(cpumask_empty(attrs->cpumask))) + cpumask_copy(attrs->cpumask, unbound_cpumask); +} + +/* find wq_pod_type to use for @attrs */ +static const struct wq_pod_type * +wqattrs_pod_type(const struct workqueue_attrs *attrs) +{ + enum wq_affn_scope scope; + struct wq_pod_type *pt; + + /* to synchronize access to wq_affn_dfl */ + lockdep_assert_held(&wq_pool_mutex); + + if (attrs->affn_scope == WQ_AFFN_DFL) + scope = wq_affn_dfl; + else + scope = attrs->affn_scope; + + pt = &wq_pod_types[scope]; + + if (!WARN_ON_ONCE(attrs->affn_scope == WQ_AFFN_NR_TYPES) && + likely(pt->nr_pods)) + return pt; + + /* + * Before workqueue_init_topology(), only SYSTEM is available which is + * initialized in workqueue_init_early(). + */ + pt = &wq_pod_types[WQ_AFFN_SYSTEM]; + BUG_ON(!pt->nr_pods); + return pt; +} + /** * init_worker_pool - initialize a newly zalloc'd worker_pool * @pool: worker_pool to initialize @@ -3790,6 +3880,9 @@ static int init_worker_pool(struct worker_pool *pool) pool->attrs = alloc_workqueue_attrs(); if (!pool->attrs) return -ENOMEM; + + wqattrs_clear_for_pool(pool->attrs); + return 0; } @@ -3837,12 +3930,8 @@ static void rcu_free_wq(struct rcu_head *rcu) container_of(rcu, struct workqueue_struct, rcu); wq_free_lockdep(wq); - - if (!(wq->flags & WQ_UNBOUND)) - free_percpu(wq->cpu_pwqs); - else - free_workqueue_attrs(wq->unbound_attrs); - + free_percpu(wq->cpu_pwq); + free_workqueue_attrs(wq->unbound_attrs); kfree(wq); } @@ -3869,10 +3958,8 @@ static void rcu_free_pool(struct rcu_head *rcu) static void put_unbound_pool(struct worker_pool *pool) { DECLARE_COMPLETION_ONSTACK(detach_completion); - struct list_head cull_list; struct worker *worker; - - INIT_LIST_HEAD(&cull_list); + LIST_HEAD(cull_list); lockdep_assert_held(&wq_pool_mutex); @@ -3956,10 +4043,10 @@ static void put_unbound_pool(struct worker_pool *pool) */ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) { + struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_NUMA]; u32 hash = wqattrs_hash(attrs); struct worker_pool *pool; - int node; - int target_node = NUMA_NO_NODE; + int pod, node = NUMA_NO_NODE; lockdep_assert_held(&wq_pool_mutex); @@ -3971,31 +4058,22 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) } } - /* if cpumask is contained inside a NUMA node, we belong to that node */ - if (wq_numa_enabled) { - for_each_node(node) { - if (cpumask_subset(attrs->cpumask, - wq_numa_possible_cpumask[node])) { - target_node = node; - break; - } + /* If __pod_cpumask is contained inside a NUMA pod, that's our node */ + for (pod = 0; pod < pt->nr_pods; pod++) { + if (cpumask_subset(attrs->__pod_cpumask, pt->pod_cpus[pod])) { + node = pt->pod_node[pod]; + break; } } /* nope, create a new one */ - pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, target_node); + pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, node); if (!pool || init_worker_pool(pool) < 0) goto fail; - lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ + pool->node = node; copy_workqueue_attrs(pool->attrs, attrs); - pool->node = target_node; - - /* - * no_numa isn't a worker_pool attribute, always clear it. See - * 'struct workqueue_attrs' comments for detail. - */ - pool->attrs->no_numa = false; + wqattrs_clear_for_pool(pool->attrs); if (worker_pool_assign_id(pool) < 0) goto fail; @@ -4021,34 +4099,33 @@ static void rcu_free_pwq(struct rcu_head *rcu) } /* - * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt - * and needs to be destroyed. + * Scheduled on pwq_release_worker by put_pwq() when an unbound pwq hits zero + * refcnt and needs to be destroyed. */ -static void pwq_unbound_release_workfn(struct work_struct *work) +static void pwq_release_workfn(struct kthread_work *work) { struct pool_workqueue *pwq = container_of(work, struct pool_workqueue, - unbound_release_work); + release_work); struct workqueue_struct *wq = pwq->wq; struct worker_pool *pool = pwq->pool; bool is_last = false; /* - * when @pwq is not linked, it doesn't hold any reference to the + * When @pwq is not linked, it doesn't hold any reference to the * @wq, and @wq is invalid to access. */ if (!list_empty(&pwq->pwqs_node)) { - if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) - return; - mutex_lock(&wq->mutex); list_del_rcu(&pwq->pwqs_node); is_last = list_empty(&wq->pwqs); mutex_unlock(&wq->mutex); } - mutex_lock(&wq_pool_mutex); - put_unbound_pool(pool); - mutex_unlock(&wq_pool_mutex); + if (wq->flags & WQ_UNBOUND) { + mutex_lock(&wq_pool_mutex); + put_unbound_pool(pool); + mutex_unlock(&wq_pool_mutex); + } call_rcu(&pwq->rcu, rcu_free_pwq); @@ -4092,24 +4169,13 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) * is updated and visible. */ if (!freezable || !workqueue_freezing) { - bool kick = false; - pwq->max_active = wq->saved_max_active; while (!list_empty(&pwq->inactive_works) && - pwq->nr_active < pwq->max_active) { + pwq->nr_active < pwq->max_active) pwq_activate_first_inactive(pwq); - kick = true; - } - /* - * Need to kick a worker after thawed or an unbound wq's - * max_active is bumped. In realtime scenarios, always kicking a - * worker will cause interference on the isolated cpu cores, so - * let's kick iff work items were activated. - */ - if (kick) - wake_up_worker(pwq->pool); + kick_pool(pwq->pool); } else { pwq->max_active = 0; } @@ -4132,7 +4198,7 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, INIT_LIST_HEAD(&pwq->inactive_works); INIT_LIST_HEAD(&pwq->pwqs_node); INIT_LIST_HEAD(&pwq->mayday_node); - INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); + kthread_init_work(&pwq->release_work, pwq_release_workfn); } /* sync @pwq with the current state of its associated wq and link it */ @@ -4180,61 +4246,49 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, } /** - * wq_calc_node_cpumask - calculate a wq_attrs' cpumask for the specified node + * wq_calc_pod_cpumask - calculate a wq_attrs' cpumask for a pod * @attrs: the wq_attrs of the default pwq of the target workqueue - * @node: the target NUMA node + * @cpu: the target CPU * @cpu_going_down: if >= 0, the CPU to consider as offline - * @cpumask: outarg, the resulting cpumask - * - * Calculate the cpumask a workqueue with @attrs should use on @node. If - * @cpu_going_down is >= 0, that cpu is considered offline during - * calculation. The result is stored in @cpumask. * - * If NUMA affinity is not enabled, @attrs->cpumask is always used. If - * enabled and @node has online CPUs requested by @attrs, the returned - * cpumask is the intersection of the possible CPUs of @node and - * @attrs->cpumask. + * Calculate the cpumask a workqueue with @attrs should use on @pod. If + * @cpu_going_down is >= 0, that cpu is considered offline during calculation. + * The result is stored in @attrs->__pod_cpumask. * - * The caller is responsible for ensuring that the cpumask of @node stays - * stable. + * If pod affinity is not enabled, @attrs->cpumask is always used. If enabled + * and @pod has online CPUs requested by @attrs, the returned cpumask is the + * intersection of the possible CPUs of @pod and @attrs->cpumask. * - * Return: %true if the resulting @cpumask is different from @attrs->cpumask, - * %false if equal. + * The caller is responsible for ensuring that the cpumask of @pod stays stable. */ -static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, - int cpu_going_down, cpumask_t *cpumask) +static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu, + int cpu_going_down) { - if (!wq_numa_enabled || attrs->no_numa) - goto use_dfl; + const struct wq_pod_type *pt = wqattrs_pod_type(attrs); + int pod = pt->cpu_pod[cpu]; - /* does @node have any online CPUs @attrs wants? */ - cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask); + /* does @pod have any online CPUs @attrs wants? */ + cpumask_and(attrs->__pod_cpumask, pt->pod_cpus[pod], attrs->cpumask); + cpumask_and(attrs->__pod_cpumask, attrs->__pod_cpumask, cpu_online_mask); if (cpu_going_down >= 0) - cpumask_clear_cpu(cpu_going_down, cpumask); + cpumask_clear_cpu(cpu_going_down, attrs->__pod_cpumask); - if (cpumask_empty(cpumask)) - goto use_dfl; + if (cpumask_empty(attrs->__pod_cpumask)) { + cpumask_copy(attrs->__pod_cpumask, attrs->cpumask); + return; + } - /* yeap, return possible CPUs in @node that @attrs wants */ - cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]); + /* yeap, return possible CPUs in @pod that @attrs wants */ + cpumask_and(attrs->__pod_cpumask, attrs->cpumask, pt->pod_cpus[pod]); - if (cpumask_empty(cpumask)) { + if (cpumask_empty(attrs->__pod_cpumask)) pr_warn_once("WARNING: workqueue cpumask: online intersect > " "possible intersect\n"); - return false; - } - - return !cpumask_equal(cpumask, attrs->cpumask); - -use_dfl: - cpumask_copy(cpumask, attrs->cpumask); - return false; } -/* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */ -static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, - int node, - struct pool_workqueue *pwq) +/* install @pwq into @wq's cpu_pwq and return the old pwq */ +static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq, + int cpu, struct pool_workqueue *pwq) { struct pool_workqueue *old_pwq; @@ -4244,8 +4298,8 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, /* link_pwq() can handle duplicate calls */ link_pwq(pwq); - old_pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]); - rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq); + old_pwq = rcu_access_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu)); + rcu_assign_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu), pwq); return old_pwq; } @@ -4262,10 +4316,10 @@ struct apply_wqattrs_ctx { static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx) { if (ctx) { - int node; + int cpu; - for_each_node(node) - put_pwq_unlocked(ctx->pwq_tbl[node]); + for_each_possible_cpu(cpu) + put_pwq_unlocked(ctx->pwq_tbl[cpu]); put_pwq_unlocked(ctx->dfl_pwq); free_workqueue_attrs(ctx->attrs); @@ -4281,76 +4335,64 @@ apply_wqattrs_prepare(struct workqueue_struct *wq, const cpumask_var_t unbound_cpumask) { struct apply_wqattrs_ctx *ctx; - struct workqueue_attrs *new_attrs, *tmp_attrs; - int node; + struct workqueue_attrs *new_attrs; + int cpu; lockdep_assert_held(&wq_pool_mutex); - ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_node_ids), GFP_KERNEL); + if (WARN_ON(attrs->affn_scope < 0 || + attrs->affn_scope >= WQ_AFFN_NR_TYPES)) + return ERR_PTR(-EINVAL); + + ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_cpu_ids), GFP_KERNEL); new_attrs = alloc_workqueue_attrs(); - tmp_attrs = alloc_workqueue_attrs(); - if (!ctx || !new_attrs || !tmp_attrs) + if (!ctx || !new_attrs) goto out_free; /* - * Calculate the attrs of the default pwq with unbound_cpumask - * which is wq_unbound_cpumask or to set to wq_unbound_cpumask. - * If the user configured cpumask doesn't overlap with the - * wq_unbound_cpumask, we fallback to the wq_unbound_cpumask. - */ - copy_workqueue_attrs(new_attrs, attrs); - cpumask_and(new_attrs->cpumask, new_attrs->cpumask, unbound_cpumask); - if (unlikely(cpumask_empty(new_attrs->cpumask))) - cpumask_copy(new_attrs->cpumask, unbound_cpumask); - - /* - * We may create multiple pwqs with differing cpumasks. Make a - * copy of @new_attrs which will be modified and used to obtain - * pools. - */ - copy_workqueue_attrs(tmp_attrs, new_attrs); - - /* * If something goes wrong during CPU up/down, we'll fall back to * the default pwq covering whole @attrs->cpumask. Always create * it even if we don't use it immediately. */ + copy_workqueue_attrs(new_attrs, attrs); + wqattrs_actualize_cpumask(new_attrs, unbound_cpumask); + cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask); ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs); if (!ctx->dfl_pwq) goto out_free; - for_each_node(node) { - if (wq_calc_node_cpumask(new_attrs, node, -1, tmp_attrs->cpumask)) { - ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs); - if (!ctx->pwq_tbl[node]) - goto out_free; - } else { + for_each_possible_cpu(cpu) { + if (new_attrs->ordered) { ctx->dfl_pwq->refcnt++; - ctx->pwq_tbl[node] = ctx->dfl_pwq; + ctx->pwq_tbl[cpu] = ctx->dfl_pwq; + } else { + wq_calc_pod_cpumask(new_attrs, cpu, -1); + ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, new_attrs); + if (!ctx->pwq_tbl[cpu]) + goto out_free; } } /* save the user configured attrs and sanitize it. */ copy_workqueue_attrs(new_attrs, attrs); cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask); + cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask); ctx->attrs = new_attrs; ctx->wq = wq; - free_workqueue_attrs(tmp_attrs); return ctx; out_free: - free_workqueue_attrs(tmp_attrs); free_workqueue_attrs(new_attrs); apply_wqattrs_cleanup(ctx); - return NULL; + return ERR_PTR(-ENOMEM); } /* set attrs and install prepared pwqs, @ctx points to old pwqs on return */ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) { - int node; + int cpu; /* all pwqs have been created successfully, let's install'em */ mutex_lock(&ctx->wq->mutex); @@ -4358,9 +4400,9 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs); /* save the previous pwq and install the new one */ - for_each_node(node) - ctx->pwq_tbl[node] = numa_pwq_tbl_install(ctx->wq, node, - ctx->pwq_tbl[node]); + for_each_possible_cpu(cpu) + ctx->pwq_tbl[cpu] = install_unbound_pwq(ctx->wq, cpu, + ctx->pwq_tbl[cpu]); /* @dfl_pwq might not have been used, ensure it's linked */ link_pwq(ctx->dfl_pwq); @@ -4400,8 +4442,8 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, } ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask); - if (!ctx) - return -ENOMEM; + if (IS_ERR(ctx)) + return PTR_ERR(ctx); /* the ctx has been prepared successfully, let's commit it */ apply_wqattrs_commit(ctx); @@ -4415,12 +4457,11 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, * @wq: the target workqueue * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs() * - * Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA - * machines, this function maps a separate pwq to each NUMA node with - * possibles CPUs in @attrs->cpumask so that work items are affine to the - * NUMA node it was issued on. Older pwqs are released as in-flight work - * items finish. Note that a work item which repeatedly requeues itself - * back-to-back will stay on its current pwq. + * Apply @attrs to an unbound workqueue @wq. Unless disabled, this function maps + * a separate pwq to each CPU pod with possibles CPUs in @attrs->cpumask so that + * work items are affine to the pod it was issued on. Older pwqs are released as + * in-flight work items finish. Note that a work item which repeatedly requeues + * itself back-to-back will stay on its current pwq. * * Performs GFP_KERNEL allocations. * @@ -4443,40 +4484,37 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, } /** - * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug + * wq_update_pod - update pod affinity of a wq for CPU hot[un]plug * @wq: the target workqueue - * @cpu: the CPU coming up or going down + * @cpu: the CPU to update pool association for + * @hotplug_cpu: the CPU coming up or going down * @online: whether @cpu is coming up or going down * * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and - * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update NUMA affinity of + * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update pod affinity of * @wq accordingly. * - * If NUMA affinity can't be adjusted due to memory allocation failure, it - * falls back to @wq->dfl_pwq which may not be optimal but is always - * correct. - * - * Note that when the last allowed CPU of a NUMA node goes offline for a - * workqueue with a cpumask spanning multiple nodes, the workers which were - * already executing the work items for the workqueue will lose their CPU - * affinity and may execute on any CPU. This is similar to how per-cpu - * workqueues behave on CPU_DOWN. If a workqueue user wants strict - * affinity, it's the user's responsibility to flush the work item from - * CPU_DOWN_PREPARE. + * + * If pod affinity can't be adjusted due to memory allocation failure, it falls + * back to @wq->dfl_pwq which may not be optimal but is always correct. + * + * Note that when the last allowed CPU of a pod goes offline for a workqueue + * with a cpumask spanning multiple pods, the workers which were already + * executing the work items for the workqueue will lose their CPU affinity and + * may execute on any CPU. This is similar to how per-cpu workqueues behave on + * CPU_DOWN. If a workqueue user wants strict affinity, it's the user's + * responsibility to flush the work item from CPU_DOWN_PREPARE. */ -static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, - bool online) +static void wq_update_pod(struct workqueue_struct *wq, int cpu, + int hotplug_cpu, bool online) { - int node = cpu_to_node(cpu); - int cpu_off = online ? -1 : cpu; + int off_cpu = online ? -1 : hotplug_cpu; struct pool_workqueue *old_pwq = NULL, *pwq; struct workqueue_attrs *target_attrs; - cpumask_t *cpumask; lockdep_assert_held(&wq_pool_mutex); - if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND) || - wq->unbound_attrs->no_numa) + if (!(wq->flags & WQ_UNBOUND) || wq->unbound_attrs->ordered) return; /* @@ -4484,36 +4522,29 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, * Let's use a preallocated one. The following buf is protected by * CPU hotplug exclusion. */ - target_attrs = wq_update_unbound_numa_attrs_buf; - cpumask = target_attrs->cpumask; + target_attrs = wq_update_pod_attrs_buf; copy_workqueue_attrs(target_attrs, wq->unbound_attrs); - pwq = unbound_pwq_by_node(wq, node); + wqattrs_actualize_cpumask(target_attrs, wq_unbound_cpumask); - /* - * Let's determine what needs to be done. If the target cpumask is - * different from the default pwq's, we need to compare it to @pwq's - * and create a new one if they don't match. If the target cpumask - * equals the default pwq's, the default pwq should be used. - */ - if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, cpu_off, cpumask)) { - if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) - return; - } else { - goto use_dfl_pwq; - } + /* nothing to do if the target cpumask matches the current pwq */ + wq_calc_pod_cpumask(target_attrs, cpu, off_cpu); + pwq = rcu_dereference_protected(*per_cpu_ptr(wq->cpu_pwq, cpu), + lockdep_is_held(&wq_pool_mutex)); + if (wqattrs_equal(target_attrs, pwq->pool->attrs)) + return; /* create a new pwq */ pwq = alloc_unbound_pwq(wq, target_attrs); if (!pwq) { - pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", + pr_warn("workqueue: allocation failed while updating CPU pod affinity of \"%s\"\n", wq->name); goto use_dfl_pwq; } /* Install the new pwq. */ mutex_lock(&wq->mutex); - old_pwq = numa_pwq_tbl_install(wq, node, pwq); + old_pwq = install_unbound_pwq(wq, cpu, pwq); goto out_unlock; use_dfl_pwq: @@ -4521,7 +4552,7 @@ use_dfl_pwq: raw_spin_lock_irq(&wq->dfl_pwq->pool->lock); get_pwq(wq->dfl_pwq); raw_spin_unlock_irq(&wq->dfl_pwq->pool->lock); - old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq); + old_pwq = install_unbound_pwq(wq, cpu, wq->dfl_pwq); out_unlock: mutex_unlock(&wq->mutex); put_pwq_unlocked(old_pwq); @@ -4532,21 +4563,26 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) bool highpri = wq->flags & WQ_HIGHPRI; int cpu, ret; - if (!(wq->flags & WQ_UNBOUND)) { - wq->cpu_pwqs = alloc_percpu(struct pool_workqueue); - if (!wq->cpu_pwqs) - return -ENOMEM; + wq->cpu_pwq = alloc_percpu(struct pool_workqueue *); + if (!wq->cpu_pwq) + goto enomem; + if (!(wq->flags & WQ_UNBOUND)) { for_each_possible_cpu(cpu) { - struct pool_workqueue *pwq = - per_cpu_ptr(wq->cpu_pwqs, cpu); - struct worker_pool *cpu_pools = - per_cpu(cpu_worker_pools, cpu); + struct pool_workqueue **pwq_p = + per_cpu_ptr(wq->cpu_pwq, cpu); + struct worker_pool *pool = + &(per_cpu_ptr(cpu_worker_pools, cpu)[highpri]); + + *pwq_p = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, + pool->node); + if (!*pwq_p) + goto enomem; - init_pwq(pwq, wq, &cpu_pools[highpri]); + init_pwq(*pwq_p, wq, pool); mutex_lock(&wq->mutex); - link_pwq(pwq); + link_pwq(*pwq_p); mutex_unlock(&wq->mutex); } return 0; @@ -4565,18 +4601,25 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) cpus_read_unlock(); return ret; + +enomem: + if (wq->cpu_pwq) { + for_each_possible_cpu(cpu) + kfree(*per_cpu_ptr(wq->cpu_pwq, cpu)); + free_percpu(wq->cpu_pwq); + wq->cpu_pwq = NULL; + } + return -ENOMEM; } static int wq_clamp_max_active(int max_active, unsigned int flags, const char *name) { - int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE; - - if (max_active < 1 || max_active > lim) + if (max_active < 1 || max_active > WQ_MAX_ACTIVE) pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n", - max_active, name, 1, lim); + max_active, name, 1, WQ_MAX_ACTIVE); - return clamp_val(max_active, 1, lim); + return clamp_val(max_active, 1, WQ_MAX_ACTIVE); } /* @@ -4599,7 +4642,7 @@ static int init_rescuer(struct workqueue_struct *wq) } rescuer->rescue_wq = wq; - rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name); + rescuer->task = kthread_create(rescuer_thread, rescuer, "kworker/R-%s", wq->name); if (IS_ERR(rescuer->task)) { ret = PTR_ERR(rescuer->task); pr_err("workqueue: Failed to create a rescuer kthread for wq \"%s\": %pe", @@ -4620,17 +4663,15 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...) { - size_t tbl_size = 0; va_list args; struct workqueue_struct *wq; struct pool_workqueue *pwq; /* - * Unbound && max_active == 1 used to imply ordered, which is no - * longer the case on NUMA machines due to per-node pools. While + * Unbound && max_active == 1 used to imply ordered, which is no longer + * the case on many machines due to per-pod pools. While * alloc_ordered_workqueue() is the right way to create an ordered - * workqueue, keep the previous behavior to avoid subtle breakages - * on NUMA. + * workqueue, keep the previous behavior to avoid subtle breakages. */ if ((flags & WQ_UNBOUND) && max_active == 1) flags |= __WQ_ORDERED; @@ -4640,10 +4681,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, flags |= WQ_UNBOUND; /* allocate wq and format name */ - if (flags & WQ_UNBOUND) - tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]); - - wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL); + wq = kzalloc(sizeof(*wq), GFP_KERNEL); if (!wq) return NULL; @@ -4738,7 +4776,7 @@ static bool pwq_busy(struct pool_workqueue *pwq) void destroy_workqueue(struct workqueue_struct *wq) { struct pool_workqueue *pwq; - int node; + int cpu; /* * Remove it from sysfs first so that sanity check failure doesn't @@ -4797,33 +4835,23 @@ void destroy_workqueue(struct workqueue_struct *wq) list_del_rcu(&wq->list); mutex_unlock(&wq_pool_mutex); - if (!(wq->flags & WQ_UNBOUND)) { - wq_unregister_lockdep(wq); - /* - * The base ref is never dropped on per-cpu pwqs. Directly - * schedule RCU free. - */ - call_rcu(&wq->rcu, rcu_free_wq); - } else { - /* - * We're the sole accessor of @wq at this point. Directly - * access numa_pwq_tbl[] and dfl_pwq to put the base refs. - * @wq will be freed when the last pwq is released. - */ - for_each_node(node) { - pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]); - RCU_INIT_POINTER(wq->numa_pwq_tbl[node], NULL); - put_pwq_unlocked(pwq); - } + /* + * We're the sole accessor of @wq. Directly access cpu_pwq and dfl_pwq + * to put the base refs. @wq will be auto-destroyed from the last + * pwq_put. RCU read lock prevents @wq from going away from under us. + */ + rcu_read_lock(); - /* - * Put dfl_pwq. @wq may be freed any time after dfl_pwq is - * put. Don't access it afterwards. - */ - pwq = wq->dfl_pwq; - wq->dfl_pwq = NULL; + for_each_possible_cpu(cpu) { + pwq = rcu_access_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu)); + RCU_INIT_POINTER(*per_cpu_ptr(wq->cpu_pwq, cpu), NULL); put_pwq_unlocked(pwq); } + + put_pwq_unlocked(wq->dfl_pwq); + wq->dfl_pwq = NULL; + + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(destroy_workqueue); @@ -4900,10 +4928,11 @@ bool current_is_workqueue_rescuer(void) * unreliable and only useful as advisory hints or for debugging. * * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU. - * Note that both per-cpu and unbound workqueues may be associated with - * multiple pool_workqueues which have separate congested states. A - * workqueue being congested on one CPU doesn't mean the workqueue is also - * contested on other CPUs / NUMA nodes. + * + * With the exception of ordered workqueues, all workqueues have per-cpu + * pool_workqueues, each with its own congested state. A workqueue being + * congested on one CPU doesn't mean that the workqueue is contested on any + * other CPUs. * * Return: * %true if congested, %false otherwise. @@ -4919,12 +4948,9 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) if (cpu == WORK_CPU_UNBOUND) cpu = smp_processor_id(); - if (!(wq->flags & WQ_UNBOUND)) - pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); - else - pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); - + pwq = *per_cpu_ptr(wq->cpu_pwq, cpu); ret = !list_empty(&pwq->inactive_works); + preempt_enable(); rcu_read_unlock(); @@ -5399,7 +5425,7 @@ static void unbind_workers(int cpu) * worker blocking could lead to lengthy stalls. Kick off * unbound chain execution of currently pending work items. */ - wake_up_worker(pool); + kick_pool(pool); raw_spin_unlock_irq(&pool->lock); @@ -5432,7 +5458,7 @@ static void rebind_workers(struct worker_pool *pool) for_each_pool_worker(worker, pool) { kthread_set_per_cpu(worker->task, pool->cpu); WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, - pool->attrs->cpumask) < 0); + pool_allowed_cpus(pool)) < 0); } raw_spin_lock_irq(&pool->lock); @@ -5526,9 +5552,18 @@ int workqueue_online_cpu(unsigned int cpu) mutex_unlock(&wq_pool_attach_mutex); } - /* update NUMA affinity of unbound workqueues */ - list_for_each_entry(wq, &workqueues, list) - wq_update_unbound_numa(wq, cpu, true); + /* update pod affinity of unbound workqueues */ + list_for_each_entry(wq, &workqueues, list) { + struct workqueue_attrs *attrs = wq->unbound_attrs; + + if (attrs) { + const struct wq_pod_type *pt = wqattrs_pod_type(attrs); + int tcpu; + + for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]]) + wq_update_pod(wq, tcpu, cpu, true); + } + } mutex_unlock(&wq_pool_mutex); return 0; @@ -5544,10 +5579,19 @@ int workqueue_offline_cpu(unsigned int cpu) unbind_workers(cpu); - /* update NUMA affinity of unbound workqueues */ + /* update pod affinity of unbound workqueues */ mutex_lock(&wq_pool_mutex); - list_for_each_entry(wq, &workqueues, list) - wq_update_unbound_numa(wq, cpu, false); + list_for_each_entry(wq, &workqueues, list) { + struct workqueue_attrs *attrs = wq->unbound_attrs; + + if (attrs) { + const struct wq_pod_type *pt = wqattrs_pod_type(attrs); + int tcpu; + + for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]]) + wq_update_pod(wq, tcpu, cpu, false); + } + } mutex_unlock(&wq_pool_mutex); return 0; @@ -5743,8 +5787,8 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask) continue; ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask); - if (!ctx) { - ret = -ENOMEM; + if (IS_ERR(ctx)) { + ret = PTR_ERR(ctx); break; } @@ -5802,21 +5846,72 @@ out_unlock: return ret; } +static int parse_affn_scope(const char *val) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(wq_affn_names); i++) { + if (!strncasecmp(val, wq_affn_names[i], strlen(wq_affn_names[i]))) + return i; + } + return -EINVAL; +} + +static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp) +{ + struct workqueue_struct *wq; + int affn, cpu; + + affn = parse_affn_scope(val); + if (affn < 0) + return affn; + if (affn == WQ_AFFN_DFL) + return -EINVAL; + + cpus_read_lock(); + mutex_lock(&wq_pool_mutex); + + wq_affn_dfl = affn; + + list_for_each_entry(wq, &workqueues, list) { + for_each_online_cpu(cpu) { + wq_update_pod(wq, cpu, cpu, true); + } + } + + mutex_unlock(&wq_pool_mutex); + cpus_read_unlock(); + + return 0; +} + +static int wq_affn_dfl_get(char *buffer, const struct kernel_param *kp) +{ + return scnprintf(buffer, PAGE_SIZE, "%s\n", wq_affn_names[wq_affn_dfl]); +} + +static const struct kernel_param_ops wq_affn_dfl_ops = { + .set = wq_affn_dfl_set, + .get = wq_affn_dfl_get, +}; + +module_param_cb(default_affinity_scope, &wq_affn_dfl_ops, NULL, 0644); + #ifdef CONFIG_SYSFS /* * Workqueues with WQ_SYSFS flag set is visible to userland via * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the * following attributes. * - * per_cpu RO bool : whether the workqueue is per-cpu or unbound - * max_active RW int : maximum number of in-flight work items + * per_cpu RO bool : whether the workqueue is per-cpu or unbound + * max_active RW int : maximum number of in-flight work items * * Unbound workqueues have the following extra attributes. * - * pool_ids RO int : the associated pool IDs for each node - * nice RW int : nice value of the workers - * cpumask RW mask : bitmask of allowed CPUs for the workers - * numa RW bool : whether enable NUMA affinity + * nice RW int : nice value of the workers + * cpumask RW mask : bitmask of allowed CPUs for the workers + * affinity_scope RW str : worker CPU affinity scope (cache, numa, none) + * affinity_strict RW bool : worker CPU affinity is strict */ struct wq_device { struct workqueue_struct *wq; @@ -5869,28 +5964,6 @@ static struct attribute *wq_sysfs_attrs[] = { }; ATTRIBUTE_GROUPS(wq_sysfs); -static ssize_t wq_pool_ids_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - const char *delim = ""; - int node, written = 0; - - cpus_read_lock(); - rcu_read_lock(); - for_each_node(node) { - written += scnprintf(buf + written, PAGE_SIZE - written, - "%s%d:%d", delim, node, - unbound_pwq_by_node(wq, node)->pool->id); - delim = " "; - } - written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); - rcu_read_unlock(); - cpus_read_unlock(); - - return written; -} - static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -5981,50 +6054,84 @@ out_unlock: return ret ?: count; } -static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr, - char *buf) +static ssize_t wq_affn_scope_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); int written; mutex_lock(&wq->mutex); - written = scnprintf(buf, PAGE_SIZE, "%d\n", - !wq->unbound_attrs->no_numa); + if (wq->unbound_attrs->affn_scope == WQ_AFFN_DFL) + written = scnprintf(buf, PAGE_SIZE, "%s (%s)\n", + wq_affn_names[WQ_AFFN_DFL], + wq_affn_names[wq_affn_dfl]); + else + written = scnprintf(buf, PAGE_SIZE, "%s\n", + wq_affn_names[wq->unbound_attrs->affn_scope]); mutex_unlock(&wq->mutex); return written; } -static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) +static ssize_t wq_affn_scope_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) { struct workqueue_struct *wq = dev_to_wq(dev); struct workqueue_attrs *attrs; - int v, ret = -ENOMEM; + int affn, ret = -ENOMEM; - apply_wqattrs_lock(); + affn = parse_affn_scope(buf); + if (affn < 0) + return affn; + apply_wqattrs_lock(); attrs = wq_sysfs_prep_attrs(wq); - if (!attrs) - goto out_unlock; - - ret = -EINVAL; - if (sscanf(buf, "%d", &v) == 1) { - attrs->no_numa = !v; + if (attrs) { + attrs->affn_scope = affn; ret = apply_workqueue_attrs_locked(wq, attrs); } + apply_wqattrs_unlock(); + free_workqueue_attrs(attrs); + return ret ?: count; +} -out_unlock: +static ssize_t wq_affinity_strict_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + + return scnprintf(buf, PAGE_SIZE, "%d\n", + wq->unbound_attrs->affn_strict); +} + +static ssize_t wq_affinity_strict_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int v, ret = -ENOMEM; + + if (sscanf(buf, "%d", &v) != 1) + return -EINVAL; + + apply_wqattrs_lock(); + attrs = wq_sysfs_prep_attrs(wq); + if (attrs) { + attrs->affn_strict = (bool)v; + ret = apply_workqueue_attrs_locked(wq, attrs); + } apply_wqattrs_unlock(); free_workqueue_attrs(attrs); return ret ?: count; } static struct device_attribute wq_sysfs_unbound_attrs[] = { - __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL), __ATTR(nice, 0644, wq_nice_show, wq_nice_store), __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), - __ATTR(numa, 0644, wq_numa_show, wq_numa_store), + __ATTR(affinity_scope, 0644, wq_affn_scope_show, wq_affn_scope_store), + __ATTR(affinity_strict, 0644, wq_affinity_strict_show, wq_affinity_strict_store), __ATTR_NULL, }; @@ -6390,62 +6497,19 @@ static inline void wq_watchdog_init(void) { } #endif /* CONFIG_WQ_WATCHDOG */ -static void __init wq_numa_init(void) -{ - cpumask_var_t *tbl; - int node, cpu; - - if (num_possible_nodes() <= 1) - return; - - if (wq_disable_numa) { - pr_info("workqueue: NUMA affinity support disabled\n"); - return; - } - - for_each_possible_cpu(cpu) { - if (WARN_ON(cpu_to_node(cpu) == NUMA_NO_NODE)) { - pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu); - return; - } - } - - wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(); - BUG_ON(!wq_update_unbound_numa_attrs_buf); - - /* - * We want masks of possible CPUs of each node which isn't readily - * available. Build one from cpu_to_node() which should have been - * fully initialized by now. - */ - tbl = kcalloc(nr_node_ids, sizeof(tbl[0]), GFP_KERNEL); - BUG_ON(!tbl); - - for_each_node(node) - BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL, - node_online(node) ? node : NUMA_NO_NODE)); - - for_each_possible_cpu(cpu) { - node = cpu_to_node(cpu); - cpumask_set_cpu(cpu, tbl[node]); - } - - wq_numa_possible_cpumask = tbl; - wq_numa_enabled = true; -} - /** * workqueue_init_early - early init for workqueue subsystem * - * This is the first half of two-staged workqueue subsystem initialization - * and invoked as soon as the bare basics - memory allocation, cpumasks and - * idr are up. It sets up all the data structures and system workqueues - * and allows early boot code to create workqueues and queue/cancel work - * items. Actual work item execution starts only after kthreads can be - * created and scheduled right before early initcalls. + * This is the first step of three-staged workqueue subsystem initialization and + * invoked as soon as the bare basics - memory allocation, cpumasks and idr are + * up. It sets up all the data structures and system workqueues and allows early + * boot code to create workqueues and queue/cancel work items. Actual work item + * execution starts only after kthreads can be created and scheduled right + * before early initcalls. */ void __init workqueue_init_early(void) { + struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM]; int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; int i, cpu; @@ -6455,8 +6519,30 @@ void __init workqueue_init_early(void) cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ)); cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN)); + if (!cpumask_empty(&wq_cmdline_cpumask)) + cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, &wq_cmdline_cpumask); + pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); + wq_update_pod_attrs_buf = alloc_workqueue_attrs(); + BUG_ON(!wq_update_pod_attrs_buf); + + /* initialize WQ_AFFN_SYSTEM pods */ + pt->pod_cpus = kcalloc(1, sizeof(pt->pod_cpus[0]), GFP_KERNEL); + pt->pod_node = kcalloc(1, sizeof(pt->pod_node[0]), GFP_KERNEL); + pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL); + BUG_ON(!pt->pod_cpus || !pt->pod_node || !pt->cpu_pod); + + BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[0], GFP_KERNEL, NUMA_NO_NODE)); + + wq_update_pod_attrs_buf = alloc_workqueue_attrs(); + BUG_ON(!wq_update_pod_attrs_buf); + + pt->nr_pods = 1; + cpumask_copy(pt->pod_cpus[0], cpu_possible_mask); + pt->pod_node[0] = NUMA_NO_NODE; + pt->cpu_pod[0] = 0; + /* initialize CPU pools */ for_each_possible_cpu(cpu) { struct worker_pool *pool; @@ -6466,7 +6552,9 @@ void __init workqueue_init_early(void) BUG_ON(init_worker_pool(pool)); pool->cpu = cpu; cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu)); + cpumask_copy(pool->attrs->__pod_cpumask, cpumask_of(cpu)); pool->attrs->nice = std_nice[i++]; + pool->attrs->affn_strict = true; pool->node = cpu_to_node(cpu); /* alloc pool ID */ @@ -6487,11 +6575,10 @@ void __init workqueue_init_early(void) /* * An ordered wq should have only one pwq as ordering is * guaranteed by max_active which is enforced by pwqs. - * Turn off NUMA so that dfl_pwq is used for all nodes. */ BUG_ON(!(attrs = alloc_workqueue_attrs())); attrs->nice = std_nice[i]; - attrs->no_numa = true; + attrs->ordered = true; ordered_wq_attrs[i] = attrs; } @@ -6499,7 +6586,7 @@ void __init workqueue_init_early(void) system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0); system_long_wq = alloc_workqueue("events_long", 0, 0); system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, - WQ_UNBOUND_MAX_ACTIVE); + WQ_MAX_ACTIVE); system_freezable_wq = alloc_workqueue("events_freezable", WQ_FREEZABLE, 0); system_power_efficient_wq = alloc_workqueue("events_power_efficient", @@ -6513,14 +6600,53 @@ void __init workqueue_init_early(void) !system_freezable_power_efficient_wq); } +static void __init wq_cpu_intensive_thresh_init(void) +{ + unsigned long thresh; + unsigned long bogo; + + /* if the user set it to a specific value, keep it */ + if (wq_cpu_intensive_thresh_us != ULONG_MAX) + return; + + pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release"); + BUG_ON(IS_ERR(pwq_release_worker)); + + /* + * The default of 10ms is derived from the fact that most modern (as of + * 2023) processors can do a lot in 10ms and that it's just below what + * most consider human-perceivable. However, the kernel also runs on a + * lot slower CPUs including microcontrollers where the threshold is way + * too low. + * + * Let's scale up the threshold upto 1 second if BogoMips is below 4000. + * This is by no means accurate but it doesn't have to be. The mechanism + * is still useful even when the threshold is fully scaled up. Also, as + * the reports would usually be applicable to everyone, some machines + * operating on longer thresholds won't significantly diminish their + * usefulness. + */ + thresh = 10 * USEC_PER_MSEC; + + /* see init/calibrate.c for lpj -> BogoMIPS calculation */ + bogo = max_t(unsigned long, loops_per_jiffy / 500000 * HZ, 1); + if (bogo < 4000) + thresh = min_t(unsigned long, thresh * 4000 / bogo, USEC_PER_SEC); + + pr_debug("wq_cpu_intensive_thresh: lpj=%lu BogoMIPS=%lu thresh_us=%lu\n", + loops_per_jiffy, bogo, thresh); + + wq_cpu_intensive_thresh_us = thresh; +} + /** * workqueue_init - bring workqueue subsystem fully online * - * This is the latter half of two-staged workqueue subsystem initialization - * and invoked as soon as kthreads can be created and scheduled. - * Workqueues have been created and work items queued on them, but there - * are no kworkers executing the work items yet. Populate the worker pools - * with the initial workers and enable future kworker creations. + * This is the second step of three-staged workqueue subsystem initialization + * and invoked as soon as kthreads can be created and scheduled. Workqueues have + * been created and work items queued on them, but there are no kworkers + * executing the work items yet. Populate the worker pools with the initial + * workers and enable future kworker creations. */ void __init workqueue_init(void) { @@ -6528,19 +6654,14 @@ void __init workqueue_init(void) struct worker_pool *pool; int cpu, bkt; - /* - * It'd be simpler to initialize NUMA in workqueue_init_early() but - * CPU to node mapping may not be available that early on some - * archs such as power and arm64. As per-cpu pools created - * previously could be missing node hint and unbound pools NUMA - * affinity, fix them up. - * - * Also, while iterating workqueues, create rescuers if requested. - */ - wq_numa_init(); + wq_cpu_intensive_thresh_init(); mutex_lock(&wq_pool_mutex); + /* + * Per-cpu pools created earlier could be missing node hint. Fix them + * up. Also, create a rescuer for workqueues that requested it. + */ for_each_possible_cpu(cpu) { for_each_cpu_worker_pool(pool, cpu) { pool->node = cpu_to_node(cpu); @@ -6548,7 +6669,6 @@ void __init workqueue_init(void) } list_for_each_entry(wq, &workqueues, list) { - wq_update_unbound_numa(wq, smp_processor_id(), true); WARN(init_rescuer(wq), "workqueue: failed to create early rescuer for %s", wq->name); @@ -6572,9 +6692,114 @@ void __init workqueue_init(void) } /* - * Despite the naming, this is a no-op function which is here only for avoiding - * link error. Since compile-time warning may fail to catch, we will need to - * emit run-time warning from __flush_workqueue(). + * Initialize @pt by first initializing @pt->cpu_pod[] with pod IDs according to + * @cpu_shares_pod(). Each subset of CPUs that share a pod is assigned a unique + * and consecutive pod ID. The rest of @pt is initialized accordingly. + */ +static void __init init_pod_type(struct wq_pod_type *pt, + bool (*cpus_share_pod)(int, int)) +{ + int cur, pre, cpu, pod; + + pt->nr_pods = 0; + + /* init @pt->cpu_pod[] according to @cpus_share_pod() */ + pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL); + BUG_ON(!pt->cpu_pod); + + for_each_possible_cpu(cur) { + for_each_possible_cpu(pre) { + if (pre >= cur) { + pt->cpu_pod[cur] = pt->nr_pods++; + break; + } + if (cpus_share_pod(cur, pre)) { + pt->cpu_pod[cur] = pt->cpu_pod[pre]; + break; + } + } + } + + /* init the rest to match @pt->cpu_pod[] */ + pt->pod_cpus = kcalloc(pt->nr_pods, sizeof(pt->pod_cpus[0]), GFP_KERNEL); + pt->pod_node = kcalloc(pt->nr_pods, sizeof(pt->pod_node[0]), GFP_KERNEL); + BUG_ON(!pt->pod_cpus || !pt->pod_node); + + for (pod = 0; pod < pt->nr_pods; pod++) + BUG_ON(!zalloc_cpumask_var(&pt->pod_cpus[pod], GFP_KERNEL)); + + for_each_possible_cpu(cpu) { + cpumask_set_cpu(cpu, pt->pod_cpus[pt->cpu_pod[cpu]]); + pt->pod_node[pt->cpu_pod[cpu]] = cpu_to_node(cpu); + } +} + +static bool __init cpus_dont_share(int cpu0, int cpu1) +{ + return false; +} + +static bool __init cpus_share_smt(int cpu0, int cpu1) +{ +#ifdef CONFIG_SCHED_SMT + return cpumask_test_cpu(cpu0, cpu_smt_mask(cpu1)); +#else + return false; +#endif +} + +static bool __init cpus_share_numa(int cpu0, int cpu1) +{ + return cpu_to_node(cpu0) == cpu_to_node(cpu1); +} + +/** + * workqueue_init_topology - initialize CPU pods for unbound workqueues + * + * This is the third step of there-staged workqueue subsystem initialization and + * invoked after SMP and topology information are fully initialized. It + * initializes the unbound CPU pods accordingly. */ -void __warn_flushing_systemwide_wq(void) { } +void __init workqueue_init_topology(void) +{ + struct workqueue_struct *wq; + int cpu; + + init_pod_type(&wq_pod_types[WQ_AFFN_CPU], cpus_dont_share); + init_pod_type(&wq_pod_types[WQ_AFFN_SMT], cpus_share_smt); + init_pod_type(&wq_pod_types[WQ_AFFN_CACHE], cpus_share_cache); + init_pod_type(&wq_pod_types[WQ_AFFN_NUMA], cpus_share_numa); + + mutex_lock(&wq_pool_mutex); + + /* + * Workqueues allocated earlier would have all CPUs sharing the default + * worker pool. Explicitly call wq_update_pod() on all workqueue and CPU + * combinations to apply per-pod sharing. + */ + list_for_each_entry(wq, &workqueues, list) { + for_each_online_cpu(cpu) { + wq_update_pod(wq, cpu, cpu, true); + } + } + + mutex_unlock(&wq_pool_mutex); +} + +void __warn_flushing_systemwide_wq(void) +{ + pr_warn("WARNING: Flushing system-wide workqueues will be prohibited in near future.\n"); + dump_stack(); +} EXPORT_SYMBOL(__warn_flushing_systemwide_wq); + +static int __init workqueue_unbound_cpus_setup(char *str) +{ + if (cpulist_parse(str, &wq_cmdline_cpumask) < 0) { + cpumask_clear(&wq_cmdline_cpumask); + pr_warn("workqueue.unbound_cpus: incorrect CPU range, using default\n"); + } + + return 1; +} +__setup("workqueue.unbound_cpus=", workqueue_unbound_cpus_setup); diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 6b1d66e28269..f6275944ada7 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -48,7 +48,7 @@ struct worker { /* A: runs through worker->node */ unsigned long last_active; /* K: last active timestamp */ - unsigned int flags; /* X: flags */ + unsigned int flags; /* L: flags */ int id; /* I: worker id */ /* |