diff options
Diffstat (limited to 'kernel')
170 files changed, 11245 insertions, 2512 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index b74820d8b264..af601b9bda0e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -12,7 +12,7 @@ obj-y = fork.o exec_domain.o panic.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o smpboot.o ucount.o regset.o -obj-$(CONFIG_BPFILTER) += usermode_driver.o +obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o obj-$(CONFIG_MODULES) += kmod.o obj-$(CONFIG_MULTIUSER) += groups.o @@ -38,9 +38,6 @@ KASAN_SANITIZE_kcov.o := n KCSAN_SANITIZE_kcov.o := n CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector -# cond_syscall is currently not LTO compatible -CFLAGS_sys_ni.o = $(DISABLE_LTO) - obj-y += sched/ obj-y += locking/ obj-y += power/ @@ -134,6 +131,8 @@ KASAN_SANITIZE_stackleak.o := n KCSAN_SANITIZE_stackleak.o := n KCOV_INSTRUMENT_stackleak.o := n +obj-$(CONFIG_SCF_TORTURE_TEST) += scftorture.o + $(obj)/configs.o: $(obj)/config_data.gz targets += config_data.gz diff --git a/kernel/acct.c b/kernel/acct.c index b0c5b3a9f5af..f175df8f6aa4 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -25,7 +25,7 @@ * Now we silently close acct_file on attempt to reopen. Cleaned sys_acct(). * XTerms and EMACS are manifestations of pure evil. 21/10/98, AV. * - * Fixed a nasty interaction with with sys_umount(). If the accointing + * Fixed a nasty interaction with sys_umount(). If the accounting * was suspeneded we failed to stop it on umount(). Messy. * Another one: remount to readonly didn't stop accounting. * Question: what should we do if we have CAP_SYS_ADMIN but not @@ -263,12 +263,12 @@ static DEFINE_MUTEX(acct_on_mutex); * sys_acct - enable/disable process accounting * @name: file name for accounting records or NULL to shutdown accounting * - * Returns 0 for success or negative errno values for failure. - * * sys_acct() is the only system call needed to implement process * accounting. It takes the name of the file where accounting records * should be written. If the filename is NULL, accounting will be * shutdown. + * + * Returns: 0 for success or negative errno values for failure. */ SYSCALL_DEFINE1(acct, const char __user *, name) { @@ -586,9 +586,7 @@ static void slow_acct_process(struct pid_namespace *ns) } /** - * acct_process - * - * handles process accounting for an exiting task + * acct_process - handles process accounting for an exiting task */ void acct_process(void) { diff --git a/kernel/audit.c b/kernel/audit.c index 7efaece534a9..68cee3bc8cfe 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -123,9 +123,9 @@ static u32 audit_backlog_limit = 64; static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; /* The identity of the user shutting down the audit system. */ -kuid_t audit_sig_uid = INVALID_UID; -pid_t audit_sig_pid = -1; -u32 audit_sig_sid = 0; +static kuid_t audit_sig_uid = INVALID_UID; +static pid_t audit_sig_pid = -1; +static u32 audit_sig_sid; /* Records can be lost in several ways: 0) [suppressed in audit_alloc] @@ -934,8 +934,7 @@ static void audit_free_reply(struct audit_reply *reply) if (!reply) return; - if (reply->skb) - kfree_skb(reply->skb); + kfree_skb(reply->skb); if (reply->net) put_net(reply->net); kfree(reply); diff --git a/kernel/audit.h b/kernel/audit.h index ddc22878433d..3b9c0945225a 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -327,10 +327,6 @@ static inline int audit_signal_info_syscall(struct task_struct *t) extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len); -extern pid_t audit_sig_pid; -extern kuid_t audit_sig_uid; -extern u32 audit_sig_sid; - extern int audit_filter(int msgtype, unsigned int listtype); extern void audit_ctl_lock(void); diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e6eb9c0402da..bdc8cd1b6767 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -5,6 +5,7 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o +obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o @@ -12,6 +13,7 @@ obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o +obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o obj-$(CONFIG_BPF_SYSCALL) += offload.o obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o endif @@ -29,3 +31,4 @@ ifeq ($(CONFIG_BPF_JIT),y) obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o obj-${CONFIG_BPF_LSM} += bpf_lsm.o endif +obj-$(CONFIG_BPF_PRELOAD) += preload/ diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 8ff419b632a6..c6c81eceb68f 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -10,11 +10,13 @@ #include <linux/filter.h> #include <linux/perf_event.h> #include <uapi/linux/btf.h> +#include <linux/rcupdate_trace.h> #include "map_in_map.h" #define ARRAY_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK) + (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK | \ + BPF_F_PRESERVE_ELEMS | BPF_F_INNER_MAP) static void bpf_array_free_percpu(struct bpf_array *array) { @@ -60,7 +62,11 @@ int array_map_alloc_check(union bpf_attr *attr) return -EINVAL; if (attr->map_type != BPF_MAP_TYPE_ARRAY && - attr->map_flags & BPF_F_MMAPABLE) + attr->map_flags & (BPF_F_MMAPABLE | BPF_F_INNER_MAP)) + return -EINVAL; + + if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY && + attr->map_flags & BPF_F_PRESERVE_ELEMS) return -EINVAL; if (attr->value_size > KMALLOC_MAX_SIZE) @@ -208,7 +214,7 @@ static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm, } /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ -static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_insn *insn = insn_buf; @@ -217,6 +223,9 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) const int map_ptr = BPF_REG_1; const int index = BPF_REG_2; + if (map->map_flags & BPF_F_INNER_MAP) + return -EOPNOTSUPP; + *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); if (!map->bypass_spec_v1) { @@ -487,6 +496,15 @@ static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) vma->vm_pgoff + pgoff); } +static bool array_map_meta_equal(const struct bpf_map *meta0, + const struct bpf_map *meta1) +{ + if (!bpf_map_meta_equal(meta0, meta1)) + return false; + return meta0->map_flags & BPF_F_INNER_MAP ? true : + meta0->max_entries == meta1->max_entries; +} + struct bpf_iter_seq_array_map_info { struct bpf_map *map; void *percpu_value_buf; @@ -625,6 +643,7 @@ static const struct bpf_iter_seq_info iter_seq_info = { static int array_map_btf_id; const struct bpf_map_ops array_map_ops = { + .map_meta_equal = array_map_meta_equal, .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, @@ -647,6 +666,7 @@ const struct bpf_map_ops array_map_ops = { static int percpu_array_map_btf_id; const struct bpf_map_ops percpu_array_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, @@ -888,6 +908,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, struct bpf_prog *old, struct bpf_prog *new) { + u8 *old_addr, *new_addr, *old_bypass_addr; struct prog_poke_elem *elem; struct bpf_array_aux *aux; @@ -908,12 +929,13 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, * there could be danger of use after free otherwise. * 2) Initially when we start tracking aux, the program * is not JITed yet and also does not have a kallsyms - * entry. We skip these as poke->ip_stable is not - * active yet. The JIT will do the final fixup before - * setting it stable. The various poke->ip_stable are - * successively activated, so tail call updates can - * arrive from here while JIT is still finishing its - * final fixup for non-activated poke entries. + * entry. We skip these as poke->tailcall_target_stable + * is not active yet. The JIT will do the final fixup + * before setting it stable. The various + * poke->tailcall_target_stable are successively + * activated, so tail call updates can arrive from here + * while JIT is still finishing its final fixup for + * non-activated poke entries. * 3) On program teardown, the program's kallsym entry gets * removed out of RCU callback, but we can only untrack * from sleepable context, therefore bpf_arch_text_poke() @@ -930,7 +952,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, * 5) Any other error happening below from bpf_arch_text_poke() * is a unexpected bug. */ - if (!READ_ONCE(poke->ip_stable)) + if (!READ_ONCE(poke->tailcall_target_stable)) continue; if (poke->reason != BPF_POKE_REASON_TAIL_CALL) continue; @@ -938,12 +960,39 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, poke->tail_call.key != key) continue; - ret = bpf_arch_text_poke(poke->ip, BPF_MOD_JUMP, - old ? (u8 *)old->bpf_func + - poke->adj_off : NULL, - new ? (u8 *)new->bpf_func + - poke->adj_off : NULL); - BUG_ON(ret < 0 && ret != -EINVAL); + old_bypass_addr = old ? NULL : poke->bypass_addr; + old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL; + new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL; + + if (new) { + ret = bpf_arch_text_poke(poke->tailcall_target, + BPF_MOD_JUMP, + old_addr, new_addr); + BUG_ON(ret < 0 && ret != -EINVAL); + if (!old) { + ret = bpf_arch_text_poke(poke->tailcall_bypass, + BPF_MOD_JUMP, + poke->bypass_addr, + NULL); + BUG_ON(ret < 0 && ret != -EINVAL); + } + } else { + ret = bpf_arch_text_poke(poke->tailcall_bypass, + BPF_MOD_JUMP, + old_bypass_addr, + poke->bypass_addr); + BUG_ON(ret < 0 && ret != -EINVAL); + /* let other CPUs finish the execution of program + * so that it will not possible to expose them + * to invalid nop, stack unwind, nop state + */ + if (!ret) + synchronize_rcu(); + ret = bpf_arch_text_poke(poke->tailcall_target, + BPF_MOD_JUMP, + old_addr, NULL); + BUG_ON(ret < 0 && ret != -EINVAL); + } } } } @@ -1003,6 +1052,11 @@ static void prog_array_map_free(struct bpf_map *map) fd_array_map_free(map); } +/* prog_array->aux->{type,jited} is a runtime binding. + * Doing static check alone in the verifier is not enough. + * Thus, prog_array_map cannot be used as an inner_map + * and map_meta_equal is not implemented. + */ static int prog_array_map_btf_id; const struct bpf_map_ops prog_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, @@ -1090,6 +1144,9 @@ static void perf_event_fd_array_release(struct bpf_map *map, struct bpf_event_entry *ee; int i; + if (map->map_flags & BPF_F_PRESERVE_ELEMS) + return; + rcu_read_lock(); for (i = 0; i < array->map.max_entries; i++) { ee = READ_ONCE(array->ptrs[i]); @@ -1099,11 +1156,19 @@ static void perf_event_fd_array_release(struct bpf_map *map, rcu_read_unlock(); } +static void perf_event_fd_array_map_free(struct bpf_map *map) +{ + if (map->map_flags & BPF_F_PRESERVE_ELEMS) + bpf_fd_array_map_clear(map); + fd_array_map_free(map); +} + static int perf_event_array_map_btf_id; const struct bpf_map_ops perf_event_array_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, - .map_free = fd_array_map_free, + .map_free = perf_event_fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, @@ -1137,6 +1202,7 @@ static void cgroup_fd_array_free(struct bpf_map *map) static int cgroup_array_map_btf_id; const struct bpf_map_ops cgroup_array_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = cgroup_fd_array_free, @@ -1190,7 +1256,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) return READ_ONCE(*inner_map); } -static u32 array_of_map_gen_lookup(struct bpf_map *map, +static int array_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c new file mode 100644 index 000000000000..6edff97ad594 --- /dev/null +++ b/kernel/bpf/bpf_inode_storage.c @@ -0,0 +1,272 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2019 Facebook + * Copyright 2020 Google LLC. + */ + +#include <linux/rculist.h> +#include <linux/list.h> +#include <linux/hash.h> +#include <linux/types.h> +#include <linux/spinlock.h> +#include <linux/bpf.h> +#include <linux/bpf_local_storage.h> +#include <net/sock.h> +#include <uapi/linux/sock_diag.h> +#include <uapi/linux/btf.h> +#include <linux/bpf_lsm.h> +#include <linux/btf_ids.h> +#include <linux/fdtable.h> + +DEFINE_BPF_STORAGE_CACHE(inode_cache); + +static struct bpf_local_storage __rcu ** +inode_storage_ptr(void *owner) +{ + struct inode *inode = owner; + struct bpf_storage_blob *bsb; + + bsb = bpf_inode(inode); + if (!bsb) + return NULL; + return &bsb->storage; +} + +static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode, + struct bpf_map *map, + bool cacheit_lockit) +{ + struct bpf_local_storage *inode_storage; + struct bpf_local_storage_map *smap; + struct bpf_storage_blob *bsb; + + bsb = bpf_inode(inode); + if (!bsb) + return NULL; + + inode_storage = rcu_dereference(bsb->storage); + if (!inode_storage) + return NULL; + + smap = (struct bpf_local_storage_map *)map; + return bpf_local_storage_lookup(inode_storage, smap, cacheit_lockit); +} + +void bpf_inode_storage_free(struct inode *inode) +{ + struct bpf_local_storage_elem *selem; + struct bpf_local_storage *local_storage; + bool free_inode_storage = false; + struct bpf_storage_blob *bsb; + struct hlist_node *n; + + bsb = bpf_inode(inode); + if (!bsb) + return; + + rcu_read_lock(); + + local_storage = rcu_dereference(bsb->storage); + if (!local_storage) { + rcu_read_unlock(); + return; + } + + /* Netiher the bpf_prog nor the bpf-map's syscall + * could be modifying the local_storage->list now. + * Thus, no elem can be added-to or deleted-from the + * local_storage->list by the bpf_prog or by the bpf-map's syscall. + * + * It is racing with bpf_local_storage_map_free() alone + * when unlinking elem from the local_storage->list and + * the map's bucket->list. + */ + raw_spin_lock_bh(&local_storage->lock); + hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { + /* Always unlink from map before unlinking from + * local_storage. + */ + bpf_selem_unlink_map(selem); + free_inode_storage = bpf_selem_unlink_storage_nolock( + local_storage, selem, false); + } + raw_spin_unlock_bh(&local_storage->lock); + rcu_read_unlock(); + + /* free_inoode_storage should always be true as long as + * local_storage->list was non-empty. + */ + if (free_inode_storage) + kfree_rcu(local_storage, rcu); +} + +static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_local_storage_data *sdata; + struct file *f; + int fd; + + fd = *(int *)key; + f = fget_raw(fd); + if (!f) + return NULL; + + sdata = inode_storage_lookup(f->f_inode, map, true); + fput(f); + return sdata ? sdata->data : NULL; +} + +static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct bpf_local_storage_data *sdata; + struct file *f; + int fd; + + fd = *(int *)key; + f = fget_raw(fd); + if (!f || !inode_storage_ptr(f->f_inode)) + return -EBADF; + + sdata = bpf_local_storage_update(f->f_inode, + (struct bpf_local_storage_map *)map, + value, map_flags); + fput(f); + return PTR_ERR_OR_ZERO(sdata); +} + +static int inode_storage_delete(struct inode *inode, struct bpf_map *map) +{ + struct bpf_local_storage_data *sdata; + + sdata = inode_storage_lookup(inode, map, false); + if (!sdata) + return -ENOENT; + + bpf_selem_unlink(SELEM(sdata)); + + return 0; +} + +static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) +{ + struct file *f; + int fd, err; + + fd = *(int *)key; + f = fget_raw(fd); + if (!f) + return -EBADF; + + err = inode_storage_delete(f->f_inode, map); + fput(f); + return err; +} + +BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, + void *, value, u64, flags) +{ + struct bpf_local_storage_data *sdata; + + if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) + return (unsigned long)NULL; + + /* explicitly check that the inode_storage_ptr is not + * NULL as inode_storage_lookup returns NULL in this case and + * bpf_local_storage_update expects the owner to have a + * valid storage pointer. + */ + if (!inode_storage_ptr(inode)) + return (unsigned long)NULL; + + sdata = inode_storage_lookup(inode, map, true); + if (sdata) + return (unsigned long)sdata->data; + + /* This helper must only called from where the inode is gurranteed + * to have a refcount and cannot be freed. + */ + if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { + sdata = bpf_local_storage_update( + inode, (struct bpf_local_storage_map *)map, value, + BPF_NOEXIST); + return IS_ERR(sdata) ? (unsigned long)NULL : + (unsigned long)sdata->data; + } + + return (unsigned long)NULL; +} + +BPF_CALL_2(bpf_inode_storage_delete, + struct bpf_map *, map, struct inode *, inode) +{ + /* This helper must only called from where the inode is gurranteed + * to have a refcount and cannot be freed. + */ + return inode_storage_delete(inode, map); +} + +static int notsupp_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + return -ENOTSUPP; +} + +static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr) +{ + struct bpf_local_storage_map *smap; + + smap = bpf_local_storage_map_alloc(attr); + if (IS_ERR(smap)) + return ERR_CAST(smap); + + smap->cache_idx = bpf_local_storage_cache_idx_get(&inode_cache); + return &smap->map; +} + +static void inode_storage_map_free(struct bpf_map *map) +{ + struct bpf_local_storage_map *smap; + + smap = (struct bpf_local_storage_map *)map; + bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx); + bpf_local_storage_map_free(smap); +} + +static int inode_storage_map_btf_id; +const struct bpf_map_ops inode_storage_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc_check = bpf_local_storage_map_alloc_check, + .map_alloc = inode_storage_map_alloc, + .map_free = inode_storage_map_free, + .map_get_next_key = notsupp_get_next_key, + .map_lookup_elem = bpf_fd_inode_storage_lookup_elem, + .map_update_elem = bpf_fd_inode_storage_update_elem, + .map_delete_elem = bpf_fd_inode_storage_delete_elem, + .map_check_btf = bpf_local_storage_map_check_btf, + .map_btf_name = "bpf_local_storage_map", + .map_btf_id = &inode_storage_map_btf_id, + .map_owner_storage_ptr = inode_storage_ptr, +}; + +BTF_ID_LIST_SINGLE(bpf_inode_storage_btf_ids, struct, inode) + +const struct bpf_func_proto bpf_inode_storage_get_proto = { + .func = bpf_inode_storage_get, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &bpf_inode_storage_btf_ids[0], + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; + +const struct bpf_func_proto bpf_inode_storage_delete_proto = { + .func = bpf_inode_storage_delete, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &bpf_inode_storage_btf_ids[0], +}; diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 8faa2ce89396..8f10e30ea0b0 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -88,8 +88,8 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, mutex_lock(&seq->lock); if (!seq->buf) { - seq->size = PAGE_SIZE; - seq->buf = kmalloc(seq->size, GFP_KERNEL); + seq->size = PAGE_SIZE << 3; + seq->buf = kvmalloc(seq->size, GFP_KERNEL); if (!seq->buf) { err = -ENOMEM; goto done; @@ -390,10 +390,68 @@ out_unlock: return ret; } +static void bpf_iter_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_iter_link *iter_link = + container_of(link, struct bpf_iter_link, link); + bpf_iter_show_fdinfo_t show_fdinfo; + + seq_printf(seq, + "target_name:\t%s\n", + iter_link->tinfo->reg_info->target); + + show_fdinfo = iter_link->tinfo->reg_info->show_fdinfo; + if (show_fdinfo) + show_fdinfo(&iter_link->aux, seq); +} + +static int bpf_iter_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_iter_link *iter_link = + container_of(link, struct bpf_iter_link, link); + char __user *ubuf = u64_to_user_ptr(info->iter.target_name); + bpf_iter_fill_link_info_t fill_link_info; + u32 ulen = info->iter.target_name_len; + const char *target_name; + u32 target_len; + + if (!ulen ^ !ubuf) + return -EINVAL; + + target_name = iter_link->tinfo->reg_info->target; + target_len = strlen(target_name); + info->iter.target_name_len = target_len + 1; + + if (ubuf) { + if (ulen >= target_len + 1) { + if (copy_to_user(ubuf, target_name, target_len + 1)) + return -EFAULT; + } else { + char zero = '\0'; + + if (copy_to_user(ubuf, target_name, ulen - 1)) + return -EFAULT; + if (put_user(zero, ubuf + ulen - 1)) + return -EFAULT; + return -ENOSPC; + } + } + + fill_link_info = iter_link->tinfo->reg_info->fill_link_info; + if (fill_link_info) + return fill_link_info(&iter_link->aux, info); + + return 0; +} + static const struct bpf_link_ops bpf_iter_link_lops = { .release = bpf_iter_link_release, .dealloc = bpf_iter_link_dealloc, .update_prog = bpf_iter_link_replace, + .show_fdinfo = bpf_iter_link_show_fdinfo, + .fill_link_info = bpf_iter_link_fill_link_info, }; bool bpf_link_is_iter(struct bpf_link *link) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c new file mode 100644 index 000000000000..5d3a7af9ba9b --- /dev/null +++ b/kernel/bpf/bpf_local_storage.c @@ -0,0 +1,600 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook */ +#include <linux/rculist.h> +#include <linux/list.h> +#include <linux/hash.h> +#include <linux/types.h> +#include <linux/spinlock.h> +#include <linux/bpf.h> +#include <linux/btf_ids.h> +#include <linux/bpf_local_storage.h> +#include <net/sock.h> +#include <uapi/linux/sock_diag.h> +#include <uapi/linux/btf.h> + +#define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE) + +static struct bpf_local_storage_map_bucket * +select_bucket(struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *selem) +{ + return &smap->buckets[hash_ptr(selem, smap->bucket_log)]; +} + +static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size) +{ + struct bpf_map *map = &smap->map; + + if (!map->ops->map_local_storage_charge) + return 0; + + return map->ops->map_local_storage_charge(smap, owner, size); +} + +static void mem_uncharge(struct bpf_local_storage_map *smap, void *owner, + u32 size) +{ + struct bpf_map *map = &smap->map; + + if (map->ops->map_local_storage_uncharge) + map->ops->map_local_storage_uncharge(smap, owner, size); +} + +static struct bpf_local_storage __rcu ** +owner_storage(struct bpf_local_storage_map *smap, void *owner) +{ + struct bpf_map *map = &smap->map; + + return map->ops->map_owner_storage_ptr(owner); +} + +static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem) +{ + return !hlist_unhashed(&selem->snode); +} + +static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) +{ + return !hlist_unhashed(&selem->map_node); +} + +struct bpf_local_storage_elem * +bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, + void *value, bool charge_mem) +{ + struct bpf_local_storage_elem *selem; + + if (charge_mem && mem_charge(smap, owner, smap->elem_size)) + return NULL; + + selem = kzalloc(smap->elem_size, GFP_ATOMIC | __GFP_NOWARN); + if (selem) { + if (value) + memcpy(SDATA(selem)->data, value, smap->map.value_size); + return selem; + } + + if (charge_mem) + mem_uncharge(smap, owner, smap->elem_size); + + return NULL; +} + +/* local_storage->lock must be held and selem->local_storage == local_storage. + * The caller must ensure selem->smap is still valid to be + * dereferenced for its smap->elem_size and smap->cache_idx. + */ +bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem, + bool uncharge_mem) +{ + struct bpf_local_storage_map *smap; + bool free_local_storage; + void *owner; + + smap = rcu_dereference(SDATA(selem)->smap); + owner = local_storage->owner; + + /* All uncharging on the owner must be done first. + * The owner may be freed once the last selem is unlinked + * from local_storage. + */ + if (uncharge_mem) + mem_uncharge(smap, owner, smap->elem_size); + + free_local_storage = hlist_is_singular_node(&selem->snode, + &local_storage->list); + if (free_local_storage) { + mem_uncharge(smap, owner, sizeof(struct bpf_local_storage)); + local_storage->owner = NULL; + + /* After this RCU_INIT, owner may be freed and cannot be used */ + RCU_INIT_POINTER(*owner_storage(smap, owner), NULL); + + /* local_storage is not freed now. local_storage->lock is + * still held and raw_spin_unlock_bh(&local_storage->lock) + * will be done by the caller. + * + * Although the unlock will be done under + * rcu_read_lock(), it is more intutivie to + * read if kfree_rcu(local_storage, rcu) is done + * after the raw_spin_unlock_bh(&local_storage->lock). + * + * Hence, a "bool free_local_storage" is returned + * to the caller which then calls the kfree_rcu() + * after unlock. + */ + } + hlist_del_init_rcu(&selem->snode); + if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) == + SDATA(selem)) + RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); + + kfree_rcu(selem, rcu); + + return free_local_storage; +} + +static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem) +{ + struct bpf_local_storage *local_storage; + bool free_local_storage = false; + + if (unlikely(!selem_linked_to_storage(selem))) + /* selem has already been unlinked from sk */ + return; + + local_storage = rcu_dereference(selem->local_storage); + raw_spin_lock_bh(&local_storage->lock); + if (likely(selem_linked_to_storage(selem))) + free_local_storage = bpf_selem_unlink_storage_nolock( + local_storage, selem, true); + raw_spin_unlock_bh(&local_storage->lock); + + if (free_local_storage) + kfree_rcu(local_storage, rcu); +} + +void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem) +{ + RCU_INIT_POINTER(selem->local_storage, local_storage); + hlist_add_head_rcu(&selem->snode, &local_storage->list); +} + +void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) +{ + struct bpf_local_storage_map *smap; + struct bpf_local_storage_map_bucket *b; + + if (unlikely(!selem_linked_to_map(selem))) + /* selem has already be unlinked from smap */ + return; + + smap = rcu_dereference(SDATA(selem)->smap); + b = select_bucket(smap, selem); + raw_spin_lock_bh(&b->lock); + if (likely(selem_linked_to_map(selem))) + hlist_del_init_rcu(&selem->map_node); + raw_spin_unlock_bh(&b->lock); +} + +void bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *selem) +{ + struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem); + + raw_spin_lock_bh(&b->lock); + RCU_INIT_POINTER(SDATA(selem)->smap, smap); + hlist_add_head_rcu(&selem->map_node, &b->list); + raw_spin_unlock_bh(&b->lock); +} + +void bpf_selem_unlink(struct bpf_local_storage_elem *selem) +{ + /* Always unlink from map before unlinking from local_storage + * because selem will be freed after successfully unlinked from + * the local_storage. + */ + bpf_selem_unlink_map(selem); + __bpf_selem_unlink_storage(selem); +} + +struct bpf_local_storage_data * +bpf_local_storage_lookup(struct bpf_local_storage *local_storage, + struct bpf_local_storage_map *smap, + bool cacheit_lockit) +{ + struct bpf_local_storage_data *sdata; + struct bpf_local_storage_elem *selem; + + /* Fast path (cache hit) */ + sdata = rcu_dereference(local_storage->cache[smap->cache_idx]); + if (sdata && rcu_access_pointer(sdata->smap) == smap) + return sdata; + + /* Slow path (cache miss) */ + hlist_for_each_entry_rcu(selem, &local_storage->list, snode) + if (rcu_access_pointer(SDATA(selem)->smap) == smap) + break; + + if (!selem) + return NULL; + + sdata = SDATA(selem); + if (cacheit_lockit) { + /* spinlock is needed to avoid racing with the + * parallel delete. Otherwise, publishing an already + * deleted sdata to the cache will become a use-after-free + * problem in the next bpf_local_storage_lookup(). + */ + raw_spin_lock_bh(&local_storage->lock); + if (selem_linked_to_storage(selem)) + rcu_assign_pointer(local_storage->cache[smap->cache_idx], + sdata); + raw_spin_unlock_bh(&local_storage->lock); + } + + return sdata; +} + +static int check_flags(const struct bpf_local_storage_data *old_sdata, + u64 map_flags) +{ + if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) + /* elem already exists */ + return -EEXIST; + + if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) + /* elem doesn't exist, cannot update it */ + return -ENOENT; + + return 0; +} + +int bpf_local_storage_alloc(void *owner, + struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *first_selem) +{ + struct bpf_local_storage *prev_storage, *storage; + struct bpf_local_storage **owner_storage_ptr; + int err; + + err = mem_charge(smap, owner, sizeof(*storage)); + if (err) + return err; + + storage = kzalloc(sizeof(*storage), GFP_ATOMIC | __GFP_NOWARN); + if (!storage) { + err = -ENOMEM; + goto uncharge; + } + + INIT_HLIST_HEAD(&storage->list); + raw_spin_lock_init(&storage->lock); + storage->owner = owner; + + bpf_selem_link_storage_nolock(storage, first_selem); + bpf_selem_link_map(smap, first_selem); + + owner_storage_ptr = + (struct bpf_local_storage **)owner_storage(smap, owner); + /* Publish storage to the owner. + * Instead of using any lock of the kernel object (i.e. owner), + * cmpxchg will work with any kernel object regardless what + * the running context is, bh, irq...etc. + * + * From now on, the owner->storage pointer (e.g. sk->sk_bpf_storage) + * is protected by the storage->lock. Hence, when freeing + * the owner->storage, the storage->lock must be held before + * setting owner->storage ptr to NULL. + */ + prev_storage = cmpxchg(owner_storage_ptr, NULL, storage); + if (unlikely(prev_storage)) { + bpf_selem_unlink_map(first_selem); + err = -EAGAIN; + goto uncharge; + + /* Note that even first_selem was linked to smap's + * bucket->list, first_selem can be freed immediately + * (instead of kfree_rcu) because + * bpf_local_storage_map_free() does a + * synchronize_rcu() before walking the bucket->list. + * Hence, no one is accessing selem from the + * bucket->list under rcu_read_lock(). + */ + } + + return 0; + +uncharge: + kfree(storage); + mem_uncharge(smap, owner, sizeof(*storage)); + return err; +} + +/* sk cannot be going away because it is linking new elem + * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0). + * Otherwise, it will become a leak (and other memory issues + * during map destruction). + */ +struct bpf_local_storage_data * +bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, + void *value, u64 map_flags) +{ + struct bpf_local_storage_data *old_sdata = NULL; + struct bpf_local_storage_elem *selem; + struct bpf_local_storage *local_storage; + int err; + + /* BPF_EXIST and BPF_NOEXIST cannot be both set */ + if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) || + /* BPF_F_LOCK can only be used in a value with spin_lock */ + unlikely((map_flags & BPF_F_LOCK) && + !map_value_has_spin_lock(&smap->map))) + return ERR_PTR(-EINVAL); + + local_storage = rcu_dereference(*owner_storage(smap, owner)); + if (!local_storage || hlist_empty(&local_storage->list)) { + /* Very first elem for the owner */ + err = check_flags(NULL, map_flags); + if (err) + return ERR_PTR(err); + + selem = bpf_selem_alloc(smap, owner, value, true); + if (!selem) + return ERR_PTR(-ENOMEM); + + err = bpf_local_storage_alloc(owner, smap, selem); + if (err) { + kfree(selem); + mem_uncharge(smap, owner, smap->elem_size); + return ERR_PTR(err); + } + + return SDATA(selem); + } + + if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) { + /* Hoping to find an old_sdata to do inline update + * such that it can avoid taking the local_storage->lock + * and changing the lists. + */ + old_sdata = + bpf_local_storage_lookup(local_storage, smap, false); + err = check_flags(old_sdata, map_flags); + if (err) + return ERR_PTR(err); + if (old_sdata && selem_linked_to_storage(SELEM(old_sdata))) { + copy_map_value_locked(&smap->map, old_sdata->data, + value, false); + return old_sdata; + } + } + + raw_spin_lock_bh(&local_storage->lock); + + /* Recheck local_storage->list under local_storage->lock */ + if (unlikely(hlist_empty(&local_storage->list))) { + /* A parallel del is happening and local_storage is going + * away. It has just been checked before, so very + * unlikely. Return instead of retry to keep things + * simple. + */ + err = -EAGAIN; + goto unlock_err; + } + + old_sdata = bpf_local_storage_lookup(local_storage, smap, false); + err = check_flags(old_sdata, map_flags); + if (err) + goto unlock_err; + + if (old_sdata && (map_flags & BPF_F_LOCK)) { + copy_map_value_locked(&smap->map, old_sdata->data, value, + false); + selem = SELEM(old_sdata); + goto unlock; + } + + /* local_storage->lock is held. Hence, we are sure + * we can unlink and uncharge the old_sdata successfully + * later. Hence, instead of charging the new selem now + * and then uncharge the old selem later (which may cause + * a potential but unnecessary charge failure), avoid taking + * a charge at all here (the "!old_sdata" check) and the + * old_sdata will not be uncharged later during + * bpf_selem_unlink_storage_nolock(). + */ + selem = bpf_selem_alloc(smap, owner, value, !old_sdata); + if (!selem) { + err = -ENOMEM; + goto unlock_err; + } + + /* First, link the new selem to the map */ + bpf_selem_link_map(smap, selem); + + /* Second, link (and publish) the new selem to local_storage */ + bpf_selem_link_storage_nolock(local_storage, selem); + + /* Third, remove old selem, SELEM(old_sdata) */ + if (old_sdata) { + bpf_selem_unlink_map(SELEM(old_sdata)); + bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), + false); + } + +unlock: + raw_spin_unlock_bh(&local_storage->lock); + return SDATA(selem); + +unlock_err: + raw_spin_unlock_bh(&local_storage->lock); + return ERR_PTR(err); +} + +u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache) +{ + u64 min_usage = U64_MAX; + u16 i, res = 0; + + spin_lock(&cache->idx_lock); + + for (i = 0; i < BPF_LOCAL_STORAGE_CACHE_SIZE; i++) { + if (cache->idx_usage_counts[i] < min_usage) { + min_usage = cache->idx_usage_counts[i]; + res = i; + + /* Found a free cache_idx */ + if (!min_usage) + break; + } + } + cache->idx_usage_counts[res]++; + + spin_unlock(&cache->idx_lock); + + return res; +} + +void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, + u16 idx) +{ + spin_lock(&cache->idx_lock); + cache->idx_usage_counts[idx]--; + spin_unlock(&cache->idx_lock); +} + +void bpf_local_storage_map_free(struct bpf_local_storage_map *smap) +{ + struct bpf_local_storage_elem *selem; + struct bpf_local_storage_map_bucket *b; + unsigned int i; + + /* Note that this map might be concurrently cloned from + * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone + * RCU read section to finish before proceeding. New RCU + * read sections should be prevented via bpf_map_inc_not_zero. + */ + synchronize_rcu(); + + /* bpf prog and the userspace can no longer access this map + * now. No new selem (of this map) can be added + * to the owner->storage or to the map bucket's list. + * + * The elem of this map can be cleaned up here + * or when the storage is freed e.g. + * by bpf_sk_storage_free() during __sk_destruct(). + */ + for (i = 0; i < (1U << smap->bucket_log); i++) { + b = &smap->buckets[i]; + + rcu_read_lock(); + /* No one is adding to b->list now */ + while ((selem = hlist_entry_safe( + rcu_dereference_raw(hlist_first_rcu(&b->list)), + struct bpf_local_storage_elem, map_node))) { + bpf_selem_unlink(selem); + cond_resched_rcu(); + } + rcu_read_unlock(); + } + + /* While freeing the storage we may still need to access the map. + * + * e.g. when bpf_sk_storage_free() has unlinked selem from the map + * which then made the above while((selem = ...)) loop + * exit immediately. + * + * However, while freeing the storage one still needs to access the + * smap->elem_size to do the uncharging in + * bpf_selem_unlink_storage_nolock(). + * + * Hence, wait another rcu grace period for the storage to be freed. + */ + synchronize_rcu(); + + kvfree(smap->buckets); + kfree(smap); +} + +int bpf_local_storage_map_alloc_check(union bpf_attr *attr) +{ + if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK || + !(attr->map_flags & BPF_F_NO_PREALLOC) || + attr->max_entries || + attr->key_size != sizeof(int) || !attr->value_size || + /* Enforce BTF for userspace sk dumping */ + !attr->btf_key_type_id || !attr->btf_value_type_id) + return -EINVAL; + + if (!bpf_capable()) + return -EPERM; + + if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE) + return -E2BIG; + + return 0; +} + +struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr) +{ + struct bpf_local_storage_map *smap; + unsigned int i; + u32 nbuckets; + u64 cost; + int ret; + + smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN); + if (!smap) + return ERR_PTR(-ENOMEM); + bpf_map_init_from_attr(&smap->map, attr); + + nbuckets = roundup_pow_of_two(num_possible_cpus()); + /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ + nbuckets = max_t(u32, 2, nbuckets); + smap->bucket_log = ilog2(nbuckets); + cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); + + ret = bpf_map_charge_init(&smap->map.memory, cost); + if (ret < 0) { + kfree(smap); + return ERR_PTR(ret); + } + + smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, + GFP_USER | __GFP_NOWARN); + if (!smap->buckets) { + bpf_map_charge_finish(&smap->map.memory); + kfree(smap); + return ERR_PTR(-ENOMEM); + } + + for (i = 0; i < nbuckets; i++) { + INIT_HLIST_HEAD(&smap->buckets[i].list); + raw_spin_lock_init(&smap->buckets[i].lock); + } + + smap->elem_size = + sizeof(struct bpf_local_storage_elem) + attr->value_size; + + return smap; +} + +int bpf_local_storage_map_check_btf(const struct bpf_map *map, + const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + u32 int_data; + + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) + return -EINVAL; + + int_data = *(u32 *)(key_type + 1); + if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) + return -EINVAL; + + return 0; +} diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index fb278144e9fd..78ea8a7bd27f 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -11,6 +11,8 @@ #include <linux/bpf_lsm.h> #include <linux/kallsyms.h> #include <linux/bpf_verifier.h> +#include <net/bpf_sk_storage.h> +#include <linux/bpf_local_storage.h> /* For every LSM hook that allows attachment of BPF programs, declare a nop * function where a BPF program can be attached. @@ -45,10 +47,27 @@ int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, return 0; } +static const struct bpf_func_proto * +bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_inode_storage_get: + return &bpf_inode_storage_get_proto; + case BPF_FUNC_inode_storage_delete: + return &bpf_inode_storage_delete_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; + default: + return tracing_prog_func_proto(func_id, prog); + } +} + const struct bpf_prog_ops lsm_prog_ops = { }; const struct bpf_verifier_ops lsm_verifier_ops = { - .get_func_proto = tracing_prog_func_proto, + .get_func_proto = bpf_lsm_func_proto, .is_valid_access = btf_ctx_access, }; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 969c5d47f81f..4c3b543bb33b 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -298,8 +298,7 @@ static int check_zero_holes(const struct btf_type *t, void *data) return -EINVAL; mtype = btf_type_by_id(btf_vmlinux, member->type); - mtype = btf_resolve_size(btf_vmlinux, mtype, &msize, - NULL, NULL); + mtype = btf_resolve_size(btf_vmlinux, mtype, &msize); if (IS_ERR(mtype)) return PTR_ERR(mtype); prev_mend = moff + msize; @@ -396,8 +395,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, u32 msize; mtype = btf_type_by_id(btf_vmlinux, member->type); - mtype = btf_resolve_size(btf_vmlinux, mtype, &msize, - NULL, NULL); + mtype = btf_resolve_size(btf_vmlinux, mtype, &msize); if (IS_ERR(mtype)) { err = PTR_ERR(mtype); goto reset_unlock; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 91afdd4c82e3..ed7d02e8bc93 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -21,6 +21,8 @@ #include <linux/btf_ids.h> #include <linux/skmsg.h> #include <linux/perf_event.h> +#include <linux/bsearch.h> +#include <linux/btf_ids.h> #include <net/sock.h> /* BTF (BPF Type Format) is the meta data format which describes @@ -186,11 +188,6 @@ i < btf_type_vlen(struct_type); \ i++, member++) -#define for_each_vsi(i, struct_type, member) \ - for (i = 0, member = btf_type_var_secinfo(struct_type); \ - i < btf_type_vlen(struct_type); \ - i++, member++) - #define for_each_vsi_from(i, from, struct_type, member) \ for (i = from, member = btf_type_var_secinfo(struct_type) + from; \ i < btf_type_vlen(struct_type); \ @@ -282,6 +279,91 @@ static const char *btf_type_str(const struct btf_type *t) return btf_kind_str[BTF_INFO_KIND(t->info)]; } +/* Chunk size we use in safe copy of data to be shown. */ +#define BTF_SHOW_OBJ_SAFE_SIZE 32 + +/* + * This is the maximum size of a base type value (equivalent to a + * 128-bit int); if we are at the end of our safe buffer and have + * less than 16 bytes space we can't be assured of being able + * to copy the next type safely, so in such cases we will initiate + * a new copy. + */ +#define BTF_SHOW_OBJ_BASE_TYPE_SIZE 16 + +/* Type name size */ +#define BTF_SHOW_NAME_SIZE 80 + +/* + * Common data to all BTF show operations. Private show functions can add + * their own data to a structure containing a struct btf_show and consult it + * in the show callback. See btf_type_show() below. + * + * One challenge with showing nested data is we want to skip 0-valued + * data, but in order to figure out whether a nested object is all zeros + * we need to walk through it. As a result, we need to make two passes + * when handling structs, unions and arrays; the first path simply looks + * for nonzero data, while the second actually does the display. The first + * pass is signalled by show->state.depth_check being set, and if we + * encounter a non-zero value we set show->state.depth_to_show to + * the depth at which we encountered it. When we have completed the + * first pass, we will know if anything needs to be displayed if + * depth_to_show > depth. See btf_[struct,array]_show() for the + * implementation of this. + * + * Another problem is we want to ensure the data for display is safe to + * access. To support this, the anonymous "struct {} obj" tracks the data + * object and our safe copy of it. We copy portions of the data needed + * to the object "copy" buffer, but because its size is limited to + * BTF_SHOW_OBJ_COPY_LEN bytes, multiple copies may be required as we + * traverse larger objects for display. + * + * The various data type show functions all start with a call to + * btf_show_start_type() which returns a pointer to the safe copy + * of the data needed (or if BTF_SHOW_UNSAFE is specified, to the + * raw data itself). btf_show_obj_safe() is responsible for + * using copy_from_kernel_nofault() to update the safe data if necessary + * as we traverse the object's data. skbuff-like semantics are + * used: + * + * - obj.head points to the start of the toplevel object for display + * - obj.size is the size of the toplevel object + * - obj.data points to the current point in the original data at + * which our safe data starts. obj.data will advance as we copy + * portions of the data. + * + * In most cases a single copy will suffice, but larger data structures + * such as "struct task_struct" will require many copies. The logic in + * btf_show_obj_safe() handles the logic that determines if a new + * copy_from_kernel_nofault() is needed. + */ +struct btf_show { + u64 flags; + void *target; /* target of show operation (seq file, buffer) */ + void (*showfn)(struct btf_show *show, const char *fmt, va_list args); + const struct btf *btf; + /* below are used during iteration */ + struct { + u8 depth; + u8 depth_to_show; + u8 depth_check; + u8 array_member:1, + array_terminated:1; + u16 array_encoding; + u32 type_id; + int status; /* non-zero for error */ + const struct btf_type *type; + const struct btf_member *member; + char name[BTF_SHOW_NAME_SIZE]; /* space for member name/type */ + } state; + struct { + u32 size; + void *head; + void *data; + u8 safe[BTF_SHOW_OBJ_SAFE_SIZE]; + } obj; +}; + struct btf_kind_operations { s32 (*check_meta)(struct btf_verifier_env *env, const struct btf_type *t, @@ -298,9 +380,9 @@ struct btf_kind_operations { const struct btf_type *member_type); void (*log_details)(struct btf_verifier_env *env, const struct btf_type *t); - void (*seq_show)(const struct btf *btf, const struct btf_type *t, + void (*show)(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offsets, - struct seq_file *m); + struct btf_show *show); }; static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS]; @@ -353,16 +435,6 @@ static bool btf_type_nosize_or_null(const struct btf_type *t) return !t || btf_type_nosize(t); } -/* union is only a special case of struct: - * all its offsetof(member) == 0 - */ -static bool btf_type_is_struct(const struct btf_type *t) -{ - u8 kind = BTF_INFO_KIND(t->info); - - return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; -} - static bool __btf_type_is_struct(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT; @@ -373,11 +445,6 @@ static bool btf_type_is_array(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; } -static bool btf_type_is_var(const struct btf_type *t) -{ - return BTF_INFO_KIND(t->info) == BTF_KIND_VAR; -} - static bool btf_type_is_datasec(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; @@ -526,11 +593,6 @@ static const struct btf_var *btf_type_var(const struct btf_type *t) return (const struct btf_var *)(t + 1); } -static const struct btf_var_secinfo *btf_type_var_secinfo(const struct btf_type *t) -{ - return (const struct btf_var_secinfo *)(t + 1); -} - static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) { return kind_ops[BTF_INFO_KIND(t->info)]; @@ -677,6 +739,488 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, return true; } +/* Similar to btf_type_skip_modifiers() but does not skip typedefs. */ +static const struct btf_type *btf_type_skip_qualifiers(const struct btf *btf, + u32 id) +{ + const struct btf_type *t = btf_type_by_id(btf, id); + + while (btf_type_is_modifier(t) && + BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) { + id = t->type; + t = btf_type_by_id(btf, t->type); + } + + return t; +} + +#define BTF_SHOW_MAX_ITER 10 + +#define BTF_KIND_BIT(kind) (1ULL << kind) + +/* + * Populate show->state.name with type name information. + * Format of type name is + * + * [.member_name = ] (type_name) + */ +static const char *btf_show_name(struct btf_show *show) +{ + /* BTF_MAX_ITER array suffixes "[]" */ + const char *array_suffixes = "[][][][][][][][][][]"; + const char *array_suffix = &array_suffixes[strlen(array_suffixes)]; + /* BTF_MAX_ITER pointer suffixes "*" */ + const char *ptr_suffixes = "**********"; + const char *ptr_suffix = &ptr_suffixes[strlen(ptr_suffixes)]; + const char *name = NULL, *prefix = "", *parens = ""; + const struct btf_member *m = show->state.member; + const struct btf_type *t = show->state.type; + const struct btf_array *array; + u32 id = show->state.type_id; + const char *member = NULL; + bool show_member = false; + u64 kinds = 0; + int i; + + show->state.name[0] = '\0'; + + /* + * Don't show type name if we're showing an array member; + * in that case we show the array type so don't need to repeat + * ourselves for each member. + */ + if (show->state.array_member) + return ""; + + /* Retrieve member name, if any. */ + if (m) { + member = btf_name_by_offset(show->btf, m->name_off); + show_member = strlen(member) > 0; + id = m->type; + } + + /* + * Start with type_id, as we have resolved the struct btf_type * + * via btf_modifier_show() past the parent typedef to the child + * struct, int etc it is defined as. In such cases, the type_id + * still represents the starting type while the struct btf_type * + * in our show->state points at the resolved type of the typedef. + */ + t = btf_type_by_id(show->btf, id); + if (!t) + return ""; + + /* + * The goal here is to build up the right number of pointer and + * array suffixes while ensuring the type name for a typedef + * is represented. Along the way we accumulate a list of + * BTF kinds we have encountered, since these will inform later + * display; for example, pointer types will not require an + * opening "{" for struct, we will just display the pointer value. + * + * We also want to accumulate the right number of pointer or array + * indices in the format string while iterating until we get to + * the typedef/pointee/array member target type. + * + * We start by pointing at the end of pointer and array suffix + * strings; as we accumulate pointers and arrays we move the pointer + * or array string backwards so it will show the expected number of + * '*' or '[]' for the type. BTF_SHOW_MAX_ITER of nesting of pointers + * and/or arrays and typedefs are supported as a precaution. + * + * We also want to get typedef name while proceeding to resolve + * type it points to so that we can add parentheses if it is a + * "typedef struct" etc. + */ + for (i = 0; i < BTF_SHOW_MAX_ITER; i++) { + + switch (BTF_INFO_KIND(t->info)) { + case BTF_KIND_TYPEDEF: + if (!name) + name = btf_name_by_offset(show->btf, + t->name_off); + kinds |= BTF_KIND_BIT(BTF_KIND_TYPEDEF); + id = t->type; + break; + case BTF_KIND_ARRAY: + kinds |= BTF_KIND_BIT(BTF_KIND_ARRAY); + parens = "["; + if (!t) + return ""; + array = btf_type_array(t); + if (array_suffix > array_suffixes) + array_suffix -= 2; + id = array->type; + break; + case BTF_KIND_PTR: + kinds |= BTF_KIND_BIT(BTF_KIND_PTR); + if (ptr_suffix > ptr_suffixes) + ptr_suffix -= 1; + id = t->type; + break; + default: + id = 0; + break; + } + if (!id) + break; + t = btf_type_skip_qualifiers(show->btf, id); + } + /* We may not be able to represent this type; bail to be safe */ + if (i == BTF_SHOW_MAX_ITER) + return ""; + + if (!name) + name = btf_name_by_offset(show->btf, t->name_off); + + switch (BTF_INFO_KIND(t->info)) { + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + prefix = BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT ? + "struct" : "union"; + /* if it's an array of struct/union, parens is already set */ + if (!(kinds & (BTF_KIND_BIT(BTF_KIND_ARRAY)))) + parens = "{"; + break; + case BTF_KIND_ENUM: + prefix = "enum"; + break; + default: + break; + } + + /* pointer does not require parens */ + if (kinds & BTF_KIND_BIT(BTF_KIND_PTR)) + parens = ""; + /* typedef does not require struct/union/enum prefix */ + if (kinds & BTF_KIND_BIT(BTF_KIND_TYPEDEF)) + prefix = ""; + + if (!name) + name = ""; + + /* Even if we don't want type name info, we want parentheses etc */ + if (show->flags & BTF_SHOW_NONAME) + snprintf(show->state.name, sizeof(show->state.name), "%s", + parens); + else + snprintf(show->state.name, sizeof(show->state.name), + "%s%s%s(%s%s%s%s%s%s)%s", + /* first 3 strings comprise ".member = " */ + show_member ? "." : "", + show_member ? member : "", + show_member ? " = " : "", + /* ...next is our prefix (struct, enum, etc) */ + prefix, + strlen(prefix) > 0 && strlen(name) > 0 ? " " : "", + /* ...this is the type name itself */ + name, + /* ...suffixed by the appropriate '*', '[]' suffixes */ + strlen(ptr_suffix) > 0 ? " " : "", ptr_suffix, + array_suffix, parens); + + return show->state.name; +} + +static const char *__btf_show_indent(struct btf_show *show) +{ + const char *indents = " "; + const char *indent = &indents[strlen(indents)]; + + if ((indent - show->state.depth) >= indents) + return indent - show->state.depth; + return indents; +} + +static const char *btf_show_indent(struct btf_show *show) +{ + return show->flags & BTF_SHOW_COMPACT ? "" : __btf_show_indent(show); +} + +static const char *btf_show_newline(struct btf_show *show) +{ + return show->flags & BTF_SHOW_COMPACT ? "" : "\n"; +} + +static const char *btf_show_delim(struct btf_show *show) +{ + if (show->state.depth == 0) + return ""; + + if ((show->flags & BTF_SHOW_COMPACT) && show->state.type && + BTF_INFO_KIND(show->state.type->info) == BTF_KIND_UNION) + return "|"; + + return ","; +} + +__printf(2, 3) static void btf_show(struct btf_show *show, const char *fmt, ...) +{ + va_list args; + + if (!show->state.depth_check) { + va_start(args, fmt); + show->showfn(show, fmt, args); + va_end(args); + } +} + +/* Macros are used here as btf_show_type_value[s]() prepends and appends + * format specifiers to the format specifier passed in; these do the work of + * adding indentation, delimiters etc while the caller simply has to specify + * the type value(s) in the format specifier + value(s). + */ +#define btf_show_type_value(show, fmt, value) \ + do { \ + if ((value) != 0 || (show->flags & BTF_SHOW_ZERO) || \ + show->state.depth == 0) { \ + btf_show(show, "%s%s" fmt "%s%s", \ + btf_show_indent(show), \ + btf_show_name(show), \ + value, btf_show_delim(show), \ + btf_show_newline(show)); \ + if (show->state.depth > show->state.depth_to_show) \ + show->state.depth_to_show = show->state.depth; \ + } \ + } while (0) + +#define btf_show_type_values(show, fmt, ...) \ + do { \ + btf_show(show, "%s%s" fmt "%s%s", btf_show_indent(show), \ + btf_show_name(show), \ + __VA_ARGS__, btf_show_delim(show), \ + btf_show_newline(show)); \ + if (show->state.depth > show->state.depth_to_show) \ + show->state.depth_to_show = show->state.depth; \ + } while (0) + +/* How much is left to copy to safe buffer after @data? */ +static int btf_show_obj_size_left(struct btf_show *show, void *data) +{ + return show->obj.head + show->obj.size - data; +} + +/* Is object pointed to by @data of @size already copied to our safe buffer? */ +static bool btf_show_obj_is_safe(struct btf_show *show, void *data, int size) +{ + return data >= show->obj.data && + (data + size) < (show->obj.data + BTF_SHOW_OBJ_SAFE_SIZE); +} + +/* + * If object pointed to by @data of @size falls within our safe buffer, return + * the equivalent pointer to the same safe data. Assumes + * copy_from_kernel_nofault() has already happened and our safe buffer is + * populated. + */ +static void *__btf_show_obj_safe(struct btf_show *show, void *data, int size) +{ + if (btf_show_obj_is_safe(show, data, size)) + return show->obj.safe + (data - show->obj.data); + return NULL; +} + +/* + * Return a safe-to-access version of data pointed to by @data. + * We do this by copying the relevant amount of information + * to the struct btf_show obj.safe buffer using copy_from_kernel_nofault(). + * + * If BTF_SHOW_UNSAFE is specified, just return data as-is; no + * safe copy is needed. + * + * Otherwise we need to determine if we have the required amount + * of data (determined by the @data pointer and the size of the + * largest base type we can encounter (represented by + * BTF_SHOW_OBJ_BASE_TYPE_SIZE). Having that much data ensures + * that we will be able to print some of the current object, + * and if more is needed a copy will be triggered. + * Some objects such as structs will not fit into the buffer; + * in such cases additional copies when we iterate over their + * members may be needed. + * + * btf_show_obj_safe() is used to return a safe buffer for + * btf_show_start_type(); this ensures that as we recurse into + * nested types we always have safe data for the given type. + * This approach is somewhat wasteful; it's possible for example + * that when iterating over a large union we'll end up copying the + * same data repeatedly, but the goal is safety not performance. + * We use stack data as opposed to per-CPU buffers because the + * iteration over a type can take some time, and preemption handling + * would greatly complicate use of the safe buffer. + */ +static void *btf_show_obj_safe(struct btf_show *show, + const struct btf_type *t, + void *data) +{ + const struct btf_type *rt; + int size_left, size; + void *safe = NULL; + + if (show->flags & BTF_SHOW_UNSAFE) + return data; + + rt = btf_resolve_size(show->btf, t, &size); + if (IS_ERR(rt)) { + show->state.status = PTR_ERR(rt); + return NULL; + } + + /* + * Is this toplevel object? If so, set total object size and + * initialize pointers. Otherwise check if we still fall within + * our safe object data. + */ + if (show->state.depth == 0) { + show->obj.size = size; + show->obj.head = data; + } else { + /* + * If the size of the current object is > our remaining + * safe buffer we _may_ need to do a new copy. However + * consider the case of a nested struct; it's size pushes + * us over the safe buffer limit, but showing any individual + * struct members does not. In such cases, we don't need + * to initiate a fresh copy yet; however we definitely need + * at least BTF_SHOW_OBJ_BASE_TYPE_SIZE bytes left + * in our buffer, regardless of the current object size. + * The logic here is that as we resolve types we will + * hit a base type at some point, and we need to be sure + * the next chunk of data is safely available to display + * that type info safely. We cannot rely on the size of + * the current object here because it may be much larger + * than our current buffer (e.g. task_struct is 8k). + * All we want to do here is ensure that we can print the + * next basic type, which we can if either + * - the current type size is within the safe buffer; or + * - at least BTF_SHOW_OBJ_BASE_TYPE_SIZE bytes are left in + * the safe buffer. + */ + safe = __btf_show_obj_safe(show, data, + min(size, + BTF_SHOW_OBJ_BASE_TYPE_SIZE)); + } + + /* + * We need a new copy to our safe object, either because we haven't + * yet copied and are intializing safe data, or because the data + * we want falls outside the boundaries of the safe object. + */ + if (!safe) { + size_left = btf_show_obj_size_left(show, data); + if (size_left > BTF_SHOW_OBJ_SAFE_SIZE) + size_left = BTF_SHOW_OBJ_SAFE_SIZE; + show->state.status = copy_from_kernel_nofault(show->obj.safe, + data, size_left); + if (!show->state.status) { + show->obj.data = data; + safe = show->obj.safe; + } + } + + return safe; +} + +/* + * Set the type we are starting to show and return a safe data pointer + * to be used for showing the associated data. + */ +static void *btf_show_start_type(struct btf_show *show, + const struct btf_type *t, + u32 type_id, void *data) +{ + show->state.type = t; + show->state.type_id = type_id; + show->state.name[0] = '\0'; + + return btf_show_obj_safe(show, t, data); +} + +static void btf_show_end_type(struct btf_show *show) +{ + show->state.type = NULL; + show->state.type_id = 0; + show->state.name[0] = '\0'; +} + +static void *btf_show_start_aggr_type(struct btf_show *show, + const struct btf_type *t, + u32 type_id, void *data) +{ + void *safe_data = btf_show_start_type(show, t, type_id, data); + + if (!safe_data) + return safe_data; + + btf_show(show, "%s%s%s", btf_show_indent(show), + btf_show_name(show), + btf_show_newline(show)); + show->state.depth++; + return safe_data; +} + +static void btf_show_end_aggr_type(struct btf_show *show, + const char *suffix) +{ + show->state.depth--; + btf_show(show, "%s%s%s%s", btf_show_indent(show), suffix, + btf_show_delim(show), btf_show_newline(show)); + btf_show_end_type(show); +} + +static void btf_show_start_member(struct btf_show *show, + const struct btf_member *m) +{ + show->state.member = m; +} + +static void btf_show_start_array_member(struct btf_show *show) +{ + show->state.array_member = 1; + btf_show_start_member(show, NULL); +} + +static void btf_show_end_member(struct btf_show *show) +{ + show->state.member = NULL; +} + +static void btf_show_end_array_member(struct btf_show *show) +{ + show->state.array_member = 0; + btf_show_end_member(show); +} + +static void *btf_show_start_array_type(struct btf_show *show, + const struct btf_type *t, + u32 type_id, + u16 array_encoding, + void *data) +{ + show->state.array_encoding = array_encoding; + show->state.array_terminated = 0; + return btf_show_start_aggr_type(show, t, type_id, data); +} + +static void btf_show_end_array_type(struct btf_show *show) +{ + show->state.array_encoding = 0; + show->state.array_terminated = 0; + btf_show_end_aggr_type(show, "]"); +} + +static void *btf_show_start_struct_type(struct btf_show *show, + const struct btf_type *t, + u32 type_id, + void *data) +{ + return btf_show_start_aggr_type(show, t, type_id, data); +} + +static void btf_show_end_struct_type(struct btf_show *show) +{ + btf_show_end_aggr_type(show, "}"); +} + __printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log, const char *fmt, ...) { @@ -1079,23 +1623,27 @@ static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env) * *type_size: (x * y * sizeof(u32)). Hence, *type_size always * corresponds to the return type. * *elem_type: u32 + * *elem_id: id of u32 * *total_nelems: (x * y). Hence, individual elem size is * (*type_size / *total_nelems) + * *type_id: id of type if it's changed within the function, 0 if not * * type: is not an array (e.g. const struct X) * return type: type "struct X" * *type_size: sizeof(struct X) * *elem_type: same as return type ("struct X") + * *elem_id: 0 * *total_nelems: 1 + * *type_id: id of type if it's changed within the function, 0 if not */ -const struct btf_type * -btf_resolve_size(const struct btf *btf, const struct btf_type *type, - u32 *type_size, const struct btf_type **elem_type, - u32 *total_nelems) +static const struct btf_type * +__btf_resolve_size(const struct btf *btf, const struct btf_type *type, + u32 *type_size, const struct btf_type **elem_type, + u32 *elem_id, u32 *total_nelems, u32 *type_id) { const struct btf_type *array_type = NULL; - const struct btf_array *array; - u32 i, size, nelems = 1; + const struct btf_array *array = NULL; + u32 i, size, nelems = 1, id = 0; for (i = 0; i < MAX_RESOLVE_DEPTH; i++) { switch (BTF_INFO_KIND(type->info)) { @@ -1116,6 +1664,7 @@ btf_resolve_size(const struct btf *btf, const struct btf_type *type, case BTF_KIND_VOLATILE: case BTF_KIND_CONST: case BTF_KIND_RESTRICT: + id = type->type; type = btf_type_by_id(btf, type->type); break; @@ -1146,10 +1695,21 @@ resolved: *total_nelems = nelems; if (elem_type) *elem_type = type; + if (elem_id) + *elem_id = array ? array->type : 0; + if (type_id && id) + *type_id = id; return array_type ? : type; } +const struct btf_type * +btf_resolve_size(const struct btf *btf, const struct btf_type *type, + u32 *type_size) +{ + return __btf_resolve_size(btf, type, type_size, NULL, NULL, NULL, NULL); +} + /* The input param "type_id" must point to a needs_resolve type */ static const struct btf_type *btf_type_id_resolve(const struct btf *btf, u32 *type_id) @@ -1250,11 +1810,11 @@ static int btf_df_resolve(struct btf_verifier_env *env, return -EINVAL; } -static void btf_df_seq_show(const struct btf *btf, const struct btf_type *t, - u32 type_id, void *data, u8 bits_offsets, - struct seq_file *m) +static void btf_df_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offsets, + struct btf_show *show) { - seq_printf(m, "<unsupported kind:%u>", BTF_INFO_KIND(t->info)); + btf_show(show, "<unsupported kind:%u>", BTF_INFO_KIND(t->info)); } static int btf_int_check_member(struct btf_verifier_env *env, @@ -1427,7 +1987,7 @@ static void btf_int_log(struct btf_verifier_env *env, btf_int_encoding_str(BTF_INT_ENCODING(int_data))); } -static void btf_int128_print(struct seq_file *m, void *data) +static void btf_int128_print(struct btf_show *show, void *data) { /* data points to a __int128 number. * Suppose @@ -1446,9 +2006,10 @@ static void btf_int128_print(struct seq_file *m, void *data) lower_num = *(u64 *)data; #endif if (upper_num == 0) - seq_printf(m, "0x%llx", lower_num); + btf_show_type_value(show, "0x%llx", lower_num); else - seq_printf(m, "0x%llx%016llx", upper_num, lower_num); + btf_show_type_values(show, "0x%llx%016llx", upper_num, + lower_num); } static void btf_int128_shift(u64 *print_num, u16 left_shift_bits, @@ -1492,8 +2053,8 @@ static void btf_int128_shift(u64 *print_num, u16 left_shift_bits, #endif } -static void btf_bitfield_seq_show(void *data, u8 bits_offset, - u8 nr_bits, struct seq_file *m) +static void btf_bitfield_show(void *data, u8 bits_offset, + u8 nr_bits, struct btf_show *show) { u16 left_shift_bits, right_shift_bits; u8 nr_copy_bytes; @@ -1513,14 +2074,14 @@ static void btf_bitfield_seq_show(void *data, u8 bits_offset, right_shift_bits = BITS_PER_U128 - nr_bits; btf_int128_shift(print_num, left_shift_bits, right_shift_bits); - btf_int128_print(m, print_num); + btf_int128_print(show, print_num); } -static void btf_int_bits_seq_show(const struct btf *btf, - const struct btf_type *t, - void *data, u8 bits_offset, - struct seq_file *m) +static void btf_int_bits_show(const struct btf *btf, + const struct btf_type *t, + void *data, u8 bits_offset, + struct btf_show *show) { u32 int_data = btf_type_int(t); u8 nr_bits = BTF_INT_BITS(int_data); @@ -1533,55 +2094,77 @@ static void btf_int_bits_seq_show(const struct btf *btf, total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); data += BITS_ROUNDDOWN_BYTES(total_bits_offset); bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset); - btf_bitfield_seq_show(data, bits_offset, nr_bits, m); + btf_bitfield_show(data, bits_offset, nr_bits, show); } -static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, - u32 type_id, void *data, u8 bits_offset, - struct seq_file *m) +static void btf_int_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct btf_show *show) { u32 int_data = btf_type_int(t); u8 encoding = BTF_INT_ENCODING(int_data); bool sign = encoding & BTF_INT_SIGNED; u8 nr_bits = BTF_INT_BITS(int_data); + void *safe_data; + + safe_data = btf_show_start_type(show, t, type_id, data); + if (!safe_data) + return; if (bits_offset || BTF_INT_OFFSET(int_data) || BITS_PER_BYTE_MASKED(nr_bits)) { - btf_int_bits_seq_show(btf, t, data, bits_offset, m); - return; + btf_int_bits_show(btf, t, safe_data, bits_offset, show); + goto out; } switch (nr_bits) { case 128: - btf_int128_print(m, data); + btf_int128_print(show, safe_data); break; case 64: if (sign) - seq_printf(m, "%lld", *(s64 *)data); + btf_show_type_value(show, "%lld", *(s64 *)safe_data); else - seq_printf(m, "%llu", *(u64 *)data); + btf_show_type_value(show, "%llu", *(u64 *)safe_data); break; case 32: if (sign) - seq_printf(m, "%d", *(s32 *)data); + btf_show_type_value(show, "%d", *(s32 *)safe_data); else - seq_printf(m, "%u", *(u32 *)data); + btf_show_type_value(show, "%u", *(u32 *)safe_data); break; case 16: if (sign) - seq_printf(m, "%d", *(s16 *)data); + btf_show_type_value(show, "%d", *(s16 *)safe_data); else - seq_printf(m, "%u", *(u16 *)data); + btf_show_type_value(show, "%u", *(u16 *)safe_data); break; case 8: + if (show->state.array_encoding == BTF_INT_CHAR) { + /* check for null terminator */ + if (show->state.array_terminated) + break; + if (*(char *)data == '\0') { + show->state.array_terminated = 1; + break; + } + if (isprint(*(char *)data)) { + btf_show_type_value(show, "'%c'", + *(char *)safe_data); + break; + } + } if (sign) - seq_printf(m, "%d", *(s8 *)data); + btf_show_type_value(show, "%d", *(s8 *)safe_data); else - seq_printf(m, "%u", *(u8 *)data); + btf_show_type_value(show, "%u", *(u8 *)safe_data); break; default: - btf_int_bits_seq_show(btf, t, data, bits_offset, m); + btf_int_bits_show(btf, t, safe_data, bits_offset, show); + break; } +out: + btf_show_end_type(show); } static const struct btf_kind_operations int_ops = { @@ -1590,7 +2173,7 @@ static const struct btf_kind_operations int_ops = { .check_member = btf_int_check_member, .check_kflag_member = btf_int_check_kflag_member, .log_details = btf_int_log, - .seq_show = btf_int_seq_show, + .show = btf_int_show, }; static int btf_modifier_check_member(struct btf_verifier_env *env, @@ -1854,34 +2437,44 @@ static int btf_ptr_resolve(struct btf_verifier_env *env, return 0; } -static void btf_modifier_seq_show(const struct btf *btf, - const struct btf_type *t, - u32 type_id, void *data, - u8 bits_offset, struct seq_file *m) +static void btf_modifier_show(const struct btf *btf, + const struct btf_type *t, + u32 type_id, void *data, + u8 bits_offset, struct btf_show *show) { if (btf->resolved_ids) t = btf_type_id_resolve(btf, &type_id); else t = btf_type_skip_modifiers(btf, type_id, NULL); - btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m); + btf_type_ops(t)->show(btf, t, type_id, data, bits_offset, show); } -static void btf_var_seq_show(const struct btf *btf, const struct btf_type *t, - u32 type_id, void *data, u8 bits_offset, - struct seq_file *m) +static void btf_var_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct btf_show *show) { t = btf_type_id_resolve(btf, &type_id); - btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m); + btf_type_ops(t)->show(btf, t, type_id, data, bits_offset, show); } -static void btf_ptr_seq_show(const struct btf *btf, const struct btf_type *t, - u32 type_id, void *data, u8 bits_offset, - struct seq_file *m) +static void btf_ptr_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct btf_show *show) { - /* It is a hashed value */ - seq_printf(m, "%p", *(void **)data); + void *safe_data; + + safe_data = btf_show_start_type(show, t, type_id, data); + if (!safe_data) + return; + + /* It is a hashed value unless BTF_SHOW_PTR_RAW is specified */ + if (show->flags & BTF_SHOW_PTR_RAW) + btf_show_type_value(show, "0x%px", *(void **)safe_data); + else + btf_show_type_value(show, "0x%p", *(void **)safe_data); + btf_show_end_type(show); } static void btf_ref_type_log(struct btf_verifier_env *env, @@ -1896,7 +2489,7 @@ static struct btf_kind_operations modifier_ops = { .check_member = btf_modifier_check_member, .check_kflag_member = btf_modifier_check_kflag_member, .log_details = btf_ref_type_log, - .seq_show = btf_modifier_seq_show, + .show = btf_modifier_show, }; static struct btf_kind_operations ptr_ops = { @@ -1905,7 +2498,7 @@ static struct btf_kind_operations ptr_ops = { .check_member = btf_ptr_check_member, .check_kflag_member = btf_generic_check_kflag_member, .log_details = btf_ref_type_log, - .seq_show = btf_ptr_seq_show, + .show = btf_ptr_show, }; static s32 btf_fwd_check_meta(struct btf_verifier_env *env, @@ -1946,7 +2539,7 @@ static struct btf_kind_operations fwd_ops = { .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_fwd_type_log, - .seq_show = btf_df_seq_show, + .show = btf_df_show, }; static int btf_array_check_member(struct btf_verifier_env *env, @@ -2105,28 +2698,90 @@ static void btf_array_log(struct btf_verifier_env *env, array->type, array->index_type, array->nelems); } -static void btf_array_seq_show(const struct btf *btf, const struct btf_type *t, - u32 type_id, void *data, u8 bits_offset, - struct seq_file *m) +static void __btf_array_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct btf_show *show) { const struct btf_array *array = btf_type_array(t); const struct btf_kind_operations *elem_ops; const struct btf_type *elem_type; - u32 i, elem_size, elem_type_id; + u32 i, elem_size = 0, elem_type_id; + u16 encoding = 0; elem_type_id = array->type; - elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size); + elem_type = btf_type_skip_modifiers(btf, elem_type_id, NULL); + if (elem_type && btf_type_has_size(elem_type)) + elem_size = elem_type->size; + + if (elem_type && btf_type_is_int(elem_type)) { + u32 int_type = btf_type_int(elem_type); + + encoding = BTF_INT_ENCODING(int_type); + + /* + * BTF_INT_CHAR encoding never seems to be set for + * char arrays, so if size is 1 and element is + * printable as a char, we'll do that. + */ + if (elem_size == 1) + encoding = BTF_INT_CHAR; + } + + if (!btf_show_start_array_type(show, t, type_id, encoding, data)) + return; + + if (!elem_type) + goto out; elem_ops = btf_type_ops(elem_type); - seq_puts(m, "["); + for (i = 0; i < array->nelems; i++) { - if (i) - seq_puts(m, ","); - elem_ops->seq_show(btf, elem_type, elem_type_id, data, - bits_offset, m); + btf_show_start_array_member(show); + + elem_ops->show(btf, elem_type, elem_type_id, data, + bits_offset, show); data += elem_size; + + btf_show_end_array_member(show); + + if (show->state.array_terminated) + break; + } +out: + btf_show_end_array_type(show); +} + +static void btf_array_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct btf_show *show) +{ + const struct btf_member *m = show->state.member; + + /* + * First check if any members would be shown (are non-zero). + * See comments above "struct btf_show" definition for more + * details on how this works at a high-level. + */ + if (show->state.depth > 0 && !(show->flags & BTF_SHOW_ZERO)) { + if (!show->state.depth_check) { + show->state.depth_check = show->state.depth + 1; + show->state.depth_to_show = 0; + } + __btf_array_show(btf, t, type_id, data, bits_offset, show); + show->state.member = m; + + if (show->state.depth_check != show->state.depth + 1) + return; + show->state.depth_check = 0; + + if (show->state.depth_to_show <= show->state.depth) + return; + /* + * Reaching here indicates we have recursed and found + * non-zero array member(s). + */ } - seq_puts(m, "]"); + __btf_array_show(btf, t, type_id, data, bits_offset, show); } static struct btf_kind_operations array_ops = { @@ -2135,7 +2790,7 @@ static struct btf_kind_operations array_ops = { .check_member = btf_array_check_member, .check_kflag_member = btf_generic_check_kflag_member, .log_details = btf_array_log, - .seq_show = btf_array_seq_show, + .show = btf_array_show, }; static int btf_struct_check_member(struct btf_verifier_env *env, @@ -2358,15 +3013,18 @@ int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t) return off; } -static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t, - u32 type_id, void *data, u8 bits_offset, - struct seq_file *m) +static void __btf_struct_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct btf_show *show) { - const char *seq = BTF_INFO_KIND(t->info) == BTF_KIND_UNION ? "|" : ","; const struct btf_member *member; + void *safe_data; u32 i; - seq_puts(m, "{"); + safe_data = btf_show_start_struct_type(show, t, type_id, data); + if (!safe_data) + return; + for_each_member(i, t, member) { const struct btf_type *member_type = btf_type_by_id(btf, member->type); @@ -2375,23 +3033,65 @@ static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t, u32 bytes_offset; u8 bits8_offset; - if (i) - seq_puts(m, seq); + btf_show_start_member(show, member); member_offset = btf_member_bit_offset(t, member); bitfield_size = btf_member_bitfield_size(t, member); bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset); bits8_offset = BITS_PER_BYTE_MASKED(member_offset); if (bitfield_size) { - btf_bitfield_seq_show(data + bytes_offset, bits8_offset, - bitfield_size, m); + safe_data = btf_show_start_type(show, member_type, + member->type, + data + bytes_offset); + if (safe_data) + btf_bitfield_show(safe_data, + bits8_offset, + bitfield_size, show); + btf_show_end_type(show); } else { ops = btf_type_ops(member_type); - ops->seq_show(btf, member_type, member->type, - data + bytes_offset, bits8_offset, m); + ops->show(btf, member_type, member->type, + data + bytes_offset, bits8_offset, show); } + + btf_show_end_member(show); } - seq_puts(m, "}"); + + btf_show_end_struct_type(show); +} + +static void btf_struct_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct btf_show *show) +{ + const struct btf_member *m = show->state.member; + + /* + * First check if any members would be shown (are non-zero). + * See comments above "struct btf_show" definition for more + * details on how this works at a high-level. + */ + if (show->state.depth > 0 && !(show->flags & BTF_SHOW_ZERO)) { + if (!show->state.depth_check) { + show->state.depth_check = show->state.depth + 1; + show->state.depth_to_show = 0; + } + __btf_struct_show(btf, t, type_id, data, bits_offset, show); + /* Restore saved member data here */ + show->state.member = m; + if (show->state.depth_check != show->state.depth + 1) + return; + show->state.depth_check = 0; + + if (show->state.depth_to_show <= show->state.depth) + return; + /* + * Reaching here indicates we have recursed and found + * non-zero child values. + */ + } + + __btf_struct_show(btf, t, type_id, data, bits_offset, show); } static struct btf_kind_operations struct_ops = { @@ -2400,7 +3100,7 @@ static struct btf_kind_operations struct_ops = { .check_member = btf_struct_check_member, .check_kflag_member = btf_generic_check_kflag_member, .log_details = btf_struct_log, - .seq_show = btf_struct_seq_show, + .show = btf_struct_show, }; static int btf_enum_check_member(struct btf_verifier_env *env, @@ -2531,24 +3231,35 @@ static void btf_enum_log(struct btf_verifier_env *env, btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } -static void btf_enum_seq_show(const struct btf *btf, const struct btf_type *t, - u32 type_id, void *data, u8 bits_offset, - struct seq_file *m) +static void btf_enum_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct btf_show *show) { const struct btf_enum *enums = btf_type_enum(t); u32 i, nr_enums = btf_type_vlen(t); - int v = *(int *)data; + void *safe_data; + int v; + + safe_data = btf_show_start_type(show, t, type_id, data); + if (!safe_data) + return; + + v = *(int *)safe_data; for (i = 0; i < nr_enums; i++) { - if (v == enums[i].val) { - seq_printf(m, "%s", - __btf_name_by_offset(btf, - enums[i].name_off)); - return; - } + if (v != enums[i].val) + continue; + + btf_show_type_value(show, "%s", + __btf_name_by_offset(btf, + enums[i].name_off)); + + btf_show_end_type(show); + return; } - seq_printf(m, "%d", v); + btf_show_type_value(show, "%d", v); + btf_show_end_type(show); } static struct btf_kind_operations enum_ops = { @@ -2557,7 +3268,7 @@ static struct btf_kind_operations enum_ops = { .check_member = btf_enum_check_member, .check_kflag_member = btf_enum_check_kflag_member, .log_details = btf_enum_log, - .seq_show = btf_enum_seq_show, + .show = btf_enum_show, }; static s32 btf_func_proto_check_meta(struct btf_verifier_env *env, @@ -2644,7 +3355,7 @@ static struct btf_kind_operations func_proto_ops = { .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_func_proto_log, - .seq_show = btf_df_seq_show, + .show = btf_df_show, }; static s32 btf_func_check_meta(struct btf_verifier_env *env, @@ -2678,7 +3389,7 @@ static struct btf_kind_operations func_ops = { .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_ref_type_log, - .seq_show = btf_df_seq_show, + .show = btf_df_show, }; static s32 btf_var_check_meta(struct btf_verifier_env *env, @@ -2742,7 +3453,7 @@ static const struct btf_kind_operations var_ops = { .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_var_log, - .seq_show = btf_var_seq_show, + .show = btf_var_show, }; static s32 btf_datasec_check_meta(struct btf_verifier_env *env, @@ -2868,24 +3579,28 @@ static void btf_datasec_log(struct btf_verifier_env *env, btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } -static void btf_datasec_seq_show(const struct btf *btf, - const struct btf_type *t, u32 type_id, - void *data, u8 bits_offset, - struct seq_file *m) +static void btf_datasec_show(const struct btf *btf, + const struct btf_type *t, u32 type_id, + void *data, u8 bits_offset, + struct btf_show *show) { const struct btf_var_secinfo *vsi; const struct btf_type *var; u32 i; - seq_printf(m, "section (\"%s\") = {", __btf_name_by_offset(btf, t->name_off)); + if (!btf_show_start_type(show, t, type_id, data)) + return; + + btf_show_type_value(show, "section (\"%s\") = {", + __btf_name_by_offset(btf, t->name_off)); for_each_vsi(i, t, vsi) { var = btf_type_by_id(btf, vsi->type); if (i) - seq_puts(m, ","); - btf_type_ops(var)->seq_show(btf, var, vsi->type, - data + vsi->offset, bits_offset, m); + btf_show(show, ","); + btf_type_ops(var)->show(btf, var, vsi->type, + data + vsi->offset, bits_offset, show); } - seq_puts(m, "}"); + btf_show_end_type(show); } static const struct btf_kind_operations datasec_ops = { @@ -2894,7 +3609,7 @@ static const struct btf_kind_operations datasec_ops = { .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_datasec_log, - .seq_show = btf_datasec_seq_show, + .show = btf_datasec_show, }; static int btf_func_proto_check(struct btf_verifier_env *env, @@ -3688,7 +4403,7 @@ errout: struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog) { - struct bpf_prog *tgt_prog = prog->aux->linked_prog; + struct bpf_prog *tgt_prog = prog->aux->dst_prog; if (tgt_prog) { return tgt_prog->aux->btf; @@ -3715,7 +4430,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { const struct btf_type *t = prog->aux->attach_func_proto; - struct bpf_prog *tgt_prog = prog->aux->linked_prog; + struct bpf_prog *tgt_prog = prog->aux->dst_prog; struct btf *btf = bpf_prog_get_target_btf(prog); const char *tname = prog->aux->attach_func_name; struct bpf_verifier_log *log = info->log; @@ -3842,7 +4557,14 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, info->reg_type = PTR_TO_BTF_ID; if (tgt_prog) { - ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg); + enum bpf_prog_type tgt_type; + + if (tgt_prog->type == BPF_PROG_TYPE_EXT) + tgt_type = tgt_prog->aux->saved_dst_prog_type; + else + tgt_type = tgt_prog->type; + + ret = btf_translate_to_vmlinux(log, btf, t, tgt_type, arg); if (ret > 0) { info->btf_id = ret; return true; @@ -3870,16 +4592,22 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return true; } -int btf_struct_access(struct bpf_verifier_log *log, - const struct btf_type *t, int off, int size, - enum bpf_access_type atype, - u32 *next_btf_id) +enum bpf_struct_walk_result { + /* < 0 error */ + WALK_SCALAR = 0, + WALK_PTR, + WALK_STRUCT, +}; + +static int btf_struct_walk(struct bpf_verifier_log *log, + const struct btf_type *t, int off, int size, + u32 *next_btf_id) { u32 i, moff, mtrue_end, msize = 0, total_nelems = 0; const struct btf_type *mtype, *elem_type = NULL; const struct btf_member *member; const char *tname, *mname; - u32 vlen; + u32 vlen, elem_id, mid; again: tname = __btf_name_by_offset(btf_vmlinux, t->name_off); @@ -3915,14 +4643,13 @@ again: /* Only allow structure for now, can be relaxed for * other types later. */ - elem_type = btf_type_skip_modifiers(btf_vmlinux, - array_elem->type, NULL); - if (!btf_type_is_struct(elem_type)) + t = btf_type_skip_modifiers(btf_vmlinux, array_elem->type, + NULL); + if (!btf_type_is_struct(t)) goto error; - off = (off - moff) % elem_type->size; - return btf_struct_access(log, elem_type, off, size, atype, - next_btf_id); + off = (off - moff) % t->size; + goto again; error: bpf_log(log, "access beyond struct %s at off %u size %u\n", @@ -3951,7 +4678,7 @@ error: */ if (off <= moff && BITS_ROUNDUP_BYTES(end_bit) <= off + size) - return SCALAR_VALUE; + return WALK_SCALAR; /* off may be accessing a following member * @@ -3973,11 +4700,13 @@ error: break; /* type of the field */ + mid = member->type; mtype = btf_type_by_id(btf_vmlinux, member->type); mname = __btf_name_by_offset(btf_vmlinux, member->name_off); - mtype = btf_resolve_size(btf_vmlinux, mtype, &msize, - &elem_type, &total_nelems); + mtype = __btf_resolve_size(btf_vmlinux, mtype, &msize, + &elem_type, &elem_id, &total_nelems, + &mid); if (IS_ERR(mtype)) { bpf_log(log, "field %s doesn't have size\n", mname); return -EFAULT; @@ -3991,7 +4720,7 @@ error: if (btf_type_is_array(mtype)) { u32 elem_idx; - /* btf_resolve_size() above helps to + /* __btf_resolve_size() above helps to * linearize a multi-dimensional array. * * The logic here is treating an array @@ -4039,6 +4768,7 @@ error: elem_idx = (off - moff) / msize; moff += elem_idx * msize; mtype = elem_type; + mid = elem_id; } /* the 'off' we're looking for is either equal to start @@ -4048,6 +4778,12 @@ error: /* our field must be inside that union or struct */ t = mtype; + /* return if the offset matches the member offset */ + if (off == moff) { + *next_btf_id = mid; + return WALK_STRUCT; + } + /* adjust offset we're looking for */ off -= moff; goto again; @@ -4063,11 +4799,10 @@ error: mname, moff, tname, off, size); return -EACCES; } - stype = btf_type_skip_modifiers(btf_vmlinux, mtype->type, &id); if (btf_type_is_struct(stype)) { *next_btf_id = id; - return PTR_TO_BTF_ID; + return WALK_PTR; } } @@ -4084,23 +4819,82 @@ error: return -EACCES; } - return SCALAR_VALUE; + return WALK_SCALAR; } bpf_log(log, "struct %s doesn't have field at offset %d\n", tname, off); return -EINVAL; } -int btf_resolve_helper_id(struct bpf_verifier_log *log, - const struct bpf_func_proto *fn, int arg) +int btf_struct_access(struct bpf_verifier_log *log, + const struct btf_type *t, int off, int size, + enum bpf_access_type atype __maybe_unused, + u32 *next_btf_id) +{ + int err; + u32 id; + + do { + err = btf_struct_walk(log, t, off, size, &id); + + switch (err) { + case WALK_PTR: + /* If we found the pointer or scalar on t+off, + * we're done. + */ + *next_btf_id = id; + return PTR_TO_BTF_ID; + case WALK_SCALAR: + return SCALAR_VALUE; + case WALK_STRUCT: + /* We found nested struct, so continue the search + * by diving in it. At this point the offset is + * aligned with the new type, so set it to 0. + */ + t = btf_type_by_id(btf_vmlinux, id); + off = 0; + break; + default: + /* It's either error or unknown return value.. + * scream and leave. + */ + if (WARN_ONCE(err > 0, "unknown btf_struct_walk return value")) + return -EINVAL; + return err; + } + } while (t); + + return -EINVAL; +} + +bool btf_struct_ids_match(struct bpf_verifier_log *log, + int off, u32 id, u32 need_type_id) { - int id; + const struct btf_type *type; + int err; - if (fn->arg_type[arg] != ARG_PTR_TO_BTF_ID || !btf_vmlinux) - return -EINVAL; - id = fn->btf_id[arg]; - if (!id || id > btf_vmlinux->nr_types) - return -EINVAL; - return id; + /* Are we already done? */ + if (need_type_id == id && off == 0) + return true; + +again: + type = btf_type_by_id(btf_vmlinux, id); + if (!type) + return false; + err = btf_struct_walk(log, type, off, 1, &id); + if (err != WALK_STRUCT) + return false; + + /* We found nested struct object. If it matches + * the requested ID, we're done. Otherwise let's + * continue the search with offset 0 in the new + * type. + */ + if (need_type_id != id) { + off = 0; + goto again; + } + + return true; } static int __get_type_size(struct btf *btf, u32 btf_id, @@ -4298,7 +5092,7 @@ static int btf_check_func_type_match(struct bpf_verifier_log *log, } /* Compare BTFs of given program with BTF of target program */ -int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog, +int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, struct btf *btf2, const struct btf_type *t2) { struct btf *btf1 = prog->aux->btf; @@ -4306,7 +5100,7 @@ int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog, u32 btf_id = 0; if (!prog->aux->func_info) { - bpf_log(&env->log, "Program extension requires BTF\n"); + bpf_log(log, "Program extension requires BTF\n"); return -EINVAL; } @@ -4318,7 +5112,7 @@ int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog, if (!t1 || !btf_type_is_func(t1)) return -EFAULT; - return btf_check_func_type_match(&env->log, btf1, t1, btf2, t2); + return btf_check_func_type_match(log, btf1, t1, btf2, t2); } /* Compare BTF of a function with given bpf_reg_state. @@ -4469,7 +5263,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, return -EFAULT; } if (prog_type == BPF_PROG_TYPE_EXT) - prog_type = prog->aux->linked_prog->type; + prog_type = prog->aux->dst_prog->type; t = btf_type_by_id(btf, t->type); if (!t || !btf_type_is_func_proto(t)) { @@ -4516,12 +5310,93 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, return 0; } +static void btf_type_show(const struct btf *btf, u32 type_id, void *obj, + struct btf_show *show) +{ + const struct btf_type *t = btf_type_by_id(btf, type_id); + + show->btf = btf; + memset(&show->state, 0, sizeof(show->state)); + memset(&show->obj, 0, sizeof(show->obj)); + + btf_type_ops(t)->show(btf, t, type_id, obj, 0, show); +} + +static void btf_seq_show(struct btf_show *show, const char *fmt, + va_list args) +{ + seq_vprintf((struct seq_file *)show->target, fmt, args); +} + +int btf_type_seq_show_flags(const struct btf *btf, u32 type_id, + void *obj, struct seq_file *m, u64 flags) +{ + struct btf_show sseq; + + sseq.target = m; + sseq.showfn = btf_seq_show; + sseq.flags = flags; + + btf_type_show(btf, type_id, obj, &sseq); + + return sseq.state.status; +} + void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m) { - const struct btf_type *t = btf_type_by_id(btf, type_id); + (void) btf_type_seq_show_flags(btf, type_id, obj, m, + BTF_SHOW_NONAME | BTF_SHOW_COMPACT | + BTF_SHOW_ZERO | BTF_SHOW_UNSAFE); +} + +struct btf_show_snprintf { + struct btf_show show; + int len_left; /* space left in string */ + int len; /* length we would have written */ +}; + +static void btf_snprintf_show(struct btf_show *show, const char *fmt, + va_list args) +{ + struct btf_show_snprintf *ssnprintf = (struct btf_show_snprintf *)show; + int len; - btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m); + len = vsnprintf(show->target, ssnprintf->len_left, fmt, args); + + if (len < 0) { + ssnprintf->len_left = 0; + ssnprintf->len = len; + } else if (len > ssnprintf->len_left) { + /* no space, drive on to get length we would have written */ + ssnprintf->len_left = 0; + ssnprintf->len += len; + } else { + ssnprintf->len_left -= len; + ssnprintf->len += len; + show->target += len; + } +} + +int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj, + char *buf, int len, u64 flags) +{ + struct btf_show_snprintf ssnprintf; + + ssnprintf.show.target = buf; + ssnprintf.show.flags = flags; + ssnprintf.show.showfn = btf_snprintf_show; + ssnprintf.len_left = len; + ssnprintf.len = 0; + + btf_type_show(btf, type_id, obj, (struct btf_show *)&ssnprintf); + + /* If we encontered an error, return it. */ + if (ssnprintf.show.state.status) + return ssnprintf.show.state.status; + + /* Otherwise return length we would have written */ + return ssnprintf.len; } #ifdef CONFIG_PROC_FS @@ -4661,3 +5536,15 @@ u32 btf_id(const struct btf *btf) { return btf->id; } + +static int btf_id_cmp_func(const void *a, const void *b) +{ + const int *pa = a, *pb = b; + + return *pa - *pb; +} + +bool btf_id_set_contains(const struct btf_id_set *set, u32 id) +{ + return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL; +} diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index e21de4f1754c..6ec088a96302 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1226,7 +1226,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { */ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, - void **buf, size_t *pcount, loff_t *ppos, + char **buf, size_t *pcount, loff_t *ppos, enum bpf_attach_type type) { struct bpf_sysctl_kern ctx = { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ed0b3578867c..9268d77898b7 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -25,7 +25,7 @@ #include <linux/moduleloader.h> #include <linux/bpf.h> #include <linux/btf.h> -#include <linux/frame.h> +#include <linux/objtool.h> #include <linux/rbtree_latch.h> #include <linux/kallsyms.h> #include <linux/rcupdate.h> @@ -98,6 +98,8 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag fp->jit_requested = ebpf_jit_enabled(); INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode); + mutex_init(&fp->aux->used_maps_mutex); + mutex_init(&fp->aux->dst_mutex); return fp; } @@ -253,6 +255,8 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, void __bpf_prog_free(struct bpf_prog *fp) { if (fp->aux) { + mutex_destroy(&fp->aux->used_maps_mutex); + mutex_destroy(&fp->aux->dst_mutex); free_percpu(fp->aux->stats); kfree(fp->aux->poke_tab); kfree(fp->aux); @@ -773,7 +777,8 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, if (size > poke_tab_max) return -ENOSPC; - if (poke->ip || poke->ip_stable || poke->adj_off) + if (poke->tailcall_target || poke->tailcall_target_stable || + poke->tailcall_bypass || poke->adj_off || poke->bypass_addr) return -EINVAL; switch (poke->reason) { @@ -1747,8 +1752,9 @@ bool bpf_prog_array_compatible(struct bpf_array *array, static int bpf_check_tail_call(const struct bpf_prog *fp) { struct bpf_prog_aux *aux = fp->aux; - int i; + int i, ret = 0; + mutex_lock(&aux->used_maps_mutex); for (i = 0; i < aux->used_map_cnt; i++) { struct bpf_map *map = aux->used_maps[i]; struct bpf_array *array; @@ -1757,11 +1763,15 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) continue; array = container_of(map, struct bpf_array, map); - if (!bpf_prog_array_compatible(array, fp)) - return -EINVAL; + if (!bpf_prog_array_compatible(array, fp)) { + ret = -EINVAL; + goto out; + } } - return 0; +out: + mutex_unlock(&aux->used_maps_mutex); + return ret; } static void bpf_prog_select_func(struct bpf_prog *fp) @@ -2130,7 +2140,8 @@ static void bpf_prog_free_deferred(struct work_struct *work) if (aux->prog->has_callchain_buf) put_callchain_buffers(); #endif - bpf_trampoline_put(aux->trampoline); + if (aux->dst_trampoline) + bpf_trampoline_put(aux->dst_trampoline); for (i = 0; i < aux->func_cnt; i++) bpf_jit_free(aux->func[i]); if (aux->func_cnt) { @@ -2146,8 +2157,8 @@ void bpf_prog_free(struct bpf_prog *fp) { struct bpf_prog_aux *aux = fp->aux; - if (aux->linked_prog) - bpf_prog_put(aux->linked_prog); + if (aux->dst_prog) + bpf_prog_put(aux->dst_prog); INIT_WORK(&aux->work, bpf_prog_free_deferred); schedule_work(&aux->work); } @@ -2208,6 +2219,8 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_local_storage_proto __weak; const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak; +const struct bpf_func_proto bpf_snprintf_btf_proto __weak; +const struct bpf_func_proto bpf_seq_printf_btf_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 6386b7bb98f2..c61a23b564aa 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -79,8 +79,6 @@ struct bpf_cpu_map { static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list); -static int bq_flush_to_queue(struct xdp_bulk_queue *bq); - static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { u32 value_size = attr->value_size; @@ -157,8 +155,7 @@ static void cpu_map_kthread_stop(struct work_struct *work) kthread_stop(rcpu->kthread); } -static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, - struct xdp_frame *xdpf, +static struct sk_buff *cpu_map_build_skb(struct xdp_frame *xdpf, struct sk_buff *skb) { unsigned int hard_start_headroom; @@ -367,7 +364,7 @@ static int cpu_map_kthread_run(void *data) struct sk_buff *skb = skbs[i]; int ret; - skb = cpu_map_build_skb(rcpu, xdpf, skb); + skb = cpu_map_build_skb(xdpf, skb); if (!skb) { xdp_return_frame(xdpf); continue; @@ -658,6 +655,7 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) static int cpu_map_btf_id; const struct bpf_map_ops cpu_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc = cpu_map_alloc, .map_free = cpu_map_free, .map_delete_elem = cpu_map_delete_elem, @@ -669,7 +667,7 @@ const struct bpf_map_ops cpu_map_ops = { .map_btf_id = &cpu_map_btf_id, }; -static int bq_flush_to_queue(struct xdp_bulk_queue *bq) +static void bq_flush_to_queue(struct xdp_bulk_queue *bq) { struct bpf_cpu_map_entry *rcpu = bq->obj; unsigned int processed = 0, drops = 0; @@ -678,7 +676,7 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq) int i; if (unlikely(!bq->count)) - return 0; + return; q = rcpu->queue; spin_lock(&q->producer_lock); @@ -701,13 +699,12 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq) /* Feedback loop via tracepoints */ trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu); - return 0; } /* Runs under RCU-read-side, plus in softirq under NAPI protection. * Thus, safe percpu variable access. */ -static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) +static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); @@ -728,8 +725,6 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) if (!bq->flush_node.prev) list_add(&bq->flush_node, flush_list); - - return 0; } int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 10abb06065bb..2b5ca93c17de 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -341,14 +341,14 @@ bool dev_map_can_have_prog(struct bpf_map *map) return false; } -static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) +static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) { struct net_device *dev = bq->dev; int sent = 0, drops = 0, err = 0; int i; if (unlikely(!bq->count)) - return 0; + return; for (i = 0; i < bq->count; i++) { struct xdp_frame *xdpf = bq->q[i]; @@ -369,7 +369,7 @@ out: trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err); bq->dev_rx = NULL; __list_del_clearprev(&bq->flush_node); - return 0; + return; error: /* If ndo_xdp_xmit fails with an errno, no frames have been * xmit'ed and it's our responsibility to them free all. @@ -421,8 +421,8 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) /* Runs under RCU-read-side, plus in softirq under NAPI protection. * Thus, safe percpu variable access. */ -static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, - struct net_device *dev_rx) +static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, + struct net_device *dev_rx) { struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); @@ -441,8 +441,6 @@ static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, if (!bq->flush_node.prev) list_add(&bq->flush_node, flush_list); - - return 0; } static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, @@ -462,7 +460,8 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, if (unlikely(!xdpf)) return -EOVERFLOW; - return bq_enqueue(dev, xdpf, dev_rx); + bq_enqueue(dev, xdpf, dev_rx); + return 0; } static struct xdp_buff *dev_map_run_prog(struct net_device *dev, @@ -751,6 +750,7 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, static int dev_map_btf_id; const struct bpf_map_ops dev_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc = dev_map_alloc, .map_free = dev_map_free, .map_get_next_key = dev_map_get_next_key, @@ -764,6 +764,7 @@ const struct bpf_map_ops dev_map_ops = { static int dev_map_hash_map_btf_id; const struct bpf_map_ops dev_map_hash_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc = dev_map_alloc, .map_free = dev_map_free, .map_get_next_key = dev_map_hash_get_next_key, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 7df28a45c66b..1815e97d4c9c 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -9,6 +9,7 @@ #include <linux/rculist_nulls.h> #include <linux/random.h> #include <uapi/linux/btf.h> +#include <linux/rcupdate_trace.h> #include "percpu_freelist.h" #include "bpf_lru_list.h" #include "map_in_map.h" @@ -577,8 +578,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) struct htab_elem *l; u32 hash, key_size; - /* Must be called with rcu_read_lock. */ - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; @@ -612,7 +612,7 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) * bpf_prog * __htab_map_lookup_elem */ -static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; @@ -651,7 +651,7 @@ static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key) return __htab_lru_map_lookup_elem(map, key, false); } -static u32 htab_lru_map_gen_lookup(struct bpf_map *map, +static int htab_lru_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; @@ -941,7 +941,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; @@ -1032,7 +1032,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; @@ -1220,7 +1220,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) u32 hash, key_size; int ret = -ENOENT; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; @@ -1252,7 +1252,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) u32 hash, key_size; int ret = -ENOENT; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; @@ -1803,6 +1803,7 @@ static const struct bpf_iter_seq_info iter_seq_info = { static int htab_map_btf_id; const struct bpf_map_ops htab_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, @@ -1820,6 +1821,7 @@ const struct bpf_map_ops htab_map_ops = { static int htab_lru_map_btf_id; const struct bpf_map_ops htab_lru_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, @@ -1940,6 +1942,7 @@ static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, static int htab_percpu_map_btf_id; const struct bpf_map_ops htab_percpu_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, @@ -1956,6 +1959,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { static int htab_lru_percpu_map_btf_id; const struct bpf_map_ops htab_lru_percpu_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, @@ -2066,7 +2070,7 @@ static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) return READ_ONCE(*inner_map); } -static u32 htab_of_map_gen_lookup(struct bpf_map *map, +static int htab_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index be43ab3e619f..25520f5eeaf6 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -601,6 +601,56 @@ const struct bpf_func_proto bpf_event_output_data_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +BPF_CALL_3(bpf_copy_from_user, void *, dst, u32, size, + const void __user *, user_ptr) +{ + int ret = copy_from_user(dst, user_ptr, size); + + if (unlikely(ret)) { + memset(dst, 0, size); + ret = -EFAULT; + } + + return ret; +} + +const struct bpf_func_proto bpf_copy_from_user_proto = { + .func = bpf_copy_from_user, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu) +{ + if (cpu >= nr_cpu_ids) + return (unsigned long)NULL; + + return (unsigned long)per_cpu_ptr((const void __percpu *)ptr, cpu); +} + +const struct bpf_func_proto bpf_per_cpu_ptr_proto = { + .func = bpf_per_cpu_ptr, + .gpl_only = false, + .ret_type = RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr) +{ + return (unsigned long)this_cpu_ptr((const void __percpu *)percpu_ptr); +} + +const struct bpf_func_proto bpf_this_cpu_ptr_proto = { + .func = bpf_this_cpu_ptr, + .gpl_only = false, + .ret_type = RET_PTR_TO_MEM_OR_BTF_ID, + .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, +}; + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; @@ -661,8 +711,16 @@ bpf_base_func_proto(enum bpf_func_id func_id) if (!perfmon_capable()) return NULL; return bpf_get_trace_printk_proto(); + case BPF_FUNC_snprintf_btf: + if (!perfmon_capable()) + return NULL; + return &bpf_snprintf_btf_proto; case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; + case BPF_FUNC_bpf_per_cpu_ptr: + return &bpf_per_cpu_ptr_proto; + case BPF_FUNC_bpf_this_cpu_ptr: + return &bpf_this_cpu_ptr_proto; default: break; } diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 18f4969552ac..dd4b7fd60ee7 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -20,6 +20,7 @@ #include <linux/filter.h> #include <linux/bpf.h> #include <linux/bpf_trace.h> +#include "preload/bpf_preload.h" enum bpf_type { BPF_TYPE_UNSPEC = 0, @@ -371,9 +372,10 @@ static struct dentry * bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) { /* Dots in names (e.g. "/sys/fs/bpf/foo.bar") are reserved for future - * extensions. + * extensions. That allows popoulate_bpffs() create special files. */ - if (strchr(dentry->d_name.name, '.')) + if ((dir->i_mode & S_IALLUGO) && + strchr(dentry->d_name.name, '.')) return ERR_PTR(-EPERM); return simple_lookup(dir, dentry, flags); @@ -411,6 +413,27 @@ static const struct inode_operations bpf_dir_iops = { .unlink = simple_unlink, }; +/* pin iterator link into bpffs */ +static int bpf_iter_link_pin_kernel(struct dentry *parent, + const char *name, struct bpf_link *link) +{ + umode_t mode = S_IFREG | S_IRUSR; + struct dentry *dentry; + int ret; + + inode_lock(parent->d_inode); + dentry = lookup_one_len(name, parent, strlen(name)); + if (IS_ERR(dentry)) { + inode_unlock(parent->d_inode); + return PTR_ERR(dentry); + } + ret = bpf_mkobj_ops(dentry, mode, link, &bpf_link_iops, + &bpf_iter_fops); + dput(dentry); + inode_unlock(parent->d_inode); + return ret; +} + static int bpf_obj_do_pin(const char __user *pathname, void *raw, enum bpf_type type) { @@ -640,6 +663,91 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) return 0; } +struct bpf_preload_ops *bpf_preload_ops; +EXPORT_SYMBOL_GPL(bpf_preload_ops); + +static bool bpf_preload_mod_get(void) +{ + /* If bpf_preload.ko wasn't loaded earlier then load it now. + * When bpf_preload is built into vmlinux the module's __init + * function will populate it. + */ + if (!bpf_preload_ops) { + request_module("bpf_preload"); + if (!bpf_preload_ops) + return false; + } + /* And grab the reference, so the module doesn't disappear while the + * kernel is interacting with the kernel module and its UMD. + */ + if (!try_module_get(bpf_preload_ops->owner)) { + pr_err("bpf_preload module get failed.\n"); + return false; + } + return true; +} + +static void bpf_preload_mod_put(void) +{ + if (bpf_preload_ops) + /* now user can "rmmod bpf_preload" if necessary */ + module_put(bpf_preload_ops->owner); +} + +static DEFINE_MUTEX(bpf_preload_lock); + +static int populate_bpffs(struct dentry *parent) +{ + struct bpf_preload_info objs[BPF_PRELOAD_LINKS] = {}; + struct bpf_link *links[BPF_PRELOAD_LINKS] = {}; + int err = 0, i; + + /* grab the mutex to make sure the kernel interactions with bpf_preload + * UMD are serialized + */ + mutex_lock(&bpf_preload_lock); + + /* if bpf_preload.ko wasn't built into vmlinux then load it */ + if (!bpf_preload_mod_get()) + goto out; + + if (!bpf_preload_ops->info.tgid) { + /* preload() will start UMD that will load BPF iterator programs */ + err = bpf_preload_ops->preload(objs); + if (err) + goto out_put; + for (i = 0; i < BPF_PRELOAD_LINKS; i++) { + links[i] = bpf_link_by_id(objs[i].link_id); + if (IS_ERR(links[i])) { + err = PTR_ERR(links[i]); + goto out_put; + } + } + for (i = 0; i < BPF_PRELOAD_LINKS; i++) { + err = bpf_iter_link_pin_kernel(parent, + objs[i].link_name, links[i]); + if (err) + goto out_put; + /* do not unlink successfully pinned links even + * if later link fails to pin + */ + links[i] = NULL; + } + /* finish() will tell UMD process to exit */ + err = bpf_preload_ops->finish(); + if (err) + goto out_put; + } +out_put: + bpf_preload_mod_put(); +out: + mutex_unlock(&bpf_preload_lock); + for (i = 0; i < BPF_PRELOAD_LINKS && err; i++) + if (!IS_ERR_OR_NULL(links[i])) + bpf_link_put(links[i]); + return err; +} + static int bpf_fill_super(struct super_block *sb, struct fs_context *fc) { static const struct tree_descr bpf_rfiles[] = { { "" } }; @@ -656,8 +764,8 @@ static int bpf_fill_super(struct super_block *sb, struct fs_context *fc) inode = sb->s_root->d_inode; inode->i_op = &bpf_dir_iops; inode->i_mode &= ~S_IALLUGO; + populate_bpffs(sb->s_root); inode->i_mode |= S_ISVTX | opts->mode; - return 0; } @@ -707,6 +815,8 @@ static int __init bpf_init(void) { int ret; + mutex_init(&bpf_preload_lock); + ret = sysfs_create_mount_point(fs_kobj, "bpf"); if (ret) return ret; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 44474bf3ab7a..00e32f2ec3e6 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -732,6 +732,7 @@ static int trie_check_btf(const struct bpf_map *map, static int trie_map_btf_id; const struct bpf_map_ops trie_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc = trie_alloc, .map_free = trie_free, .map_get_next_key = trie_get_next_key, diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 17738c93bec8..39ab0b68cade 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -17,23 +17,17 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) if (IS_ERR(inner_map)) return inner_map; - /* prog_array->aux->{type,jited} is a runtime binding. - * Doing static check alone in the verifier is not enough. - */ - if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY || - inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE || - inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE || - inner_map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { - fdput(f); - return ERR_PTR(-ENOTSUPP); - } - /* Does not support >1 level map-in-map */ if (inner_map->inner_map_meta) { fdput(f); return ERR_PTR(-EINVAL); } + if (!inner_map->ops->map_meta_equal) { + fdput(f); + return ERR_PTR(-ENOTSUPP); + } + if (map_value_has_spin_lock(inner_map)) { fdput(f); return ERR_PTR(-ENOTSUPP); @@ -81,15 +75,14 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0, return meta0->map_type == meta1->map_type && meta0->key_size == meta1->key_size && meta0->value_size == meta1->value_size && - meta0->map_flags == meta1->map_flags && - meta0->max_entries == meta1->max_entries; + meta0->map_flags == meta1->map_flags; } void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file /* not used */, int ufd) { - struct bpf_map *inner_map; + struct bpf_map *inner_map, *inner_map_meta; struct fd f; f = fdget(ufd); @@ -97,7 +90,8 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map, if (IS_ERR(inner_map)) return inner_map; - if (bpf_map_meta_equal(map->inner_map_meta, inner_map)) + inner_map_meta = map->inner_map_meta; + if (inner_map_meta->ops->map_meta_equal(inner_map_meta, inner_map)) bpf_map_inc(inner_map); else inner_map = ERR_PTR(-EINVAL); diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h index a507bf6ef8b9..bcb7534afb3c 100644 --- a/kernel/bpf/map_in_map.h +++ b/kernel/bpf/map_in_map.h @@ -11,8 +11,6 @@ struct bpf_map; struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd); void bpf_map_meta_free(struct bpf_map *map_meta); -bool bpf_map_meta_equal(const struct bpf_map *meta0, - const struct bpf_map *meta1); void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, int ufd); void bpf_map_fd_put_ptr(void *ptr); diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index af86048e5afd..6a9542af4212 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -149,6 +149,19 @@ static void bpf_iter_detach_map(struct bpf_iter_aux_info *aux) bpf_map_put_with_uref(aux->map); } +void bpf_iter_map_show_fdinfo(const struct bpf_iter_aux_info *aux, + struct seq_file *seq) +{ + seq_printf(seq, "map_id:\t%u\n", aux->map->id); +} + +int bpf_iter_map_fill_link_info(const struct bpf_iter_aux_info *aux, + struct bpf_link_info *info) +{ + info->iter.map.map_id = aux->map->id; + return 0; +} + DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta, struct bpf_map *map, void *key, void *value) @@ -156,6 +169,8 @@ static const struct bpf_iter_reg bpf_map_elem_reg_info = { .target = "bpf_map_elem", .attach_target = bpf_iter_attach_map, .detach_target = bpf_iter_detach_map, + .show_fdinfo = bpf_iter_map_show_fdinfo, + .fill_link_info = bpf_iter_map_fill_link_info, .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_map_elem, key), diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index b367430e611c..3d897de89061 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -17,6 +17,8 @@ int pcpu_freelist_init(struct pcpu_freelist *s) raw_spin_lock_init(&head->lock); head->first = NULL; } + raw_spin_lock_init(&s->extralist.lock); + s->extralist.first = NULL; return 0; } @@ -40,12 +42,50 @@ static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head, raw_spin_unlock(&head->lock); } +static inline bool pcpu_freelist_try_push_extra(struct pcpu_freelist *s, + struct pcpu_freelist_node *node) +{ + if (!raw_spin_trylock(&s->extralist.lock)) + return false; + + pcpu_freelist_push_node(&s->extralist, node); + raw_spin_unlock(&s->extralist.lock); + return true; +} + +static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s, + struct pcpu_freelist_node *node) +{ + int cpu, orig_cpu; + + orig_cpu = cpu = raw_smp_processor_id(); + while (1) { + struct pcpu_freelist_head *head; + + head = per_cpu_ptr(s->freelist, cpu); + if (raw_spin_trylock(&head->lock)) { + pcpu_freelist_push_node(head, node); + raw_spin_unlock(&head->lock); + return; + } + cpu = cpumask_next(cpu, cpu_possible_mask); + if (cpu >= nr_cpu_ids) + cpu = 0; + + /* cannot lock any per cpu lock, try extralist */ + if (cpu == orig_cpu && + pcpu_freelist_try_push_extra(s, node)) + return; + } +} + void __pcpu_freelist_push(struct pcpu_freelist *s, struct pcpu_freelist_node *node) { - struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist); - - ___pcpu_freelist_push(head, node); + if (in_nmi()) + ___pcpu_freelist_push_nmi(s, node); + else + ___pcpu_freelist_push(this_cpu_ptr(s->freelist), node); } void pcpu_freelist_push(struct pcpu_freelist *s, @@ -81,7 +121,7 @@ again: } } -struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) +static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s) { struct pcpu_freelist_head *head; struct pcpu_freelist_node *node; @@ -102,8 +142,59 @@ struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) if (cpu >= nr_cpu_ids) cpu = 0; if (cpu == orig_cpu) - return NULL; + break; + } + + /* per cpu lists are all empty, try extralist */ + raw_spin_lock(&s->extralist.lock); + node = s->extralist.first; + if (node) + s->extralist.first = node->next; + raw_spin_unlock(&s->extralist.lock); + return node; +} + +static struct pcpu_freelist_node * +___pcpu_freelist_pop_nmi(struct pcpu_freelist *s) +{ + struct pcpu_freelist_head *head; + struct pcpu_freelist_node *node; + int orig_cpu, cpu; + + orig_cpu = cpu = raw_smp_processor_id(); + while (1) { + head = per_cpu_ptr(s->freelist, cpu); + if (raw_spin_trylock(&head->lock)) { + node = head->first; + if (node) { + head->first = node->next; + raw_spin_unlock(&head->lock); + return node; + } + raw_spin_unlock(&head->lock); + } + cpu = cpumask_next(cpu, cpu_possible_mask); + if (cpu >= nr_cpu_ids) + cpu = 0; + if (cpu == orig_cpu) + break; } + + /* cannot pop from per cpu lists, try extralist */ + if (!raw_spin_trylock(&s->extralist.lock)) + return NULL; + node = s->extralist.first; + if (node) + s->extralist.first = node->next; + raw_spin_unlock(&s->extralist.lock); + return node; +} + +struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) +{ + if (in_nmi()) + return ___pcpu_freelist_pop_nmi(s); + return ___pcpu_freelist_pop(s); } struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h index fbf8a8a28979..3c76553cfe57 100644 --- a/kernel/bpf/percpu_freelist.h +++ b/kernel/bpf/percpu_freelist.h @@ -13,6 +13,7 @@ struct pcpu_freelist_head { struct pcpu_freelist { struct pcpu_freelist_head __percpu *freelist; + struct pcpu_freelist_head extralist; }; struct pcpu_freelist_node { diff --git a/kernel/bpf/preload/.gitignore b/kernel/bpf/preload/.gitignore new file mode 100644 index 000000000000..856a4c5ad0dd --- /dev/null +++ b/kernel/bpf/preload/.gitignore @@ -0,0 +1,4 @@ +/FEATURE-DUMP.libbpf +/bpf_helper_defs.h +/feature +/bpf_preload_umd diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig new file mode 100644 index 000000000000..ace49111d3a3 --- /dev/null +++ b/kernel/bpf/preload/Kconfig @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: GPL-2.0-only +config USERMODE_DRIVER + bool + default n + +menuconfig BPF_PRELOAD + bool "Preload BPF file system with kernel specific program and map iterators" + depends on BPF + # The dependency on !COMPILE_TEST prevents it from being enabled + # in allmodconfig or allyesconfig configurations + depends on !COMPILE_TEST + select USERMODE_DRIVER + help + This builds kernel module with several embedded BPF programs that are + pinned into BPF FS mount point as human readable files that are + useful in debugging and introspection of BPF programs and maps. + +if BPF_PRELOAD +config BPF_PRELOAD_UMD + tristate "bpf_preload kernel module with user mode driver" + depends on CC_CAN_LINK + depends on m || CC_CAN_LINK_STATIC + default m + help + This builds bpf_preload kernel module with embedded user mode driver. +endif diff --git a/kernel/bpf/preload/Makefile b/kernel/bpf/preload/Makefile new file mode 100644 index 000000000000..23ee310b6eb4 --- /dev/null +++ b/kernel/bpf/preload/Makefile @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: GPL-2.0 + +LIBBPF_SRCS = $(srctree)/tools/lib/bpf/ +LIBBPF_A = $(obj)/libbpf.a +LIBBPF_OUT = $(abspath $(obj)) + +$(LIBBPF_A): + $(Q)$(MAKE) -C $(LIBBPF_SRCS) OUTPUT=$(LIBBPF_OUT)/ $(LIBBPF_OUT)/libbpf.a + +userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi \ + -I $(srctree)/tools/lib/ -Wno-unused-result + +userprogs := bpf_preload_umd + +clean-files := $(userprogs) bpf_helper_defs.h FEATURE-DUMP.libbpf staticobjs/ feature/ + +bpf_preload_umd-objs := iterators/iterators.o +bpf_preload_umd-userldlibs := $(LIBBPF_A) -lelf -lz + +$(obj)/bpf_preload_umd: $(LIBBPF_A) + +$(obj)/bpf_preload_umd_blob.o: $(obj)/bpf_preload_umd + +obj-$(CONFIG_BPF_PRELOAD_UMD) += bpf_preload.o +bpf_preload-objs += bpf_preload_kern.o bpf_preload_umd_blob.o diff --git a/kernel/bpf/preload/bpf_preload.h b/kernel/bpf/preload/bpf_preload.h new file mode 100644 index 000000000000..2f9932276f2e --- /dev/null +++ b/kernel/bpf/preload/bpf_preload.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BPF_PRELOAD_H +#define _BPF_PRELOAD_H + +#include <linux/usermode_driver.h> +#include "iterators/bpf_preload_common.h" + +struct bpf_preload_ops { + struct umd_info info; + int (*preload)(struct bpf_preload_info *); + int (*finish)(void); + struct module *owner; +}; +extern struct bpf_preload_ops *bpf_preload_ops; +#define BPF_PRELOAD_LINKS 2 +#endif diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c new file mode 100644 index 000000000000..79c5772465f1 --- /dev/null +++ b/kernel/bpf/preload/bpf_preload_kern.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/init.h> +#include <linux/module.h> +#include <linux/pid.h> +#include <linux/fs.h> +#include <linux/sched/signal.h> +#include "bpf_preload.h" + +extern char bpf_preload_umd_start; +extern char bpf_preload_umd_end; + +static int preload(struct bpf_preload_info *obj); +static int finish(void); + +static struct bpf_preload_ops umd_ops = { + .info.driver_name = "bpf_preload", + .preload = preload, + .finish = finish, + .owner = THIS_MODULE, +}; + +static int preload(struct bpf_preload_info *obj) +{ + int magic = BPF_PRELOAD_START; + loff_t pos = 0; + int i, err; + ssize_t n; + + err = fork_usermode_driver(&umd_ops.info); + if (err) + return err; + + /* send the start magic to let UMD proceed with loading BPF progs */ + n = kernel_write(umd_ops.info.pipe_to_umh, + &magic, sizeof(magic), &pos); + if (n != sizeof(magic)) + return -EPIPE; + + /* receive bpf_link IDs and names from UMD */ + pos = 0; + for (i = 0; i < BPF_PRELOAD_LINKS; i++) { + n = kernel_read(umd_ops.info.pipe_from_umh, + &obj[i], sizeof(*obj), &pos); + if (n != sizeof(*obj)) + return -EPIPE; + } + return 0; +} + +static int finish(void) +{ + int magic = BPF_PRELOAD_END; + struct pid *tgid; + loff_t pos = 0; + ssize_t n; + + /* send the last magic to UMD. It will do a normal exit. */ + n = kernel_write(umd_ops.info.pipe_to_umh, + &magic, sizeof(magic), &pos); + if (n != sizeof(magic)) + return -EPIPE; + tgid = umd_ops.info.tgid; + wait_event(tgid->wait_pidfd, thread_group_exited(tgid)); + umd_ops.info.tgid = NULL; + return 0; +} + +static int __init load_umd(void) +{ + int err; + + err = umd_load_blob(&umd_ops.info, &bpf_preload_umd_start, + &bpf_preload_umd_end - &bpf_preload_umd_start); + if (err) + return err; + bpf_preload_ops = &umd_ops; + return err; +} + +static void __exit fini_umd(void) +{ + bpf_preload_ops = NULL; + /* kill UMD in case it's still there due to earlier error */ + kill_pid(umd_ops.info.tgid, SIGKILL, 1); + umd_ops.info.tgid = NULL; + umd_unload_blob(&umd_ops.info); +} +late_initcall(load_umd); +module_exit(fini_umd); +MODULE_LICENSE("GPL"); diff --git a/kernel/bpf/preload/bpf_preload_umd_blob.S b/kernel/bpf/preload/bpf_preload_umd_blob.S new file mode 100644 index 000000000000..f1f40223b5c3 --- /dev/null +++ b/kernel/bpf/preload/bpf_preload_umd_blob.S @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + .section .init.rodata, "a" + .global bpf_preload_umd_start +bpf_preload_umd_start: + .incbin "kernel/bpf/preload/bpf_preload_umd" + .global bpf_preload_umd_end +bpf_preload_umd_end: diff --git a/kernel/bpf/preload/iterators/.gitignore b/kernel/bpf/preload/iterators/.gitignore new file mode 100644 index 000000000000..ffdb70230c8b --- /dev/null +++ b/kernel/bpf/preload/iterators/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +/.output diff --git a/kernel/bpf/preload/iterators/Makefile b/kernel/bpf/preload/iterators/Makefile new file mode 100644 index 000000000000..28fa8c1440f4 --- /dev/null +++ b/kernel/bpf/preload/iterators/Makefile @@ -0,0 +1,57 @@ +# SPDX-License-Identifier: GPL-2.0 +OUTPUT := .output +CLANG ?= clang +LLC ?= llc +LLVM_STRIP ?= llvm-strip +DEFAULT_BPFTOOL := $(OUTPUT)/sbin/bpftool +BPFTOOL ?= $(DEFAULT_BPFTOOL) +LIBBPF_SRC := $(abspath ../../../../tools/lib/bpf) +BPFOBJ := $(OUTPUT)/libbpf.a +BPF_INCLUDE := $(OUTPUT) +INCLUDES := -I$(OUTPUT) -I$(BPF_INCLUDE) -I$(abspath ../../../../tools/lib) \ + -I$(abspath ../../../../tools/include/uapi) +CFLAGS := -g -Wall + +abs_out := $(abspath $(OUTPUT)) +ifeq ($(V),1) +Q = +msg = +else +Q = @ +msg = @printf ' %-8s %s%s\n' "$(1)" "$(notdir $(2))" "$(if $(3), $(3))"; +MAKEFLAGS += --no-print-directory +submake_extras := feature_display=0 +endif + +.DELETE_ON_ERROR: + +.PHONY: all clean + +all: iterators.skel.h + +clean: + $(call msg,CLEAN) + $(Q)rm -rf $(OUTPUT) iterators + +iterators.skel.h: $(OUTPUT)/iterators.bpf.o | $(BPFTOOL) + $(call msg,GEN-SKEL,$@) + $(Q)$(BPFTOOL) gen skeleton $< > $@ + + +$(OUTPUT)/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT) + $(call msg,BPF,$@) + $(Q)$(CLANG) -g -O2 -target bpf $(INCLUDES) \ + -c $(filter %.c,$^) -o $@ && \ + $(LLVM_STRIP) -g $@ + +$(OUTPUT): + $(call msg,MKDIR,$@) + $(Q)mkdir -p $(OUTPUT) + +$(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT) + $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) \ + OUTPUT=$(abspath $(dir $@))/ $(abspath $@) + +$(DEFAULT_BPFTOOL): + $(Q)$(MAKE) $(submake_extras) -C ../../../../tools/bpf/bpftool \ + prefix= OUTPUT=$(abs_out)/ DESTDIR=$(abs_out) install diff --git a/kernel/bpf/preload/iterators/README b/kernel/bpf/preload/iterators/README new file mode 100644 index 000000000000..7fd6d39a9ad2 --- /dev/null +++ b/kernel/bpf/preload/iterators/README @@ -0,0 +1,4 @@ +WARNING: +If you change "iterators.bpf.c" do "make -j" in this directory to rebuild "iterators.skel.h". +Make sure to have clang 10 installed. +See Documentation/bpf/bpf_devel_QA.rst diff --git a/kernel/bpf/preload/iterators/bpf_preload_common.h b/kernel/bpf/preload/iterators/bpf_preload_common.h new file mode 100644 index 000000000000..8464d1a48c05 --- /dev/null +++ b/kernel/bpf/preload/iterators/bpf_preload_common.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BPF_PRELOAD_COMMON_H +#define _BPF_PRELOAD_COMMON_H + +#define BPF_PRELOAD_START 0x5555 +#define BPF_PRELOAD_END 0xAAAA + +struct bpf_preload_info { + char link_name[16]; + int link_id; +}; + +#endif diff --git a/kernel/bpf/preload/iterators/iterators.bpf.c b/kernel/bpf/preload/iterators/iterators.bpf.c new file mode 100644 index 000000000000..52aa7b38e8b8 --- /dev/null +++ b/kernel/bpf/preload/iterators/iterators.bpf.c @@ -0,0 +1,114 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> + +#pragma clang attribute push (__attribute__((preserve_access_index)), apply_to = record) +struct seq_file; +struct bpf_iter_meta { + struct seq_file *seq; + __u64 session_id; + __u64 seq_num; +}; + +struct bpf_map { + __u32 id; + char name[16]; + __u32 max_entries; +}; + +struct bpf_iter__bpf_map { + struct bpf_iter_meta *meta; + struct bpf_map *map; +}; + +struct btf_type { + __u32 name_off; +}; + +struct btf_header { + __u32 str_len; +}; + +struct btf { + const char *strings; + struct btf_type **types; + struct btf_header hdr; +}; + +struct bpf_prog_aux { + __u32 id; + char name[16]; + const char *attach_func_name; + struct bpf_prog *dst_prog; + struct bpf_func_info *func_info; + struct btf *btf; +}; + +struct bpf_prog { + struct bpf_prog_aux *aux; +}; + +struct bpf_iter__bpf_prog { + struct bpf_iter_meta *meta; + struct bpf_prog *prog; +}; +#pragma clang attribute pop + +static const char *get_name(struct btf *btf, long btf_id, const char *fallback) +{ + struct btf_type **types, *t; + unsigned int name_off; + const char *str; + + if (!btf) + return fallback; + str = btf->strings; + types = btf->types; + bpf_probe_read_kernel(&t, sizeof(t), types + btf_id); + name_off = BPF_CORE_READ(t, name_off); + if (name_off >= btf->hdr.str_len) + return fallback; + return str + name_off; +} + +SEC("iter/bpf_map") +int dump_bpf_map(struct bpf_iter__bpf_map *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + __u64 seq_num = ctx->meta->seq_num; + struct bpf_map *map = ctx->map; + + if (!map) + return 0; + + if (seq_num == 0) + BPF_SEQ_PRINTF(seq, " id name max_entries\n"); + + BPF_SEQ_PRINTF(seq, "%4u %-16s%6d\n", map->id, map->name, map->max_entries); + return 0; +} + +SEC("iter/bpf_prog") +int dump_bpf_prog(struct bpf_iter__bpf_prog *ctx) +{ + struct seq_file *seq = ctx->meta->seq; + __u64 seq_num = ctx->meta->seq_num; + struct bpf_prog *prog = ctx->prog; + struct bpf_prog_aux *aux; + + if (!prog) + return 0; + + aux = prog->aux; + if (seq_num == 0) + BPF_SEQ_PRINTF(seq, " id name attached\n"); + + BPF_SEQ_PRINTF(seq, "%4u %-16s %s %s\n", aux->id, + get_name(aux->btf, aux->func_info[0].type_id, aux->name), + aux->attach_func_name, aux->dst_prog->aux->name); + return 0; +} +char LICENSE[] SEC("license") = "GPL"; diff --git a/kernel/bpf/preload/iterators/iterators.c b/kernel/bpf/preload/iterators/iterators.c new file mode 100644 index 000000000000..b7ff87939172 --- /dev/null +++ b/kernel/bpf/preload/iterators/iterators.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Facebook */ +#include <argp.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/resource.h> +#include <bpf/libbpf.h> +#include <bpf/bpf.h> +#include <sys/mount.h> +#include "iterators.skel.h" +#include "bpf_preload_common.h" + +int to_kernel = -1; +int from_kernel = 0; + +static int send_link_to_kernel(struct bpf_link *link, const char *link_name) +{ + struct bpf_preload_info obj = {}; + struct bpf_link_info info = {}; + __u32 info_len = sizeof(info); + int err; + + err = bpf_obj_get_info_by_fd(bpf_link__fd(link), &info, &info_len); + if (err) + return err; + obj.link_id = info.id; + if (strlen(link_name) >= sizeof(obj.link_name)) + return -E2BIG; + strcpy(obj.link_name, link_name); + if (write(to_kernel, &obj, sizeof(obj)) != sizeof(obj)) + return -EPIPE; + return 0; +} + +int main(int argc, char **argv) +{ + struct rlimit rlim = { RLIM_INFINITY, RLIM_INFINITY }; + struct iterators_bpf *skel; + int err, magic; + int debug_fd; + + debug_fd = open("/dev/console", O_WRONLY | O_NOCTTY | O_CLOEXEC); + if (debug_fd < 0) + return 1; + to_kernel = dup(1); + close(1); + dup(debug_fd); + /* now stdin and stderr point to /dev/console */ + + read(from_kernel, &magic, sizeof(magic)); + if (magic != BPF_PRELOAD_START) { + printf("bad start magic %d\n", magic); + return 1; + } + setrlimit(RLIMIT_MEMLOCK, &rlim); + /* libbpf opens BPF object and loads it into the kernel */ + skel = iterators_bpf__open_and_load(); + if (!skel) { + /* iterators.skel.h is little endian. + * libbpf doesn't support automatic little->big conversion + * of BPF bytecode yet. + * The program load will fail in such case. + */ + printf("Failed load could be due to wrong endianness\n"); + return 1; + } + err = iterators_bpf__attach(skel); + if (err) + goto cleanup; + + /* send two bpf_link IDs with names to the kernel */ + err = send_link_to_kernel(skel->links.dump_bpf_map, "maps.debug"); + if (err) + goto cleanup; + err = send_link_to_kernel(skel->links.dump_bpf_prog, "progs.debug"); + if (err) + goto cleanup; + + /* The kernel will proceed with pinnging the links in bpffs. + * UMD will wait on read from pipe. + */ + read(from_kernel, &magic, sizeof(magic)); + if (magic != BPF_PRELOAD_END) { + printf("bad final magic %d\n", magic); + err = -EINVAL; + } +cleanup: + iterators_bpf__destroy(skel); + + return err != 0; +} diff --git a/kernel/bpf/preload/iterators/iterators.skel.h b/kernel/bpf/preload/iterators/iterators.skel.h new file mode 100644 index 000000000000..cf9a6a94b3a4 --- /dev/null +++ b/kernel/bpf/preload/iterators/iterators.skel.h @@ -0,0 +1,412 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* THIS FILE IS AUTOGENERATED! */ +#ifndef __ITERATORS_BPF_SKEL_H__ +#define __ITERATORS_BPF_SKEL_H__ + +#include <stdlib.h> +#include <bpf/libbpf.h> + +struct iterators_bpf { + struct bpf_object_skeleton *skeleton; + struct bpf_object *obj; + struct { + struct bpf_map *rodata; + } maps; + struct { + struct bpf_program *dump_bpf_map; + struct bpf_program *dump_bpf_prog; + } progs; + struct { + struct bpf_link *dump_bpf_map; + struct bpf_link *dump_bpf_prog; + } links; + struct iterators_bpf__rodata { + char dump_bpf_map____fmt[35]; + char dump_bpf_map____fmt_1[14]; + char dump_bpf_prog____fmt[32]; + char dump_bpf_prog____fmt_2[17]; + } *rodata; +}; + +static void +iterators_bpf__destroy(struct iterators_bpf *obj) +{ + if (!obj) + return; + if (obj->skeleton) + bpf_object__destroy_skeleton(obj->skeleton); + free(obj); +} + +static inline int +iterators_bpf__create_skeleton(struct iterators_bpf *obj); + +static inline struct iterators_bpf * +iterators_bpf__open_opts(const struct bpf_object_open_opts *opts) +{ + struct iterators_bpf *obj; + + obj = (struct iterators_bpf *)calloc(1, sizeof(*obj)); + if (!obj) + return NULL; + if (iterators_bpf__create_skeleton(obj)) + goto err; + if (bpf_object__open_skeleton(obj->skeleton, opts)) + goto err; + + return obj; +err: + iterators_bpf__destroy(obj); + return NULL; +} + +static inline struct iterators_bpf * +iterators_bpf__open(void) +{ + return iterators_bpf__open_opts(NULL); +} + +static inline int +iterators_bpf__load(struct iterators_bpf *obj) +{ + return bpf_object__load_skeleton(obj->skeleton); +} + +static inline struct iterators_bpf * +iterators_bpf__open_and_load(void) +{ + struct iterators_bpf *obj; + + obj = iterators_bpf__open(); + if (!obj) + return NULL; + if (iterators_bpf__load(obj)) { + iterators_bpf__destroy(obj); + return NULL; + } + return obj; +} + +static inline int +iterators_bpf__attach(struct iterators_bpf *obj) +{ + return bpf_object__attach_skeleton(obj->skeleton); +} + +static inline void +iterators_bpf__detach(struct iterators_bpf *obj) +{ + return bpf_object__detach_skeleton(obj->skeleton); +} + +static inline int +iterators_bpf__create_skeleton(struct iterators_bpf *obj) +{ + struct bpf_object_skeleton *s; + + s = (struct bpf_object_skeleton *)calloc(1, sizeof(*s)); + if (!s) + return -1; + obj->skeleton = s; + + s->sz = sizeof(*s); + s->name = "iterators_bpf"; + s->obj = &obj->obj; + + /* maps */ + s->map_cnt = 1; + s->map_skel_sz = sizeof(*s->maps); + s->maps = (struct bpf_map_skeleton *)calloc(s->map_cnt, s->map_skel_sz); + if (!s->maps) + goto err; + + s->maps[0].name = "iterator.rodata"; + s->maps[0].map = &obj->maps.rodata; + s->maps[0].mmaped = (void **)&obj->rodata; + + /* programs */ + s->prog_cnt = 2; + s->prog_skel_sz = sizeof(*s->progs); + s->progs = (struct bpf_prog_skeleton *)calloc(s->prog_cnt, s->prog_skel_sz); + if (!s->progs) + goto err; + + s->progs[0].name = "dump_bpf_map"; + s->progs[0].prog = &obj->progs.dump_bpf_map; + s->progs[0].link = &obj->links.dump_bpf_map; + + s->progs[1].name = "dump_bpf_prog"; + s->progs[1].prog = &obj->progs.dump_bpf_prog; + s->progs[1].link = &obj->links.dump_bpf_prog; + + s->data_sz = 7176; + s->data = (void *)"\ +\x7f\x45\x4c\x46\x02\x01\x01\0\0\0\0\0\0\0\0\0\x01\0\xf7\0\x01\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\x48\x18\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\x40\0\x0f\0\ +\x0e\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\ +\x1a\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\ +\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\xb7\x03\0\0\x23\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x61\x71\0\ +\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\0\0\0\0\0\ +\x0f\x12\0\0\0\0\0\0\x7b\x2a\xf0\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\x7b\x1a\xf8\ +\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\ +\0\x18\x02\0\0\x23\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x0e\0\0\0\xb7\x05\0\0\x18\ +\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\x79\x12\0\0\0\0\ +\0\0\x79\x26\0\0\0\0\0\0\x79\x11\x08\0\0\0\0\0\x15\x01\x3b\0\0\0\0\0\x79\x17\0\ +\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\ +\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x02\0\0\x31\0\0\0\0\0\0\0\0\0\ +\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x7b\x6a\xc8\ +\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\xff\0\0\0\0\xb7\x03\0\0\x04\0\0\0\ +\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\x71\x28\0\0\0\0\0\x79\x78\x30\0\0\ +\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\0\x61\x11\ +\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\0\x03\0\0\0\x0f\x13\0\0\0\0\0\0\ +\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf8\xff\xff\xff\xb7\x02\0\ +\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\0\0\0\0\x79\xa3\xf8\xff\0\0\0\0\ +\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf4\xff\xff\xff\xb7\x02\0\ +\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\x04\0\0\0\x61\xa1\xf4\xff\0\0\0\0\ +\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\x0f\x16\0\0\0\0\0\0\xbf\x69\0\0\0\ +\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\0\0\0\0\x7b\x1a\xe0\xff\0\0\0\0\ +\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x31\0\0\0\0\0\0\x7b\x1a\xe8\xff\ +\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\x79\xa1\xc8\xff\0\0\0\ +\0\x18\x02\0\0\x51\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x11\0\0\0\xb7\x05\0\0\x20\ +\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\x20\x20\x69\x64\ +\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\ +\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\ +\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\ +\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x0a\0\x25\x34\ +\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0\x47\x50\x4c\0\x9f\ +\xeb\x01\0\x18\0\0\0\0\0\0\0\x1c\x04\0\0\x1c\x04\0\0\x09\x05\0\0\0\0\0\0\0\0\0\ +\x02\x02\0\0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\ +\0\0\0\x04\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\ +\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\ +\0\0\0\x20\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xaf\0\0\0\x03\0\0\x04\x18\0\ +\0\0\xbd\0\0\0\x09\0\0\0\0\0\0\0\xc1\0\0\0\x0b\0\0\0\x40\0\0\0\xcc\0\0\0\x0b\0\ +\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xd4\0\0\0\0\0\0\x07\0\0\0\0\xdd\0\0\ +\0\0\0\0\x08\x0c\0\0\0\xe3\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\xa4\x01\0\0\x03\ +\0\0\x04\x18\0\0\0\xac\x01\0\0\x0e\0\0\0\0\0\0\0\xaf\x01\0\0\x11\0\0\0\x20\0\0\ +\0\xb4\x01\0\0\x0e\0\0\0\xa0\0\0\0\xc0\x01\0\0\0\0\0\x08\x0f\0\0\0\xc6\x01\0\0\ +\0\0\0\x01\x04\0\0\0\x20\0\0\0\xd3\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\ +\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xd8\x01\0\0\0\0\0\x01\x04\ +\0\0\0\x20\0\0\0\0\0\0\0\0\0\0\x02\x14\0\0\0\x3c\x02\0\0\x02\0\0\x04\x10\0\0\0\ +\x13\0\0\0\x03\0\0\0\0\0\0\0\x4f\x02\0\0\x15\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\ +\x18\0\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x13\0\0\0\x54\x02\0\0\x01\0\ +\0\x0c\x16\0\0\0\xa0\x02\0\0\x01\0\0\x04\x08\0\0\0\xa9\x02\0\0\x19\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\x02\x1a\0\0\0\xfa\x02\0\0\x06\0\0\x04\x38\0\0\0\xac\x01\0\0\ +\x0e\0\0\0\0\0\0\0\xaf\x01\0\0\x11\0\0\0\x20\0\0\0\x07\x03\0\0\x1b\0\0\0\xc0\0\ +\0\0\x18\x03\0\0\x15\0\0\0\0\x01\0\0\x21\x03\0\0\x1d\0\0\0\x40\x01\0\0\x2b\x03\ +\0\0\x1e\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x1c\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\ +\0\0\0\0\0\0\0\0\0\x02\x1f\0\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\x75\x03\0\0\x02\0\ +\0\x04\x08\0\0\0\x83\x03\0\0\x0e\0\0\0\0\0\0\0\x8c\x03\0\0\x0e\0\0\0\x20\0\0\0\ +\x2b\x03\0\0\x03\0\0\x04\x18\0\0\0\x96\x03\0\0\x1b\0\0\0\0\0\0\0\x9e\x03\0\0\ +\x21\0\0\0\x40\0\0\0\xa4\x03\0\0\x23\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x22\0\0\ +\0\0\0\0\0\0\0\0\x02\x24\0\0\0\xa8\x03\0\0\x01\0\0\x04\x04\0\0\0\xb3\x03\0\0\ +\x0e\0\0\0\0\0\0\0\x1c\x04\0\0\x01\0\0\x04\x04\0\0\0\x25\x04\0\0\x0e\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\0\x9b\x04\0\0\0\0\0\ +\x0e\x25\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\0\ +\xaf\x04\0\0\0\0\0\x0e\x27\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\ +\x12\0\0\0\x20\0\0\0\xc5\x04\0\0\0\0\0\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\ +\0\0\0\0\x1c\0\0\0\x12\0\0\0\x11\0\0\0\xda\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\xf1\x04\0\0\0\0\0\x0e\ +\x2d\0\0\0\x01\0\0\0\xf9\x04\0\0\x04\0\0\x0f\0\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\ +\0\x28\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\ +\0\0\x11\0\0\0\x01\x05\0\0\x01\0\0\x0f\0\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\0\0\0\ +\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\ +\x74\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\ +\x70\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\ +\x30\x3a\x30\0\x2f\x68\x6f\x6d\x65\x2f\x61\x6c\x72\x75\x61\x2f\x62\x75\x69\x6c\ +\x64\x2f\x6c\x69\x6e\x75\x78\x2f\x6b\x65\x72\x6e\x65\x6c\x2f\x62\x70\x66\x2f\ +\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2f\x69\ +\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\x09\x73\x74\x72\x75\ +\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\x73\x65\x71\x20\x3d\x20\ +\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x3b\0\x62\x70\x66\x5f\ +\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\x73\x65\x73\x73\x69\x6f\ +\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\x65\x71\x5f\x66\x69\x6c\ +\x65\0\x5f\x5f\x75\x36\x34\0\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\x20\x75\x6e\ +\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x30\x3a\x31\0\x09\x73\x74\x72\x75\ +\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\x20\x63\ +\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\x29\0\ +\x30\x3a\x32\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\ +\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\ +\x3b\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\x29\ +\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\ +\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\ +\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\ +\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\ +\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\ +\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\ +\x52\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\ +\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\ +\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\ +\x2d\x3e\x69\x64\x2c\x20\x6d\x61\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\ +\x70\x2d\x3e\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\ +\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\ +\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\ +\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\ +\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\ +\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\ +\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\ +\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\ +\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\ +\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\ +\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\ +\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\ +\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\ +\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\ +\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\ +\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\ +\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\ +\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\ +\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\ +\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\ +\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\ +\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\ +\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\ +\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\ +\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\ +\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\ +\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\ +\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\ +\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\ +\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\ +\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\ +\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\ +\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\ +\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\ +\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\ +\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\ +\x4e\x53\x45\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\x9f\ +\xeb\x01\0\x20\0\0\0\0\0\0\0\x24\0\0\0\x24\0\0\0\x44\x02\0\0\x68\x02\0\0\xa4\ +\x01\0\0\x08\0\0\0\x31\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\0\x62\x02\0\0\x01\0\0\0\ +\0\0\0\0\x17\0\0\0\x10\0\0\0\x31\0\0\0\x09\0\0\0\0\0\0\0\x42\0\0\0\x87\0\0\0\ +\x1e\x40\x01\0\x08\0\0\0\x42\0\0\0\x87\0\0\0\x24\x40\x01\0\x10\0\0\0\x42\0\0\0\ +\xfe\0\0\0\x1d\x48\x01\0\x18\0\0\0\x42\0\0\0\x1f\x01\0\0\x06\x50\x01\0\x20\0\0\ +\0\x42\0\0\0\x2e\x01\0\0\x1d\x44\x01\0\x28\0\0\0\x42\0\0\0\x53\x01\0\0\x06\x5c\ +\x01\0\x38\0\0\0\x42\0\0\0\x66\x01\0\0\x03\x60\x01\0\x70\0\0\0\x42\0\0\0\xec\ +\x01\0\0\x02\x68\x01\0\xf0\0\0\0\x42\0\0\0\x3a\x02\0\0\x01\x70\x01\0\x62\x02\0\ +\0\x1a\0\0\0\0\0\0\0\x42\0\0\0\x87\0\0\0\x1e\x84\x01\0\x08\0\0\0\x42\0\0\0\x87\ +\0\0\0\x24\x84\x01\0\x10\0\0\0\x42\0\0\0\x70\x02\0\0\x1f\x8c\x01\0\x18\0\0\0\ +\x42\0\0\0\x94\x02\0\0\x06\x98\x01\0\x20\0\0\0\x42\0\0\0\xad\x02\0\0\x0e\xa4\ +\x01\0\x28\0\0\0\x42\0\0\0\x2e\x01\0\0\x1d\x88\x01\0\x30\0\0\0\x42\0\0\0\x53\ +\x01\0\0\x06\xa8\x01\0\x40\0\0\0\x42\0\0\0\xbf\x02\0\0\x03\xac\x01\0\x80\0\0\0\ +\x42\0\0\0\x2f\x03\0\0\x02\xb4\x01\0\xb8\0\0\0\x42\0\0\0\x6a\x03\0\0\x06\x08\ +\x01\0\xd0\0\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\xd8\0\0\0\x42\0\0\0\xbb\x03\0\0\x0f\ +\x14\x01\0\xe0\0\0\0\x42\0\0\0\xd0\x03\0\0\x2d\x18\x01\0\xf0\0\0\0\x42\0\0\0\ +\x07\x04\0\0\x0d\x10\x01\0\0\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x08\x01\0\0\x42\ +\0\0\0\xd0\x03\0\0\x02\x18\x01\0\x20\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\x1c\x01\ +\0\x38\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x40\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\ +\x1c\x01\0\x58\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\x1c\x01\0\x60\x01\0\0\x42\0\0\ +\0\x5c\x04\0\0\x1b\x20\x01\0\x68\x01\0\0\x42\0\0\0\x5c\x04\0\0\x06\x20\x01\0\ +\x70\x01\0\0\x42\0\0\0\x7f\x04\0\0\x0d\x28\x01\0\x78\x01\0\0\x42\0\0\0\0\0\0\0\ +\0\0\0\0\x80\x01\0\0\x42\0\0\0\x2f\x03\0\0\x02\xb4\x01\0\xf8\x01\0\0\x42\0\0\0\ +\x3a\x02\0\0\x01\xc4\x01\0\x10\0\0\0\x31\0\0\0\x07\0\0\0\0\0\0\0\x02\0\0\0\x3e\ +\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x02\0\0\0\xfa\0\ +\0\0\0\0\0\0\x20\0\0\0\x08\0\0\0\x2a\x01\0\0\0\0\0\0\x70\0\0\0\x0d\0\0\0\x3e\0\ +\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\0\xfa\0\0\0\0\0\0\0\xa0\0\0\0\x0d\0\0\0\x2a\x01\ +\0\0\0\0\0\0\x62\x02\0\0\x12\0\0\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\ +\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x14\0\0\0\xfa\0\0\0\0\0\0\0\x20\0\0\0\ +\x18\0\0\0\x3e\0\0\0\0\0\0\0\x28\0\0\0\x08\0\0\0\x2a\x01\0\0\0\0\0\0\x80\0\0\0\ +\x1a\0\0\0\x3e\0\0\0\0\0\0\0\x90\0\0\0\x1a\0\0\0\xfa\0\0\0\0\0\0\0\xa8\0\0\0\ +\x1a\0\0\0\x62\x03\0\0\0\0\0\0\xb0\0\0\0\x1a\0\0\0\x66\x03\0\0\0\0\0\0\xc0\0\0\ +\0\x1f\0\0\0\x94\x03\0\0\0\0\0\0\xd8\0\0\0\x20\0\0\0\xfa\0\0\0\0\0\0\0\xf0\0\0\ +\0\x20\0\0\0\x3e\0\0\0\0\0\0\0\x18\x01\0\0\x24\0\0\0\x3e\0\0\0\0\0\0\0\x50\x01\ +\0\0\x1a\0\0\0\xfa\0\0\0\0\0\0\0\x60\x01\0\0\x20\0\0\0\x56\x04\0\0\0\0\0\0\x88\ +\x01\0\0\x1a\0\0\0\x2a\x01\0\0\0\0\0\0\x98\x01\0\0\x1a\0\0\0\x97\x04\0\0\0\0\0\ +\0\xa0\x01\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\x91\0\0\0\x04\0\xf1\xff\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xe6\0\0\ +\0\0\0\x02\0\x70\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd8\0\0\0\0\0\x02\0\xf0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\xdf\0\0\0\0\0\x03\0\x78\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\xd1\0\0\0\0\0\x03\0\x80\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xca\0\0\0\0\0\x03\0\ +\xf8\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x14\0\0\0\x01\0\x04\0\0\0\0\0\0\0\0\0\x23\ +\0\0\0\0\0\0\0\x04\x01\0\0\x01\0\x04\0\x23\0\0\0\0\0\0\0\x0e\0\0\0\0\0\0\0\x28\ +\0\0\0\x01\0\x04\0\x31\0\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\xed\0\0\0\x01\0\x04\0\ +\x51\0\0\0\0\0\0\0\x11\0\0\0\0\0\0\0\0\0\0\0\x03\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\x03\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\ +\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xc2\0\0\0\x11\0\x05\0\0\0\0\0\0\0\0\0\ +\x04\0\0\0\0\0\0\0\x3d\0\0\0\x12\0\x02\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\x5b\ +\0\0\0\x12\0\x03\0\0\0\0\0\0\0\0\0\x08\x02\0\0\0\0\0\0\x48\0\0\0\0\0\0\0\x01\0\ +\0\0\x0d\0\0\0\xc8\0\0\0\0\0\0\0\x01\0\0\0\x0d\0\0\0\x50\0\0\0\0\0\0\0\x01\0\0\ +\0\x0d\0\0\0\xd0\x01\0\0\0\0\0\0\x01\0\0\0\x0d\0\0\0\xf0\x03\0\0\0\0\0\0\x0a\0\ +\0\0\x0d\0\0\0\xfc\x03\0\0\0\0\0\0\x0a\0\0\0\x0d\0\0\0\x08\x04\0\0\0\0\0\0\x0a\ +\0\0\0\x0d\0\0\0\x14\x04\0\0\0\0\0\0\x0a\0\0\0\x0d\0\0\0\x2c\x04\0\0\0\0\0\0\0\ +\0\0\0\x0e\0\0\0\x2c\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x3c\0\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x50\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x60\0\0\0\0\0\0\0\0\0\0\0\x0b\0\ +\0\0\x70\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\ +\x90\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xb0\0\ +\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xd0\0\0\0\0\ +\0\0\0\0\0\0\0\x0b\0\0\0\xe8\0\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xf8\0\0\0\0\0\0\0\ +\0\0\0\0\x0c\0\0\0\x08\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x18\x01\0\0\0\0\0\0\0\ +\0\0\0\x0c\0\0\0\x28\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x38\x01\0\0\0\0\0\0\0\0\ +\0\0\x0c\0\0\0\x48\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x58\x01\0\0\0\0\0\0\0\0\0\ +\0\x0c\0\0\0\x68\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x78\x01\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x88\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x98\x01\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\xa8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xb8\x01\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\xc8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xd8\x01\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\xe8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xf8\x01\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x28\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x38\x02\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x48\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x58\x02\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x68\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x78\x02\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x94\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa4\x02\0\0\0\0\0\0\0\0\0\0\ +\x0b\0\0\0\xb4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc4\x02\0\0\0\0\0\0\0\0\0\0\ +\x0b\0\0\0\xd4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xe4\x02\0\0\0\0\0\0\0\0\0\0\ +\x0b\0\0\0\xf4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x0c\x03\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x1c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x2c\x03\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x3c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x4c\x03\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x5c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x6c\x03\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x7c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x8c\x03\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x9c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xac\x03\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\xbc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xcc\x03\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\xdc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xec\x03\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\xfc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x0c\x04\0\0\0\0\0\0\0\0\0\0\ +\x0c\0\0\0\x1c\x04\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x4d\x4e\x40\x41\x42\x43\x4c\0\ +\x2e\x74\x65\x78\x74\0\x2e\x72\x65\x6c\x2e\x42\x54\x46\x2e\x65\x78\x74\0\x64\ +\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\ +\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\ +\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x2e\x72\x65\x6c\x69\x74\x65\ +\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\ +\x72\x6f\x67\0\x2e\x72\x65\x6c\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\ +\x67\0\x2e\x6c\x6c\x76\x6d\x5f\x61\x64\x64\x72\x73\x69\x67\0\x6c\x69\x63\x65\ +\x6e\x73\x65\0\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\ +\x2e\x73\x74\x72\x74\x61\x62\0\x2e\x73\x79\x6d\x74\x61\x62\0\x2e\x72\x6f\x64\ +\x61\x74\x61\0\x2e\x72\x65\x6c\x2e\x42\x54\x46\0\x4c\x49\x43\x45\x4e\x53\x45\0\ +\x4c\x42\x42\x31\x5f\x37\0\x4c\x42\x42\x31\x5f\x36\0\x4c\x42\x42\x30\x5f\x34\0\ +\x4c\x42\x42\x31\x5f\x33\0\x4c\x42\x42\x30\x5f\x33\0\x64\x75\x6d\x70\x5f\x62\ +\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x64\x75\x6d\ +\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x01\0\0\ +\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x4e\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\x6d\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\x40\x01\0\0\0\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\xb1\0\0\0\x01\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x48\x03\0\ +\0\0\0\0\0\x62\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\x89\0\0\0\x01\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xaa\x03\0\0\0\0\0\0\x04\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xbd\0\0\0\x01\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xae\x03\0\0\0\0\0\0\x3d\x09\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x01\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\xeb\x0c\0\0\0\0\0\0\x2c\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa9\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\x18\x11\0\0\0\0\0\0\x98\x01\0\0\0\0\0\0\x0e\0\0\0\x0e\0\0\0\x08\0\0\ +\0\0\0\0\0\x18\0\0\0\0\0\0\0\x4a\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\xb0\x12\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\x08\0\0\0\x02\0\0\0\x08\0\0\0\0\0\0\0\ +\x10\0\0\0\0\0\0\0\x69\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd0\x12\ +\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\x08\0\0\0\x03\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\ +\0\0\0\0\xb9\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xf0\x12\0\0\0\0\0\ +\0\x50\0\0\0\0\0\0\0\x08\0\0\0\x06\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\ +\x07\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\x13\0\0\0\0\0\0\xe0\ +\x03\0\0\0\0\0\0\x08\0\0\0\x07\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x7b\0\ +\0\0\x03\x4c\xff\x6f\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\0\0\0\0\x07\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa1\0\0\0\x03\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x27\x17\0\0\0\0\0\0\x1a\x01\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"; + + return 0; +err: + bpf_object__destroy_skeleton(s); + return -1; +} + +#endif /* __ITERATORS_BPF_SKEL_H__ */ diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 44184f82916a..0ee2347ba510 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -257,6 +257,7 @@ static int queue_stack_map_get_next_key(struct bpf_map *map, void *key, static int queue_map_btf_id; const struct bpf_map_ops queue_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = queue_stack_map_alloc_check, .map_alloc = queue_stack_map_alloc, .map_free = queue_stack_map_free, @@ -273,6 +274,7 @@ const struct bpf_map_ops queue_map_ops = { static int stack_map_btf_id; const struct bpf_map_ops stack_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = queue_stack_map_alloc_check, .map_alloc = queue_stack_map_alloc, .map_free = queue_stack_map_free, diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 90b29c5b1da7..a55cd542f2ce 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -191,7 +191,7 @@ int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, rcu_read_lock(); sk = reuseport_array_lookup_elem(map, key); if (sk) { - *(u64 *)value = sock_gen_cookie(sk); + *(u64 *)value = __sock_gen_cookie(sk); err = 0; } else { err = -ENOENT; @@ -351,6 +351,7 @@ static int reuseport_array_get_next_key(struct bpf_map *map, void *key, static int reuseport_array_map_btf_id; const struct bpf_map_ops reuseport_array_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = reuseport_array_alloc_check, .map_alloc = reuseport_array_alloc, .map_free = reuseport_array_free, diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 002f8a5c9e51..31cb04a4dd2d 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -287,6 +287,7 @@ static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp, static int ringbuf_map_btf_id; const struct bpf_map_ops ringbuf_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc = ringbuf_map_alloc, .map_free = ringbuf_map_free, .map_mmap = ringbuf_map_mmap, diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index cfed0ac44d38..06065fa27124 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -665,18 +665,17 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, return __bpf_get_stack(regs, task, NULL, buf, size, flags); } -BTF_ID_LIST(bpf_get_task_stack_btf_ids) -BTF_ID(struct, task_struct) +BTF_ID_LIST_SINGLE(bpf_get_task_stack_btf_ids, struct, task_struct) const struct bpf_func_proto bpf_get_task_stack_proto = { .func = bpf_get_task_stack, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_get_task_stack_btf_ids[0], .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, - .btf_id = bpf_get_task_stack_btf_ids, }; BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, @@ -839,6 +838,7 @@ static void stack_map_free(struct bpf_map *map) static int stack_trace_map_btf_id; const struct bpf_map_ops stack_trace_map_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc = stack_map_alloc, .map_free = stack_map_free, .map_get_next_key = stack_map_get_next_key, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b999e7ff2583..8f50c9c19f1b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4,6 +4,7 @@ #include <linux/bpf.h> #include <linux/bpf_trace.h> #include <linux/bpf_lirc.h> +#include <linux/bpf_verifier.h> #include <linux/btf.h> #include <linux/syscalls.h> #include <linux/slab.h> @@ -29,6 +30,7 @@ #include <linux/bpf_lsm.h> #include <linux/poll.h> #include <linux/bpf-netns.h> +#include <linux/rcupdate_trace.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -90,6 +92,7 @@ int bpf_check_uarg_tail_zero(void __user *uaddr, } const struct bpf_map_ops bpf_map_offload_ops = { + .map_meta_equal = bpf_map_meta_equal, .map_alloc = bpf_map_offload_map_alloc, .map_free = bpf_map_offload_map_free, .map_check_btf = map_check_no_btf, @@ -157,10 +160,11 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key, if (bpf_map_is_dev_bound(map)) { return bpf_map_offload_update_elem(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || - map->map_type == BPF_MAP_TYPE_SOCKHASH || - map->map_type == BPF_MAP_TYPE_SOCKMAP || map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { return map->ops->map_update_elem(map, key, value, flags); + } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH || + map->map_type == BPF_MAP_TYPE_SOCKMAP) { + return sock_map_update_elem_sys(map, key, value, flags); } else if (IS_FD_PROG_ARRAY(map)) { return bpf_fd_array_map_update_elem(map, f.file, key, value, flags); @@ -768,7 +772,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_ARRAY && map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && - map->map_type != BPF_MAP_TYPE_SK_STORAGE) + map->map_type != BPF_MAP_TYPE_SK_STORAGE && + map->map_type != BPF_MAP_TYPE_INODE_STORAGE) return -ENOTSUPP; if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > map->value_size) { @@ -1728,10 +1733,14 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) btf_put(prog->aux->btf); bpf_prog_free_linfo(prog); - if (deferred) - call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); - else + if (deferred) { + if (prog->aux->sleepable) + call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu); + else + call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); + } else { __bpf_prog_put_rcu(&prog->aux->rcu); + } } static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) @@ -2101,6 +2110,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT | BPF_F_TEST_STATE_FREQ | + BPF_F_SLEEPABLE | BPF_F_TEST_RND_HI32)) return -EINVAL; @@ -2145,17 +2155,18 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) prog->expected_attach_type = attr->expected_attach_type; prog->aux->attach_btf_id = attr->attach_btf_id; if (attr->attach_prog_fd) { - struct bpf_prog *tgt_prog; + struct bpf_prog *dst_prog; - tgt_prog = bpf_prog_get(attr->attach_prog_fd); - if (IS_ERR(tgt_prog)) { - err = PTR_ERR(tgt_prog); + dst_prog = bpf_prog_get(attr->attach_prog_fd); + if (IS_ERR(dst_prog)) { + err = PTR_ERR(dst_prog); goto free_prog_nouncharge; } - prog->aux->linked_prog = tgt_prog; + prog->aux->dst_prog = dst_prog; } prog->aux->offload_requested = !!attr->prog_ifindex; + prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE; err = security_bpf_prog_alloc(prog->aux); if (err) @@ -2488,11 +2499,23 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd) struct bpf_tracing_link { struct bpf_link link; enum bpf_attach_type attach_type; + struct bpf_trampoline *trampoline; + struct bpf_prog *tgt_prog; }; static void bpf_tracing_link_release(struct bpf_link *link) { - WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog)); + struct bpf_tracing_link *tr_link = + container_of(link, struct bpf_tracing_link, link); + + WARN_ON_ONCE(bpf_trampoline_unlink_prog(link->prog, + tr_link->trampoline)); + + bpf_trampoline_put(tr_link->trampoline); + + /* tgt_prog is NULL if target is a kernel function */ + if (tr_link->tgt_prog) + bpf_prog_put(tr_link->tgt_prog); } static void bpf_tracing_link_dealloc(struct bpf_link *link) @@ -2532,10 +2555,15 @@ static const struct bpf_link_ops bpf_tracing_link_lops = { .fill_link_info = bpf_tracing_link_fill_link_info, }; -static int bpf_tracing_prog_attach(struct bpf_prog *prog) +static int bpf_tracing_prog_attach(struct bpf_prog *prog, + int tgt_prog_fd, + u32 btf_id) { struct bpf_link_primer link_primer; + struct bpf_prog *tgt_prog = NULL; + struct bpf_trampoline *tr = NULL; struct bpf_tracing_link *link; + u64 key = 0; int err; switch (prog->type) { @@ -2564,6 +2592,28 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) goto out_put_prog; } + if (!!tgt_prog_fd != !!btf_id) { + err = -EINVAL; + goto out_put_prog; + } + + if (tgt_prog_fd) { + /* For now we only allow new targets for BPF_PROG_TYPE_EXT */ + if (prog->type != BPF_PROG_TYPE_EXT) { + err = -EINVAL; + goto out_put_prog; + } + + tgt_prog = bpf_prog_get(tgt_prog_fd); + if (IS_ERR(tgt_prog)) { + err = PTR_ERR(tgt_prog); + tgt_prog = NULL; + goto out_put_prog; + } + + key = bpf_trampoline_compute_key(tgt_prog, btf_id); + } + link = kzalloc(sizeof(*link), GFP_USER); if (!link) { err = -ENOMEM; @@ -2573,20 +2623,100 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) &bpf_tracing_link_lops, prog); link->attach_type = prog->expected_attach_type; - err = bpf_link_prime(&link->link, &link_primer); - if (err) { - kfree(link); - goto out_put_prog; + mutex_lock(&prog->aux->dst_mutex); + + /* There are a few possible cases here: + * + * - if prog->aux->dst_trampoline is set, the program was just loaded + * and not yet attached to anything, so we can use the values stored + * in prog->aux + * + * - if prog->aux->dst_trampoline is NULL, the program has already been + * attached to a target and its initial target was cleared (below) + * + * - if tgt_prog != NULL, the caller specified tgt_prog_fd + + * target_btf_id using the link_create API. + * + * - if tgt_prog == NULL when this function was called using the old + * raw_tracepoint_open API, and we need a target from prog->aux + * + * The combination of no saved target in prog->aux, and no target + * specified on load is illegal, and we reject that here. + */ + if (!prog->aux->dst_trampoline && !tgt_prog) { + err = -ENOENT; + goto out_unlock; + } + + if (!prog->aux->dst_trampoline || + (key && key != prog->aux->dst_trampoline->key)) { + /* If there is no saved target, or the specified target is + * different from the destination specified at load time, we + * need a new trampoline and a check for compatibility + */ + struct bpf_attach_target_info tgt_info = {}; + + err = bpf_check_attach_target(NULL, prog, tgt_prog, btf_id, + &tgt_info); + if (err) + goto out_unlock; + + tr = bpf_trampoline_get(key, &tgt_info); + if (!tr) { + err = -ENOMEM; + goto out_unlock; + } + } else { + /* The caller didn't specify a target, or the target was the + * same as the destination supplied during program load. This + * means we can reuse the trampoline and reference from program + * load time, and there is no need to allocate a new one. This + * can only happen once for any program, as the saved values in + * prog->aux are cleared below. + */ + tr = prog->aux->dst_trampoline; + tgt_prog = prog->aux->dst_prog; } - err = bpf_trampoline_link_prog(prog); + err = bpf_link_prime(&link->link, &link_primer); + if (err) + goto out_unlock; + + err = bpf_trampoline_link_prog(prog, tr); if (err) { bpf_link_cleanup(&link_primer); - goto out_put_prog; + link = NULL; + goto out_unlock; } + link->tgt_prog = tgt_prog; + link->trampoline = tr; + + /* Always clear the trampoline and target prog from prog->aux to make + * sure the original attach destination is not kept alive after a + * program is (re-)attached to another target. + */ + if (prog->aux->dst_prog && + (tgt_prog_fd || tr != prog->aux->dst_trampoline)) + /* got extra prog ref from syscall, or attaching to different prog */ + bpf_prog_put(prog->aux->dst_prog); + if (prog->aux->dst_trampoline && tr != prog->aux->dst_trampoline) + /* we allocated a new trampoline, so free the old one */ + bpf_trampoline_put(prog->aux->dst_trampoline); + + prog->aux->dst_prog = NULL; + prog->aux->dst_trampoline = NULL; + mutex_unlock(&prog->aux->dst_mutex); + return bpf_link_settle(&link_primer); +out_unlock: + if (tr && tr != prog->aux->dst_trampoline) + bpf_trampoline_put(tr); + mutex_unlock(&prog->aux->dst_mutex); + kfree(link); out_put_prog: + if (tgt_prog_fd && tgt_prog) + bpf_prog_put(tgt_prog); bpf_prog_put(prog); return err; } @@ -2700,7 +2830,7 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) tp_name = prog->aux->attach_func_name; break; } - return bpf_tracing_prog_attach(prog); + return bpf_tracing_prog_attach(prog, 0, 0); case BPF_PROG_TYPE_RAW_TRACEPOINT: case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: if (strncpy_from_user(buf, @@ -2783,7 +2913,6 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: return BPF_PROG_TYPE_CGROUP_SKB; - break; case BPF_CGROUP_INET_SOCK_CREATE: case BPF_CGROUP_INET_SOCK_RELEASE: case BPF_CGROUP_INET4_POST_BIND: @@ -2969,7 +3098,7 @@ static int bpf_prog_query(const union bpf_attr *attr, } } -#define BPF_PROG_TEST_RUN_LAST_FIELD test.ctx_out +#define BPF_PROG_TEST_RUN_LAST_FIELD test.cpu static int bpf_prog_test_run(const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -3152,21 +3281,25 @@ static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, const struct bpf_map *map; int i; + mutex_lock(&prog->aux->used_maps_mutex); for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) { map = prog->aux->used_maps[i]; if (map == (void *)addr) { *type = BPF_PSEUDO_MAP_FD; - return map; + goto out; } if (!map->ops->map_direct_value_meta) continue; if (!map->ops->map_direct_value_meta(map, addr, off)) { *type = BPF_PSEUDO_MAP_VALUE; - return map; + goto out; } } + map = NULL; - return NULL; +out: + mutex_unlock(&prog->aux->used_maps_mutex); + return map; } static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, @@ -3284,6 +3417,7 @@ static int bpf_prog_get_info_by_fd(struct file *file, memcpy(info.tag, prog->tag, sizeof(prog->tag)); memcpy(info.name, prog->aux->name, sizeof(prog->aux->name)); + mutex_lock(&prog->aux->used_maps_mutex); ulen = info.nr_map_ids; info.nr_map_ids = prog->aux->used_map_cnt; ulen = min_t(u32, info.nr_map_ids, ulen); @@ -3293,9 +3427,12 @@ static int bpf_prog_get_info_by_fd(struct file *file, for (i = 0; i < ulen; i++) if (put_user(prog->aux->used_maps[i]->id, - &user_map_ids[i])) + &user_map_ids[i])) { + mutex_unlock(&prog->aux->used_maps_mutex); return -EFAULT; + } } + mutex_unlock(&prog->aux->used_maps_mutex); err = set_info_rec_size(&info); if (err) @@ -3876,10 +4013,15 @@ err_put: static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { - if (attr->link_create.attach_type == BPF_TRACE_ITER && - prog->expected_attach_type == BPF_TRACE_ITER) - return bpf_iter_link_attach(attr, prog); + if (attr->link_create.attach_type != prog->expected_attach_type) + return -EINVAL; + if (prog->expected_attach_type == BPF_TRACE_ITER) + return bpf_iter_link_attach(attr, prog); + else if (prog->type == BPF_PROG_TYPE_EXT) + return bpf_tracing_prog_attach(prog, + attr->link_create.target_fd, + attr->link_create.target_btf_id); return -EINVAL; } @@ -3893,18 +4035,25 @@ static int link_create(union bpf_attr *attr) if (CHECK_ATTR(BPF_LINK_CREATE)) return -EINVAL; - ptype = attach_type_to_prog_type(attr->link_create.attach_type); - if (ptype == BPF_PROG_TYPE_UNSPEC) - return -EINVAL; - - prog = bpf_prog_get_type(attr->link_create.prog_fd, ptype); + prog = bpf_prog_get(attr->link_create.prog_fd); if (IS_ERR(prog)) return PTR_ERR(prog); ret = bpf_prog_attach_check_attach_type(prog, attr->link_create.attach_type); if (ret) - goto err_out; + goto out; + + if (prog->type == BPF_PROG_TYPE_EXT) { + ret = tracing_bpf_link_attach(attr, prog); + goto out; + } + + ptype = attach_type_to_prog_type(attr->link_create.attach_type); + if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) { + ret = -EINVAL; + goto out; + } switch (ptype) { case BPF_PROG_TYPE_CGROUP_SKB: @@ -3932,7 +4081,7 @@ static int link_create(union bpf_attr *attr) ret = -EINVAL; } -err_out: +out: if (ret < 0) bpf_prog_put(prog); return ret; @@ -4014,40 +4163,50 @@ static int link_detach(union bpf_attr *attr) return ret; } -static int bpf_link_inc_not_zero(struct bpf_link *link) +static struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link) { - return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? 0 : -ENOENT; + return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT); } -#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id - -static int bpf_link_get_fd_by_id(const union bpf_attr *attr) +struct bpf_link *bpf_link_by_id(u32 id) { struct bpf_link *link; - u32 id = attr->link_id; - int fd, err; - - if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID)) - return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + if (!id) + return ERR_PTR(-ENOENT); spin_lock_bh(&link_idr_lock); - link = idr_find(&link_idr, id); /* before link is "settled", ID is 0, pretend it doesn't exist yet */ + link = idr_find(&link_idr, id); if (link) { if (link->id) - err = bpf_link_inc_not_zero(link); + link = bpf_link_inc_not_zero(link); else - err = -EAGAIN; + link = ERR_PTR(-EAGAIN); } else { - err = -ENOENT; + link = ERR_PTR(-ENOENT); } spin_unlock_bh(&link_idr_lock); + return link; +} - if (err) - return err; +#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id + +static int bpf_link_get_fd_by_id(const union bpf_attr *attr) +{ + struct bpf_link *link; + u32 id = attr->link_id; + int fd; + + if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + link = bpf_link_by_id(id); + if (IS_ERR(link)) + return PTR_ERR(link); fd = bpf_link_new_fd(link); if (fd < 0) @@ -4133,6 +4292,68 @@ static int bpf_iter_create(union bpf_attr *attr) return err; } +#define BPF_PROG_BIND_MAP_LAST_FIELD prog_bind_map.flags + +static int bpf_prog_bind_map(union bpf_attr *attr) +{ + struct bpf_prog *prog; + struct bpf_map *map; + struct bpf_map **used_maps_old, **used_maps_new; + int i, ret = 0; + + if (CHECK_ATTR(BPF_PROG_BIND_MAP)) + return -EINVAL; + + if (attr->prog_bind_map.flags) + return -EINVAL; + + prog = bpf_prog_get(attr->prog_bind_map.prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + map = bpf_map_get(attr->prog_bind_map.map_fd); + if (IS_ERR(map)) { + ret = PTR_ERR(map); + goto out_prog_put; + } + + mutex_lock(&prog->aux->used_maps_mutex); + + used_maps_old = prog->aux->used_maps; + + for (i = 0; i < prog->aux->used_map_cnt; i++) + if (used_maps_old[i] == map) { + bpf_map_put(map); + goto out_unlock; + } + + used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1, + sizeof(used_maps_new[0]), + GFP_KERNEL); + if (!used_maps_new) { + ret = -ENOMEM; + goto out_unlock; + } + + memcpy(used_maps_new, used_maps_old, + sizeof(used_maps_old[0]) * prog->aux->used_map_cnt); + used_maps_new[prog->aux->used_map_cnt] = map; + + prog->aux->used_map_cnt++; + prog->aux->used_maps = used_maps_new; + + kfree(used_maps_old); + +out_unlock: + mutex_unlock(&prog->aux->used_maps_mutex); + + if (ret) + bpf_map_put(map); +out_prog_put: + bpf_prog_put(prog); + return ret; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr; @@ -4266,6 +4487,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_LINK_DETACH: err = link_detach(&attr); break; + case BPF_PROG_BIND_MAP: + err = bpf_prog_bind_map(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 99af4cea1102..5b6af30bfbcd 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -22,7 +22,8 @@ struct bpf_iter_seq_task_info { }; static struct task_struct *task_seq_get_next(struct pid_namespace *ns, - u32 *tid) + u32 *tid, + bool skip_if_dup_files) { struct task_struct *task = NULL; struct pid *pid; @@ -36,6 +37,12 @@ retry: if (!task) { ++*tid; goto retry; + } else if (skip_if_dup_files && task->tgid != task->pid && + task->files == task->group_leader->files) { + put_task_struct(task); + task = NULL; + ++*tid; + goto retry; } } rcu_read_unlock(); @@ -48,7 +55,7 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos) struct bpf_iter_seq_task_info *info = seq->private; struct task_struct *task; - task = task_seq_get_next(info->common.ns, &info->tid); + task = task_seq_get_next(info->common.ns, &info->tid, false); if (!task) return NULL; @@ -65,7 +72,7 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos) ++*pos; ++info->tid; put_task_struct((struct task_struct *)v); - task = task_seq_get_next(info->common.ns, &info->tid); + task = task_seq_get_next(info->common.ns, &info->tid, false); if (!task) return NULL; @@ -148,7 +155,7 @@ again: curr_files = *fstruct; curr_fd = info->fd; } else { - curr_task = task_seq_get_next(ns, &curr_tid); + curr_task = task_seq_get_next(ns, &curr_tid, true); if (!curr_task) return NULL; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 9be85aa4ec5f..35c5887d82ff 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -7,6 +7,8 @@ #include <linux/rbtree_latch.h> #include <linux/perf_event.h> #include <linux/btf.h> +#include <linux/rcupdate_trace.h> +#include <linux/rcupdate_wait.h> /* dummy _ops. The verifier will operate on target program's ops. */ const struct bpf_verifier_ops bpf_extension_verifier_ops = { @@ -63,7 +65,7 @@ static void bpf_trampoline_ksym_add(struct bpf_trampoline *tr) bpf_image_ksym_add(tr->image, ksym); } -struct bpf_trampoline *bpf_trampoline_lookup(u64 key) +static struct bpf_trampoline *bpf_trampoline_lookup(u64 key) { struct bpf_trampoline *tr; struct hlist_head *head; @@ -210,9 +212,12 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) * updates to trampoline would change the code from underneath the * preempted task. Hence wait for tasks to voluntarily schedule or go * to userspace. + * The same trampoline can hold both sleepable and non-sleepable progs. + * synchronize_rcu_tasks_trace() is needed to make sure all sleepable + * programs finish executing. + * Wait for these two grace periods together. */ - - synchronize_rcu_tasks(); + synchronize_rcu_mult(call_rcu_tasks, call_rcu_tasks_trace); err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2, &tr->func.model, flags, tprogs, @@ -256,14 +261,12 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) } } -int bpf_trampoline_link_prog(struct bpf_prog *prog) +int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) { enum bpf_tramp_prog_type kind; - struct bpf_trampoline *tr; int err = 0; int cnt; - tr = prog->aux->trampoline; kind = bpf_attach_type_to_tramp(prog); mutex_lock(&tr->mutex); if (tr->extension_prog) { @@ -296,7 +299,7 @@ int bpf_trampoline_link_prog(struct bpf_prog *prog) } hlist_add_head(&prog->aux->tramp_hlist, &tr->progs_hlist[kind]); tr->progs_cnt[kind]++; - err = bpf_trampoline_update(prog->aux->trampoline); + err = bpf_trampoline_update(tr); if (err) { hlist_del(&prog->aux->tramp_hlist); tr->progs_cnt[kind]--; @@ -307,13 +310,11 @@ out: } /* bpf_trampoline_unlink_prog() should never fail. */ -int bpf_trampoline_unlink_prog(struct bpf_prog *prog) +int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) { enum bpf_tramp_prog_type kind; - struct bpf_trampoline *tr; int err; - tr = prog->aux->trampoline; kind = bpf_attach_type_to_tramp(prog); mutex_lock(&tr->mutex); if (kind == BPF_TRAMP_REPLACE) { @@ -325,12 +326,32 @@ int bpf_trampoline_unlink_prog(struct bpf_prog *prog) } hlist_del(&prog->aux->tramp_hlist); tr->progs_cnt[kind]--; - err = bpf_trampoline_update(prog->aux->trampoline); + err = bpf_trampoline_update(tr); out: mutex_unlock(&tr->mutex); return err; } +struct bpf_trampoline *bpf_trampoline_get(u64 key, + struct bpf_attach_target_info *tgt_info) +{ + struct bpf_trampoline *tr; + + tr = bpf_trampoline_lookup(key); + if (!tr) + return NULL; + + mutex_lock(&tr->mutex); + if (tr->func.addr) + goto out; + + memcpy(&tr->func.model, &tgt_info->fmodel, sizeof(tgt_info->fmodel)); + tr->func.addr = (void *)tgt_info->tgt_addr; +out: + mutex_unlock(&tr->mutex); + return tr; +} + void bpf_trampoline_put(struct bpf_trampoline *tr) { if (!tr) @@ -344,7 +365,14 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT]))) goto out; bpf_image_ksym_del(&tr->ksym); - /* wait for tasks to get out of trampoline before freeing it */ + /* This code will be executed when all bpf progs (both sleepable and + * non-sleepable) went through + * bpf_prog_put()->call_rcu[_tasks_trace]()->bpf_prog_free_deferred(). + * Hence no need for another synchronize_rcu_tasks_trace() here, + * but synchronize_rcu_tasks() is still needed, since trampoline + * may not have had any sleepable programs and we need to wait + * for tasks to get out of trampoline code before freeing it. + */ synchronize_rcu_tasks(); bpf_jit_free_exec(tr->image); hlist_del(&tr->hlist); @@ -394,6 +422,17 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) rcu_read_unlock(); } +void notrace __bpf_prog_enter_sleepable(void) +{ + rcu_read_lock_trace(); + might_fault(); +} + +void notrace __bpf_prog_exit_sleepable(void) +{ + rcu_read_unlock_trace(); +} + int __weak arch_prepare_bpf_trampoline(void *image, void *image_end, const struct btf_func_model *m, u32 flags, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fba52d9ec8fc..6200519582a6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21,6 +21,7 @@ #include <linux/ctype.h> #include <linux/error-injection.h> #include <linux/bpf_lsm.h> +#include <linux/btf_ids.h> #include "disasm.h" @@ -238,6 +239,7 @@ struct bpf_call_arg_meta { int ref_obj_id; int func_id; u32 btf_id; + u32 ret_btf_id; }; struct btf *btf_vmlinux; @@ -435,6 +437,15 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type) return type == ARG_PTR_TO_SOCK_COMMON; } +static bool arg_type_may_be_null(enum bpf_arg_type type) +{ + return type == ARG_PTR_TO_MAP_VALUE_OR_NULL || + type == ARG_PTR_TO_MEM_OR_NULL || + type == ARG_PTR_TO_CTX_OR_NULL || + type == ARG_PTR_TO_SOCKET_OR_NULL || + type == ARG_PTR_TO_ALLOC_MEM_OR_NULL; +} + /* Determine whether the function releases some resources allocated by another * function call. The first reference type argument will be assumed to be * released by release_reference(). @@ -477,7 +488,12 @@ static bool is_acquire_function(enum bpf_func_id func_id, static bool is_ptr_cast_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_tcp_sock || - func_id == BPF_FUNC_sk_fullsock; + func_id == BPF_FUNC_sk_fullsock || + func_id == BPF_FUNC_skc_to_tcp_sock || + func_id == BPF_FUNC_skc_to_tcp6_sock || + func_id == BPF_FUNC_skc_to_udp6_sock || + func_id == BPF_FUNC_skc_to_tcp_timewait_sock || + func_id == BPF_FUNC_skc_to_tcp_request_sock; } /* string representation of 'enum bpf_reg_type' */ @@ -503,6 +519,7 @@ static const char * const reg_type_str[] = { [PTR_TO_XDP_SOCK] = "xdp_sock", [PTR_TO_BTF_ID] = "ptr_", [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", + [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_", [PTR_TO_MEM] = "mem", [PTR_TO_MEM_OR_NULL] = "mem_or_null", [PTR_TO_RDONLY_BUF] = "rdonly_buf", @@ -569,7 +586,9 @@ static void print_verifier_state(struct bpf_verifier_env *env, /* reg->off should be 0 for SCALAR_VALUE */ verbose(env, "%lld", reg->var_off.value + reg->off); } else { - if (t == PTR_TO_BTF_ID || t == PTR_TO_BTF_ID_OR_NULL) + if (t == PTR_TO_BTF_ID || + t == PTR_TO_BTF_ID_OR_NULL || + t == PTR_TO_PERCPU_BTF_ID) verbose(env, "%s", kernel_type_name(reg->btf_id)); verbose(env, "(id=%d", reg->id); if (reg_type_may_be_refcounted_or_null(t)) @@ -991,14 +1010,9 @@ static const int caller_saved[CALLER_SAVED_REGS] = { static void __mark_reg_not_init(const struct bpf_verifier_env *env, struct bpf_reg_state *reg); -/* Mark the unknown part of a register (variable offset or scalar value) as - * known to have the value @imm. - */ -static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm) +/* This helper doesn't clear reg->id */ +static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm) { - /* Clear id, off, and union(map_ptr, range) */ - memset(((u8 *)reg) + sizeof(reg->type), 0, - offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type)); reg->var_off = tnum_const(imm); reg->smin_value = (s64)imm; reg->smax_value = (s64)imm; @@ -1011,6 +1025,17 @@ static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm) reg->u32_max_value = (u32)imm; } +/* Mark the unknown part of a register (variable offset or scalar value) as + * known to have the value @imm. + */ +static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm) +{ + /* Clear id, off, and union(map_ptr, range) */ + memset(((u8 *)reg) + sizeof(reg->type), 0, + offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type)); + ___mark_reg_known(reg, imm); +} + static void __mark_reg32_known(struct bpf_reg_state *reg, u64 imm) { reg->var_off = tnum_const_subreg(reg->var_off, imm); @@ -1489,6 +1514,13 @@ static int check_subprogs(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++) { u8 code = insn[i].code; + if (code == (BPF_JMP | BPF_CALL) && + insn[i].imm == BPF_FUNC_tail_call && + insn[i].src_reg != BPF_PSEUDO_CALL) + subprog[cur_subprog].has_tail_call = true; + if (BPF_CLASS(code) == BPF_LD && + (BPF_MODE(code) == BPF_ABS || BPF_MODE(code) == BPF_IND)) + subprog[cur_subprog].has_ld_abs = true; if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) goto next; if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) @@ -2183,6 +2215,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_RDONLY_BUF_OR_NULL: case PTR_TO_RDWR_BUF: case PTR_TO_RDWR_BUF_OR_NULL: + case PTR_TO_PERCPU_BTF_ID: return true; default: return false; @@ -2200,6 +2233,20 @@ static bool register_is_const(struct bpf_reg_state *reg) return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off); } +static bool __is_scalar_unbounded(struct bpf_reg_state *reg) +{ + return tnum_is_unknown(reg->var_off) && + reg->smin_value == S64_MIN && reg->smax_value == S64_MAX && + reg->umin_value == 0 && reg->umax_value == U64_MAX && + reg->s32_min_value == S32_MIN && reg->s32_max_value == S32_MAX && + reg->u32_min_value == 0 && reg->u32_max_value == U32_MAX; +} + +static bool register_is_bounded(struct bpf_reg_state *reg) +{ + return reg->type == SCALAR_VALUE && !__is_scalar_unbounded(reg); +} + static bool __is_pointer_value(bool allow_ptr_leaks, const struct bpf_reg_state *reg) { @@ -2251,7 +2298,7 @@ static int check_stack_write(struct bpf_verifier_env *env, if (value_regno >= 0) reg = &cur->regs[value_regno]; - if (reg && size == BPF_REG_SIZE && register_is_const(reg) && + if (reg && size == BPF_REG_SIZE && register_is_bounded(reg) && !register_is_null(reg) && env->bpf_capable) { if (dst_reg != BPF_REG_FP) { /* The backtracking logic can only recognize explicit @@ -2625,11 +2672,18 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, #define MAX_PACKET_OFF 0xffff +static enum bpf_prog_type resolve_prog_type(struct bpf_prog *prog) +{ + return prog->aux->dst_prog ? prog->aux->dst_prog->type : prog->type; +} + static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, const struct bpf_call_arg_meta *meta, enum bpf_access_type t) { - switch (env->prog->type) { + enum bpf_prog_type prog_type = resolve_prog_type(env->prog); + + switch (prog_type) { /* Program types only with direct read access go here! */ case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: @@ -2639,7 +2693,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, case BPF_PROG_TYPE_CGROUP_SKB: if (t == BPF_WRITE) return false; - /* fallthrough */ + fallthrough; /* Program types with direct read + write access go here! */ case BPF_PROG_TYPE_SCHED_CLS: @@ -2970,10 +3024,37 @@ static int check_max_stack_depth(struct bpf_verifier_env *env) int depth = 0, frame = 0, idx = 0, i = 0, subprog_end; struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn = env->prog->insnsi; + bool tail_call_reachable = false; int ret_insn[MAX_CALL_FRAMES]; int ret_prog[MAX_CALL_FRAMES]; + int j; process_func: + /* protect against potential stack overflow that might happen when + * bpf2bpf calls get combined with tailcalls. Limit the caller's stack + * depth for such case down to 256 so that the worst case scenario + * would result in 8k stack size (32 which is tailcall limit * 256 = + * 8k). + * + * To get the idea what might happen, see an example: + * func1 -> sub rsp, 128 + * subfunc1 -> sub rsp, 256 + * tailcall1 -> add rsp, 256 + * func2 -> sub rsp, 192 (total stack size = 128 + 192 = 320) + * subfunc2 -> sub rsp, 64 + * subfunc22 -> sub rsp, 128 + * tailcall2 -> add rsp, 128 + * func3 -> sub rsp, 32 (total stack size 128 + 192 + 64 + 32 = 416) + * + * tailcall will unwind the current stack frame but it will not get rid + * of caller's stack as shown on the example above. + */ + if (idx && subprog[idx].has_tail_call && depth >= 256) { + verbose(env, + "tail_calls are not allowed when call stack of previous frames is %d bytes. Too large\n", + depth); + return -EACCES; + } /* round up to 32-bytes, since this is granularity * of interpreter stack size */ @@ -3002,6 +3083,10 @@ continue_func: i); return -EFAULT; } + + if (subprog[idx].has_tail_call) + tail_call_reachable = true; + frame++; if (frame >= MAX_CALL_FRAMES) { verbose(env, "the call stack of %d frames is too deep !\n", @@ -3010,6 +3095,15 @@ continue_func: } goto process_func; } + /* if tail call got detected across bpf2bpf calls then mark each of the + * currently present subprog frames as tail call reachable subprogs; + * this info will be utilized by JIT so that we will be preserving the + * tail call counter throughout bpf2bpf calls combined with tailcalls + */ + if (tail_call_reachable) + for (j = 0; j < frame; j++) + subprog[ret_prog[j]].tail_call_reachable = true; + /* end of for() loop means the last insn of the 'subprog' * was reached. Doesn't matter whether it was JA or EXIT */ @@ -3585,18 +3679,6 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, struct bpf_func_state *state = func(env, reg); int err, min_off, max_off, i, j, slot, spi; - if (reg->type != PTR_TO_STACK) { - /* Allow zero-byte read from NULL, regardless of pointer type */ - if (zero_size_allowed && access_size == 0 && - register_is_null(reg)) - return 0; - - verbose(env, "R%d type=%s expected=%s\n", regno, - reg_type_str[reg->type], - reg_type_str[PTR_TO_STACK]); - return -EACCES; - } - if (tnum_is_const(reg->var_off)) { min_off = max_off = reg->var_off.value + reg->off; err = __check_stack_boundary(env, regno, min_off, access_size, @@ -3741,9 +3823,19 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, access_size, zero_size_allowed, "rdwr", &env->prog->aux->max_rdwr_access); - default: /* scalar_value|ptr_to_stack or invalid ptr */ + case PTR_TO_STACK: return check_stack_boundary(env, regno, access_size, zero_size_allowed, meta); + default: /* scalar_value or invalid ptr */ + /* Allow zero-byte read from NULL, regardless of pointer type */ + if (zero_size_allowed && access_size == 0 && + register_is_null(reg)) + return 0; + + verbose(env, "R%d type=%s expected=%s\n", regno, + reg_type_str[reg->type], + reg_type_str[PTR_TO_STACK]); + return -EACCES; } } @@ -3775,10 +3867,6 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, struct bpf_map *map = reg->map_ptr; u64 val = reg->var_off.value; - if (reg->type != PTR_TO_MAP_VALUE) { - verbose(env, "R%d is not a pointer to map_value\n", regno); - return -EINVAL; - } if (!is_const) { verbose(env, "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n", @@ -3845,12 +3933,6 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type) type == ARG_CONST_SIZE_OR_ZERO; } -static bool arg_type_is_alloc_mem_ptr(enum bpf_arg_type type) -{ - return type == ARG_PTR_TO_ALLOC_MEM || - type == ARG_PTR_TO_ALLOC_MEM_OR_NULL; -} - static bool arg_type_is_alloc_size(enum bpf_arg_type type) { return type == ARG_CONST_ALLOC_SIZE_OR_ZERO; @@ -3872,14 +3954,194 @@ static int int_ptr_type_to_size(enum bpf_arg_type type) return -EINVAL; } +static int resolve_map_arg_type(struct bpf_verifier_env *env, + const struct bpf_call_arg_meta *meta, + enum bpf_arg_type *arg_type) +{ + if (!meta->map_ptr) { + /* kernel subsystem misconfigured verifier */ + verbose(env, "invalid map_ptr to access map->type\n"); + return -EACCES; + } + + switch (meta->map_ptr->map_type) { + case BPF_MAP_TYPE_SOCKMAP: + case BPF_MAP_TYPE_SOCKHASH: + if (*arg_type == ARG_PTR_TO_MAP_VALUE) { + *arg_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON; + } else { + verbose(env, "invalid arg_type for sockmap/sockhash\n"); + return -EINVAL; + } + break; + + default: + break; + } + return 0; +} + +struct bpf_reg_types { + const enum bpf_reg_type types[10]; + u32 *btf_id; +}; + +static const struct bpf_reg_types map_key_value_types = { + .types = { + PTR_TO_STACK, + PTR_TO_PACKET, + PTR_TO_PACKET_META, + PTR_TO_MAP_VALUE, + }, +}; + +static const struct bpf_reg_types sock_types = { + .types = { + PTR_TO_SOCK_COMMON, + PTR_TO_SOCKET, + PTR_TO_TCP_SOCK, + PTR_TO_XDP_SOCK, + }, +}; + +#ifdef CONFIG_NET +static const struct bpf_reg_types btf_id_sock_common_types = { + .types = { + PTR_TO_SOCK_COMMON, + PTR_TO_SOCKET, + PTR_TO_TCP_SOCK, + PTR_TO_XDP_SOCK, + PTR_TO_BTF_ID, + }, + .btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], +}; +#endif + +static const struct bpf_reg_types mem_types = { + .types = { + PTR_TO_STACK, + PTR_TO_PACKET, + PTR_TO_PACKET_META, + PTR_TO_MAP_VALUE, + PTR_TO_MEM, + PTR_TO_RDONLY_BUF, + PTR_TO_RDWR_BUF, + }, +}; + +static const struct bpf_reg_types int_ptr_types = { + .types = { + PTR_TO_STACK, + PTR_TO_PACKET, + PTR_TO_PACKET_META, + PTR_TO_MAP_VALUE, + }, +}; + +static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } }; +static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } }; +static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } }; +static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM } }; +static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } }; +static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; +static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; +static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } }; + +static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { + [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, + [ARG_PTR_TO_MAP_VALUE] = &map_key_value_types, + [ARG_PTR_TO_UNINIT_MAP_VALUE] = &map_key_value_types, + [ARG_PTR_TO_MAP_VALUE_OR_NULL] = &map_key_value_types, + [ARG_CONST_SIZE] = &scalar_types, + [ARG_CONST_SIZE_OR_ZERO] = &scalar_types, + [ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types, + [ARG_CONST_MAP_PTR] = &const_map_ptr_types, + [ARG_PTR_TO_CTX] = &context_types, + [ARG_PTR_TO_CTX_OR_NULL] = &context_types, + [ARG_PTR_TO_SOCK_COMMON] = &sock_types, +#ifdef CONFIG_NET + [ARG_PTR_TO_BTF_ID_SOCK_COMMON] = &btf_id_sock_common_types, +#endif + [ARG_PTR_TO_SOCKET] = &fullsock_types, + [ARG_PTR_TO_SOCKET_OR_NULL] = &fullsock_types, + [ARG_PTR_TO_BTF_ID] = &btf_ptr_types, + [ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types, + [ARG_PTR_TO_MEM] = &mem_types, + [ARG_PTR_TO_MEM_OR_NULL] = &mem_types, + [ARG_PTR_TO_UNINIT_MEM] = &mem_types, + [ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types, + [ARG_PTR_TO_ALLOC_MEM_OR_NULL] = &alloc_mem_types, + [ARG_PTR_TO_INT] = &int_ptr_types, + [ARG_PTR_TO_LONG] = &int_ptr_types, + [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types, +}; + +static int check_reg_type(struct bpf_verifier_env *env, u32 regno, + enum bpf_arg_type arg_type, + const u32 *arg_btf_id) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + enum bpf_reg_type expected, type = reg->type; + const struct bpf_reg_types *compatible; + int i, j; + + compatible = compatible_reg_types[arg_type]; + if (!compatible) { + verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type); + return -EFAULT; + } + + for (i = 0; i < ARRAY_SIZE(compatible->types); i++) { + expected = compatible->types[i]; + if (expected == NOT_INIT) + break; + + if (type == expected) + goto found; + } + + verbose(env, "R%d type=%s expected=", regno, reg_type_str[type]); + for (j = 0; j + 1 < i; j++) + verbose(env, "%s, ", reg_type_str[compatible->types[j]]); + verbose(env, "%s\n", reg_type_str[compatible->types[j]]); + return -EACCES; + +found: + if (type == PTR_TO_BTF_ID) { + if (!arg_btf_id) { + if (!compatible->btf_id) { + verbose(env, "verifier internal error: missing arg compatible BTF ID\n"); + return -EFAULT; + } + arg_btf_id = compatible->btf_id; + } + + if (!btf_struct_ids_match(&env->log, reg->off, reg->btf_id, + *arg_btf_id)) { + verbose(env, "R%d is of type %s but %s is expected\n", + regno, kernel_type_name(reg->btf_id), + kernel_type_name(*arg_btf_id)); + return -EACCES; + } + + if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n", + regno); + return -EACCES; + } + } + + return 0; +} + static int check_func_arg(struct bpf_verifier_env *env, u32 arg, struct bpf_call_arg_meta *meta, const struct bpf_func_proto *fn) { u32 regno = BPF_REG_1 + arg; struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; - enum bpf_reg_type expected_type, type = reg->type; enum bpf_arg_type arg_type = fn->arg_type[arg]; + enum bpf_reg_type type = reg->type; int err = 0; if (arg_type == ARG_DONTCARE) @@ -3904,120 +4166,32 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return -EACCES; } - if (arg_type == ARG_PTR_TO_MAP_KEY || - arg_type == ARG_PTR_TO_MAP_VALUE || + if (arg_type == ARG_PTR_TO_MAP_VALUE || arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE || arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) { - expected_type = PTR_TO_STACK; - if (register_is_null(reg) && - arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) - /* final test in check_stack_boundary() */; - else if (!type_is_pkt_pointer(type) && - type != PTR_TO_MAP_VALUE && - type != expected_type) - goto err_type; - } else if (arg_type == ARG_CONST_SIZE || - arg_type == ARG_CONST_SIZE_OR_ZERO || - arg_type == ARG_CONST_ALLOC_SIZE_OR_ZERO) { - expected_type = SCALAR_VALUE; - if (type != expected_type) - goto err_type; - } else if (arg_type == ARG_CONST_MAP_PTR) { - expected_type = CONST_PTR_TO_MAP; - if (type != expected_type) - goto err_type; - } else if (arg_type == ARG_PTR_TO_CTX || - arg_type == ARG_PTR_TO_CTX_OR_NULL) { - expected_type = PTR_TO_CTX; - if (!(register_is_null(reg) && - arg_type == ARG_PTR_TO_CTX_OR_NULL)) { - if (type != expected_type) - goto err_type; - err = check_ctx_reg(env, reg, regno); - if (err < 0) - return err; - } - } else if (arg_type == ARG_PTR_TO_SOCK_COMMON) { - expected_type = PTR_TO_SOCK_COMMON; - /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */ - if (!type_is_sk_pointer(type)) - goto err_type; - if (reg->ref_obj_id) { - if (meta->ref_obj_id) { - verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", - regno, reg->ref_obj_id, - meta->ref_obj_id); - return -EFAULT; - } - meta->ref_obj_id = reg->ref_obj_id; - } - } else if (arg_type == ARG_PTR_TO_SOCKET || - arg_type == ARG_PTR_TO_SOCKET_OR_NULL) { - expected_type = PTR_TO_SOCKET; - if (!(register_is_null(reg) && - arg_type == ARG_PTR_TO_SOCKET_OR_NULL)) { - if (type != expected_type) - goto err_type; - } - } else if (arg_type == ARG_PTR_TO_BTF_ID) { - expected_type = PTR_TO_BTF_ID; - if (type != expected_type) - goto err_type; - if (!fn->check_btf_id) { - if (reg->btf_id != meta->btf_id) { - verbose(env, "Helper has type %s got %s in R%d\n", - kernel_type_name(meta->btf_id), - kernel_type_name(reg->btf_id), regno); - - return -EACCES; - } - } else if (!fn->check_btf_id(reg->btf_id, arg)) { - verbose(env, "Helper does not support %s in R%d\n", - kernel_type_name(reg->btf_id), regno); + err = resolve_map_arg_type(env, meta, &arg_type); + if (err) + return err; + } - return -EACCES; - } - if (!tnum_is_const(reg->var_off) || reg->var_off.value || reg->off) { - verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n", - regno); - return -EACCES; - } - } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { - if (meta->func_id == BPF_FUNC_spin_lock) { - if (process_spin_lock(env, regno, true)) - return -EACCES; - } else if (meta->func_id == BPF_FUNC_spin_unlock) { - if (process_spin_lock(env, regno, false)) - return -EACCES; - } else { - verbose(env, "verifier internal error\n"); - return -EFAULT; - } - } else if (arg_type_is_mem_ptr(arg_type)) { - expected_type = PTR_TO_STACK; - /* One exception here. In case function allows for NULL to be - * passed in as argument, it's a SCALAR_VALUE type. Final test - * happens during stack boundary checking. + if (register_is_null(reg) && arg_type_may_be_null(arg_type)) + /* A NULL register has a SCALAR_VALUE type, so skip + * type checking. */ - if (register_is_null(reg) && - (arg_type == ARG_PTR_TO_MEM_OR_NULL || - arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL)) - /* final test in check_stack_boundary() */; - else if (!type_is_pkt_pointer(type) && - type != PTR_TO_MAP_VALUE && - type != PTR_TO_MEM && - type != PTR_TO_RDONLY_BUF && - type != PTR_TO_RDWR_BUF && - type != expected_type) - goto err_type; - meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; - } else if (arg_type_is_alloc_mem_ptr(arg_type)) { - expected_type = PTR_TO_MEM; - if (register_is_null(reg) && - arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL) - /* final test in check_stack_boundary() */; - else if (type != expected_type) - goto err_type; + goto skip_type_check; + + err = check_reg_type(env, regno, arg_type, fn->arg_btf_id[arg]); + if (err) + return err; + + if (type == PTR_TO_CTX) { + err = check_ctx_reg(env, reg, regno); + if (err < 0) + return err; + } + +skip_type_check: + if (reg->ref_obj_id) { if (meta->ref_obj_id) { verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", regno, reg->ref_obj_id, @@ -4025,15 +4199,6 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return -EFAULT; } meta->ref_obj_id = reg->ref_obj_id; - } else if (arg_type_is_int_ptr(arg_type)) { - expected_type = PTR_TO_STACK; - if (!type_is_pkt_pointer(type) && - type != PTR_TO_MAP_VALUE && - type != expected_type) - goto err_type; - } else { - verbose(env, "unsupported arg_type %d\n", arg_type); - return -EFAULT; } if (arg_type == ARG_CONST_MAP_PTR) { @@ -4072,6 +4237,28 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, false, meta); + } else if (arg_type == ARG_PTR_TO_PERCPU_BTF_ID) { + if (!reg->btf_id) { + verbose(env, "Helper has invalid btf_id in R%d\n", regno); + return -EACCES; + } + meta->ret_btf_id = reg->btf_id; + } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { + if (meta->func_id == BPF_FUNC_spin_lock) { + if (process_spin_lock(env, regno, true)) + return -EACCES; + } else if (meta->func_id == BPF_FUNC_spin_unlock) { + if (process_spin_lock(env, regno, false)) + return -EACCES; + } else { + verbose(env, "verifier internal error\n"); + return -EFAULT; + } + } else if (arg_type_is_mem_ptr(arg_type)) { + /* The access to this pointer is only checked when we hit the + * next is_mem_size argument below. + */ + meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MEM); } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); @@ -4137,10 +4324,43 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, } return err; -err_type: - verbose(env, "R%d type=%s expected=%s\n", regno, - reg_type_str[type], reg_type_str[expected_type]); - return -EACCES; +} + +static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id) +{ + enum bpf_attach_type eatype = env->prog->expected_attach_type; + enum bpf_prog_type type = resolve_prog_type(env->prog); + + if (func_id != BPF_FUNC_map_update_elem) + return false; + + /* It's not possible to get access to a locked struct sock in these + * contexts, so updating is safe. + */ + switch (type) { + case BPF_PROG_TYPE_TRACING: + if (eatype == BPF_TRACE_ITER) + return true; + break; + case BPF_PROG_TYPE_SOCKET_FILTER: + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: + case BPF_PROG_TYPE_XDP: + case BPF_PROG_TYPE_SK_REUSEPORT: + case BPF_PROG_TYPE_FLOW_DISSECTOR: + case BPF_PROG_TYPE_SK_LOOKUP: + return true; + default: + break; + } + + verbose(env, "cannot update sockmap in this context\n"); + return false; +} + +static bool allow_tail_call_in_subprogs(struct bpf_verifier_env *env) +{ + return env->prog->jit_requested && IS_ENABLED(CONFIG_X86_64); } static int check_map_func_compatibility(struct bpf_verifier_env *env, @@ -4214,7 +4434,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_msg_redirect_map && func_id != BPF_FUNC_sk_select_reuseport && - func_id != BPF_FUNC_map_lookup_elem) + func_id != BPF_FUNC_map_lookup_elem && + !may_update_sockmap(env, func_id)) goto error; break; case BPF_MAP_TYPE_SOCKHASH: @@ -4223,7 +4444,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_msg_redirect_hash && func_id != BPF_FUNC_sk_select_reuseport && - func_id != BPF_FUNC_map_lookup_elem) + func_id != BPF_FUNC_map_lookup_elem && + !may_update_sockmap(env, func_id)) goto error; break; case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: @@ -4242,6 +4464,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_sk_storage_delete) goto error; break; + case BPF_MAP_TYPE_INODE_STORAGE: + if (func_id != BPF_FUNC_inode_storage_get && + func_id != BPF_FUNC_inode_storage_delete) + goto error; + break; default: break; } @@ -4251,8 +4478,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_tail_call: if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) goto error; - if (env->subprog_cnt > 1) { - verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n"); + if (env->subprog_cnt > 1 && !allow_tail_call_in_subprogs(env)) { + verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n"); return -EINVAL; } break; @@ -4315,6 +4542,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) goto error; break; + case BPF_FUNC_inode_storage_get: + case BPF_FUNC_inode_storage_delete: + if (map->map_type != BPF_MAP_TYPE_INODE_STORAGE) + goto error; + break; default: break; } @@ -4402,10 +4634,26 @@ static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id) return count <= 1; } +static bool check_btf_id_ok(const struct bpf_func_proto *fn) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) { + if (fn->arg_type[i] == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i]) + return false; + + if (fn->arg_type[i] != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i]) + return false; + } + + return true; +} + static int check_func_proto(const struct bpf_func_proto *fn, int func_id) { return check_raw_mode_ok(fn) && check_arg_pair_ok(fn) && + check_btf_id_ok(fn) && check_refcount_ok(fn, func_id) ? 0 : -EINVAL; } @@ -4775,6 +5023,11 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } + if (fn->allowed && !fn->allowed(env->prog)) { + verbose(env, "helper call is not allowed in probe\n"); + return -EINVAL; + } + /* With LD_ABS/IND some JITs save/restore skb from r1. */ changes_data = bpf_helper_changes_pkt_data(fn->func); if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) { @@ -4796,11 +5049,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn meta.func_id = func_id; /* check args */ for (i = 0; i < 5; i++) { - if (!fn->check_btf_id) { - err = btf_resolve_helper_id(&env->log, fn, i); - if (err > 0) - meta.btf_id = err; - } err = check_func_arg(env, i, &meta, fn); if (err) return err; @@ -4885,25 +5133,49 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].id = ++env->id_gen; } else { regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; - regs[BPF_REG_0].id = ++env->id_gen; } } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; - regs[BPF_REG_0].id = ++env->id_gen; } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL; - regs[BPF_REG_0].id = ++env->id_gen; } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; - regs[BPF_REG_0].id = ++env->id_gen; } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; - regs[BPF_REG_0].id = ++env->id_gen; regs[BPF_REG_0].mem_size = meta.mem_size; + } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL || + fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) { + const struct btf_type *t; + + mark_reg_known_zero(env, regs, BPF_REG_0); + t = btf_type_skip_modifiers(btf_vmlinux, meta.ret_btf_id, NULL); + if (!btf_type_is_struct(t)) { + u32 tsize; + const struct btf_type *ret; + const char *tname; + + /* resolve the type size of ksym. */ + ret = btf_resolve_size(btf_vmlinux, t, &tsize); + if (IS_ERR(ret)) { + tname = btf_name_by_offset(btf_vmlinux, t->name_off); + verbose(env, "unable to resolve the size of type '%s': %ld\n", + tname, PTR_ERR(ret)); + return -EINVAL; + } + regs[BPF_REG_0].type = + fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? + PTR_TO_MEM : PTR_TO_MEM_OR_NULL; + regs[BPF_REG_0].mem_size = tsize; + } else { + regs[BPF_REG_0].type = + fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? + PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL; + regs[BPF_REG_0].btf_id = meta.ret_btf_id; + } } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) { int ret_btf_id; @@ -4922,6 +5194,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } + if (reg_type_may_be_null(regs[BPF_REG_0].type)) + regs[BPF_REG_0].id = ++env->id_gen; + if (is_ptr_cast_function(func_id)) { /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; @@ -5219,6 +5494,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst, reg_type_str[ptr_reg->type]); return -EACCES; case CONST_PTR_TO_MAP: + /* smin_val represents the known value */ + if (known && smin_val == 0 && opcode == BPF_ADD) + break; + fallthrough; case PTR_TO_PACKET_END: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: @@ -5634,8 +5913,7 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg, u64 umax_val = src_reg->umax_value; if (src_known && dst_known) { - __mark_reg_known(dst_reg, dst_reg->var_off.value & - src_reg->var_off.value); + __mark_reg_known(dst_reg, dst_reg->var_off.value); return; } @@ -5705,8 +5983,7 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg, u64 umin_val = src_reg->umin_value; if (src_known && dst_known) { - __mark_reg_known(dst_reg, dst_reg->var_off.value | - src_reg->var_off.value); + __mark_reg_known(dst_reg, dst_reg->var_off.value); return; } @@ -5732,6 +6009,67 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg, __update_reg_bounds(dst_reg); } +static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + bool src_known = tnum_subreg_is_const(src_reg->var_off); + bool dst_known = tnum_subreg_is_const(dst_reg->var_off); + struct tnum var32_off = tnum_subreg(dst_reg->var_off); + s32 smin_val = src_reg->s32_min_value; + + /* Assuming scalar64_min_max_xor will be called so it is safe + * to skip updating register for known case. + */ + if (src_known && dst_known) + return; + + /* We get both minimum and maximum from the var32_off. */ + dst_reg->u32_min_value = var32_off.value; + dst_reg->u32_max_value = var32_off.value | var32_off.mask; + + if (dst_reg->s32_min_value >= 0 && smin_val >= 0) { + /* XORing two positive sign numbers gives a positive, + * so safe to cast u32 result into s32. + */ + dst_reg->s32_min_value = dst_reg->u32_min_value; + dst_reg->s32_max_value = dst_reg->u32_max_value; + } else { + dst_reg->s32_min_value = S32_MIN; + dst_reg->s32_max_value = S32_MAX; + } +} + +static void scalar_min_max_xor(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg) +{ + bool src_known = tnum_is_const(src_reg->var_off); + bool dst_known = tnum_is_const(dst_reg->var_off); + s64 smin_val = src_reg->smin_value; + + if (src_known && dst_known) { + /* dst_reg->var_off.value has been updated earlier */ + __mark_reg_known(dst_reg, dst_reg->var_off.value); + return; + } + + /* We get both minimum and maximum from the var_off. */ + dst_reg->umin_value = dst_reg->var_off.value; + dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask; + + if (dst_reg->smin_value >= 0 && smin_val >= 0) { + /* XORing two positive sign numbers gives a positive, + * so safe to cast u64 result into s64. + */ + dst_reg->smin_value = dst_reg->umin_value; + dst_reg->smax_value = dst_reg->umax_value; + } else { + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; + } + + __update_reg_bounds(dst_reg); +} + static void __scalar32_min_max_lsh(struct bpf_reg_state *dst_reg, u64 umin_val, u64 umax_val) { @@ -6040,6 +6378,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, scalar32_min_max_or(dst_reg, &src_reg); scalar_min_max_or(dst_reg, &src_reg); break; + case BPF_XOR: + dst_reg->var_off = tnum_xor(dst_reg->var_off, src_reg.var_off); + scalar32_min_max_xor(dst_reg, &src_reg); + scalar_min_max_xor(dst_reg, &src_reg); + break; case BPF_LSH: if (umax_val >= insn_bitness) { /* Shifts greater than 31 or 63 are undefined. @@ -6111,6 +6454,11 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, src_reg = NULL; if (dst_reg->type != SCALAR_VALUE) ptr_reg = dst_reg; + else + /* Make sure ID is cleared otherwise dst_reg min/max could be + * incorrectly propagated into other registers by find_equal_scalars() + */ + dst_reg->id = 0; if (BPF_SRC(insn->code) == BPF_X) { src_reg = ®s[insn->src_reg]; if (src_reg->type != SCALAR_VALUE) { @@ -6244,6 +6592,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) /* case: R1 = R2 * copy register state to dest reg */ + if (src_reg->type == SCALAR_VALUE && !src_reg->id) + /* Assign src and dst registers the same ID + * that will be used by find_equal_scalars() + * to propagate min/max range. + */ + src_reg->id = ++env->id_gen; *dst_reg = *src_reg; dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = DEF_NOT_SUBREG; @@ -6256,6 +6610,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EACCES; } else if (src_reg->type == SCALAR_VALUE) { *dst_reg = *src_reg; + /* Make sure ID is cleared otherwise + * dst_reg min/max could be incorrectly + * propagated into src_reg by find_equal_scalars() + */ + dst_reg->id = 0; dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = env->insn_idx + 1; } else { @@ -6646,14 +7005,18 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, struct bpf_reg_state *reg = opcode == BPF_JEQ ? true_reg : false_reg; - /* For BPF_JEQ, if this is false we know nothing Jon Snow, but - * if it is true we know the value for sure. Likewise for - * BPF_JNE. + /* JEQ/JNE comparison doesn't change the register equivalence. + * r1 = r2; + * if (r1 == 42) goto label; + * ... + * label: // here both r1 and r2 are known to be 42. + * + * Hence when marking register as known preserve it's ID. */ if (is_jmp32) __mark_reg32_known(reg, val32); else - __mark_reg_known(reg, val); + ___mark_reg_known(reg, val); break; } case BPF_JSET: @@ -6847,7 +7210,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, struct bpf_reg_state *reg, u32 id, bool is_null) { - if (reg_type_may_be_null(reg->type) && reg->id == id) { + if (reg_type_may_be_null(reg->type) && reg->id == id && + !WARN_ON_ONCE(!reg->id)) { /* Old offset (both fixed and variable parts) should * have been known-zero, because we don't allow pointer * arithmetic on pointers that might be NULL. @@ -7044,6 +7408,30 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, return true; } +static void find_equal_scalars(struct bpf_verifier_state *vstate, + struct bpf_reg_state *known_reg) +{ + struct bpf_func_state *state; + struct bpf_reg_state *reg; + int i, j; + + for (i = 0; i <= vstate->curframe; i++) { + state = vstate->frame[i]; + for (j = 0; j < MAX_BPF_REG; j++) { + reg = &state->regs[j]; + if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) + *reg = *known_reg; + } + + bpf_for_each_spilled_reg(j, state, reg) { + if (!reg) + continue; + if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) + *reg = *known_reg; + } + } +} + static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { @@ -7172,6 +7560,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, reg_combine_min_max(&other_branch_regs[insn->src_reg], &other_branch_regs[insn->dst_reg], src_reg, dst_reg, opcode); + if (src_reg->id && + !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) { + find_equal_scalars(this_branch, src_reg); + find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]); + } + } } else if (dst_reg->type == SCALAR_VALUE) { reg_set_min_max(&other_branch_regs[insn->dst_reg], @@ -7179,6 +7573,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, opcode, is_jmp32); } + if (dst_reg->type == SCALAR_VALUE && dst_reg->id && + !WARN_ON_ONCE(dst_reg->id != other_branch_regs[insn->dst_reg].id)) { + find_equal_scalars(this_branch, dst_reg); + find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]); + } + /* detect if R == 0 where R is returned from bpf_map_lookup_elem(). * NOTE: these optimizations below are related with pointer comparison * which will never be JMP32. @@ -7210,6 +7610,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) { struct bpf_insn_aux_data *aux = cur_aux(env); struct bpf_reg_state *regs = cur_regs(env); + struct bpf_reg_state *dst_reg; struct bpf_map *map; int err; @@ -7226,25 +7627,45 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) if (err) return err; + dst_reg = ®s[insn->dst_reg]; if (insn->src_reg == 0) { u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; - regs[insn->dst_reg].type = SCALAR_VALUE; + dst_reg->type = SCALAR_VALUE; __mark_reg_known(®s[insn->dst_reg], imm); return 0; } + if (insn->src_reg == BPF_PSEUDO_BTF_ID) { + mark_reg_known_zero(env, regs, insn->dst_reg); + + dst_reg->type = aux->btf_var.reg_type; + switch (dst_reg->type) { + case PTR_TO_MEM: + dst_reg->mem_size = aux->btf_var.mem_size; + break; + case PTR_TO_BTF_ID: + case PTR_TO_PERCPU_BTF_ID: + dst_reg->btf_id = aux->btf_var.btf_id; + break; + default: + verbose(env, "bpf verifier is misconfigured\n"); + return -EFAULT; + } + return 0; + } + map = env->used_maps[aux->map_index]; mark_reg_known_zero(env, regs, insn->dst_reg); - regs[insn->dst_reg].map_ptr = map; + dst_reg->map_ptr = map; if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) { - regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; - regs[insn->dst_reg].off = aux->map_off; + dst_reg->type = PTR_TO_MAP_VALUE; + dst_reg->off = aux->map_off; if (map_value_has_spin_lock(map)) - regs[insn->dst_reg].id = ++env->id_gen; + dst_reg->id = ++env->id_gen; } else if (insn->src_reg == BPF_PSEUDO_MAP_FD) { - regs[insn->dst_reg].type = CONST_PTR_TO_MAP; + dst_reg->type = CONST_PTR_TO_MAP; } else { verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; @@ -7287,7 +7708,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) u8 mode = BPF_MODE(insn->code); int i, err; - if (!may_access_skb(env->prog->type)) { + if (!may_access_skb(resolve_prog_type(env->prog))) { verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n"); return -EINVAL; } @@ -7297,18 +7718,6 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } - if (env->subprog_cnt > 1) { - /* when program has LD_ABS insn JITs and interpreter assume - * that r1 == ctx == skb which is not the case for callees - * that can have arbitrary arguments. It's problematic - * for main prog as well since JITs would need to analyze - * all functions in order to make proper register save/restore - * decisions in the main prog. Hence disallow LD_ABS with calls - */ - verbose(env, "BPF_LD_[ABS|IND] instructions cannot be mixed with bpf-to-bpf calls\n"); - return -EINVAL; - } - if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || BPF_SIZE(insn->code) == BPF_DW || (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { @@ -7375,11 +7784,12 @@ static int check_return_code(struct bpf_verifier_env *env) const struct bpf_prog *prog = env->prog; struct bpf_reg_state *reg; struct tnum range = tnum_range(0, 1); + enum bpf_prog_type prog_type = resolve_prog_type(env->prog); int err; /* LSM and struct_ops func-ptr's return type could be "void" */ - if ((env->prog->type == BPF_PROG_TYPE_STRUCT_OPS || - env->prog->type == BPF_PROG_TYPE_LSM) && + if ((prog_type == BPF_PROG_TYPE_STRUCT_OPS || + prog_type == BPF_PROG_TYPE_LSM) && !prog->aux->attach_func_proto->type) return 0; @@ -7398,7 +7808,7 @@ static int check_return_code(struct bpf_verifier_env *env) return -EACCES; } - switch (env->prog->type) { + switch (prog_type) { case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG || @@ -7718,6 +8128,23 @@ err_free: return ret; } +static int check_abnormal_return(struct bpf_verifier_env *env) +{ + int i; + + for (i = 1; i < env->subprog_cnt; i++) { + if (env->subprog_info[i].has_ld_abs) { + verbose(env, "LD_ABS is not allowed in subprogs without BTF\n"); + return -EINVAL; + } + if (env->subprog_info[i].has_tail_call) { + verbose(env, "tail_call is not allowed in subprogs without BTF\n"); + return -EINVAL; + } + } + return 0; +} + /* The minimum supported BTF func info size */ #define MIN_BPF_FUNCINFO_SIZE 8 #define MAX_FUNCINFO_REC_SIZE 252 @@ -7726,20 +8153,24 @@ static int check_btf_func(struct bpf_verifier_env *env, const union bpf_attr *attr, union bpf_attr __user *uattr) { + const struct btf_type *type, *func_proto, *ret_type; u32 i, nfuncs, urec_size, min_size; u32 krec_size = sizeof(struct bpf_func_info); struct bpf_func_info *krecord; struct bpf_func_info_aux *info_aux = NULL; - const struct btf_type *type; struct bpf_prog *prog; const struct btf *btf; void __user *urecord; u32 prev_offset = 0; + bool scalar_return; int ret = -ENOMEM; nfuncs = attr->func_info_cnt; - if (!nfuncs) + if (!nfuncs) { + if (check_abnormal_return(env)) + return -EINVAL; return 0; + } if (nfuncs != env->subprog_cnt) { verbose(env, "number of funcs in func_info doesn't match number of subprogs\n"); @@ -7787,25 +8218,23 @@ static int check_btf_func(struct bpf_verifier_env *env, } /* check insn_off */ + ret = -EINVAL; if (i == 0) { if (krecord[i].insn_off) { verbose(env, "nonzero insn_off %u for the first func info record", krecord[i].insn_off); - ret = -EINVAL; goto err_free; } } else if (krecord[i].insn_off <= prev_offset) { verbose(env, "same or smaller insn offset (%u) than previous func info record (%u)", krecord[i].insn_off, prev_offset); - ret = -EINVAL; goto err_free; } if (env->subprog_info[i].start != krecord[i].insn_off) { verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n"); - ret = -EINVAL; goto err_free; } @@ -7814,10 +8243,26 @@ static int check_btf_func(struct bpf_verifier_env *env, if (!type || !btf_type_is_func(type)) { verbose(env, "invalid type id %d in func info", krecord[i].type_id); - ret = -EINVAL; goto err_free; } info_aux[i].linkage = BTF_INFO_VLEN(type->info); + + func_proto = btf_type_by_id(btf, type->type); + if (unlikely(!func_proto || !btf_type_is_func_proto(func_proto))) + /* btf_func_check() already verified it during BTF load */ + goto err_free; + ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL); + scalar_return = + btf_type_is_small_int(ret_type) || btf_type_is_enum(ret_type); + if (i && !scalar_return && env->subprog_info[i].has_ld_abs) { + verbose(env, "LD_ABS is only allowed in functions that return 'int'.\n"); + goto err_free; + } + if (i && !scalar_return && env->subprog_info[i].has_tail_call) { + verbose(env, "tail_call is only allowed in functions that return 'int'.\n"); + goto err_free; + } + prev_offset = krecord[i].insn_off; urecord += urec_size; } @@ -7978,8 +8423,11 @@ static int check_btf_info(struct bpf_verifier_env *env, struct btf *btf; int err; - if (!attr->func_info_cnt && !attr->line_info_cnt) + if (!attr->func_info_cnt && !attr->line_info_cnt) { + if (check_abnormal_return(env)) + return -EINVAL; return 0; + } btf = btf_get_by_fd(attr->prog_btf_fd); if (IS_ERR(btf)) @@ -9119,6 +9567,92 @@ process_bpf_exit: return 0; } +/* replace pseudo btf_id with kernel symbol address */ +static int check_pseudo_btf_id(struct bpf_verifier_env *env, + struct bpf_insn *insn, + struct bpf_insn_aux_data *aux) +{ + u32 datasec_id, type, id = insn->imm; + const struct btf_var_secinfo *vsi; + const struct btf_type *datasec; + const struct btf_type *t; + const char *sym_name; + bool percpu = false; + u64 addr; + int i; + + if (!btf_vmlinux) { + verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n"); + return -EINVAL; + } + + if (insn[1].imm != 0) { + verbose(env, "reserved field (insn[1].imm) is used in pseudo_btf_id ldimm64 insn.\n"); + return -EINVAL; + } + + t = btf_type_by_id(btf_vmlinux, id); + if (!t) { + verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id); + return -ENOENT; + } + + if (!btf_type_is_var(t)) { + verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", + id); + return -EINVAL; + } + + sym_name = btf_name_by_offset(btf_vmlinux, t->name_off); + addr = kallsyms_lookup_name(sym_name); + if (!addr) { + verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n", + sym_name); + return -ENOENT; + } + + datasec_id = btf_find_by_name_kind(btf_vmlinux, ".data..percpu", + BTF_KIND_DATASEC); + if (datasec_id > 0) { + datasec = btf_type_by_id(btf_vmlinux, datasec_id); + for_each_vsi(i, datasec, vsi) { + if (vsi->type == id) { + percpu = true; + break; + } + } + } + + insn[0].imm = (u32)addr; + insn[1].imm = addr >> 32; + + type = t->type; + t = btf_type_skip_modifiers(btf_vmlinux, type, NULL); + if (percpu) { + aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID; + aux->btf_var.btf_id = type; + } else if (!btf_type_is_struct(t)) { + const struct btf_type *ret; + const char *tname; + u32 tsize; + + /* resolve the type size of ksym. */ + ret = btf_resolve_size(btf_vmlinux, t, &tsize); + if (IS_ERR(ret)) { + tname = btf_name_by_offset(btf_vmlinux, t->name_off); + verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n", + tname, PTR_ERR(ret)); + return -EINVAL; + } + aux->btf_var.reg_type = PTR_TO_MEM; + aux->btf_var.mem_size = tsize; + } else { + aux->btf_var.reg_type = PTR_TO_BTF_ID; + aux->btf_var.btf_id = type; + } + return 0; +} + static int check_map_prealloc(struct bpf_map *map) { return (map->map_type != BPF_MAP_TYPE_HASH && @@ -9154,6 +9688,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, struct bpf_prog *prog) { + enum bpf_prog_type prog_type = resolve_prog_type(prog); /* * Validate that trace type programs use preallocated hash maps. * @@ -9171,8 +9706,8 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, * now, but warnings are emitted so developers are made aware of * the unsafety and can fix their programs before this is enforced. */ - if (is_tracing_prog_type(prog->type) && !is_preallocated_map(map)) { - if (prog->type == BPF_PROG_TYPE_PERF_EVENT) { + if (is_tracing_prog_type(prog_type) && !is_preallocated_map(map)) { + if (prog_type == BPF_PROG_TYPE_PERF_EVENT) { verbose(env, "perf_event programs can only use preallocated hash map\n"); return -EINVAL; } @@ -9184,8 +9719,8 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, verbose(env, "trace type programs with run-time allocated hash maps are unsafe. Switch to preallocated hash maps.\n"); } - if ((is_tracing_prog_type(prog->type) || - prog->type == BPF_PROG_TYPE_SOCKET_FILTER) && + if ((is_tracing_prog_type(prog_type) || + prog_type == BPF_PROG_TYPE_SOCKET_FILTER) && map_value_has_spin_lock(map)) { verbose(env, "tracing progs cannot use bpf_spin_lock yet\n"); return -EINVAL; @@ -9202,6 +9737,23 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, return -EINVAL; } + if (prog->aux->sleepable) + switch (map->map_type) { + case BPF_MAP_TYPE_HASH: + case BPF_MAP_TYPE_LRU_HASH: + case BPF_MAP_TYPE_ARRAY: + if (!is_preallocated_map(map)) { + verbose(env, + "Sleepable programs can only use preallocated hash maps\n"); + return -EINVAL; + } + break; + default: + verbose(env, + "Sleepable programs can only use array and hash maps\n"); + return -EINVAL; + } + return 0; } @@ -9211,10 +9763,14 @@ static bool bpf_map_is_cgroup_storage(struct bpf_map *map) map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE); } -/* look for pseudo eBPF instructions that access map FDs and - * replace them with actual map pointers +/* find and rewrite pseudo imm in ld_imm64 instructions: + * + * 1. if it accesses map FD, replace it with actual map pointer. + * 2. if it accesses btf_id of a VAR, replace it with pointer to the var. + * + * NOTE: btf_vmlinux is required for converting pseudo btf_id. */ -static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) +static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) { struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; @@ -9255,6 +9811,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) /* valid generic load 64-bit imm */ goto next_insn; + if (insn[0].src_reg == BPF_PSEUDO_BTF_ID) { + aux = &env->insn_aux_data[i]; + err = check_pseudo_btf_id(env, insn, aux); + if (err) + return err; + goto next_insn; + } + /* In final convert_pseudo_ld_imm64() step, this is * converted into regular 64-bit imm load insn. */ @@ -9436,6 +10000,18 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len } } +static void adjust_poke_descs(struct bpf_prog *prog, u32 len) +{ + struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab; + int i, sz = prog->aux->size_poke_tab; + struct bpf_jit_poke_descriptor *desc; + + for (i = 0; i < sz; i++) { + desc = &tab[i]; + desc->insn_idx += len - 1; + } +} + static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, const struct bpf_insn *patch, u32 len) { @@ -9452,6 +10028,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of if (adjust_insn_aux_data(env, new_prog, off, len)) return NULL; adjust_subprog_starts(env, off, len); + adjust_poke_descs(new_prog, len); return new_prog; } @@ -9897,7 +10474,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code); env->prog->aux->num_exentries++; - } else if (env->prog->type != BPF_PROG_TYPE_STRUCT_OPS) { + } else if (resolve_prog_type(env->prog) != BPF_PROG_TYPE_STRUCT_OPS) { verbose(env, "Writes through BTF pointers are not allowed\n"); return -EINVAL; } @@ -9982,6 +10559,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog, **func, *tmp; int i, j, subprog_start, subprog_end = 0, len, subprog; + struct bpf_map *map_ptr; struct bpf_insn *insn; void *old_bpf_func; int err, num_exentries; @@ -10049,6 +10627,31 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->btf = prog->aux->btf; func[i]->aux->func_info = prog->aux->func_info; + for (j = 0; j < prog->aux->size_poke_tab; j++) { + u32 insn_idx = prog->aux->poke_tab[j].insn_idx; + int ret; + + if (!(insn_idx >= subprog_start && + insn_idx <= subprog_end)) + continue; + + ret = bpf_jit_add_poke_descriptor(func[i], + &prog->aux->poke_tab[j]); + if (ret < 0) { + verbose(env, "adding tail call poke descriptor failed\n"); + goto out_free; + } + + func[i]->insnsi[insn_idx - subprog_start].imm = ret + 1; + + map_ptr = func[i]->aux->poke_tab[ret].tail_call.map; + ret = map_ptr->ops->map_poke_track(map_ptr, func[i]->aux); + if (ret < 0) { + verbose(env, "tracking tail call prog failed\n"); + goto out_free; + } + } + /* Use bpf_prog_F_tag to indicate functions in stack traces. * Long term would need debug info to populate names */ @@ -10067,6 +10670,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) num_exentries++; } func[i]->aux->num_exentries = num_exentries; + func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable; func[i] = bpf_int_jit_compile(func[i]); if (!func[i]->jited) { err = -ENOTSUPP; @@ -10074,6 +10678,19 @@ static int jit_subprogs(struct bpf_verifier_env *env) } cond_resched(); } + + /* Untrack main program's aux structs so that during map_poke_run() + * we will not stumble upon the unfilled poke descriptors; each + * of the main program's poke descs got distributed across subprogs + * and got tracked onto map, so we are sure that none of them will + * be missed after the operation below + */ + for (i = 0; i < prog->aux->size_poke_tab; i++) { + map_ptr = prog->aux->poke_tab[i].tail_call.map; + + map_ptr->ops->map_poke_untrack(map_ptr, prog->aux); + } + /* at this point all bpf functions were successfully JITed * now populate all bpf_calls with correct addresses and * run last pass of JIT @@ -10142,9 +10759,16 @@ static int jit_subprogs(struct bpf_verifier_env *env) bpf_prog_free_unused_jited_linfo(prog); return 0; out_free: - for (i = 0; i < env->subprog_cnt; i++) - if (func[i]) - bpf_jit_free(func[i]); + for (i = 0; i < env->subprog_cnt; i++) { + if (!func[i]) + continue; + + for (j = 0; j < func[i]->aux->size_poke_tab; j++) { + map_ptr = func[i]->aux->poke_tab[j].tail_call.map; + map_ptr->ops->map_poke_untrack(map_ptr, func[i]->aux); + } + bpf_jit_free(func[i]); + } kfree(func); out_undo_insn: /* cleanup main prog to be interpreted */ @@ -10178,6 +10802,13 @@ static int fixup_call_args(struct bpf_verifier_env *env) return err; } #ifndef CONFIG_BPF_JIT_ALWAYS_ON + if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) { + /* When JIT fails the progs with bpf2bpf calls and tail_calls + * have to be rejected, since interpreter doesn't support them yet. + */ + verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n"); + return -EINVAL; + } for (i = 0; i < prog->len; i++, insn++) { if (insn->code != (BPF_JMP | BPF_CALL) || insn->src_reg != BPF_PSEUDO_CALL) @@ -10341,8 +10972,9 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) * the program array. */ prog->cb_access = 1; - env->prog->aux->stack_depth = MAX_BPF_STACK; - env->prog->aux->max_pkt_offset = MAX_PACKET_OFF; + if (!allow_tail_call_in_subprogs(env)) + prog->aux->stack_depth = MAX_BPF_STACK; + prog->aux->max_pkt_offset = MAX_PACKET_OFF; /* mark bpf_tail_call as different opcode to avoid * conditional branch in the interpeter for every normal @@ -10362,6 +10994,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) .reason = BPF_POKE_REASON_TAIL_CALL, .tail_call.map = BPF_MAP_PTR(aux->map_ptr_state), .tail_call.key = bpf_map_key_immediate(aux), + .insn_idx = i + delta, }; ret = bpf_jit_add_poke_descriptor(prog, &desc); @@ -10427,7 +11060,9 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) if (insn->imm == BPF_FUNC_map_lookup_elem && ops->map_gen_lookup) { cnt = ops->map_gen_lookup(map_ptr, insn_buf); - if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + if (cnt == -EOPNOTSUPP) + goto patch_map_ops_generic; + if (cnt <= 0 || cnt >= ARRAY_SIZE(insn_buf)) { verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } @@ -10457,7 +11092,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) (int (*)(struct bpf_map *map, void *value))NULL)); BUILD_BUG_ON(!__same_type(ops->map_peek_elem, (int (*)(struct bpf_map *map, void *value))NULL)); - +patch_map_ops_generic: switch (insn->imm) { case BPF_FUNC_map_lookup_elem: insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) - @@ -10810,59 +11445,79 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) } #define SECURITY_PREFIX "security_" -static int check_attach_modify_return(struct bpf_prog *prog, unsigned long addr) +static int check_attach_modify_return(unsigned long addr, const char *func_name) { if (within_error_injection_list(addr) || - !strncmp(SECURITY_PREFIX, prog->aux->attach_func_name, - sizeof(SECURITY_PREFIX) - 1)) + !strncmp(SECURITY_PREFIX, func_name, sizeof(SECURITY_PREFIX) - 1)) return 0; return -EINVAL; } -static int check_attach_btf_id(struct bpf_verifier_env *env) +/* non exhaustive list of sleepable bpf_lsm_*() functions */ +BTF_SET_START(btf_sleepable_lsm_hooks) +#ifdef CONFIG_BPF_LSM +BTF_ID(func, bpf_lsm_bprm_committed_creds) +#else +BTF_ID_UNUSED +#endif +BTF_SET_END(btf_sleepable_lsm_hooks) + +static int check_sleepable_lsm_hook(u32 btf_id) +{ + return btf_id_set_contains(&btf_sleepable_lsm_hooks, btf_id); +} + +/* list of non-sleepable functions that are otherwise on + * ALLOW_ERROR_INJECTION list + */ +BTF_SET_START(btf_non_sleepable_error_inject) +/* Three functions below can be called from sleepable and non-sleepable context. + * Assume non-sleepable from bpf safety point of view. + */ +BTF_ID(func, __add_to_page_cache_locked) +BTF_ID(func, should_fail_alloc_page) +BTF_ID(func, should_failslab) +BTF_SET_END(btf_non_sleepable_error_inject) + +static int check_non_sleepable_error_inject(u32 btf_id) +{ + return btf_id_set_contains(&btf_non_sleepable_error_inject, btf_id); +} + +int bpf_check_attach_target(struct bpf_verifier_log *log, + const struct bpf_prog *prog, + const struct bpf_prog *tgt_prog, + u32 btf_id, + struct bpf_attach_target_info *tgt_info) { - struct bpf_prog *prog = env->prog; bool prog_extension = prog->type == BPF_PROG_TYPE_EXT; - struct bpf_prog *tgt_prog = prog->aux->linked_prog; - u32 btf_id = prog->aux->attach_btf_id; const char prefix[] = "btf_trace_"; - struct btf_func_model fmodel; int ret = 0, subprog = -1, i; - struct bpf_trampoline *tr; const struct btf_type *t; bool conservative = true; const char *tname; struct btf *btf; - long addr; - u64 key; - - if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) - return check_struct_ops_btf_id(env); - - if (prog->type != BPF_PROG_TYPE_TRACING && - prog->type != BPF_PROG_TYPE_LSM && - !prog_extension) - return 0; + long addr = 0; if (!btf_id) { - verbose(env, "Tracing programs must provide btf_id\n"); + bpf_log(log, "Tracing programs must provide btf_id\n"); return -EINVAL; } - btf = bpf_prog_get_target_btf(prog); + btf = tgt_prog ? tgt_prog->aux->btf : btf_vmlinux; if (!btf) { - verbose(env, + bpf_log(log, "FENTRY/FEXIT program can only be attached to another program annotated with BTF\n"); return -EINVAL; } t = btf_type_by_id(btf, btf_id); if (!t) { - verbose(env, "attach_btf_id %u is invalid\n", btf_id); + bpf_log(log, "attach_btf_id %u is invalid\n", btf_id); return -EINVAL; } tname = btf_name_by_offset(btf, t->name_off); if (!tname) { - verbose(env, "attach_btf_id %u doesn't have a name\n", btf_id); + bpf_log(log, "attach_btf_id %u doesn't have a name\n", btf_id); return -EINVAL; } if (tgt_prog) { @@ -10874,26 +11529,24 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) break; } if (subprog == -1) { - verbose(env, "Subprog %s doesn't exist\n", tname); + bpf_log(log, "Subprog %s doesn't exist\n", tname); return -EINVAL; } conservative = aux->func_info_aux[subprog].unreliable; if (prog_extension) { if (conservative) { - verbose(env, + bpf_log(log, "Cannot replace static functions\n"); return -EINVAL; } if (!prog->jit_requested) { - verbose(env, + bpf_log(log, "Extension programs should be JITed\n"); return -EINVAL; } - env->ops = bpf_verifier_ops[tgt_prog->type]; - prog->expected_attach_type = tgt_prog->expected_attach_type; } if (!tgt_prog->jited) { - verbose(env, "Can attach to only JITed progs\n"); + bpf_log(log, "Can attach to only JITed progs\n"); return -EINVAL; } if (tgt_prog->type == prog->type) { @@ -10901,7 +11554,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) * Cannot attach program extension to another extension. * It's ok to attach fentry/fexit to extension program. */ - verbose(env, "Cannot recursively attach\n"); + bpf_log(log, "Cannot recursively attach\n"); return -EINVAL; } if (tgt_prog->type == BPF_PROG_TYPE_TRACING && @@ -10923,32 +11576,30 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) * reasonable stack size. Hence extending fentry is not * allowed. */ - verbose(env, "Cannot extend fentry/fexit\n"); + bpf_log(log, "Cannot extend fentry/fexit\n"); return -EINVAL; } - key = ((u64)aux->id) << 32 | btf_id; } else { if (prog_extension) { - verbose(env, "Cannot replace kernel functions\n"); + bpf_log(log, "Cannot replace kernel functions\n"); return -EINVAL; } - key = btf_id; } switch (prog->expected_attach_type) { case BPF_TRACE_RAW_TP: if (tgt_prog) { - verbose(env, + bpf_log(log, "Only FENTRY/FEXIT progs are attachable to another BPF prog\n"); return -EINVAL; } if (!btf_type_is_typedef(t)) { - verbose(env, "attach_btf_id %u is not a typedef\n", + bpf_log(log, "attach_btf_id %u is not a typedef\n", btf_id); return -EINVAL; } if (strncmp(prefix, tname, sizeof(prefix) - 1)) { - verbose(env, "attach_btf_id %u points to wrong type name %s\n", + bpf_log(log, "attach_btf_id %u points to wrong type name %s\n", btf_id, tname); return -EINVAL; } @@ -10962,29 +11613,20 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) /* should never happen in valid vmlinux build */ return -EINVAL; - /* remember two read only pointers that are valid for - * the life time of the kernel - */ - prog->aux->attach_func_name = tname; - prog->aux->attach_func_proto = t; - prog->aux->attach_btf_trace = true; - return 0; + break; case BPF_TRACE_ITER: if (!btf_type_is_func(t)) { - verbose(env, "attach_btf_id %u is not a function\n", + bpf_log(log, "attach_btf_id %u is not a function\n", btf_id); return -EINVAL; } t = btf_type_by_id(btf, t->type); if (!btf_type_is_func_proto(t)) return -EINVAL; - prog->aux->attach_func_name = tname; - prog->aux->attach_func_proto = t; - if (!bpf_iter_prog_supported(prog)) - return -EINVAL; - ret = btf_distill_func_proto(&env->log, btf, t, - tname, &fmodel); - return ret; + ret = btf_distill_func_proto(log, btf, t, tname, &tgt_info->fmodel); + if (ret) + return ret; + break; default: if (!prog_extension) return -EINVAL; @@ -10993,42 +11635,30 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) case BPF_LSM_MAC: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: - prog->aux->attach_func_name = tname; - if (prog->type == BPF_PROG_TYPE_LSM) { - ret = bpf_lsm_verify_prog(&env->log, prog); - if (ret < 0) - return ret; - } - if (!btf_type_is_func(t)) { - verbose(env, "attach_btf_id %u is not a function\n", + bpf_log(log, "attach_btf_id %u is not a function\n", btf_id); return -EINVAL; } if (prog_extension && - btf_check_type_match(env, prog, btf, t)) + btf_check_type_match(log, prog, btf, t)) return -EINVAL; t = btf_type_by_id(btf, t->type); if (!btf_type_is_func_proto(t)) return -EINVAL; - tr = bpf_trampoline_lookup(key); - if (!tr) - return -ENOMEM; - /* t is either vmlinux type or another program's type */ - prog->aux->attach_func_proto = t; - mutex_lock(&tr->mutex); - if (tr->func.addr) { - prog->aux->trampoline = tr; - goto out; - } - if (tgt_prog && conservative) { - prog->aux->attach_func_proto = NULL; + + if ((prog->aux->saved_dst_prog_type || prog->aux->saved_dst_attach_type) && + (!tgt_prog || prog->aux->saved_dst_prog_type != tgt_prog->type || + prog->aux->saved_dst_attach_type != tgt_prog->expected_attach_type)) + return -EINVAL; + + if (tgt_prog && conservative) t = NULL; - } - ret = btf_distill_func_proto(&env->log, btf, t, - tname, &tr->func.model); + + ret = btf_distill_func_proto(log, btf, t, tname, &tgt_info->fmodel); if (ret < 0) - goto out; + return ret; + if (tgt_prog) { if (subprog == 0) addr = (long) tgt_prog->bpf_func; @@ -11037,31 +11667,137 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) } else { addr = kallsyms_lookup_name(tname); if (!addr) { - verbose(env, + bpf_log(log, "The address of function %s cannot be found\n", tname); - ret = -ENOENT; - goto out; + return -ENOENT; } } - if (prog->expected_attach_type == BPF_MODIFY_RETURN) { - ret = check_attach_modify_return(prog, addr); - if (ret) - verbose(env, "%s() is not modifiable\n", - prog->aux->attach_func_name); + if (prog->aux->sleepable) { + ret = -EINVAL; + switch (prog->type) { + case BPF_PROG_TYPE_TRACING: + /* fentry/fexit/fmod_ret progs can be sleepable only if they are + * attached to ALLOW_ERROR_INJECTION and are not in denylist. + */ + if (!check_non_sleepable_error_inject(btf_id) && + within_error_injection_list(addr)) + ret = 0; + break; + case BPF_PROG_TYPE_LSM: + /* LSM progs check that they are attached to bpf_lsm_*() funcs. + * Only some of them are sleepable. + */ + if (check_sleepable_lsm_hook(btf_id)) + ret = 0; + break; + default: + break; + } + if (ret) { + bpf_log(log, "%s is not sleepable\n", tname); + return ret; + } + } else if (prog->expected_attach_type == BPF_MODIFY_RETURN) { + if (tgt_prog) { + bpf_log(log, "can't modify return codes of BPF programs\n"); + return -EINVAL; + } + ret = check_attach_modify_return(addr, tname); + if (ret) { + bpf_log(log, "%s() is not modifiable\n", tname); + return ret; + } } - if (ret) - goto out; - tr->func.addr = (void *)addr; - prog->aux->trampoline = tr; -out: - mutex_unlock(&tr->mutex); - if (ret) - bpf_trampoline_put(tr); + break; + } + tgt_info->tgt_addr = addr; + tgt_info->tgt_name = tname; + tgt_info->tgt_type = t; + return 0; +} + +static int check_attach_btf_id(struct bpf_verifier_env *env) +{ + struct bpf_prog *prog = env->prog; + struct bpf_prog *tgt_prog = prog->aux->dst_prog; + struct bpf_attach_target_info tgt_info = {}; + u32 btf_id = prog->aux->attach_btf_id; + struct bpf_trampoline *tr; + int ret; + u64 key; + + if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING && + prog->type != BPF_PROG_TYPE_LSM) { + verbose(env, "Only fentry/fexit/fmod_ret and lsm programs can be sleepable\n"); + return -EINVAL; + } + + if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) + return check_struct_ops_btf_id(env); + + if (prog->type != BPF_PROG_TYPE_TRACING && + prog->type != BPF_PROG_TYPE_LSM && + prog->type != BPF_PROG_TYPE_EXT) + return 0; + + ret = bpf_check_attach_target(&env->log, prog, tgt_prog, btf_id, &tgt_info); + if (ret) return ret; + + if (tgt_prog && prog->type == BPF_PROG_TYPE_EXT) { + /* to make freplace equivalent to their targets, they need to + * inherit env->ops and expected_attach_type for the rest of the + * verification + */ + env->ops = bpf_verifier_ops[tgt_prog->type]; + prog->expected_attach_type = tgt_prog->expected_attach_type; + } + + /* store info about the attachment target that will be used later */ + prog->aux->attach_func_proto = tgt_info.tgt_type; + prog->aux->attach_func_name = tgt_info.tgt_name; + + if (tgt_prog) { + prog->aux->saved_dst_prog_type = tgt_prog->type; + prog->aux->saved_dst_attach_type = tgt_prog->expected_attach_type; } + + if (prog->expected_attach_type == BPF_TRACE_RAW_TP) { + prog->aux->attach_btf_trace = true; + return 0; + } else if (prog->expected_attach_type == BPF_TRACE_ITER) { + if (!bpf_iter_prog_supported(prog)) + return -EINVAL; + return 0; + } + + if (prog->type == BPF_PROG_TYPE_LSM) { + ret = bpf_lsm_verify_prog(&env->log, prog); + if (ret < 0) + return ret; + } + + key = bpf_trampoline_compute_key(tgt_prog, btf_id); + tr = bpf_trampoline_get(key, &tgt_info); + if (!tr) + return -ENOMEM; + + prog->aux->dst_trampoline = tr; + return 0; +} + +struct btf *bpf_get_btf_vmlinux(void) +{ + if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { + mutex_lock(&bpf_verifier_lock); + if (!btf_vmlinux) + btf_vmlinux = btf_parse_vmlinux(); + mutex_unlock(&bpf_verifier_lock); + } + return btf_vmlinux; } int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, @@ -11097,12 +11833,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->ops = bpf_verifier_ops[env->prog->type]; is_priv = bpf_capable(); - if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { - mutex_lock(&bpf_verifier_lock); - if (!btf_vmlinux) - btf_vmlinux = btf_parse_vmlinux(); - mutex_unlock(&bpf_verifier_lock); - } + bpf_get_btf_vmlinux(); /* grab the mutex to protect few globals used by verifier */ if (!is_priv) @@ -11145,10 +11876,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (is_priv) env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; - ret = replace_map_fd_with_map_ptr(env); - if (ret < 0) - goto skip_full_check; - if (bpf_prog_is_dev_bound(env->prog->aux)) { ret = bpf_prog_offload_verifier_prep(env->prog); if (ret) @@ -11174,6 +11901,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (ret) goto skip_full_check; + ret = resolve_pseudo_ldimm64(env); + if (ret < 0) + goto skip_full_check; + ret = check_cfg(env); if (ret < 0) goto skip_full_check; diff --git a/kernel/capability.c b/kernel/capability.c index 7c59b096c98a..de7eac903a2a 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -418,7 +418,7 @@ EXPORT_SYMBOL(ns_capable_noaudit); /** * ns_capable_setid - Determine if the current task has a superior capability * in effect, while signalling that this check is being done from within a - * setid syscall. + * setid or setgroups syscall. * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index dd247747ec14..e41c21819ba0 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2006,7 +2006,6 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) BUG_ON(!list_empty(&root_cgrp->self.children)); BUG_ON(atomic_read(&root->nr_cgrps) != 1); - kernfs_activate(root_cgrp->kn); ret = 0; goto out; @@ -3682,6 +3681,9 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, struct cgroup_subsys_state *css; int ret; + if (!nbytes) + return 0; + /* * If namespaces are delegation boundaries, disallow writes to * files in an non-init namespace root from inside the namespace diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 642415b8c3c9..57b5b5d0a5fd 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -390,7 +390,7 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) * The top cpuset doesn't have any online cpu as a * consequence of a race between cpuset_hotplug_work * and cpu hotplug notifier. But we know the top - * cpuset's effective_cpus is on its way to to be + * cpuset's effective_cpus is on its way to be * identical to cpu_online_mask. */ cpumask_copy(pmask, cpu_online_mask); diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index b16dbc1bf056..1e75a8923a8d 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -80,7 +80,7 @@ static int exception_level; struct kgdb_io *dbg_io_ops; static DEFINE_SPINLOCK(kgdb_registration_lock); -/* Action for the reboot notifiter, a global allow kdb to change it */ +/* Action for the reboot notifier, a global allow kdb to change it */ static int kgdbreboot; /* kgdb console driver is loaded */ static int kgdb_con_registered; @@ -94,14 +94,6 @@ int dbg_switch_cpu; /* Use kdb or gdbserver mode */ int dbg_kdb_mode = 1; -static int __init opt_kgdb_con(char *str) -{ - kgdb_use_con = 1; - return 0; -} - -early_param("kgdbcon", opt_kgdb_con); - module_param(kgdb_use_con, int, 0644); module_param(kgdbreboot, int, 0644); @@ -163,7 +155,7 @@ early_param("nokgdbroundup", opt_nokgdbroundup); /* * Weak aliases for breakpoint management, - * can be overriden by architectures when needed: + * can be overridden by architectures when needed: */ int __weak kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) { @@ -177,17 +169,23 @@ int __weak kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE); return err; } +NOKPROBE_SYMBOL(kgdb_arch_set_breakpoint); int __weak kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) { return copy_to_kernel_nofault((char *)bpt->bpt_addr, (char *)bpt->saved_instr, BREAK_INSTR_SIZE); } +NOKPROBE_SYMBOL(kgdb_arch_remove_breakpoint); int __weak kgdb_validate_break_address(unsigned long addr) { struct kgdb_bkpt tmp; int err; + + if (kgdb_within_blocklist(addr)) + return -EINVAL; + /* Validate setting the breakpoint and then removing it. If the * remove fails, the kernel needs to emit a bad message because we * are deep trouble not being able to put things back the way we @@ -208,6 +206,7 @@ unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs) { return instruction_pointer(regs); } +NOKPROBE_SYMBOL(kgdb_arch_pc); int __weak kgdb_arch_init(void) { @@ -218,6 +217,7 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs) { return 0; } +NOKPROBE_SYMBOL(kgdb_skipexception); #ifdef CONFIG_SMP @@ -239,6 +239,7 @@ void __weak kgdb_call_nmi_hook(void *ignored) */ kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs()); } +NOKPROBE_SYMBOL(kgdb_call_nmi_hook); void __weak kgdb_roundup_cpus(void) { @@ -272,6 +273,7 @@ void __weak kgdb_roundup_cpus(void) kgdb_info[cpu].rounding_up = false; } } +NOKPROBE_SYMBOL(kgdb_roundup_cpus); #endif @@ -298,6 +300,7 @@ static void kgdb_flush_swbreak_addr(unsigned long addr) /* Force flush instruction cache if it was outside the mm */ flush_icache_range(addr, addr + BREAK_INSTR_SIZE); } +NOKPROBE_SYMBOL(kgdb_flush_swbreak_addr); /* * SW breakpoint management: @@ -325,6 +328,7 @@ int dbg_activate_sw_breakpoints(void) } return ret; } +NOKPROBE_SYMBOL(dbg_activate_sw_breakpoints); int dbg_set_sw_break(unsigned long addr) { @@ -388,6 +392,7 @@ int dbg_deactivate_sw_breakpoints(void) } return ret; } +NOKPROBE_SYMBOL(dbg_deactivate_sw_breakpoints); int dbg_remove_sw_break(unsigned long addr) { @@ -509,6 +514,7 @@ static int kgdb_io_ready(int print_wait) } return 1; } +NOKPROBE_SYMBOL(kgdb_io_ready); static int kgdb_reenter_check(struct kgdb_state *ks) { @@ -556,6 +562,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks) return 1; } +NOKPROBE_SYMBOL(kgdb_reenter_check); static void dbg_touch_watchdogs(void) { @@ -563,6 +570,7 @@ static void dbg_touch_watchdogs(void) clocksource_touch_watchdog(); rcu_cpu_stall_reset(); } +NOKPROBE_SYMBOL(dbg_touch_watchdogs); static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs, int exception_state) @@ -752,6 +760,8 @@ cpu_master_loop: } } + dbg_activate_sw_breakpoints(); + /* Call the I/O driver's post_exception routine */ if (dbg_io_ops->post_exception) dbg_io_ops->post_exception(); @@ -794,6 +804,7 @@ kgdb_restore: return kgdb_info[cpu].ret_state; } +NOKPROBE_SYMBOL(kgdb_cpu_enter); /* * kgdb_handle_exception() - main entry point from a kernel exception @@ -838,6 +849,7 @@ out: arch_kgdb_ops.enable_nmi(1); return ret; } +NOKPROBE_SYMBOL(kgdb_handle_exception); /* * GDB places a breakpoint at this function to know dynamically loaded objects. @@ -872,6 +884,7 @@ int kgdb_nmicallback(int cpu, void *regs) #endif return 1; } +NOKPROBE_SYMBOL(kgdb_nmicallback); int kgdb_nmicallin(int cpu, int trapnr, void *regs, int err_code, atomic_t *send_ready) @@ -897,6 +910,7 @@ int kgdb_nmicallin(int cpu, int trapnr, void *regs, int err_code, #endif return 1; } +NOKPROBE_SYMBOL(kgdb_nmicallin); static void kgdb_console_write(struct console *co, const char *s, unsigned count) @@ -920,6 +934,20 @@ static struct console kgdbcons = { .index = -1, }; +static int __init opt_kgdb_con(char *str) +{ + kgdb_use_con = 1; + + if (kgdb_io_module_registered && !kgdb_con_registered) { + register_console(&kgdbcons); + kgdb_con_registered = 1; + } + + return 0; +} + +early_param("kgdbcon", opt_kgdb_con); + #ifdef CONFIG_MAGIC_SYSRQ static void sysrq_handle_dbg(int key) { diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index cc3c43dfec44..a77df59d9ca5 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -725,7 +725,7 @@ static void gdb_cmd_query(struct kgdb_state *ks) } } - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (i >= ks->thr_query && !finished) { int_to_threadref(thref, p->pid); ptr = pack_threadid(ptr, thref); @@ -735,7 +735,7 @@ static void gdb_cmd_query(struct kgdb_state *ks) finished = 1; } i++; - } while_each_thread(g, p); + } *(--ptr) = '\0'; break; @@ -1061,7 +1061,6 @@ int gdb_serial_stub(struct kgdb_state *ks) error_packet(remcom_out_buffer, -EINVAL); break; } - dbg_activate_sw_breakpoints(); fallthrough; /* to default processing */ default: default_handle: diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index d7ebb2c79cb8..ec4940146612 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -307,6 +307,15 @@ static int kdb_bp(int argc, const char **argv) return KDB_BADINT; /* + * This check is redundant (since the breakpoint machinery should + * be doing the same check during kdb_bp_install) but gives the + * user immediate feedback. + */ + diag = kgdb_validate_break_address(template.bp_addr); + if (diag) + return diag; + + /* * Find an empty bp structure to allocate */ for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) { diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 18e03aba2cfc..1f9f0e47aeda 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -149,14 +149,14 @@ kdb_bt(int argc, const char **argv) return 0; } /* Now the inactive tasks */ - kdb_do_each_thread(g, p) { + for_each_process_thread(g, p) { if (KDB_FLAG(CMD_INTERRUPT)) return 0; if (task_curr(p)) continue; if (kdb_bt1(p, mask, btaprompt)) return 0; - } kdb_while_each_thread(g, p); + } } else if (strcmp(argv[0], "btp") == 0) { struct task_struct *p; unsigned long pid; diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index 53a0df6e4d92..0220afda3200 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -147,7 +147,6 @@ int kdb_stub(struct kgdb_state *ks) return DBG_PASS_EVENT; } kdb_bp_install(ks->linux_regs); - dbg_activate_sw_breakpoints(); /* Set the exit state to a single step or a continue */ if (KDB_STATE(DOING_SS)) gdbstub_state(ks, "s"); @@ -167,7 +166,6 @@ int kdb_stub(struct kgdb_state *ks) * differently vs the gdbstub */ kgdb_single_step = 0; - dbg_deactivate_sw_breakpoints(); return DBG_SWITCH_CPU_EVENT; } return kgdb_info[ks->cpu].ret_state; diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 9d847ab851db..6735ac36b718 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -545,18 +545,18 @@ static int kdb_search_string(char *searched, char *searchfor) static void kdb_msg_write(const char *msg, int msg_len) { struct console *c; + const char *cp; + int len; if (msg_len == 0) return; - if (dbg_io_ops) { - const char *cp = msg; - int len = msg_len; + cp = msg; + len = msg_len; - while (len--) { - dbg_io_ops->write_char(*cp); - cp++; - } + while (len--) { + dbg_io_ops->write_char(*cp); + cp++; } for_each_console(c) { @@ -706,12 +706,16 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) size_avail = sizeof(kdb_buffer) - len; goto kdb_print_out; } - if (kdb_grepping_flag >= KDB_GREPPING_FLAG_SEARCH) + if (kdb_grepping_flag >= KDB_GREPPING_FLAG_SEARCH) { /* * This was a interactive search (using '/' at more - * prompt) and it has completed. Clear the flag. + * prompt) and it has completed. Replace the \0 with + * its original value to ensure multi-line strings + * are handled properly, and return to normal mode. */ + *cphold = replaced_byte; kdb_grepping_flag = 0; + } /* * at this point the string is a full line and * should be printed, up to the null. diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 5c7949061671..930ac1b25ec7 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2299,10 +2299,10 @@ void kdb_ps_suppressed(void) if (kdb_task_state(p, mask_I)) ++idle; } - kdb_do_each_thread(g, p) { + for_each_process_thread(g, p) { if (kdb_task_state(p, mask_M)) ++daemon; - } kdb_while_each_thread(g, p); + } if (idle || daemon) { if (idle) kdb_printf("%d idle process%s (state I)%s\n", @@ -2370,12 +2370,12 @@ static int kdb_ps(int argc, const char **argv) } kdb_printf("\n"); /* Now the real tasks */ - kdb_do_each_thread(g, p) { + for_each_process_thread(g, p) { if (KDB_FLAG(CMD_INTERRUPT)) return 0; if (kdb_task_state(p, mask)) kdb_ps1(p); - } kdb_while_each_thread(g, p); + } return 0; } diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 2e296e4a234c..a4281fb99299 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -230,10 +230,6 @@ extern struct task_struct *kdb_curr_task(int); #define kdb_task_has_cpu(p) (task_curr(p)) -/* Simplify coexistence with NPTL */ -#define kdb_do_each_thread(g, p) do_each_thread(g, p) -#define kdb_while_each_thread(g, p) while_each_thread(g, p) - #define GFP_KDB (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL) extern void *debug_kmalloc(size_t size, gfp_t flags); diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 847a9d1fa634..c99de4a21458 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -9,6 +9,7 @@ config HAS_DMA default y config DMA_OPS + depends on HAS_DMA bool # @@ -43,6 +44,12 @@ config ARCH_HAS_DMA_SET_MASK config ARCH_HAS_DMA_WRITE_COMBINE bool +# +# Select if the architectures provides the arch_dma_mark_clean hook +# +config ARCH_HAS_DMA_MARK_CLEAN + bool + config DMA_DECLARE_COHERENT bool @@ -68,9 +75,6 @@ config ARCH_HAS_DMA_PREP_COHERENT config ARCH_HAS_FORCE_DMA_UNENCRYPTED bool -config DMA_NONCOHERENT_CACHE_SYNC - bool - config DMA_VIRT_OPS bool depends on HAS_DMA @@ -114,10 +118,21 @@ config DMA_CMA You can disable CMA by specifying "cma=0" on the kernel's command line. - For more information see <include/linux/dma-contiguous.h>. + For more information see <kernel/dma/contiguous.c>. If unsure, say "n". if DMA_CMA + +config DMA_PERNUMA_CMA + bool "Enable separate DMA Contiguous Memory Area for each NUMA Node" + default NUMA && ARM64 + help + Enable this option to get pernuma CMA areas so that devices like + ARM64 SMMU can get local memory by DMA coherent APIs. + + You can set the size of pernuma CMA by specifying "cma_pernuma=size" + on the kernel's command line. + comment "Default contiguous memory area size:" config CMA_SIZE_MBYTES @@ -162,7 +177,7 @@ endchoice config CMA_ALIGNMENT int "Maximum PAGE_SIZE order of alignment for contiguous buffers" - range 4 12 + range 2 12 default 8 help DMA mapping framework by default aligns all buffers to the smallest diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile index 32c7c1942bbd..dc755ab68aab 100644 --- a/kernel/dma/Makefile +++ b/kernel/dma/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_HAS_DMA) += mapping.o direct.o +obj-$(CONFIG_DMA_OPS) += ops_helpers.o obj-$(CONFIG_DMA_OPS) += dummy.o obj-$(CONFIG_DMA_CMA) += contiguous.o obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 2a0c4985f38e..5b5b6c7ec7f2 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -7,7 +7,8 @@ #include <linux/slab.h> #include <linux/kernel.h> #include <linux/module.h> -#include <linux/dma-mapping.h> +#include <linux/dma-direct.h> +#include <linux/dma-map-ops.h> struct dma_coherent_mem { void *virt_base; @@ -32,9 +33,8 @@ static inline dma_addr_t dma_get_device_base(struct device *dev, struct dma_coherent_mem * mem) { if (mem->use_dev_dma_pfn_offset) - return (mem->pfn_base - dev->dma_pfn_offset) << PAGE_SHIFT; - else - return mem->device_base; + return phys_to_dma(dev, PFN_PHYS(mem->pfn_base)); + return mem->device_base; } static int dma_init_coherent_memory(phys_addr_t phys_addr, @@ -107,6 +107,23 @@ static int dma_assign_coherent_memory(struct device *dev, return 0; } +/* + * Declare a region of memory to be handed out by dma_alloc_coherent() when it + * is asked for coherent memory for this device. This shall only be used + * from platform code, usually based on the device tree description. + * + * phys_addr is the CPU physical address to which the memory is currently + * assigned (this will be ioremapped so the CPU can access the region). + * + * device_addr is the DMA address the device needs to be programmed with to + * actually address this memory (this will be handed out as the dma_addr_t in + * dma_alloc_coherent()). + * + * size is the size of the area (must be a multiple of PAGE_SIZE). + * + * As a simplification for the platforms, only *one* such region of memory may + * be declared per device. + */ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, dma_addr_t device_addr, size_t size) { diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index cff7e60968b9..16b95ff12e4d 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -5,6 +5,34 @@ * Written by: * Marek Szyprowski <m.szyprowski@samsung.com> * Michal Nazarewicz <mina86@mina86.com> + * + * Contiguous Memory Allocator + * + * The Contiguous Memory Allocator (CMA) makes it possible to + * allocate big contiguous chunks of memory after the system has + * booted. + * + * Why is it needed? + * + * Various devices on embedded systems have no scatter-getter and/or + * IO map support and require contiguous blocks of memory to + * operate. They include devices such as cameras, hardware video + * coders, etc. + * + * Such devices often require big memory buffers (a full HD frame + * is, for instance, more then 2 mega pixels large, i.e. more than 6 + * MB of memory), which makes mechanisms such as kmalloc() or + * alloc_page() ineffective. + * + * At the same time, a solution where a big memory region is + * reserved for a device is suboptimal since often more memory is + * reserved then strictly required and, moreover, the memory is + * inaccessible to page system even if device drivers don't use it. + * + * CMA tries to solve this issue by operating on memory regions + * where only movable pages can be allocated from. This way, kernel + * can use the memory for pagecache and when device driver requests + * it, allocated pages can be migrated. */ #define pr_fmt(fmt) "cma: " fmt @@ -16,12 +44,11 @@ #endif #include <asm/page.h> -#include <asm/dma-contiguous.h> #include <linux/memblock.h> #include <linux/err.h> #include <linux/sizes.h> -#include <linux/dma-contiguous.h> +#include <linux/dma-map-ops.h> #include <linux/cma.h> #ifdef CONFIG_CMA_SIZE_MBYTES @@ -69,20 +96,24 @@ static int __init early_cma(char *p) } early_param("cma", early_cma); +#ifdef CONFIG_DMA_PERNUMA_CMA + +static struct cma *dma_contiguous_pernuma_area[MAX_NUMNODES]; +static phys_addr_t pernuma_size_bytes __initdata; + +static int __init early_cma_pernuma(char *p) +{ + pernuma_size_bytes = memparse(p, &p); + return 0; +} +early_param("cma_pernuma", early_cma_pernuma); +#endif + #ifdef CONFIG_CMA_SIZE_PERCENTAGE static phys_addr_t __init __maybe_unused cma_early_percent_memory(void) { - struct memblock_region *reg; - unsigned long total_pages = 0; - - /* - * We cannot use memblock_phys_mem_size() here, because - * memblock_analyze() has not been called yet. - */ - for_each_memblock(memory, reg) - total_pages += memblock_region_memory_end_pfn(reg) - - memblock_region_memory_base_pfn(reg); + unsigned long total_pages = PHYS_PFN(memblock_phys_mem_size()); return (total_pages * CONFIG_CMA_SIZE_PERCENTAGE / 100) << PAGE_SHIFT; } @@ -96,6 +127,34 @@ static inline __maybe_unused phys_addr_t cma_early_percent_memory(void) #endif +#ifdef CONFIG_DMA_PERNUMA_CMA +void __init dma_pernuma_cma_reserve(void) +{ + int nid; + + if (!pernuma_size_bytes) + return; + + for_each_online_node(nid) { + int ret; + char name[CMA_MAX_NAME]; + struct cma **cma = &dma_contiguous_pernuma_area[nid]; + + snprintf(name, sizeof(name), "pernuma%d", nid); + ret = cma_declare_contiguous_nid(0, pernuma_size_bytes, 0, 0, + 0, false, name, cma, nid); + if (ret) { + pr_warn("%s: reservation failed: err %d, node %d", __func__, + ret, nid); + continue; + } + + pr_debug("%s: reserved %llu MiB on node %d\n", __func__, + (unsigned long long)pernuma_size_bytes / SZ_1M, nid); + } +} +#endif + /** * dma_contiguous_reserve() - reserve area(s) for contiguous memory handling * @limit: End address of the reserved memory (optional, 0 for any). @@ -143,6 +202,11 @@ void __init dma_contiguous_reserve(phys_addr_t limit) } } +void __weak +dma_contiguous_early_fixup(phys_addr_t base, unsigned long size) +{ +} + /** * dma_contiguous_reserve_area() - reserve custom contiguous area * @size: Size of the reserved area (in bytes), @@ -228,23 +292,44 @@ static struct page *cma_alloc_aligned(struct cma *cma, size_t size, gfp_t gfp) * @size: Requested allocation size. * @gfp: Allocation flags. * - * This function allocates contiguous memory buffer for specified device. It - * tries to use device specific contiguous memory area if available, or the - * default global one. + * tries to use device specific contiguous memory area if available, or it + * tries to use per-numa cma, if the allocation fails, it will fallback to + * try default global one. * - * Note that it byapss one-page size of allocations from the global area as - * the addresses within one page are always contiguous, so there is no need - * to waste CMA pages for that kind; it also helps reduce fragmentations. + * Note that it bypass one-page size of allocations from the per-numa and + * global area as the addresses within one page are always contiguous, so + * there is no need to waste CMA pages for that kind; it also helps reduce + * fragmentations. */ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) { +#ifdef CONFIG_DMA_PERNUMA_CMA + int nid = dev_to_node(dev); +#endif + /* CMA can be used only in the context which permits sleeping */ if (!gfpflags_allow_blocking(gfp)) return NULL; if (dev->cma_area) return cma_alloc_aligned(dev->cma_area, size, gfp); - if (size <= PAGE_SIZE || !dma_contiguous_default_area) + if (size <= PAGE_SIZE) + return NULL; + +#ifdef CONFIG_DMA_PERNUMA_CMA + if (nid != NUMA_NO_NODE && !(gfp & (GFP_DMA | GFP_DMA32))) { + struct cma *cma = dma_contiguous_pernuma_area[nid]; + struct page *page; + + if (cma) { + page = cma_alloc_aligned(cma, size, gfp); + if (page) + return page; + } + } +#endif + if (!dma_contiguous_default_area) return NULL; + return cma_alloc_aligned(dma_contiguous_default_area, size, gfp); } @@ -261,9 +346,27 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) */ void dma_free_contiguous(struct device *dev, struct page *page, size_t size) { - if (!cma_release(dev_get_cma_area(dev), page, - PAGE_ALIGN(size) >> PAGE_SHIFT)) - __free_pages(page, get_order(size)); + unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; + + /* if dev has its own cma, free page from there */ + if (dev->cma_area) { + if (cma_release(dev->cma_area, page, count)) + return; + } else { + /* + * otherwise, page is from either per-numa cma or default cma + */ +#ifdef CONFIG_DMA_PERNUMA_CMA + if (cma_release(dma_contiguous_pernuma_area[page_to_nid(page)], + page, count)) + return; +#endif + if (cma_release(dma_contiguous_default_area, page, count)) + return; + } + + /* not in any cma, free from buddy */ + __free_pages(page, get_order(size)); } /* @@ -279,14 +382,14 @@ void dma_free_contiguous(struct device *dev, struct page *page, size_t size) static int rmem_cma_device_init(struct reserved_mem *rmem, struct device *dev) { - dev_set_cma_area(dev, rmem->priv); + dev->cma_area = rmem->priv; return 0; } static void rmem_cma_device_release(struct reserved_mem *rmem, struct device *dev) { - dev_set_cma_area(dev, NULL); + dev->cma_area = NULL; } static const struct reserved_mem_ops rmem_cma_ops = { @@ -327,7 +430,7 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) dma_contiguous_early_fixup(rmem->base, rmem->size); if (default_cma) - dma_contiguous_set_default(cma); + dma_contiguous_default_area = cma; rmem->ops = &rmem_cma_ops; rmem->priv = cma; diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 8e9f7b301c6d..14de1271463f 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -9,10 +9,9 @@ #include <linux/sched/task_stack.h> #include <linux/scatterlist.h> -#include <linux/dma-mapping.h> +#include <linux/dma-map-ops.h> #include <linux/sched/task.h> #include <linux/stacktrace.h> -#include <linux/dma-debug.h> #include <linux/spinlock.h> #include <linux/vmalloc.h> #include <linux/debugfs.h> @@ -24,8 +23,8 @@ #include <linux/ctype.h> #include <linux/list.h> #include <linux/slab.h> - #include <asm/sections.h> +#include "debug.h" #define HASH_SIZE 16384ULL #define HASH_FN_SHIFT 13 @@ -1219,7 +1218,7 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, entry->dev = dev; entry->type = dma_debug_single; entry->pfn = page_to_pfn(page); - entry->offset = offset, + entry->offset = offset; entry->dev_addr = dma_addr; entry->size = size; entry->direction = direction; @@ -1235,7 +1234,6 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, add_dma_entry(entry); } -EXPORT_SYMBOL(debug_dma_map_page); void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { @@ -1290,7 +1288,6 @@ void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, return; check_unmap(&ref); } -EXPORT_SYMBOL(debug_dma_unmap_page); void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, int mapped_ents, int direction) @@ -1310,7 +1307,7 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, entry->type = dma_debug_sg; entry->dev = dev; entry->pfn = page_to_pfn(sg_page(s)); - entry->offset = s->offset, + entry->offset = s->offset; entry->size = sg_dma_len(s); entry->dev_addr = sg_dma_address(s); entry->direction = direction; @@ -1328,7 +1325,6 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, add_dma_entry(entry); } } -EXPORT_SYMBOL(debug_dma_map_sg); static int get_nr_mapped_entries(struct device *dev, struct dma_debug_entry *ref) @@ -1380,7 +1376,6 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, check_unmap(&ref); } } -EXPORT_SYMBOL(debug_dma_unmap_sg); void debug_dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t dma_addr, void *virt) @@ -1466,7 +1461,6 @@ void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size, add_dma_entry(entry); } -EXPORT_SYMBOL(debug_dma_map_resource); void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, size_t size, int direction) @@ -1484,7 +1478,6 @@ void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, check_unmap(&ref); } -EXPORT_SYMBOL(debug_dma_unmap_resource); void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, int direction) @@ -1503,7 +1496,6 @@ void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, check_sync(dev, &ref, true); } -EXPORT_SYMBOL(debug_dma_sync_single_for_cpu); void debug_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, @@ -1523,7 +1515,6 @@ void debug_dma_sync_single_for_device(struct device *dev, check_sync(dev, &ref, false); } -EXPORT_SYMBOL(debug_dma_sync_single_for_device); void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, int direction) @@ -1556,7 +1547,6 @@ void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, check_sync(dev, &ref, true); } } -EXPORT_SYMBOL(debug_dma_sync_sg_for_cpu); void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, int direction) @@ -1588,7 +1578,6 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, check_sync(dev, &ref, false); } } -EXPORT_SYMBOL(debug_dma_sync_sg_for_device); static int __init dma_debug_driver_setup(char *str) { diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h new file mode 100644 index 000000000000..83643b3010b2 --- /dev/null +++ b/kernel/dma/debug.h @@ -0,0 +1,122 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2008 Advanced Micro Devices, Inc. + * + * Author: Joerg Roedel <joerg.roedel@amd.com> + */ + +#ifndef _KERNEL_DMA_DEBUG_H +#define _KERNEL_DMA_DEBUG_H + +#ifdef CONFIG_DMA_API_DEBUG +extern void debug_dma_map_page(struct device *dev, struct page *page, + size_t offset, size_t size, + int direction, dma_addr_t dma_addr); + +extern void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, + size_t size, int direction); + +extern void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, + int nents, int mapped_ents, int direction); + +extern void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, + int nelems, int dir); + +extern void debug_dma_alloc_coherent(struct device *dev, size_t size, + dma_addr_t dma_addr, void *virt); + +extern void debug_dma_free_coherent(struct device *dev, size_t size, + void *virt, dma_addr_t addr); + +extern void debug_dma_map_resource(struct device *dev, phys_addr_t addr, + size_t size, int direction, + dma_addr_t dma_addr); + +extern void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, + size_t size, int direction); + +extern void debug_dma_sync_single_for_cpu(struct device *dev, + dma_addr_t dma_handle, size_t size, + int direction); + +extern void debug_dma_sync_single_for_device(struct device *dev, + dma_addr_t dma_handle, + size_t size, int direction); + +extern void debug_dma_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sg, + int nelems, int direction); + +extern void debug_dma_sync_sg_for_device(struct device *dev, + struct scatterlist *sg, + int nelems, int direction); +#else /* CONFIG_DMA_API_DEBUG */ +static inline void debug_dma_map_page(struct device *dev, struct page *page, + size_t offset, size_t size, + int direction, dma_addr_t dma_addr) +{ +} + +static inline void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, + size_t size, int direction) +{ +} + +static inline void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, + int nents, int mapped_ents, int direction) +{ +} + +static inline void debug_dma_unmap_sg(struct device *dev, + struct scatterlist *sglist, + int nelems, int dir) +{ +} + +static inline void debug_dma_alloc_coherent(struct device *dev, size_t size, + dma_addr_t dma_addr, void *virt) +{ +} + +static inline void debug_dma_free_coherent(struct device *dev, size_t size, + void *virt, dma_addr_t addr) +{ +} + +static inline void debug_dma_map_resource(struct device *dev, phys_addr_t addr, + size_t size, int direction, + dma_addr_t dma_addr) +{ +} + +static inline void debug_dma_unmap_resource(struct device *dev, + dma_addr_t dma_addr, size_t size, + int direction) +{ +} + +static inline void debug_dma_sync_single_for_cpu(struct device *dev, + dma_addr_t dma_handle, + size_t size, int direction) +{ +} + +static inline void debug_dma_sync_single_for_device(struct device *dev, + dma_addr_t dma_handle, + size_t size, int direction) +{ +} + +static inline void debug_dma_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sg, + int nelems, int direction) +{ +} + +static inline void debug_dma_sync_sg_for_device(struct device *dev, + struct scatterlist *sg, + int nelems, int direction) +{ +} +#endif /* CONFIG_DMA_API_DEBUG */ +#endif /* _KERNEL_DMA_DEBUG_H */ diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index db6ef07aec3b..06c111544f61 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -1,21 +1,22 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (C) 2018 Christoph Hellwig. + * Copyright (C) 2018-2020 Christoph Hellwig. * * DMA operations that map physical memory directly without using an IOMMU. */ #include <linux/memblock.h> /* for max_pfn */ #include <linux/export.h> #include <linux/mm.h> -#include <linux/dma-direct.h> +#include <linux/dma-map-ops.h> #include <linux/scatterlist.h> -#include <linux/dma-contiguous.h> #include <linux/pfn.h> #include <linux/vmalloc.h> #include <linux/set_memory.h> +#include <linux/slab.h> +#include "direct.h" /* - * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use it + * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use * it for entirely different regions. In that case the arch code needs to * override the variable below for dma-direct to work properly. */ @@ -25,7 +26,7 @@ static inline dma_addr_t phys_to_dma_direct(struct device *dev, phys_addr_t phys) { if (force_dma_unencrypted(dev)) - return __phys_to_dma(dev, phys); + return phys_to_dma_unencrypted(dev, phys); return phys_to_dma(dev, phys); } @@ -48,11 +49,6 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, { u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit); - if (force_dma_unencrypted(dev)) - *phys_limit = __dma_to_phys(dev, dma_limit); - else - *phys_limit = dma_to_phys(dev, dma_limit); - /* * Optimistically try the zone that the physical address mask falls * into first. If that returns memory that isn't actually addressable @@ -61,6 +57,7 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, * Note that GFP_DMA32 and GFP_DMA are no ops without the corresponding * zones. */ + *phys_limit = dma_to_phys(dev, dma_limit); if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits)) return GFP_DMA; if (*phys_limit <= DMA_BIT_MASK(32)) @@ -70,45 +67,16 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) { - return phys_to_dma_direct(dev, phys) + size - 1 <= - min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); -} - -/* - * Decrypting memory is allowed to block, so if this device requires - * unencrypted memory it must come from atomic pools. - */ -static inline bool dma_should_alloc_from_pool(struct device *dev, gfp_t gfp, - unsigned long attrs) -{ - if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL)) - return false; - if (gfpflags_allow_blocking(gfp)) - return false; - if (force_dma_unencrypted(dev)) - return true; - if (!IS_ENABLED(CONFIG_DMA_DIRECT_REMAP)) - return false; - if (dma_alloc_need_uncached(dev, attrs)) - return true; - return false; -} + dma_addr_t dma_addr = phys_to_dma_direct(dev, phys); -static inline bool dma_should_free_from_pool(struct device *dev, - unsigned long attrs) -{ - if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL)) - return true; - if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && - !force_dma_unencrypted(dev)) + if (dma_addr == DMA_MAPPING_ERROR) return false; - if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP)) - return true; - return false; + return dma_addr + size - 1 <= + min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); } static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, - gfp_t gfp, unsigned long attrs) + gfp_t gfp) { int node = dev_to_node(dev); struct page *page = NULL; @@ -116,11 +84,6 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, WARN_ON_ONCE(!PAGE_ALIGNED(size)); - if (attrs & DMA_ATTR_NO_WARN) - gfp |= __GFP_NOWARN; - - /* we always manually zero the memory once we are done: */ - gfp &= ~__GFP_ZERO; gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, &phys_limit); page = dma_alloc_contiguous(dev, size, gfp); @@ -151,7 +114,23 @@ again: return page; } -void *dma_direct_alloc_pages(struct device *dev, size_t size, +static void *dma_direct_alloc_from_pool(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp) +{ + struct page *page; + u64 phys_mask; + void *ret; + + gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, + &phys_mask); + page = dma_alloc_from_pool(dev, size, &ret, gfp, dma_coherent_ok); + if (!page) + return NULL; + *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); + return ret; +} + +void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { struct page *page; @@ -159,35 +138,44 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, int err; size = PAGE_ALIGN(size); - - if (dma_should_alloc_from_pool(dev, gfp, attrs)) { - u64 phys_mask; - - gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, - &phys_mask); - page = dma_alloc_from_pool(dev, size, &ret, gfp, - dma_coherent_ok); - if (!page) - return NULL; - goto done; - } - - page = __dma_direct_alloc_pages(dev, size, gfp, attrs); - if (!page) - return NULL; + if (attrs & DMA_ATTR_NO_WARN) + gfp |= __GFP_NOWARN; if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && !force_dma_unencrypted(dev)) { + page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO); + if (!page) + return NULL; /* remove any dirty cache lines on the kernel alias */ if (!PageHighMem(page)) arch_dma_prep_coherent(page, size); + *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); /* return the page pointer as the opaque cookie */ - ret = page; - goto done; + return page; } + if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && + !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && + !dev_is_dma_coherent(dev)) + return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); + + /* + * Remapping or decrypting memory may block. If either is required and + * we can't block, allocate the memory from the atomic pools. + */ + if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && + !gfpflags_allow_blocking(gfp) && + (force_dma_unencrypted(dev) || + (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !dev_is_dma_coherent(dev)))) + return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); + + /* we always manually zero the memory once we are done */ + page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO); + if (!page) + return NULL; + if ((IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - dma_alloc_need_uncached(dev, attrs)) || + !dev_is_dma_coherent(dev)) || (IS_ENABLED(CONFIG_DMA_REMAP) && PageHighMem(page))) { /* remove any dirty cache lines on the kernel alias */ arch_dma_prep_coherent(page, size); @@ -230,17 +218,14 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, memset(ret, 0, size); if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && - dma_alloc_need_uncached(dev, attrs)) { + !dev_is_dma_coherent(dev)) { arch_dma_prep_coherent(page, size); ret = arch_dma_set_uncached(ret, size); if (IS_ERR(ret)) goto out_encrypt_pages; } done: - if (force_dma_unencrypted(dev)) - *dma_handle = __phys_to_dma(dev, page_to_phys(page)); - else - *dma_handle = phys_to_dma(dev, page_to_phys(page)); + *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); return ret; out_encrypt_pages: @@ -256,16 +241,11 @@ out_free_pages: return NULL; } -void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, - dma_addr_t dma_addr, unsigned long attrs) +void dma_direct_free(struct device *dev, size_t size, + void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) { unsigned int page_order = get_order(size); - /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */ - if (dma_should_free_from_pool(dev, attrs) && - dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size))) - return; - if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && !force_dma_unencrypted(dev)) { /* cpu_addr is a struct page cookie, not a kernel address */ @@ -273,6 +253,18 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, return; } + if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && + !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && + !dev_is_dma_coherent(dev)) { + arch_dma_free(dev, size, cpu_addr, dma_addr, attrs); + return; + } + + /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */ + if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && + dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size))) + return; + if (force_dma_unencrypted(dev)) set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order); @@ -284,25 +276,60 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, dma_free_contiguous(dev, dma_direct_to_page(dev, dma_addr), size); } -void *dma_direct_alloc(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) +struct page *dma_direct_alloc_pages(struct device *dev, size_t size, + dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) { - if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && - !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - dma_alloc_need_uncached(dev, attrs)) - return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); - return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs); + struct page *page; + void *ret; + + if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && + force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp)) + return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); + + page = __dma_direct_alloc_pages(dev, size, gfp); + if (!page) + return NULL; + if (PageHighMem(page)) { + /* + * Depending on the cma= arguments and per-arch setup + * dma_alloc_contiguous could return highmem pages. + * Without remapping there is no way to return them here, + * so log an error and fail. + */ + dev_info(dev, "Rejecting highmem page from CMA.\n"); + goto out_free_pages; + } + + ret = page_address(page); + if (force_dma_unencrypted(dev)) { + if (set_memory_decrypted((unsigned long)ret, + 1 << get_order(size))) + goto out_free_pages; + } + memset(ret, 0, size); + *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); + return page; +out_free_pages: + dma_free_contiguous(dev, page, size); + return NULL; } -void dma_direct_free(struct device *dev, size_t size, - void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) +void dma_direct_free_pages(struct device *dev, size_t size, + struct page *page, dma_addr_t dma_addr, + enum dma_data_direction dir) { - if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && - !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - dma_alloc_need_uncached(dev, attrs)) - arch_dma_free(dev, size, cpu_addr, dma_addr, attrs); - else - dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs); + unsigned int page_order = get_order(size); + void *vaddr = page_address(page); + + /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */ + if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && + dma_free_from_pool(dev, vaddr, size)) + return; + + if (force_dma_unencrypted(dev)) + set_memory_encrypted((unsigned long)vaddr, 1 << page_order); + + dma_free_contiguous(dev, page, size); } #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ @@ -345,6 +372,9 @@ void dma_direct_sync_sg_for_cpu(struct device *dev, if (unlikely(is_swiotlb_buffer(paddr))) swiotlb_tbl_sync_single(dev, paddr, sg->length, dir, SYNC_FOR_CPU); + + if (dir == DMA_FROM_DEVICE) + arch_dma_mark_clean(paddr, sg->length); } if (!dev_is_dma_coherent(dev)) @@ -453,13 +483,13 @@ int dma_direct_supported(struct device *dev, u64 mask) return 1; /* - * This check needs to be against the actual bit mask value, so - * use __phys_to_dma() here so that the SME encryption mask isn't + * This check needs to be against the actual bit mask value, so use + * phys_to_dma_unencrypted() here so that the SME encryption mask isn't * part of the check. */ if (IS_ENABLED(CONFIG_ZONE_DMA)) min_mask = min_t(u64, min_mask, DMA_BIT_MASK(zone_dma_bits)); - return mask >= __phys_to_dma(dev, min_mask); + return mask >= phys_to_dma_unencrypted(dev, min_mask); } size_t dma_direct_max_mapping_size(struct device *dev) @@ -476,3 +506,45 @@ bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr) return !dev_is_dma_coherent(dev) || is_swiotlb_buffer(dma_to_phys(dev, dma_addr)); } + +/** + * dma_direct_set_offset - Assign scalar offset for a single DMA range. + * @dev: device pointer; needed to "own" the alloced memory. + * @cpu_start: beginning of memory region covered by this offset. + * @dma_start: beginning of DMA/PCI region covered by this offset. + * @size: size of the region. + * + * This is for the simple case of a uniform offset which cannot + * be discovered by "dma-ranges". + * + * It returns -ENOMEM if out of memory, -EINVAL if a map + * already exists, 0 otherwise. + * + * Note: any call to this from a driver is a bug. The mapping needs + * to be described by the device tree or other firmware interfaces. + */ +int dma_direct_set_offset(struct device *dev, phys_addr_t cpu_start, + dma_addr_t dma_start, u64 size) +{ + struct bus_dma_region *map; + u64 offset = (u64)cpu_start - (u64)dma_start; + + if (dev->dma_range_map) { + dev_err(dev, "attempt to add DMA range to existing map\n"); + return -EINVAL; + } + + if (!offset) + return 0; + + map = kcalloc(2, sizeof(*map), GFP_KERNEL); + if (!map) + return -ENOMEM; + map[0].cpu_start = cpu_start; + map[0].dma_start = dma_start; + map[0].offset = offset; + map[0].size = size; + dev->dma_range_map = map; + return 0; +} +EXPORT_SYMBOL_GPL(dma_direct_set_offset); diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h new file mode 100644 index 000000000000..b98615578737 --- /dev/null +++ b/kernel/dma/direct.h @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2018 Christoph Hellwig. + * + * DMA operations that map physical memory directly without using an IOMMU. + */ +#ifndef _KERNEL_DMA_DIRECT_H +#define _KERNEL_DMA_DIRECT_H + +#include <linux/dma-direct.h> + +int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs); +bool dma_direct_can_mmap(struct device *dev); +int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs); +bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr); +int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, + enum dma_data_direction dir, unsigned long attrs); +size_t dma_direct_max_mapping_size(struct device *dev); + +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ + defined(CONFIG_SWIOTLB) +void dma_direct_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir); +#else +static inline void dma_direct_sync_sg_for_device(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir) +{ +} +#endif + +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \ + defined(CONFIG_SWIOTLB) +void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir, unsigned long attrs); +void dma_direct_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir); +#else +static inline void dma_direct_unmap_sg(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir, + unsigned long attrs) +{ +} +static inline void dma_direct_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir) +{ +} +#endif + +static inline void dma_direct_sync_single_for_device(struct device *dev, + dma_addr_t addr, size_t size, enum dma_data_direction dir) +{ + phys_addr_t paddr = dma_to_phys(dev, addr); + + if (unlikely(is_swiotlb_buffer(paddr))) + swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE); + + if (!dev_is_dma_coherent(dev)) + arch_sync_dma_for_device(paddr, size, dir); +} + +static inline void dma_direct_sync_single_for_cpu(struct device *dev, + dma_addr_t addr, size_t size, enum dma_data_direction dir) +{ + phys_addr_t paddr = dma_to_phys(dev, addr); + + if (!dev_is_dma_coherent(dev)) { + arch_sync_dma_for_cpu(paddr, size, dir); + arch_sync_dma_for_cpu_all(); + } + + if (unlikely(is_swiotlb_buffer(paddr))) + swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU); + + if (dir == DMA_FROM_DEVICE) + arch_dma_mark_clean(paddr, size); +} + +static inline dma_addr_t dma_direct_map_page(struct device *dev, + struct page *page, unsigned long offset, size_t size, + enum dma_data_direction dir, unsigned long attrs) +{ + phys_addr_t phys = page_to_phys(page) + offset; + dma_addr_t dma_addr = phys_to_dma(dev, phys); + + if (unlikely(swiotlb_force == SWIOTLB_FORCE)) + return swiotlb_map(dev, phys, size, dir, attrs); + + if (unlikely(!dma_capable(dev, dma_addr, size, true))) { + if (swiotlb_force != SWIOTLB_NO_FORCE) + return swiotlb_map(dev, phys, size, dir, attrs); + + dev_WARN_ONCE(dev, 1, + "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); + return DMA_MAPPING_ERROR; + } + + if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + arch_sync_dma_for_device(phys, size, dir); + return dma_addr; +} + +static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ + phys_addr_t phys = dma_to_phys(dev, addr); + + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + dma_direct_sync_single_for_cpu(dev, addr, size, dir); + + if (unlikely(is_swiotlb_buffer(phys))) + swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs); +} +#endif /* _KERNEL_DMA_DIRECT_H */ diff --git a/kernel/dma/dummy.c b/kernel/dma/dummy.c index 05607642c888..eacd4c5b10bf 100644 --- a/kernel/dma/dummy.c +++ b/kernel/dma/dummy.c @@ -2,7 +2,7 @@ /* * Dummy DMA ops that always fail. */ -#include <linux/dma-mapping.h> +#include <linux/dma-map-ops.h> static int dma_dummy_mmap(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, @@ -36,4 +36,3 @@ const struct dma_map_ops dma_dummy_ops = { .map_sg = dma_dummy_map_sg, .dma_supported = dma_dummy_supported, }; -EXPORT_SYMBOL(dma_dummy_ops); diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 0d129421e75f..51bb8fa8eb89 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -7,13 +7,14 @@ */ #include <linux/memblock.h> /* for max_pfn */ #include <linux/acpi.h> -#include <linux/dma-direct.h> -#include <linux/dma-noncoherent.h> +#include <linux/dma-map-ops.h> #include <linux/export.h> #include <linux/gfp.h> #include <linux/of_device.h> #include <linux/slab.h> #include <linux/vmalloc.h> +#include "debug.h" +#include "direct.h" /* * Managed DMA API @@ -144,6 +145,10 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, dma_addr_t addr; BUG_ON(!valid_dma_direction(dir)); + + if (WARN_ON_ONCE(!dev->dma_mask)) + return DMA_MAPPING_ERROR; + if (dma_map_direct(dev, ops)) addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); else @@ -179,6 +184,10 @@ int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, int ents; BUG_ON(!valid_dma_direction(dir)); + + if (WARN_ON_ONCE(!dev->dma_mask)) + return 0; + if (dma_map_direct(dev, ops)) ents = dma_direct_map_sg(dev, sg, nents, dir, attrs); else @@ -213,6 +222,9 @@ dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, BUG_ON(!valid_dma_direction(dir)); + if (WARN_ON_ONCE(!dev->dma_mask)) + return DMA_MAPPING_ERROR; + /* Don't allow RAM to be mapped */ if (WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr)))) return DMA_MAPPING_ERROR; @@ -296,22 +308,6 @@ void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, EXPORT_SYMBOL(dma_sync_sg_for_device); /* - * Create scatter-list for the already allocated DMA buffer. - */ -int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, - void *cpu_addr, dma_addr_t dma_addr, size_t size, - unsigned long attrs) -{ - struct page *page = virt_to_page(cpu_addr); - int ret; - - ret = sg_alloc_table(sgt, 1, GFP_KERNEL); - if (!ret) - sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); - return ret; -} - -/* * The whole dma_get_sgtable() idea is fundamentally unsafe - it seems * that the intention is to allow exporting memory allocated via the * coherent DMA APIs through the dma_buf API, which only accepts a @@ -346,9 +342,7 @@ pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs) { if (force_dma_unencrypted(dev)) prot = pgprot_decrypted(prot); - if (dev_is_dma_coherent(dev) || - (IS_ENABLED(CONFIG_DMA_NONCOHERENT_CACHE_SYNC) && - (attrs & DMA_ATTR_NON_CONSISTENT))) + if (dev_is_dma_coherent(dev)) return prot; #ifdef CONFIG_ARCH_HAS_DMA_WRITE_COMBINE if (attrs & DMA_ATTR_WRITE_COMBINE) @@ -358,35 +352,6 @@ pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs) } #endif /* CONFIG_MMU */ -/* - * Create userspace mapping for the DMA-coherent memory. - */ -int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, size_t size, - unsigned long attrs) -{ -#ifdef CONFIG_MMU - unsigned long user_count = vma_pages(vma); - unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; - unsigned long off = vma->vm_pgoff; - int ret = -ENXIO; - - vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs); - - if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) - return ret; - - if (off >= count || user_count > count - off) - return -ENXIO; - - return remap_pfn_range(vma, vma->vm_start, - page_to_pfn(virt_to_page(cpu_addr)) + vma->vm_pgoff, - user_count << PAGE_SHIFT, vma->vm_page_prot); -#else - return -ENXIO; -#endif /* CONFIG_MMU */ -} - /** * dma_can_mmap - check if a given device supports dma_mmap_* * @dev: device to check @@ -506,6 +471,86 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, } EXPORT_SYMBOL(dma_free_attrs); +struct page *dma_alloc_pages(struct device *dev, size_t size, + dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + struct page *page; + + if (WARN_ON_ONCE(!dev->coherent_dma_mask)) + return NULL; + if (WARN_ON_ONCE(gfp & (__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM))) + return NULL; + + size = PAGE_ALIGN(size); + if (dma_alloc_direct(dev, ops)) + page = dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp); + else if (ops->alloc_pages) + page = ops->alloc_pages(dev, size, dma_handle, dir, gfp); + else + return NULL; + + debug_dma_map_page(dev, page, 0, size, dir, *dma_handle); + + return page; +} +EXPORT_SYMBOL_GPL(dma_alloc_pages); + +void dma_free_pages(struct device *dev, size_t size, struct page *page, + dma_addr_t dma_handle, enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + size = PAGE_ALIGN(size); + debug_dma_unmap_page(dev, dma_handle, size, dir); + + if (dma_alloc_direct(dev, ops)) + dma_direct_free_pages(dev, size, page, dma_handle, dir); + else if (ops->free_pages) + ops->free_pages(dev, size, page, dma_handle, dir); +} +EXPORT_SYMBOL_GPL(dma_free_pages); + +void *dma_alloc_noncoherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + void *vaddr; + + if (!ops || !ops->alloc_noncoherent) { + struct page *page; + + page = dma_alloc_pages(dev, size, dma_handle, dir, gfp); + if (!page) + return NULL; + return page_address(page); + } + + size = PAGE_ALIGN(size); + vaddr = ops->alloc_noncoherent(dev, size, dma_handle, dir, gfp); + if (vaddr) + debug_dma_map_page(dev, virt_to_page(vaddr), 0, size, dir, + *dma_handle); + return vaddr; +} +EXPORT_SYMBOL_GPL(dma_alloc_noncoherent); + +void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle, enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (!ops || !ops->free_noncoherent) { + dma_free_pages(dev, size, virt_to_page(vaddr), dma_handle, dir); + return; + } + + size = PAGE_ALIGN(size); + debug_dma_unmap_page(dev, dma_handle, size, dir); + ops->free_noncoherent(dev, size, vaddr, dma_handle, dir); +} +EXPORT_SYMBOL_GPL(dma_free_noncoherent); + int dma_supported(struct device *dev, u64 mask) { const struct dma_map_ops *ops = get_dma_ops(dev); @@ -563,20 +608,6 @@ int dma_set_coherent_mask(struct device *dev, u64 mask) EXPORT_SYMBOL(dma_set_coherent_mask); #endif -void dma_cache_sync(struct device *dev, void *vaddr, size_t size, - enum dma_data_direction dir) -{ - const struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!valid_dma_direction(dir)); - - if (dma_alloc_direct(dev, ops)) - arch_dma_cache_sync(dev, vaddr, size, dir); - else if (ops->cache_sync) - ops->cache_sync(dev, vaddr, size, dir); -} -EXPORT_SYMBOL(dma_cache_sync); - size_t dma_max_mapping_size(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c new file mode 100644 index 000000000000..910ae69cae77 --- /dev/null +++ b/kernel/dma/ops_helpers.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Helpers for DMA ops implementations. These generally rely on the fact that + * the allocated memory contains normal pages in the direct kernel mapping. + */ +#include <linux/dma-map-ops.h> + +/* + * Create scatter-list for the already allocated DMA buffer. + */ +int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs) +{ + struct page *page = virt_to_page(cpu_addr); + int ret; + + ret = sg_alloc_table(sgt, 1, GFP_KERNEL); + if (!ret) + sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); + return ret; +} + +/* + * Create userspace mapping for the DMA-coherent memory. + */ +int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs) +{ +#ifdef CONFIG_MMU + unsigned long user_count = vma_pages(vma); + unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; + unsigned long off = vma->vm_pgoff; + int ret = -ENXIO; + + vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs); + + if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) + return ret; + + if (off >= count || user_count > count - off) + return -ENXIO; + + return remap_pfn_range(vma, vma->vm_start, + page_to_pfn(virt_to_page(cpu_addr)) + vma->vm_pgoff, + user_count << PAGE_SHIFT, vma->vm_page_prot); +#else + return -ENXIO; +#endif /* CONFIG_MMU */ +} + +struct page *dma_common_alloc_pages(struct device *dev, size_t size, + dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + struct page *page; + + page = dma_alloc_contiguous(dev, size, gfp); + if (!page) + page = alloc_pages_node(dev_to_node(dev), gfp, get_order(size)); + if (!page) + return NULL; + + *dma_handle = ops->map_page(dev, page, 0, size, dir, + DMA_ATTR_SKIP_CPU_SYNC); + if (*dma_handle == DMA_MAPPING_ERROR) { + dma_free_contiguous(dev, page, size); + return NULL; + } + + memset(page_address(page), 0, size); + return page; +} + +void dma_common_free_pages(struct device *dev, size_t size, struct page *page, + dma_addr_t dma_handle, enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (ops->unmap_page) + ops->unmap_page(dev, dma_handle, size, dir, + DMA_ATTR_SKIP_CPU_SYNC); + dma_free_contiguous(dev, page, size); +} diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 1281c0f0442b..d4637f72239b 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -5,9 +5,8 @@ */ #include <linux/cma.h> #include <linux/debugfs.h> -#include <linux/dma-contiguous.h> +#include <linux/dma-map-ops.h> #include <linux/dma-direct.h> -#include <linux/dma-noncoherent.h> #include <linux/init.h> #include <linux/genalloc.h> #include <linux/set_memory.h> @@ -115,7 +114,7 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, #endif /* * Memory in the atomic DMA pools must be unencrypted, the pools do not - * shrink so no re-encryption occurs in dma_direct_free_pages(). + * shrink so no re-encryption occurs in dma_direct_free(). */ ret = set_memory_decrypted((unsigned long)page_to_virt(page), 1 << order); diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index 78b23f089cf1..905c3fa005f1 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -2,7 +2,7 @@ /* * Copyright (c) 2014 The Linux Foundation */ -#include <linux/dma-mapping.h> +#include <linux/dma-map-ops.h> #include <linux/slab.h> #include <linux/vmalloc.h> diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index c19379fabd20..b4eea0abc3f0 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -22,7 +22,7 @@ #include <linux/cache.h> #include <linux/dma-direct.h> -#include <linux/dma-noncoherent.h> +#include <linux/dma-map-ops.h> #include <linux/mm.h> #include <linux/export.h> #include <linux/spinlock.h> @@ -93,7 +93,7 @@ static unsigned int io_tlb_index; * Max segment that we can provide which (if pages are contingous) will * not be bounced (unless SWIOTLB_FORCE is set). */ -unsigned int max_segment; +static unsigned int max_segment; /* * We need to save away the original address corresponding to a mapped entry @@ -172,9 +172,7 @@ void swiotlb_print_info(void) return; } - pr_info("mapped [mem %#010llx-%#010llx] (%luMB)\n", - (unsigned long long)io_tlb_start, - (unsigned long long)io_tlb_end, + pr_info("mapped [mem %pa-%pa] (%luMB)\n", &io_tlb_start, &io_tlb_end, bytes >> 20); } @@ -670,13 +668,13 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, swiotlb_force); swiotlb_addr = swiotlb_tbl_map_single(dev, - __phys_to_dma(dev, io_tlb_start), + phys_to_dma_unencrypted(dev, io_tlb_start), paddr, size, size, dir, attrs); if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR) return DMA_MAPPING_ERROR; /* Ensure that the address returned is DMA'ble */ - dma_addr = __phys_to_dma(dev, swiotlb_addr); + dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr); if (unlikely(!dma_capable(dev, dma_addr, size, true))) { swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, size, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); diff --git a/kernel/dma/virt.c b/kernel/dma/virt.c index ebe128833af7..59d32317dd57 100644 --- a/kernel/dma/virt.c +++ b/kernel/dma/virt.c @@ -4,7 +4,7 @@ */ #include <linux/export.h> #include <linux/mm.h> -#include <linux/dma-mapping.h> +#include <linux/dma-map-ops.h> #include <linux/scatterlist.h> static void *dma_virt_alloc(struct device *dev, size_t size, @@ -55,5 +55,7 @@ const struct dma_map_ops dma_virt_ops = { .free = dma_virt_free, .map_page = dma_virt_map_page, .map_sg = dma_virt_map_sg, + .alloc_pages = dma_common_alloc_pages, + .free_pages = dma_common_free_pages, }; EXPORT_SYMBOL(dma_virt_ops); diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 145ab11b8318..2b8366693d5c 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -161,7 +161,6 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, arch_do_signal(regs); if (ti_work & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); rseq_handle_notify_resume(NULL, regs); } @@ -304,7 +303,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) * terminate a grace period, if and only if the timer interrupt is * not nested into another interrupt. * - * Checking for __rcu_is_watching() here would prevent the nesting + * Checking for rcu_is_watching() here would prevent the nesting * interrupt to invoke rcu_irq_enter(). If that nested interrupt is * the tick then rcu_flavor_sched_clock_irq() would wrongfully * assume that it is the first interupt and eventually claim diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c index eb1a8a4c867c..b6678a5e3cf6 100644 --- a/kernel/entry/kvm.c +++ b/kernel/entry/kvm.c @@ -16,10 +16,8 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) if (ti_work & _TIF_NEED_RESCHED) schedule(); - if (ti_work & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); + if (ti_work & _TIF_NOTIFY_RESUME) tracehook_notify_resume(NULL); - } ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work); if (ret) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 0e18aaf23a7b..00b0358739ab 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1823,7 +1823,7 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags) t->utask->dup_xol_addr = area->vaddr; init_task_work(&t->utask->dup_xol_work, dup_xol_work); - task_work_add(t, &t->utask->dup_xol_work, true); + task_work_add(t, &t->utask->dup_xol_work, TWA_RESUME); } /* diff --git a/kernel/exit.c b/kernel/exit.c index 733e80f334e7..87a2d515de0d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1474,23 +1474,6 @@ end: return retval; } -static struct pid *pidfd_get_pid(unsigned int fd) -{ - struct fd f; - struct pid *pid; - - f = fdget(fd); - if (!f.file) - return ERR_PTR(-EBADF); - - pid = pidfd_pid(f.file); - if (!IS_ERR(pid)) - get_pid(pid); - - fdput(f); - return pid; -} - static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, int options, struct rusage *ru) { @@ -1498,6 +1481,7 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, struct pid *pid = NULL; enum pid_type type; long ret; + unsigned int f_flags = 0; if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED| __WNOTHREAD|__WCLONE|__WALL)) @@ -1531,9 +1515,10 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, if (upid < 0) return -EINVAL; - pid = pidfd_get_pid(upid); + pid = pidfd_get_pid(upid, &f_flags); if (IS_ERR(pid)) return PTR_ERR(pid); + break; default: return -EINVAL; @@ -1544,7 +1529,12 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, wo.wo_flags = options; wo.wo_info = infop; wo.wo_rusage = ru; + if (f_flags & O_NONBLOCK) + wo.wo_flags |= WNOHANG; + ret = do_wait(&wo); + if (!ret && !(options & WNOHANG) && (f_flags & O_NONBLOCK)) + ret = -EAGAIN; put_pid(pid); return ret; diff --git a/kernel/fork.c b/kernel/fork.c index da8d360fb032..32083db7a2a2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -95,6 +95,7 @@ #include <linux/stackleak.h> #include <linux/kasan.h> #include <linux/scs.h> +#include <linux/io_uring.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> @@ -555,10 +556,10 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, get_file(file); if (tmp->vm_flags & VM_DENYWRITE) - atomic_dec(&inode->i_writecount); + put_write_access(inode); i_mmap_lock_write(mapping); if (tmp->vm_flags & VM_SHARED) - atomic_inc(&mapping->i_mmap_writable); + mapping_allow_writable(mapping); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ vma_interval_tree_insert_after(tmp, mpnt, @@ -589,7 +590,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) - retval = copy_page_range(mm, oldmm, mpnt, tmp); + retval = copy_page_range(tmp, mpnt); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); @@ -728,6 +729,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(refcount_read(&tsk->usage)); WARN_ON(tsk == current); + io_uring_free(tsk); cgroup_free(tsk); task_numa_free(tsk, true); security_task_free(tsk); @@ -1810,6 +1812,25 @@ static __always_inline void delayed_free_task(struct task_struct *tsk) free_task(tsk); } +static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk) +{ + /* Skip if kernel thread */ + if (!tsk->mm) + return; + + /* Skip if spawning a thread or using vfork */ + if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM) + return; + + /* We need to synchronize with __set_oom_adj */ + mutex_lock(&oom_adj_mutex); + set_bit(MMF_MULTIPROCESS, &tsk->mm->flags); + /* Update the values in case they were changed after copy_signal */ + tsk->signal->oom_score_adj = current->signal->oom_score_adj; + tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min; + mutex_unlock(&oom_adj_mutex); +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -1983,6 +2004,10 @@ static __latent_entropy struct task_struct *copy_process( p->vtime.state = VTIME_INACTIVE; #endif +#ifdef CONFIG_IO_URING + p->io_uring = NULL; +#endif + #if defined(SPLIT_RSS_COUNTING) memset(&p->rss_stat, 0, sizeof(p->rss_stat)); #endif @@ -2164,7 +2189,7 @@ static __latent_entropy struct task_struct *copy_process( /* * Ensure that the cgroup subsystem policies allow the new process to be - * forked. It should be noted the the new process's css_set can be changed + * forked. It should be noted that the new process's css_set can be changed * between here and cgroup_post_fork() if an organisation operation is in * progress. */ @@ -2282,6 +2307,8 @@ static __latent_entropy struct task_struct *copy_process( trace_task_newtask(p, clone_flags); uprobe_copy_process(p, clone_flags); + copy_oom_score_adj(clone_flags, p); + return p; bad_fork_cancel_cgroup: @@ -2385,14 +2412,14 @@ struct mm_struct *copy_init_mm(void) * * args->exit_signal is expected to be checked for sanity by the caller. */ -long _do_fork(struct kernel_clone_args *args) +pid_t kernel_clone(struct kernel_clone_args *args) { u64 clone_flags = args->flags; struct completion vfork; struct pid *pid; struct task_struct *p; int trace = 0; - long nr; + pid_t nr; /* * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument @@ -2478,7 +2505,7 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) .stack_size = (unsigned long)arg, }; - return _do_fork(&args); + return kernel_clone(&args); } #ifdef __ARCH_WANT_SYS_FORK @@ -2489,7 +2516,7 @@ SYSCALL_DEFINE0(fork) .exit_signal = SIGCHLD, }; - return _do_fork(&args); + return kernel_clone(&args); #else /* can not support in nommu mode */ return -EINVAL; @@ -2505,7 +2532,7 @@ SYSCALL_DEFINE0(vfork) .exit_signal = SIGCHLD, }; - return _do_fork(&args); + return kernel_clone(&args); } #endif @@ -2543,7 +2570,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, .tls = tls, }; - return _do_fork(&args); + return kernel_clone(&args); } #endif @@ -2701,7 +2728,7 @@ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) if (!clone3_args_valid(&kargs)) return -EINVAL; - return _do_fork(&kargs); + return kernel_clone(&kargs); } #endif @@ -2864,7 +2891,7 @@ int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, /* * unshare allows a process to 'unshare' part of the process * context which was originally shared using clone. copy_* - * functions used by _do_fork() cannot be used here directly + * functions used by kernel_clone() cannot be used here directly * because they modify an inactive task_struct that is being * constructed. Here we are modifying the current, active, * task_struct. diff --git a/kernel/futex.c b/kernel/futex.c index 39681bf8b06c..f8614ef4ff31 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -39,6 +39,7 @@ #include <linux/freezer.h> #include <linux/memblock.h> #include <linux/fault-inject.h> +#include <linux/time_namespace.h> #include <asm/futex.h> @@ -916,7 +917,7 @@ static inline void exit_pi_state_list(struct task_struct *curr) { } * [10] Found | Found | task | !=taskTID | 0/1 | Invalid * * [1] Indicates that the kernel can acquire the futex atomically. We - * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. + * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. * * [2] Valid, if TID does not belong to a kernel thread. If no matching * thread is found then it indicates that the owner TID has died. @@ -3799,6 +3800,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, t = timespec64_to_ktime(ts); if (cmd == FUTEX_WAIT) t = ktime_add_safe(ktime_get(), t); + else if (!(op & FUTEX_CLOCK_REALTIME)) + t = timens_ktime_to_host(CLOCK_MONOTONIC, t); tp = &t; } /* @@ -3991,6 +3994,8 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, t = timespec64_to_ktime(ts); if (cmd == FUTEX_WAIT) t = ktime_add_safe(ktime_get(), t); + else if (!(op & FUTEX_CLOCK_REALTIME)) + t = timens_ktime_to_host(CLOCK_MONOTONIC, t); tp = &t; } if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || diff --git a/kernel/groups.c b/kernel/groups.c index 6ee6691f6839..fe7e6385530e 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -178,7 +178,7 @@ bool may_setgroups(void) { struct user_namespace *user_ns = current_user_ns(); - return ns_capable(user_ns, CAP_SETGID) && + return ns_capable_setid(user_ns, CAP_SETGID) && userns_may_setgroups(user_ns); } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5df903fccb60..c460e0496006 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1162,7 +1162,7 @@ static int irq_thread(void *data) handler_fn = irq_thread_fn; init_task_work(&on_exit_work, irq_thread_dtor); - task_work_add(current, &on_exit_work, false); + task_work_add(current, &on_exit_work, TWA_NONE); irq_thread_check_affinity(desc, action); diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index e960d7ce7bcc..773b6105c4ae 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -604,7 +604,7 @@ int irq_timings_alloc(int irq) /* * Some platforms can have the same private interrupt per cpu, - * so this function may be be called several times with the + * so this function may be called several times with the * same interrupt number. Just bail out in case the per cpu * stat structure is already allocated. */ diff --git a/kernel/jump_label.c b/kernel/jump_label.c index e661c61b3d6b..015ef903ce8c 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -19,7 +19,7 @@ #include <linux/cpu.h> #include <asm/sections.h> -/* mutex to protect coming/going of the the jump_label table */ +/* mutex to protect coming/going of the jump_label table */ static DEFINE_MUTEX(jump_label_mutex); void jump_label_lock(void) diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 4fb15fa96734..fe9de067771c 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -40,10 +40,10 @@ extern const u8 kallsyms_names[] __weak; * has one (eg: FRV). */ extern const unsigned int kallsyms_num_syms -__attribute__((weak, section(".rodata"))); +__section(".rodata") __attribute__((weak)); extern const unsigned long kallsyms_relative_base -__attribute__((weak, section(".rodata"))); +__section(".rodata") __attribute__((weak)); extern const char kallsyms_token_table[] __weak; extern const u16 kallsyms_token_index[] __weak; diff --git a/kernel/kcsan/encoding.h b/kernel/kcsan/encoding.h index f03562aaf2eb..1a6db2f797ac 100644 --- a/kernel/kcsan/encoding.h +++ b/kernel/kcsan/encoding.h @@ -32,7 +32,7 @@ * 1. different addresses but with the same encoded address race; * 2. and both map onto the same watchpoint slots; * - * Both these are assumed to be very unlikely. However, in case it still happens + * Both these are assumed to be very unlikely. However, in case it still * happens, the report logic will filter out the false positive (see report.c). */ #define WATCHPOINT_ADDR_BITS (BITS_PER_LONG-1 - WATCHPOINT_SIZE_BITS) diff --git a/kernel/kexec.c b/kernel/kexec.c index f977786fe498..c82c6c06f051 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -205,7 +205,7 @@ static inline int kexec_load_check(unsigned long nr_segments, return -EPERM; /* Permit LSMs and IMA to fail the kexec */ - result = security_kernel_load_data(LOADING_KEXEC_IMAGE); + result = security_kernel_load_data(LOADING_KEXEC_IMAGE, false); if (result < 0) return result; diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index c19c0dad1ebe..8798a8183974 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -36,7 +36,7 @@ #include <linux/syscore_ops.h> #include <linux/compiler.h> #include <linux/hugetlb.h> -#include <linux/frame.h> +#include <linux/objtool.h> #include <asm/page.h> #include <asm/sections.h> @@ -109,7 +109,7 @@ EXPORT_SYMBOL_GPL(kexec_crash_loaded); * defined more restrictively in <asm/kexec.h>. * * The code for the transition from the current kernel to the - * the new kernel is placed in the control_code_buffer, whose size + * new kernel is placed in the control_code_buffer, whose size * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single * page of memory is necessary, but some architectures require more. * Because this memory must be identity mapped in the transition from diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index ca40bef75a61..e21f6b9234f7 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -24,6 +24,7 @@ #include <linux/elf.h> #include <linux/elfcore.h> #include <linux/kernel.h> +#include <linux/kernel_read_file.h> #include <linux/syscalls.h> #include <linux/vmalloc.h> #include "kexec_internal.h" @@ -219,13 +220,12 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, { int ret; void *ldata; - loff_t size; - ret = kernel_read_file_from_fd(kernel_fd, &image->kernel_buf, - &size, INT_MAX, READING_KEXEC_IMAGE); - if (ret) + ret = kernel_read_file_from_fd(kernel_fd, 0, &image->kernel_buf, + INT_MAX, NULL, READING_KEXEC_IMAGE); + if (ret < 0) return ret; - image->kernel_buf_len = size; + image->kernel_buf_len = ret; /* Call arch image probe handlers */ ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, @@ -241,12 +241,13 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, #endif /* It is possible that there no initramfs is being loaded */ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { - ret = kernel_read_file_from_fd(initrd_fd, &image->initrd_buf, - &size, INT_MAX, + ret = kernel_read_file_from_fd(initrd_fd, 0, &image->initrd_buf, + INT_MAX, NULL, READING_KEXEC_INITRAMFS); - if (ret) + if (ret < 0) goto out; - image->initrd_buf_len = size; + image->initrd_buf_len = ret; + ret = 0; } if (cmdline_len) { @@ -520,7 +521,7 @@ static int locate_mem_hole_callback(struct resource *res, void *arg) /* Returning 0 will take to next memory range */ /* Don't use memory that will be detected and handled by a driver. */ - if (res->flags & IORESOURCE_MEM_DRIVER_MANAGED) + if (res->flags & IORESOURCE_SYSRAM_DRIVER_MANAGED) return 0; if (sz < kbuf->memsz) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 789002da7766..8a12a25fa40d 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2614,7 +2614,7 @@ static int __init init_kprobes(void) init_test_probes(); return err; } -subsys_initcall(init_kprobes); +early_initcall(init_kprobes); #ifdef CONFIG_DEBUG_FS static void report_probe(struct seq_file *pi, struct kprobe *p, diff --git a/kernel/kthread.c b/kernel/kthread.c index 3edaa380dc7b..e29773c82b70 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -775,7 +775,7 @@ EXPORT_SYMBOL(kthread_create_worker); /** * kthread_create_worker_on_cpu - create a kthread worker and bind it - * it to a given CPU and the associated NUMA node. + * to a given CPU and the associated NUMA node. * @cpu: CPU number * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the kthread worker (task). diff --git a/kernel/livepatch/state.c b/kernel/livepatch/state.c index 7ee19476de9d..2565d039ade0 100644 --- a/kernel/livepatch/state.c +++ b/kernel/livepatch/state.c @@ -55,7 +55,7 @@ EXPORT_SYMBOL_GPL(klp_get_state); * * The function can be called only during transition when a new * livepatch is being enabled or when such a transition is reverted. - * It is typically called only from from pre/post (un)patch + * It is typically called only from pre/post (un)patch * callbacks. * * Return: pointer to the latest struct klp_state from already diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 9cfa5e89cff7..62d215b2e39f 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -566,7 +566,7 @@ static struct lock_torture_ops rwsem_lock_ops = { #include <linux/percpu-rwsem.h> static struct percpu_rw_semaphore pcpu_rwsem; -void torture_percpu_rwsem_init(void) +static void torture_percpu_rwsem_init(void) { BUG_ON(percpu_init_rwsem(&pcpu_rwsem)); } diff --git a/kernel/module.c b/kernel/module.c index c075a18103fb..a4fa44a652a7 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -18,6 +18,7 @@ #include <linux/fs.h> #include <linux/sysfs.h> #include <linux/kernel.h> +#include <linux/kernel_read_file.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/elf.h> @@ -91,8 +92,9 @@ EXPORT_SYMBOL_GPL(module_mutex); static LIST_HEAD(modules); /* Work queue for freeing init sections in success case */ -static struct work_struct init_free_wq; -static struct llist_head init_free_list; +static void do_free_init(struct work_struct *w); +static DECLARE_WORK(init_free_wq, do_free_init); +static LLIST_HEAD(init_free_list); #ifdef CONFIG_MODULES_TREE_LOOKUP @@ -2096,8 +2098,11 @@ static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, int i; for (i = 0; i < hdr->e_shnum; i++) { - if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) + if ((sechdrs[i].sh_flags & shf_wx) == shf_wx) { + pr_err("%s: section %s (index %d) has invalid WRITE|EXEC flags\n", + mod->name, secstrings + sechdrs[i].sh_name, i); return -ENOEXEC; + } } return 0; @@ -3013,7 +3018,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, if (info->len < sizeof(*(info->hdr))) return -ENOEXEC; - err = security_kernel_load_data(LOADING_MODULE); + err = security_kernel_load_data(LOADING_MODULE, true); if (err) return err; @@ -3023,11 +3028,17 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, return -ENOMEM; if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) { - vfree(info->hdr); - return -EFAULT; + err = -EFAULT; + goto out; } - return 0; + err = security_kernel_post_load_data((char *)info->hdr, info->len, + LOADING_MODULE, "init_module"); +out: + if (err) + vfree(info->hdr); + + return err; } static void free_copy(struct load_info *info) @@ -3584,14 +3595,6 @@ static void do_free_init(struct work_struct *w) } } -static int __init modules_wq_init(void) -{ - INIT_WORK(&init_free_wq, do_free_init); - init_llist_head(&init_free_list); - return 0; -} -module_init(modules_wq_init); - /* * This is where the real work happens. * @@ -3834,8 +3837,10 @@ static int load_module(struct load_info *info, const char __user *uargs, char *after_dashes; err = elf_header_check(info); - if (err) + if (err) { + pr_err("Module has invalid ELF header\n"); goto free_copy; + } err = setup_load_info(info, flags); if (err) @@ -3843,6 +3848,7 @@ static int load_module(struct load_info *info, const char __user *uargs, if (blacklisted(info->name)) { err = -EPERM; + pr_err("Module %s is blacklisted\n", info->name); goto free_copy; } @@ -4043,8 +4049,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) { struct load_info info = { }; - loff_t size; - void *hdr; + void *hdr = NULL; int err; err = may_init_module(); @@ -4057,12 +4062,12 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) |MODULE_INIT_IGNORE_VERMAGIC)) return -EINVAL; - err = kernel_read_file_from_fd(fd, &hdr, &size, INT_MAX, + err = kernel_read_file_from_fd(fd, 0, &hdr, INT_MAX, NULL, READING_MODULE); - if (err) + if (err < 0) return err; info.hdr = hdr; - info.len = size; + info.len = err; return load_module(&info, uargs, flags); } diff --git a/kernel/panic.c b/kernel/panic.c index aef8872ba843..396142ee43fd 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -589,6 +589,11 @@ void __warn(const char *file, int line, void *caller, unsigned taint, if (args) vprintk(args->fmt, args->args); + print_modules(); + + if (regs) + show_regs(regs); + if (panic_on_warn) { /* * This thread may hit another WARN() in the panic path. @@ -600,12 +605,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, panic("panic_on_warn set ...\n"); } - print_modules(); - - if (regs) - show_regs(regs); - else - dump_stack(); + dump_stack(); print_irqtrace_events(current); diff --git a/kernel/params.c b/kernel/params.c index 8e56f8b12d8f..164d79330849 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -233,14 +233,15 @@ char *parse_args(const char *doing, EXPORT_SYMBOL(param_ops_##name) -STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8); -STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16); -STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16); -STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); -STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); -STANDARD_PARAM_DEF(long, long, "%li", kstrtol); -STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); -STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); +STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8); +STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16); +STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16); +STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); +STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); +STANDARD_PARAM_DEF(long, long, "%li", kstrtol); +STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); +STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); +STANDARD_PARAM_DEF(hexint, unsigned int, "%#08x", kstrtouint); int param_set_charp(const char *val, const struct kernel_param *kp) { @@ -529,7 +530,7 @@ struct module_param_attrs { unsigned int num; struct attribute_group grp; - struct param_attribute attrs[0]; + struct param_attribute attrs[]; }; #ifdef CONFIG_SYSFS diff --git a/kernel/pid.c b/kernel/pid.c index b2562a7ce525..a96bc4bf4f86 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -43,6 +43,7 @@ #include <linux/sched/task.h> #include <linux/idr.h> #include <net/sock.h> +#include <uapi/linux/pidfd.h> struct pid init_struct_pid = { .count = REFCOUNT_INIT(1), @@ -519,10 +520,30 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) return idr_get_next(&ns->idr, &nr); } +struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) +{ + struct fd f; + struct pid *pid; + + f = fdget(fd); + if (!f.file) + return ERR_PTR(-EBADF); + + pid = pidfd_pid(f.file); + if (!IS_ERR(pid)) { + get_pid(pid); + *flags = f.file->f_flags; + } + + fdput(f); + return pid; +} + /** * pidfd_create() - Create a new pid file descriptor. * - * @pid: struct pid that the pidfd will reference + * @pid: struct pid that the pidfd will reference + * @flags: flags to pass * * This creates a new pid file descriptor with the O_CLOEXEC flag set. * @@ -532,12 +553,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) * Return: On success, a cloexec pidfd is returned. * On error, a negative errno number will be returned. */ -static int pidfd_create(struct pid *pid) +static int pidfd_create(struct pid *pid, unsigned int flags) { int fd; fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), - O_RDWR | O_CLOEXEC); + flags | O_RDWR | O_CLOEXEC); if (fd < 0) put_pid(pid); @@ -565,7 +586,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) int fd; struct pid *p; - if (flags) + if (flags & ~PIDFD_NONBLOCK) return -EINVAL; if (pid <= 0) @@ -576,7 +597,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) return -ESRCH; if (pid_has_task(p, PIDTYPE_TGID)) - fd = pidfd_create(p); + fd = pidfd_create(p, flags); else fd = -EINVAL; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index ac135bd600eb..9de21803a8ae 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -233,7 +233,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) * to pid_ns->child_reaper. Thus pidns->child_reaper needs to * stay valid until they all go away. * - * The code relies on the the pid_ns->child_reaper ignoring + * The code relies on the pid_ns->child_reaper ignoring * SIGCHILD to cause those EXIT_ZOMBIE processes to be * autoreaped if reparented. * diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 1dee70815f3c..2fc7d509a34f 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -946,17 +946,6 @@ static int software_resume(void) /* Check if the device is there */ swsusp_resume_device = name_to_dev_t(resume_file); - - /* - * name_to_dev_t is ineffective to verify parition if resume_file is in - * integer format. (e.g. major:minor) - */ - if (isdigit(resume_file[0]) && resume_wait) { - int partno; - while (!get_gendisk(swsusp_resume_device, &partno)) - msleep(10); - } - if (!swsusp_resume_device) { /* * Some device discovery might still be in progress; we need diff --git a/kernel/power/process.c b/kernel/power/process.c index 4b6a54da7e65..45b054b7b5ec 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -146,7 +146,7 @@ int freeze_processes(void) BUG_ON(in_atomic()); /* - * Now that the whole userspace is frozen we need to disbale + * Now that the whole userspace is frozen we need to disable * the OOM killer to disallow any further interference with * killable tasks. There is no guarantee oom victims will * ever reach a point they go away we have to wait with a timeout. diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d25749bce7cf..46b1804c1ddf 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -735,7 +735,7 @@ zone_found: */ /* - * If the zone we wish to scan is the the current zone and the + * If the zone we wish to scan is the current zone and the * pfn falls into the current node then we do not need to walk * the tree. */ diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 01e2858b5fe3..c73f2e295167 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -226,6 +226,7 @@ struct hib_bio_batch { atomic_t count; wait_queue_head_t wait; blk_status_t error; + struct blk_plug plug; }; static void hib_init_batch(struct hib_bio_batch *hb) @@ -233,6 +234,12 @@ static void hib_init_batch(struct hib_bio_batch *hb) atomic_set(&hb->count, 0); init_waitqueue_head(&hb->wait); hb->error = BLK_STS_OK; + blk_start_plug(&hb->plug); +} + +static void hib_finish_batch(struct hib_bio_batch *hb) +{ + blk_finish_plug(&hb->plug); } static void hib_end_io(struct bio *bio) @@ -294,6 +301,10 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, static blk_status_t hib_wait_io(struct hib_bio_batch *hb) { + /* + * We are relying on the behavior of blk_plug that a thread with + * a plug will flush the plug list before sleeping. + */ wait_event(hb->wait, atomic_read(&hb->count) == 0); return blk_status_to_errno(hb->error); } @@ -335,26 +346,23 @@ static int swsusp_swap_check(void) { int res; - res = swap_type_of(swsusp_resume_device, swsusp_resume_block, - &hib_resume_bdev); + if (swsusp_resume_device) + res = swap_type_of(swsusp_resume_device, swsusp_resume_block); + else + res = find_first_swap(&swsusp_resume_device); if (res < 0) return res; - root_swap = res; - res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL); - if (res) - return res; + + hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, FMODE_WRITE, + NULL); + if (IS_ERR(hib_resume_bdev)) + return PTR_ERR(hib_resume_bdev); res = set_blocksize(hib_resume_bdev, PAGE_SIZE); if (res < 0) blkdev_put(hib_resume_bdev, FMODE_WRITE); - /* - * Update the resume device to the one actually used, - * so the test_resume mode can use it in case it is - * invoked from hibernate() to test the snapshot. - */ - swsusp_resume_device = hib_resume_bdev->bd_dev; return res; } @@ -561,6 +569,7 @@ static int save_image(struct swap_map_handle *handle, nr_pages++; } err2 = hib_wait_io(&hb); + hib_finish_batch(&hb); stop = ktime_get(); if (!ret) ret = err2; @@ -854,6 +863,7 @@ out_finish: pr_info("Image saving done\n"); swsusp_show_speed(start, stop, nr_to_write, "Wrote"); out_clean: + hib_finish_batch(&hb); if (crc) { if (crc->thr) kthread_stop(crc->thr); @@ -1084,6 +1094,7 @@ static int load_image(struct swap_map_handle *handle, nr_pages++; } err2 = hib_wait_io(&hb); + hib_finish_batch(&hb); stop = ktime_get(); if (!ret) ret = err2; @@ -1447,6 +1458,7 @@ out_finish: } swsusp_show_speed(start, stop, nr_to_read, "Read"); out_clean: + hib_finish_batch(&hb); for (i = 0; i < ring_size; i++) free_page((unsigned long)page[i]); if (crc) { diff --git a/kernel/power/user.c b/kernel/power/user.c index 047f598f89a5..740723bb3885 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -35,12 +35,12 @@ static struct snapshot_data { bool ready; bool platform_support; bool free_bitmaps; - struct inode *bd_inode; + dev_t dev; } snapshot_state; -int is_hibernate_resume_dev(const struct inode *bd_inode) +int is_hibernate_resume_dev(dev_t dev) { - return hibernation_available() && snapshot_state.bd_inode == bd_inode; + return hibernation_available() && snapshot_state.dev == dev; } static int snapshot_open(struct inode *inode, struct file *filp) @@ -69,8 +69,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) memset(&data->handle, 0, sizeof(struct snapshot_handle)); if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { /* Hibernating. The image device should be accessible. */ - data->swap = swsusp_resume_device ? - swap_type_of(swsusp_resume_device, 0, NULL) : -1; + data->swap = swap_type_of(swsusp_resume_device, 0); data->mode = O_RDONLY; data->free_bitmaps = false; error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION); @@ -95,7 +94,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->frozen = false; data->ready = false; data->platform_support = false; - data->bd_inode = NULL; + data->dev = 0; Unlock: unlock_system_sleep(); @@ -111,7 +110,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) swsusp_free(); data = filp->private_data; - data->bd_inode = NULL; + data->dev = 0; free_all_swap_pages(data->swap); if (data->frozen) { pm_restore_gfp_mask(); @@ -204,7 +203,6 @@ struct compat_resume_swap_area { static int snapshot_set_swap_area(struct snapshot_data *data, void __user *argp) { - struct block_device *bdev; sector_t offset; dev_t swdev; @@ -231,16 +229,10 @@ static int snapshot_set_swap_area(struct snapshot_data *data, * User space encodes device types as two-byte values, * so we need to recode them */ - if (!swdev) { - data->swap = -1; - return -EINVAL; - } - data->swap = swap_type_of(swdev, offset, &bdev); + data->swap = swap_type_of(swdev, offset); if (data->swap < 0) - return -ENODEV; - - data->bd_inode = bdev->bd_inode; - bdput(bdev); + return swdev ? -ENODEV : -EINVAL; + data->dev = swdev; return 0; } diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index 4d052fc6bcde..eee3dc9b60a9 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -2,3 +2,4 @@ obj-y = printk.o obj-$(CONFIG_PRINTK) += printk_safe.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o +obj-$(CONFIG_PRINTK) += printk_ringbuffer.o diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 660f9a6bf73a..3a8fd491758c 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -14,9 +14,9 @@ extern raw_spinlock_t logbuf_lock; -__printf(5, 0) +__printf(4, 0) int vprintk_store(int facility, int level, - const char *dict, size_t dictlen, + const struct dev_printk_info *dev_info, const char *fmt, va_list args); __printf(1, 0) int vprintk_default(const char *fmt, va_list args); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9b75f6bfc333..fe64a49344bf 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -55,6 +55,7 @@ #define CREATE_TRACE_POINTS #include <trace/events/printk.h> +#include "printk_ringbuffer.h" #include "console_cmdline.h" #include "braille.h" #include "internal.h" @@ -294,30 +295,22 @@ enum con_msg_format_flags { static int console_msg_format = MSG_FORMAT_DEFAULT; /* - * The printk log buffer consists of a chain of concatenated variable - * length records. Every record starts with a record header, containing - * the overall length of the record. + * The printk log buffer consists of a sequenced collection of records, each + * containing variable length message text. Every record also contains its + * own meta-data (@info). * - * The heads to the first and last entry in the buffer, as well as the - * sequence numbers of these entries are maintained when messages are - * stored. + * Every record meta-data carries the timestamp in microseconds, as well as + * the standard userspace syslog level and syslog facility. The usual kernel + * messages use LOG_KERN; userspace-injected messages always carry a matching + * syslog facility, by default LOG_USER. The origin of every message can be + * reliably determined that way. * - * If the heads indicate available messages, the length in the header - * tells the start next message. A length == 0 for the next message - * indicates a wrap-around to the beginning of the buffer. + * The human readable log message of a record is available in @text, the + * length of the message text in @text_len. The stored message is not + * terminated. * - * Every record carries the monotonic timestamp in microseconds, as well as - * the standard userspace syslog level and syslog facility. The usual - * kernel messages use LOG_KERN; userspace-injected messages always carry - * a matching syslog facility, by default LOG_USER. The origin of every - * message can be reliably determined that way. - * - * The human readable log message directly follows the message header. The - * length of the message text is stored in the header, the stored message - * is not terminated. - * - * Optionally, a message can carry a dictionary of properties (key/value pairs), - * to provide userspace with a machine-readable message context. + * Optionally, a record can carry a dictionary of properties (key/value + * pairs), to provide userspace with a machine-readable message context. * * Examples for well-defined, commonly used property names are: * DEVICE=b12:8 device identifier @@ -327,25 +320,22 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; * +sound:card0 subsystem:devname * SUBSYSTEM=pci driver-core subsystem name * - * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value - * follows directly after a '=' character. Every property is terminated by - * a '\0' character. The last property is not terminated. - * - * Example of a message structure: - * 0000 ff 8f 00 00 00 00 00 00 monotonic time in nsec - * 0008 34 00 record is 52 bytes long - * 000a 0b 00 text is 11 bytes long - * 000c 1f 00 dictionary is 23 bytes long - * 000e 03 00 LOG_KERN (facility) LOG_ERR (level) - * 0010 69 74 27 73 20 61 20 6c "it's a l" - * 69 6e 65 "ine" - * 001b 44 45 56 49 43 "DEVIC" - * 45 3d 62 38 3a 32 00 44 "E=b8:2\0D" - * 52 49 56 45 52 3d 62 75 "RIVER=bu" - * 67 "g" - * 0032 00 00 00 padding to next message header - * - * The 'struct printk_log' buffer header must never be directly exported to + * Valid characters in property names are [a-zA-Z0-9.-_]. Property names + * and values are terminated by a '\0' character. + * + * Example of record values: + * record.text_buf = "it's a line" (unterminated) + * record.info.seq = 56 + * record.info.ts_nsec = 36863 + * record.info.text_len = 11 + * record.info.facility = 0 (LOG_KERN) + * record.info.flags = 0 + * record.info.level = 3 (LOG_ERR) + * record.info.caller_id = 299 (task 299) + * record.info.dev_info.subsystem = "pci" (terminated) + * record.info.dev_info.device = "+pci:0000:00:01.0" (terminated) + * + * The 'struct printk_info' buffer must never be directly exported to * userspace, it is a kernel-private implementation detail that might * need to be changed in the future, when the requirements change. * @@ -365,23 +355,6 @@ enum log_flags { LOG_CONT = 8, /* text is a fragment of a continuation line */ }; -struct printk_log { - u64 ts_nsec; /* timestamp in nanoseconds */ - u16 len; /* length of entire record */ - u16 text_len; /* length of text buffer */ - u16 dict_len; /* length of dictionary buffer */ - u8 facility; /* syslog facility */ - u8 flags:5; /* internal record flags */ - u8 level:3; /* syslog level */ -#ifdef CONFIG_PRINTK_CALLER - u32 caller_id; /* thread id or processor id */ -#endif -} -#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS -__packed __aligned(4) -#endif -; - /* * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken * within the scheduler's rq lock. It must be released before calling @@ -421,26 +394,16 @@ DEFINE_RAW_SPINLOCK(logbuf_lock); DECLARE_WAIT_QUEUE_HEAD(log_wait); /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; -static u32 syslog_idx; static size_t syslog_partial; static bool syslog_time; -/* index and sequence number of the first record stored in the buffer */ -static u64 log_first_seq; -static u32 log_first_idx; - -/* index and sequence number of the next record to store in the buffer */ -static u64 log_next_seq; -static u32 log_next_idx; - /* the next printk record to write to the console */ static u64 console_seq; -static u32 console_idx; static u64 exclusive_console_stop_seq; +static unsigned long console_dropped; /* the next printk record to read after the last 'clear' command */ static u64 clear_seq; -static u32 clear_idx; #ifdef CONFIG_PRINTK_CALLER #define PREFIX_MAX 48 @@ -453,7 +416,7 @@ static u32 clear_idx; #define LOG_FACILITY(v) ((v) >> 3 & 0xff) /* record buffer */ -#define LOG_ALIGN __alignof__(struct printk_log) +#define LOG_ALIGN __alignof__(unsigned long) #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) #define LOG_BUF_LEN_MAX (u32)(1 << 31) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); @@ -461,6 +424,23 @@ static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; /* + * Define the average message size. This only affects the number of + * descriptors that will be available. Underestimating is better than + * overestimating (too many available descriptors is better than not enough). + */ +#define PRB_AVGBITS 5 /* 32 character average length */ + +#if CONFIG_LOG_BUF_SHIFT <= PRB_AVGBITS +#error CONFIG_LOG_BUF_SHIFT value too small. +#endif +_DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS, + PRB_AVGBITS, &__log_buf[0]); + +static struct printk_ringbuffer printk_rb_dynamic; + +static struct printk_ringbuffer *prb = &printk_rb_static; + +/* * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before * per_cpu_areas are initialised. This variable is set to true when * it's safe to access per-CPU data. @@ -484,108 +464,6 @@ u32 log_buf_len_get(void) return log_buf_len; } -/* human readable text of the record */ -static char *log_text(const struct printk_log *msg) -{ - return (char *)msg + sizeof(struct printk_log); -} - -/* optional key/value pair dictionary attached to the record */ -static char *log_dict(const struct printk_log *msg) -{ - return (char *)msg + sizeof(struct printk_log) + msg->text_len; -} - -/* get record by index; idx must point to valid msg */ -static struct printk_log *log_from_idx(u32 idx) -{ - struct printk_log *msg = (struct printk_log *)(log_buf + idx); - - /* - * A length == 0 record is the end of buffer marker. Wrap around and - * read the message at the start of the buffer. - */ - if (!msg->len) - return (struct printk_log *)log_buf; - return msg; -} - -/* get next record; idx must point to valid msg */ -static u32 log_next(u32 idx) -{ - struct printk_log *msg = (struct printk_log *)(log_buf + idx); - - /* length == 0 indicates the end of the buffer; wrap */ - /* - * A length == 0 record is the end of buffer marker. Wrap around and - * read the message at the start of the buffer as *this* one, and - * return the one after that. - */ - if (!msg->len) { - msg = (struct printk_log *)log_buf; - return msg->len; - } - return idx + msg->len; -} - -/* - * Check whether there is enough free space for the given message. - * - * The same values of first_idx and next_idx mean that the buffer - * is either empty or full. - * - * If the buffer is empty, we must respect the position of the indexes. - * They cannot be reset to the beginning of the buffer. - */ -static int logbuf_has_space(u32 msg_size, bool empty) -{ - u32 free; - - if (log_next_idx > log_first_idx || empty) - free = max(log_buf_len - log_next_idx, log_first_idx); - else - free = log_first_idx - log_next_idx; - - /* - * We need space also for an empty header that signalizes wrapping - * of the buffer. - */ - return free >= msg_size + sizeof(struct printk_log); -} - -static int log_make_free_space(u32 msg_size) -{ - while (log_first_seq < log_next_seq && - !logbuf_has_space(msg_size, false)) { - /* drop old messages until we have enough contiguous space */ - log_first_idx = log_next(log_first_idx); - log_first_seq++; - } - - if (clear_seq < log_first_seq) { - clear_seq = log_first_seq; - clear_idx = log_first_idx; - } - - /* sequence numbers are equal, so the log buffer is empty */ - if (logbuf_has_space(msg_size, log_first_seq == log_next_seq)) - return 0; - - return -ENOMEM; -} - -/* compute the message size including the padding bytes */ -static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len) -{ - u32 size; - - size = sizeof(struct printk_log) + text_len + dict_len; - *pad_len = (-size) & (LOG_ALIGN - 1); - size += *pad_len; - - return size; -} - /* * Define how much of the log buffer we could take at maximum. The value * must be greater than two. Note that only half of the buffer is available @@ -594,84 +472,69 @@ static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len) #define MAX_LOG_TAKE_PART 4 static const char trunc_msg[] = "<truncated>"; -static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len, - u16 *dict_len, u32 *pad_len) +static void truncate_msg(u16 *text_len, u16 *trunc_msg_len) { /* * The message should not take the whole buffer. Otherwise, it might * get removed too soon. */ u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART; + if (*text_len > max_text_len) *text_len = max_text_len; - /* enable the warning message */ + + /* enable the warning message (if there is room) */ *trunc_msg_len = strlen(trunc_msg); - /* disable the "dict" completely */ - *dict_len = 0; - /* compute the size again, count also the warning message */ - return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len); + if (*text_len >= *trunc_msg_len) + *text_len -= *trunc_msg_len; + else + *trunc_msg_len = 0; } /* insert record into the buffer, discard old ones, update heads */ static int log_store(u32 caller_id, int facility, int level, enum log_flags flags, u64 ts_nsec, - const char *dict, u16 dict_len, + const struct dev_printk_info *dev_info, const char *text, u16 text_len) { - struct printk_log *msg; - u32 size, pad_len; + struct prb_reserved_entry e; + struct printk_record r; u16 trunc_msg_len = 0; - /* number of '\0' padding bytes to next message */ - size = msg_used_size(text_len, dict_len, &pad_len); + prb_rec_init_wr(&r, text_len); - if (log_make_free_space(size)) { + if (!prb_reserve(&e, prb, &r)) { /* truncate the message if it is too long for empty buffer */ - size = truncate_msg(&text_len, &trunc_msg_len, - &dict_len, &pad_len); + truncate_msg(&text_len, &trunc_msg_len); + prb_rec_init_wr(&r, text_len + trunc_msg_len); /* survive when the log buffer is too small for trunc_msg */ - if (log_make_free_space(size)) + if (!prb_reserve(&e, prb, &r)) return 0; } - if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { - /* - * This message + an additional empty header does not fit - * at the end of the buffer. Add an empty header with len == 0 - * to signify a wrap around. - */ - memset(log_buf + log_next_idx, 0, sizeof(struct printk_log)); - log_next_idx = 0; - } - /* fill message */ - msg = (struct printk_log *)(log_buf + log_next_idx); - memcpy(log_text(msg), text, text_len); - msg->text_len = text_len; - if (trunc_msg_len) { - memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len); - msg->text_len += trunc_msg_len; - } - memcpy(log_dict(msg), dict, dict_len); - msg->dict_len = dict_len; - msg->facility = facility; - msg->level = level & 7; - msg->flags = flags & 0x1f; + memcpy(&r.text_buf[0], text, text_len); + if (trunc_msg_len) + memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len); + r.info->text_len = text_len + trunc_msg_len; + r.info->facility = facility; + r.info->level = level & 7; + r.info->flags = flags & 0x1f; if (ts_nsec > 0) - msg->ts_nsec = ts_nsec; + r.info->ts_nsec = ts_nsec; else - msg->ts_nsec = local_clock(); -#ifdef CONFIG_PRINTK_CALLER - msg->caller_id = caller_id; -#endif - memset(log_dict(msg) + dict_len, 0, pad_len); - msg->len = size; + r.info->ts_nsec = local_clock(); + r.info->caller_id = caller_id; + if (dev_info) + memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); /* insert message */ - log_next_idx += msg->len; - log_next_seq++; + if ((flags & LOG_CONT) || !(flags & LOG_NEWLINE)) + prb_commit(&e); + else + prb_final_commit(&e); - return msg->text_len; + return (text_len + trunc_msg_len); } int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT); @@ -723,13 +586,13 @@ static void append_char(char **pp, char *e, char c) *(*pp)++ = c; } -static ssize_t msg_print_ext_header(char *buf, size_t size, - struct printk_log *msg, u64 seq) +static ssize_t info_print_ext_header(char *buf, size_t size, + struct printk_info *info) { - u64 ts_usec = msg->ts_nsec; + u64 ts_usec = info->ts_nsec; char caller[20]; #ifdef CONFIG_PRINTK_CALLER - u32 id = msg->caller_id; + u32 id = info->caller_id; snprintf(caller, sizeof(caller), ",caller=%c%u", id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); @@ -740,13 +603,13 @@ static ssize_t msg_print_ext_header(char *buf, size_t size, do_div(ts_usec, 1000); return scnprintf(buf, size, "%u,%llu,%llu,%c%s;", - (msg->facility << 3) | msg->level, seq, ts_usec, - msg->flags & LOG_CONT ? 'c' : '-', caller); + (info->facility << 3) | info->level, info->seq, + ts_usec, info->flags & LOG_CONT ? 'c' : '-', caller); } -static ssize_t msg_print_ext_body(char *buf, size_t size, - char *dict, size_t dict_len, - char *text, size_t text_len) +static ssize_t msg_add_ext_text(char *buf, size_t size, + const char *text, size_t text_len, + unsigned char endc) { char *p = buf, *e = buf + size; size_t i; @@ -760,45 +623,56 @@ static ssize_t msg_print_ext_body(char *buf, size_t size, else append_char(&p, e, c); } - append_char(&p, e, '\n'); + append_char(&p, e, endc); - if (dict_len) { - bool line = true; + return p - buf; +} - for (i = 0; i < dict_len; i++) { - unsigned char c = dict[i]; +static ssize_t msg_add_dict_text(char *buf, size_t size, + const char *key, const char *val) +{ + size_t val_len = strlen(val); + ssize_t len; - if (line) { - append_char(&p, e, ' '); - line = false; - } + if (!val_len) + return 0; - if (c == '\0') { - append_char(&p, e, '\n'); - line = true; - continue; - } + len = msg_add_ext_text(buf, size, "", 0, ' '); /* dict prefix */ + len += msg_add_ext_text(buf + len, size - len, key, strlen(key), '='); + len += msg_add_ext_text(buf + len, size - len, val, val_len, '\n'); - if (c < ' ' || c >= 127 || c == '\\') { - p += scnprintf(p, e - p, "\\x%02x", c); - continue; - } + return len; +} - append_char(&p, e, c); - } - append_char(&p, e, '\n'); - } +static ssize_t msg_print_ext_body(char *buf, size_t size, + char *text, size_t text_len, + struct dev_printk_info *dev_info) +{ + ssize_t len; - return p - buf; + len = msg_add_ext_text(buf, size, text, text_len, '\n'); + + if (!dev_info) + goto out; + + len += msg_add_dict_text(buf + len, size - len, "SUBSYSTEM", + dev_info->subsystem); + len += msg_add_dict_text(buf + len, size - len, "DEVICE", + dev_info->device); +out: + return len; } /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { u64 seq; - u32 idx; struct ratelimit_state rs; struct mutex lock; char buf[CONSOLE_EXT_LOG_MAX]; + + struct printk_info info; + char text_buf[CONSOLE_EXT_LOG_MAX]; + struct printk_record record; }; static __printf(3, 4) __cold @@ -808,7 +682,7 @@ int devkmsg_emit(int facility, int level, const char *fmt, ...) int r; va_start(args, fmt); - r = vprintk_emit(facility, level, NULL, 0, fmt, args); + r = vprintk_emit(facility, level, NULL, fmt, args); va_end(args); return r; @@ -881,7 +755,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct devkmsg_user *user = file->private_data; - struct printk_log *msg; + struct printk_record *r = &user->record; size_t len; ssize_t ret; @@ -893,7 +767,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, return ret; logbuf_lock_irq(); - while (user->seq == log_next_seq) { + if (!prb_read_valid(prb, user->seq, r)) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; logbuf_unlock_irq(); @@ -902,30 +776,26 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, logbuf_unlock_irq(); ret = wait_event_interruptible(log_wait, - user->seq != log_next_seq); + prb_read_valid(prb, user->seq, r)); if (ret) goto out; logbuf_lock_irq(); } - if (user->seq < log_first_seq) { + if (user->seq < prb_first_valid_seq(prb)) { /* our last seen message is gone, return error and reset */ - user->idx = log_first_idx; - user->seq = log_first_seq; + user->seq = prb_first_valid_seq(prb); ret = -EPIPE; logbuf_unlock_irq(); goto out; } - msg = log_from_idx(user->idx); - len = msg_print_ext_header(user->buf, sizeof(user->buf), - msg, user->seq); + len = info_print_ext_header(user->buf, sizeof(user->buf), r->info); len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len, - log_dict(msg), msg->dict_len, - log_text(msg), msg->text_len); + &r->text_buf[0], r->info->text_len, + &r->info->dev_info); - user->idx = log_next(user->idx); - user->seq++; + user->seq = r->info->seq + 1; logbuf_unlock_irq(); if (len > count) { @@ -965,8 +835,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) switch (whence) { case SEEK_SET: /* the first record */ - user->idx = log_first_idx; - user->seq = log_first_seq; + user->seq = prb_first_valid_seq(prb); break; case SEEK_DATA: /* @@ -974,13 +843,11 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) * like issued by 'dmesg -c'. Reading /dev/kmsg itself * changes no global state, and does not clear anything. */ - user->idx = clear_idx; user->seq = clear_seq; break; case SEEK_END: /* after the last record */ - user->idx = log_next_idx; - user->seq = log_next_seq; + user->seq = prb_next_seq(prb); break; default: ret = -EINVAL; @@ -1000,9 +867,9 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait) poll_wait(file, &log_wait, wait); logbuf_lock_irq(); - if (user->seq < log_next_seq) { + if (prb_read_valid(prb, user->seq, NULL)) { /* return error when data has vanished underneath us */ - if (user->seq < log_first_seq) + if (user->seq < prb_first_valid_seq(prb)) ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; else ret = EPOLLIN|EPOLLRDNORM; @@ -1037,9 +904,11 @@ static int devkmsg_open(struct inode *inode, struct file *file) mutex_init(&user->lock); + prb_rec_init_rd(&user->record, &user->info, + &user->text_buf[0], sizeof(user->text_buf)); + logbuf_lock_irq(); - user->idx = log_first_idx; - user->seq = log_first_seq; + user->seq = prb_first_valid_seq(prb); logbuf_unlock_irq(); file->private_data = user; @@ -1080,23 +949,58 @@ const struct file_operations kmsg_fops = { */ void log_buf_vmcoreinfo_setup(void) { - VMCOREINFO_SYMBOL(log_buf); - VMCOREINFO_SYMBOL(log_buf_len); - VMCOREINFO_SYMBOL(log_first_idx); - VMCOREINFO_SYMBOL(clear_idx); - VMCOREINFO_SYMBOL(log_next_idx); + struct dev_printk_info *dev_info = NULL; + + VMCOREINFO_SYMBOL(prb); + VMCOREINFO_SYMBOL(printk_rb_static); + VMCOREINFO_SYMBOL(clear_seq); + /* - * Export struct printk_log size and field offsets. User space tools can + * Export struct size and field offsets. User space tools can * parse it and detect any changes to structure down the line. */ - VMCOREINFO_STRUCT_SIZE(printk_log); - VMCOREINFO_OFFSET(printk_log, ts_nsec); - VMCOREINFO_OFFSET(printk_log, len); - VMCOREINFO_OFFSET(printk_log, text_len); - VMCOREINFO_OFFSET(printk_log, dict_len); -#ifdef CONFIG_PRINTK_CALLER - VMCOREINFO_OFFSET(printk_log, caller_id); -#endif + + VMCOREINFO_STRUCT_SIZE(printk_ringbuffer); + VMCOREINFO_OFFSET(printk_ringbuffer, desc_ring); + VMCOREINFO_OFFSET(printk_ringbuffer, text_data_ring); + VMCOREINFO_OFFSET(printk_ringbuffer, fail); + + VMCOREINFO_STRUCT_SIZE(prb_desc_ring); + VMCOREINFO_OFFSET(prb_desc_ring, count_bits); + VMCOREINFO_OFFSET(prb_desc_ring, descs); + VMCOREINFO_OFFSET(prb_desc_ring, infos); + VMCOREINFO_OFFSET(prb_desc_ring, head_id); + VMCOREINFO_OFFSET(prb_desc_ring, tail_id); + + VMCOREINFO_STRUCT_SIZE(prb_desc); + VMCOREINFO_OFFSET(prb_desc, state_var); + VMCOREINFO_OFFSET(prb_desc, text_blk_lpos); + + VMCOREINFO_STRUCT_SIZE(prb_data_blk_lpos); + VMCOREINFO_OFFSET(prb_data_blk_lpos, begin); + VMCOREINFO_OFFSET(prb_data_blk_lpos, next); + + VMCOREINFO_STRUCT_SIZE(printk_info); + VMCOREINFO_OFFSET(printk_info, seq); + VMCOREINFO_OFFSET(printk_info, ts_nsec); + VMCOREINFO_OFFSET(printk_info, text_len); + VMCOREINFO_OFFSET(printk_info, caller_id); + VMCOREINFO_OFFSET(printk_info, dev_info); + + VMCOREINFO_STRUCT_SIZE(dev_printk_info); + VMCOREINFO_OFFSET(dev_printk_info, subsystem); + VMCOREINFO_LENGTH(printk_info_subsystem, sizeof(dev_info->subsystem)); + VMCOREINFO_OFFSET(dev_printk_info, device); + VMCOREINFO_LENGTH(printk_info_device, sizeof(dev_info->device)); + + VMCOREINFO_STRUCT_SIZE(prb_data_ring); + VMCOREINFO_OFFSET(prb_data_ring, size_bits); + VMCOREINFO_OFFSET(prb_data_ring, data); + VMCOREINFO_OFFSET(prb_data_ring, head_lpos); + VMCOREINFO_OFFSET(prb_data_ring, tail_lpos); + + VMCOREINFO_SIZE(atomic_long_t); + VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter); } #endif @@ -1174,11 +1078,46 @@ static void __init set_percpu_data_ready(void) __printk_percpu_data_ready = true; } +static unsigned int __init add_to_rb(struct printk_ringbuffer *rb, + struct printk_record *r) +{ + struct prb_reserved_entry e; + struct printk_record dest_r; + + prb_rec_init_wr(&dest_r, r->info->text_len); + + if (!prb_reserve(&e, rb, &dest_r)) + return 0; + + memcpy(&dest_r.text_buf[0], &r->text_buf[0], r->info->text_len); + dest_r.info->text_len = r->info->text_len; + dest_r.info->facility = r->info->facility; + dest_r.info->level = r->info->level; + dest_r.info->flags = r->info->flags; + dest_r.info->ts_nsec = r->info->ts_nsec; + dest_r.info->caller_id = r->info->caller_id; + memcpy(&dest_r.info->dev_info, &r->info->dev_info, sizeof(dest_r.info->dev_info)); + + prb_final_commit(&e); + + return prb_record_text_space(&e); +} + +static char setup_text_buf[LOG_LINE_MAX] __initdata; + void __init setup_log_buf(int early) { + struct printk_info *new_infos; + unsigned int new_descs_count; + struct prb_desc *new_descs; + struct printk_info info; + struct printk_record r; + size_t new_descs_size; + size_t new_infos_size; unsigned long flags; char *new_log_buf; unsigned int free; + u64 seq; /* * Some archs call setup_log_buf() multiple times - first is very @@ -1197,24 +1136,75 @@ void __init setup_log_buf(int early) if (!new_log_buf_len) return; + new_descs_count = new_log_buf_len >> PRB_AVGBITS; + if (new_descs_count == 0) { + pr_err("new_log_buf_len: %lu too small\n", new_log_buf_len); + return; + } + new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN); if (unlikely(!new_log_buf)) { - pr_err("log_buf_len: %lu bytes not available\n", - new_log_buf_len); + pr_err("log_buf_len: %lu text bytes not available\n", + new_log_buf_len); return; } + new_descs_size = new_descs_count * sizeof(struct prb_desc); + new_descs = memblock_alloc(new_descs_size, LOG_ALIGN); + if (unlikely(!new_descs)) { + pr_err("log_buf_len: %zu desc bytes not available\n", + new_descs_size); + goto err_free_log_buf; + } + + new_infos_size = new_descs_count * sizeof(struct printk_info); + new_infos = memblock_alloc(new_infos_size, LOG_ALIGN); + if (unlikely(!new_infos)) { + pr_err("log_buf_len: %zu info bytes not available\n", + new_infos_size); + goto err_free_descs; + } + + prb_rec_init_rd(&r, &info, &setup_text_buf[0], sizeof(setup_text_buf)); + + prb_init(&printk_rb_dynamic, + new_log_buf, ilog2(new_log_buf_len), + new_descs, ilog2(new_descs_count), + new_infos); + logbuf_lock_irqsave(flags); + log_buf_len = new_log_buf_len; log_buf = new_log_buf; new_log_buf_len = 0; - free = __LOG_BUF_LEN - log_next_idx; - memcpy(log_buf, __log_buf, __LOG_BUF_LEN); + + free = __LOG_BUF_LEN; + prb_for_each_record(0, &printk_rb_static, seq, &r) + free -= add_to_rb(&printk_rb_dynamic, &r); + + /* + * This is early enough that everything is still running on the + * boot CPU and interrupts are disabled. So no new messages will + * appear during the transition to the dynamic buffer. + */ + prb = &printk_rb_dynamic; + logbuf_unlock_irqrestore(flags); + if (seq != prb_next_seq(&printk_rb_static)) { + pr_err("dropped %llu messages\n", + prb_next_seq(&printk_rb_static) - seq); + } + pr_info("log_buf_len: %u bytes\n", log_buf_len); pr_info("early log buf free: %u(%u%%)\n", free, (free * 100) / __LOG_BUF_LEN); + return; + +err_free_descs: + memblock_free(__pa(new_descs), new_descs_size); +err_free_log_buf: + memblock_free(__pa(new_log_buf), new_log_buf_len); } static bool __read_mostly ignore_loglevel; @@ -1321,18 +1311,18 @@ static size_t print_caller(u32 id, char *buf) #define print_caller(id, buf) 0 #endif -static size_t print_prefix(const struct printk_log *msg, bool syslog, - bool time, char *buf) +static size_t info_print_prefix(const struct printk_info *info, bool syslog, + bool time, char *buf) { size_t len = 0; if (syslog) - len = print_syslog((msg->facility << 3) | msg->level, buf); + len = print_syslog((info->facility << 3) | info->level, buf); if (time) - len += print_time(msg->ts_nsec, buf + len); + len += print_time(info->ts_nsec, buf + len); - len += print_caller(msg->caller_id, buf + len); + len += print_caller(info->caller_id, buf + len); if (IS_ENABLED(CONFIG_PRINTK_CALLER) || time) { buf[len++] = ' '; @@ -1342,72 +1332,150 @@ static size_t print_prefix(const struct printk_log *msg, bool syslog, return len; } -static size_t msg_print_text(const struct printk_log *msg, bool syslog, - bool time, char *buf, size_t size) +/* + * Prepare the record for printing. The text is shifted within the given + * buffer to avoid a need for another one. The following operations are + * done: + * + * - Add prefix for each line. + * - Add the trailing newline that has been removed in vprintk_store(). + * - Drop truncated lines that do not longer fit into the buffer. + * + * Return: The length of the updated/prepared text, including the added + * prefixes and the newline. The dropped line(s) are not counted. + */ +static size_t record_print_text(struct printk_record *r, bool syslog, + bool time) { - const char *text = log_text(msg); - size_t text_size = msg->text_len; - size_t len = 0; + size_t text_len = r->info->text_len; + size_t buf_size = r->text_buf_size; + char *text = r->text_buf; char prefix[PREFIX_MAX]; - const size_t prefix_len = print_prefix(msg, syslog, time, prefix); + bool truncated = false; + size_t prefix_len; + size_t line_len; + size_t len = 0; + char *next; + + /* + * If the message was truncated because the buffer was not large + * enough, treat the available text as if it were the full text. + */ + if (text_len > buf_size) + text_len = buf_size; - do { - const char *next = memchr(text, '\n', text_size); - size_t text_len; + prefix_len = info_print_prefix(r->info, syslog, time, prefix); + /* + * @text_len: bytes of unprocessed text + * @line_len: bytes of current line _without_ newline + * @text: pointer to beginning of current line + * @len: number of bytes prepared in r->text_buf + */ + for (;;) { + next = memchr(text, '\n', text_len); if (next) { - text_len = next - text; - next++; - text_size -= next - text; + line_len = next - text; } else { - text_len = text_size; + /* Drop truncated line(s). */ + if (truncated) + break; + line_len = text_len; } - if (buf) { - if (prefix_len + text_len + 1 >= size - len) + /* + * Truncate the text if there is not enough space to add the + * prefix and a trailing newline. + */ + if (len + prefix_len + text_len + 1 > buf_size) { + /* Drop even the current line if no space. */ + if (len + prefix_len + line_len + 1 > buf_size) break; - memcpy(buf + len, prefix, prefix_len); - len += prefix_len; - memcpy(buf + len, text, text_len); - len += text_len; - buf[len++] = '\n'; - } else { - /* SYSLOG_ACTION_* buffer size only calculation */ - len += prefix_len + text_len + 1; + text_len = buf_size - len - prefix_len - 1; + truncated = true; } - text = next; - } while (text); + memmove(text + prefix_len, text, text_len); + memcpy(text, prefix, prefix_len); + + len += prefix_len + line_len + 1; + + if (text_len == line_len) { + /* + * Add the trailing newline removed in + * vprintk_store(). + */ + text[prefix_len + line_len] = '\n'; + break; + } + + /* + * Advance beyond the added prefix and the related line with + * its newline. + */ + text += prefix_len + line_len + 1; + + /* + * The remaining text has only decreased by the line with its + * newline. + * + * Note that @text_len can become zero. It happens when @text + * ended with a newline (either due to truncation or the + * original string ending with "\n\n"). The loop is correctly + * repeated and (if not truncated) an empty line with a prefix + * will be prepared. + */ + text_len -= line_len + 1; + } return len; } +static size_t get_record_print_text_size(struct printk_info *info, + unsigned int line_count, + bool syslog, bool time) +{ + char prefix[PREFIX_MAX]; + size_t prefix_len; + + prefix_len = info_print_prefix(info, syslog, time, prefix); + + /* + * Each line will be preceded with a prefix. The intermediate + * newlines are already within the text, but a final trailing + * newline will be added. + */ + return ((prefix_len * line_count) + info->text_len + 1); +} + static int syslog_print(char __user *buf, int size) { + struct printk_info info; + struct printk_record r; char *text; - struct printk_log *msg; int len = 0; text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); if (!text) return -ENOMEM; + prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); + while (size > 0) { size_t n; size_t skip; logbuf_lock_irq(); - if (syslog_seq < log_first_seq) { - /* messages are gone, move to first one */ - syslog_seq = log_first_seq; - syslog_idx = log_first_idx; - syslog_partial = 0; - } - if (syslog_seq == log_next_seq) { + if (!prb_read_valid(prb, syslog_seq, &r)) { logbuf_unlock_irq(); break; } + if (r.info->seq != syslog_seq) { + /* message is gone, move to next valid one */ + syslog_seq = r.info->seq; + syslog_partial = 0; + } /* * To keep reading/counting partial line consistent, @@ -1417,13 +1485,10 @@ static int syslog_print(char __user *buf, int size) syslog_time = printk_time; skip = syslog_partial; - msg = log_from_idx(syslog_idx); - n = msg_print_text(msg, true, syslog_time, text, - LOG_LINE_MAX + PREFIX_MAX); + n = record_print_text(&r, true, syslog_time); if (n - syslog_partial <= size) { /* message fits into buffer, move forward */ - syslog_idx = log_next(syslog_idx); - syslog_seq++; + syslog_seq = r.info->seq + 1; n -= syslog_partial; syslog_partial = 0; } else if (!len){ @@ -1454,11 +1519,12 @@ static int syslog_print(char __user *buf, int size) static int syslog_print_all(char __user *buf, int size, bool clear) { + struct printk_info info; + unsigned int line_count; + struct printk_record r; char *text; int len = 0; - u64 next_seq; u64 seq; - u32 idx; bool time; text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); @@ -1471,38 +1537,28 @@ static int syslog_print_all(char __user *buf, int size, bool clear) * Find first record that fits, including all following records, * into the user-provided buffer for this dump. */ - seq = clear_seq; - idx = clear_idx; - while (seq < log_next_seq) { - struct printk_log *msg = log_from_idx(idx); - - len += msg_print_text(msg, true, time, NULL, 0); - idx = log_next(idx); - seq++; - } + prb_for_each_info(clear_seq, prb, seq, &info, &line_count) + len += get_record_print_text_size(&info, line_count, true, time); /* move first record forward until length fits into the buffer */ - seq = clear_seq; - idx = clear_idx; - while (len > size && seq < log_next_seq) { - struct printk_log *msg = log_from_idx(idx); - - len -= msg_print_text(msg, true, time, NULL, 0); - idx = log_next(idx); - seq++; + prb_for_each_info(clear_seq, prb, seq, &info, &line_count) { + if (len <= size) + break; + len -= get_record_print_text_size(&info, line_count, true, time); } - /* last message fitting into this dump */ - next_seq = log_next_seq; + prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); len = 0; - while (len >= 0 && seq < next_seq) { - struct printk_log *msg = log_from_idx(idx); - int textlen = msg_print_text(msg, true, time, text, - LOG_LINE_MAX + PREFIX_MAX); + prb_for_each_record(seq, prb, seq, &r) { + int textlen; - idx = log_next(idx); - seq++; + textlen = record_print_text(&r, true, time); + + if (len + textlen > size) { + seq--; + break; + } logbuf_unlock_irq(); if (copy_to_user(buf + len, text, textlen)) @@ -1511,17 +1567,12 @@ static int syslog_print_all(char __user *buf, int size, bool clear) len += textlen; logbuf_lock_irq(); - if (seq < log_first_seq) { - /* messages are gone, move to next one */ - seq = log_first_seq; - idx = log_first_idx; - } + if (len < 0) + break; } - if (clear) { - clear_seq = log_next_seq; - clear_idx = log_next_idx; - } + if (clear) + clear_seq = seq; logbuf_unlock_irq(); kfree(text); @@ -1531,8 +1582,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) static void syslog_clear(void) { logbuf_lock_irq(); - clear_seq = log_next_seq; - clear_idx = log_next_idx; + clear_seq = prb_next_seq(prb); logbuf_unlock_irq(); } @@ -1559,7 +1609,7 @@ int do_syslog(int type, char __user *buf, int len, int source) if (!access_ok(buf, len)) return -EFAULT; error = wait_event_interruptible(log_wait, - syslog_seq != log_next_seq); + prb_read_valid(prb, syslog_seq, NULL)); if (error) return error; error = syslog_print(buf, len); @@ -1567,7 +1617,7 @@ int do_syslog(int type, char __user *buf, int len, int source) /* Read/clear last kernel messages */ case SYSLOG_ACTION_READ_CLEAR: clear = true; - /* FALL THRU */ + fallthrough; /* Read last kernel messages */ case SYSLOG_ACTION_READ_ALL: if (!buf || len < 0) @@ -1608,10 +1658,9 @@ int do_syslog(int type, char __user *buf, int len, int source) /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: logbuf_lock_irq(); - if (syslog_seq < log_first_seq) { + if (syslog_seq < prb_first_valid_seq(prb)) { /* messages are gone, move to first one */ - syslog_seq = log_first_seq; - syslog_idx = log_first_idx; + syslog_seq = prb_first_valid_seq(prb); syslog_partial = 0; } if (source == SYSLOG_FROM_PROC) { @@ -1620,20 +1669,18 @@ int do_syslog(int type, char __user *buf, int len, int source) * for pending data, not the size; return the count of * records, not the length. */ - error = log_next_seq - syslog_seq; + error = prb_next_seq(prb) - syslog_seq; } else { - u64 seq = syslog_seq; - u32 idx = syslog_idx; bool time = syslog_partial ? syslog_time : printk_time; - - while (seq < log_next_seq) { - struct printk_log *msg = log_from_idx(idx); - - error += msg_print_text(msg, true, time, NULL, - 0); + struct printk_info info; + unsigned int line_count; + u64 seq; + + prb_for_each_info(syslog_seq, prb, seq, &info, + &line_count) { + error += get_record_print_text_size(&info, line_count, + true, time); time = printk_time; - idx = log_next(idx); - seq++; } error -= syslog_partial; } @@ -1804,10 +1851,22 @@ static int console_trylock_spinning(void) static void call_console_drivers(const char *ext_text, size_t ext_len, const char *text, size_t len) { + static char dropped_text[64]; + size_t dropped_len = 0; struct console *con; trace_console_rcuidle(text, len); + if (!console_drivers) + return; + + if (console_dropped) { + dropped_len = snprintf(dropped_text, sizeof(dropped_text), + "** %lu printk messages dropped **\n", + console_dropped); + console_dropped = 0; + } + for_each_console(con) { if (exclusive_console && con != exclusive_console) continue; @@ -1820,8 +1879,11 @@ static void call_console_drivers(const char *ext_text, size_t ext_len, continue; if (con->flags & CON_EXTENDED) con->write(con, ext_text, ext_len); - else + else { + if (dropped_len) + con->write(con, dropped_text, dropped_len); con->write(con, text, len); + } } } @@ -1845,97 +1907,38 @@ static inline u32 printk_caller_id(void) 0x80000000 + raw_smp_processor_id(); } -/* - * Continuation lines are buffered, and not committed to the record buffer - * until the line is complete, or a race forces it. The line fragments - * though, are printed immediately to the consoles to ensure everything has - * reached the console in case of a kernel crash. - */ -static struct cont { - char buf[LOG_LINE_MAX]; - size_t len; /* length == 0 means unused buffer */ - u32 caller_id; /* printk_caller_id() of first print */ - u64 ts_nsec; /* time of first print */ - u8 level; /* log level of first message */ - u8 facility; /* log facility of first message */ - enum log_flags flags; /* prefix, newline flags */ -} cont; - -static void cont_flush(void) -{ - if (cont.len == 0) - return; - - log_store(cont.caller_id, cont.facility, cont.level, cont.flags, - cont.ts_nsec, NULL, 0, cont.buf, cont.len); - cont.len = 0; -} - -static bool cont_add(u32 caller_id, int facility, int level, - enum log_flags flags, const char *text, size_t len) -{ - /* If the line gets too long, split it up in separate records. */ - if (cont.len + len > sizeof(cont.buf)) { - cont_flush(); - return false; - } - - if (!cont.len) { - cont.facility = facility; - cont.level = level; - cont.caller_id = caller_id; - cont.ts_nsec = local_clock(); - cont.flags = flags; - } - - memcpy(cont.buf + cont.len, text, len); - cont.len += len; - - // The original flags come from the first line, - // but later continuations can add a newline. - if (flags & LOG_NEWLINE) { - cont.flags |= LOG_NEWLINE; - cont_flush(); - } - - return true; -} - -static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len) +static size_t log_output(int facility, int level, enum log_flags lflags, + const struct dev_printk_info *dev_info, + char *text, size_t text_len) { const u32 caller_id = printk_caller_id(); - /* - * If an earlier line was buffered, and we're a continuation - * write from the same context, try to add it to the buffer. - */ - if (cont.len) { - if (cont.caller_id == caller_id && (lflags & LOG_CONT)) { - if (cont_add(caller_id, facility, level, lflags, text, text_len)) - return text_len; - } - /* Otherwise, make sure it's flushed */ - cont_flush(); - } - - /* Skip empty continuation lines that couldn't be added - they just flush */ - if (!text_len && (lflags & LOG_CONT)) - return 0; - - /* If it doesn't end in a newline, try to buffer the current line */ - if (!(lflags & LOG_NEWLINE)) { - if (cont_add(caller_id, facility, level, lflags, text, text_len)) + if (lflags & LOG_CONT) { + struct prb_reserved_entry e; + struct printk_record r; + + prb_rec_init_wr(&r, text_len); + if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) { + memcpy(&r.text_buf[r.info->text_len], text, text_len); + r.info->text_len += text_len; + if (lflags & LOG_NEWLINE) { + r.info->flags |= LOG_NEWLINE; + prb_final_commit(&e); + } else { + prb_commit(&e); + } return text_len; + } } /* Store it in the record log */ return log_store(caller_id, facility, level, lflags, 0, - dict, dictlen, text, text_len); + dev_info, text, text_len); } /* Must be called under logbuf_lock. */ int vprintk_store(int facility, int level, - const char *dict, size_t dictlen, + const struct dev_printk_info *dev_info, const char *fmt, va_list args) { static char textbuf[LOG_LINE_MAX]; @@ -1977,21 +1980,19 @@ int vprintk_store(int facility, int level, if (level == LOGLEVEL_DEFAULT) level = default_message_loglevel; - if (dict) + if (dev_info) lflags |= LOG_NEWLINE; - return log_output(facility, level, lflags, - dict, dictlen, text, text_len); + return log_output(facility, level, lflags, dev_info, text, text_len); } asmlinkage int vprintk_emit(int facility, int level, - const char *dict, size_t dictlen, + const struct dev_printk_info *dev_info, const char *fmt, va_list args) { int printed_len; - bool in_sched = false, pending_output; + bool in_sched = false; unsigned long flags; - u64 curr_log_seq; /* Suppress unimportant messages after panic happens */ if (unlikely(suppress_printk)) @@ -2007,13 +2008,11 @@ asmlinkage int vprintk_emit(int facility, int level, /* This stops the holder of console_sem just where we want him */ logbuf_lock_irqsave(flags); - curr_log_seq = log_next_seq; - printed_len = vprintk_store(facility, level, dict, dictlen, fmt, args); - pending_output = (curr_log_seq != log_next_seq); + printed_len = vprintk_store(facility, level, dev_info, fmt, args); logbuf_unlock_irqrestore(flags); /* If called from the scheduler, we can not call up(). */ - if (!in_sched && pending_output) { + if (!in_sched) { /* * Disable preemption to avoid being preempted while holding * console_sem which would prevent anyone from printing to @@ -2030,8 +2029,7 @@ asmlinkage int vprintk_emit(int facility, int level, preempt_enable(); } - if (pending_output) - wake_up_klogd(); + wake_up_klogd(); return printed_len; } EXPORT_SYMBOL(vprintk_emit); @@ -2044,7 +2042,7 @@ EXPORT_SYMBOL(vprintk); int vprintk_default(const char *fmt, va_list args) { - return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); + return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args); } EXPORT_SYMBOL_GPL(vprintk_default); @@ -2088,30 +2086,31 @@ EXPORT_SYMBOL(printk); #define PREFIX_MAX 0 #define printk_time false +#define prb_read_valid(rb, seq, r) false +#define prb_first_valid_seq(rb) 0 + static u64 syslog_seq; -static u32 syslog_idx; static u64 console_seq; -static u32 console_idx; static u64 exclusive_console_stop_seq; -static u64 log_first_seq; -static u32 log_first_idx; -static u64 log_next_seq; -static char *log_text(const struct printk_log *msg) { return NULL; } -static char *log_dict(const struct printk_log *msg) { return NULL; } -static struct printk_log *log_from_idx(u32 idx) { return NULL; } -static u32 log_next(u32 idx) { return 0; } -static ssize_t msg_print_ext_header(char *buf, size_t size, - struct printk_log *msg, - u64 seq) { return 0; } +static unsigned long console_dropped; + +static size_t record_print_text(const struct printk_record *r, + bool syslog, bool time) +{ + return 0; +} +static ssize_t info_print_ext_header(char *buf, size_t size, + struct printk_info *info) +{ + return 0; +} static ssize_t msg_print_ext_body(char *buf, size_t size, - char *dict, size_t dict_len, - char *text, size_t text_len) { return 0; } + char *text, size_t text_len, + struct dev_printk_info *dev_info) { return 0; } static void console_lock_spinning_enable(void) { } static int console_lock_spinning_disable_and_check(void) { return 0; } static void call_console_drivers(const char *ext_text, size_t ext_len, const char *text, size_t len) {} -static size_t msg_print_text(const struct printk_log *msg, bool syslog, - bool time, char *buf, size_t size) { return 0; } static bool suppress_message_printing(int level) { return false; } #endif /* CONFIG_PRINTK */ @@ -2398,12 +2397,16 @@ void console_unlock(void) static char text[LOG_LINE_MAX + PREFIX_MAX]; unsigned long flags; bool do_cond_resched, retry; + struct printk_info info; + struct printk_record r; if (console_suspended) { up_console_sem(); return; } + prb_rec_init_rd(&r, &info, text, sizeof(text)); + /* * Console drivers are called with interrupts disabled, so * @console_may_schedule should be cleared before; however, we may @@ -2416,7 +2419,7 @@ void console_unlock(void) * * console_trylock() is not able to detect the preemptive * context reliably. Therefore the value must be stored before - * and cleared after the the "again" goto label. + * and cleared after the "again" goto label. */ do_cond_resched = console_may_schedule; again: @@ -2434,35 +2437,26 @@ again: } for (;;) { - struct printk_log *msg; size_t ext_len = 0; size_t len; printk_safe_enter_irqsave(flags); raw_spin_lock(&logbuf_lock); - if (console_seq < log_first_seq) { - len = snprintf(text, sizeof(text), - "** %llu printk messages dropped **\n", - log_first_seq - console_seq); - - /* messages are gone, move to first one */ - console_seq = log_first_seq; - console_idx = log_first_idx; - } else { - len = 0; - } skip: - if (console_seq == log_next_seq) + if (!prb_read_valid(prb, console_seq, &r)) break; - msg = log_from_idx(console_idx); - if (suppress_message_printing(msg->level)) { + if (console_seq != r.info->seq) { + console_dropped += r.info->seq - console_seq; + console_seq = r.info->seq; + } + + if (suppress_message_printing(r.info->level)) { /* * Skip record we have buffered and already printed * directly to the console when we received it, and * record that has level above the console loglevel. */ - console_idx = log_next(console_idx); console_seq++; goto skip; } @@ -2473,19 +2467,23 @@ skip: exclusive_console = NULL; } - len += msg_print_text(msg, - console_msg_format & MSG_FORMAT_SYSLOG, - printk_time, text + len, sizeof(text) - len); + /* + * Handle extended console text first because later + * record_print_text() will modify the record buffer in-place. + */ if (nr_ext_console_drivers) { - ext_len = msg_print_ext_header(ext_text, + ext_len = info_print_ext_header(ext_text, sizeof(ext_text), - msg, console_seq); + r.info); ext_len += msg_print_ext_body(ext_text + ext_len, sizeof(ext_text) - ext_len, - log_dict(msg), msg->dict_len, - log_text(msg), msg->text_len); + &r.text_buf[0], + r.info->text_len, + &r.info->dev_info); } - console_idx = log_next(console_idx); + len = record_print_text(&r, + console_msg_format & MSG_FORMAT_SYSLOG, + printk_time); console_seq++; raw_spin_unlock(&logbuf_lock); @@ -2525,7 +2523,7 @@ skip: * flush, no worries. */ raw_spin_lock(&logbuf_lock); - retry = console_seq != log_next_seq; + retry = prb_read_valid(prb, console_seq, NULL); raw_spin_unlock(&logbuf_lock); printk_safe_exit_irqrestore(flags); @@ -2594,8 +2592,7 @@ void console_flush_on_panic(enum con_flush_mode mode) unsigned long flags; logbuf_lock_irqsave(flags); - console_seq = log_first_seq; - console_idx = log_first_idx; + console_seq = prb_first_valid_seq(prb); logbuf_unlock_irqrestore(flags); } console_unlock(); @@ -2838,7 +2835,6 @@ void register_console(struct console *newcon) exclusive_console = newcon; exclusive_console_stop_seq = console_seq; console_seq = syslog_seq; - console_idx = syslog_idx; logbuf_unlock_irqrestore(flags); } console_unlock(); @@ -3062,7 +3058,7 @@ int vprintk_deferred(const char *fmt, va_list args) { int r; - r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args); + r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args); defer_console_output(); return r; @@ -3227,9 +3223,7 @@ void kmsg_dump(enum kmsg_dump_reason reason) logbuf_lock_irqsave(flags); dumper->cur_seq = clear_seq; - dumper->cur_idx = clear_idx; - dumper->next_seq = log_next_seq; - dumper->next_idx = log_next_idx; + dumper->next_seq = prb_next_seq(prb); logbuf_unlock_irqrestore(flags); /* invoke dumper which will iterate over records */ @@ -3263,28 +3257,33 @@ void kmsg_dump(enum kmsg_dump_reason reason) bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, char *line, size_t size, size_t *len) { - struct printk_log *msg; + struct printk_info info; + unsigned int line_count; + struct printk_record r; size_t l = 0; bool ret = false; + prb_rec_init_rd(&r, &info, line, size); + if (!dumper->active) goto out; - if (dumper->cur_seq < log_first_seq) { - /* messages are gone, move to first available one */ - dumper->cur_seq = log_first_seq; - dumper->cur_idx = log_first_idx; - } - - /* last entry */ - if (dumper->cur_seq >= log_next_seq) - goto out; + /* Read text or count text lines? */ + if (line) { + if (!prb_read_valid(prb, dumper->cur_seq, &r)) + goto out; + l = record_print_text(&r, syslog, printk_time); + } else { + if (!prb_read_valid_info(prb, dumper->cur_seq, + &info, &line_count)) { + goto out; + } + l = get_record_print_text_size(&info, line_count, syslog, + printk_time); - msg = log_from_idx(dumper->cur_idx); - l = msg_print_text(msg, syslog, printk_time, line, size); + } - dumper->cur_idx = log_next(dumper->cur_idx); - dumper->cur_seq++; + dumper->cur_seq = r.info->seq + 1; ret = true; out: if (len) @@ -3332,7 +3331,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line); * @len: length of line placed into buffer * * Start at the end of the kmsg buffer and fill the provided buffer - * with as many of the the *youngest* kmsg records that fit into it. + * with as many of the *youngest* kmsg records that fit into it. * If the buffer is large enough, all available kmsg records will be * copied with a single call. * @@ -3345,23 +3344,25 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line); bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, char *buf, size_t size, size_t *len) { + struct printk_info info; + unsigned int line_count; + struct printk_record r; unsigned long flags; u64 seq; - u32 idx; u64 next_seq; - u32 next_idx; size_t l = 0; bool ret = false; bool time = printk_time; - if (!dumper->active) + prb_rec_init_rd(&r, &info, buf, size); + + if (!dumper->active || !buf || !size) goto out; logbuf_lock_irqsave(flags); - if (dumper->cur_seq < log_first_seq) { + if (dumper->cur_seq < prb_first_valid_seq(prb)) { /* messages are gone, move to first available one */ - dumper->cur_seq = log_first_seq; - dumper->cur_idx = log_first_idx; + dumper->cur_seq = prb_first_valid_seq(prb); } /* last entry */ @@ -3372,41 +3373,41 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, /* calculate length of entire buffer */ seq = dumper->cur_seq; - idx = dumper->cur_idx; - while (seq < dumper->next_seq) { - struct printk_log *msg = log_from_idx(idx); - - l += msg_print_text(msg, true, time, NULL, 0); - idx = log_next(idx); - seq++; + while (prb_read_valid_info(prb, seq, &info, &line_count)) { + if (r.info->seq >= dumper->next_seq) + break; + l += get_record_print_text_size(&info, line_count, true, time); + seq = r.info->seq + 1; } /* move first record forward until length fits into the buffer */ seq = dumper->cur_seq; - idx = dumper->cur_idx; - while (l >= size && seq < dumper->next_seq) { - struct printk_log *msg = log_from_idx(idx); - - l -= msg_print_text(msg, true, time, NULL, 0); - idx = log_next(idx); - seq++; + while (l >= size && prb_read_valid_info(prb, seq, + &info, &line_count)) { + if (r.info->seq >= dumper->next_seq) + break; + l -= get_record_print_text_size(&info, line_count, true, time); + seq = r.info->seq + 1; } /* last message in next interation */ next_seq = seq; - next_idx = idx; + /* actually read text into the buffer now */ l = 0; - while (seq < dumper->next_seq) { - struct printk_log *msg = log_from_idx(idx); + while (prb_read_valid(prb, seq, &r)) { + if (r.info->seq >= dumper->next_seq) + break; + + l += record_print_text(&r, syslog, time); + + /* adjust record to store to remaining buffer space */ + prb_rec_init_rd(&r, &info, buf + l, size - l); - l += msg_print_text(msg, syslog, time, buf + l, size - l); - idx = log_next(idx); - seq++; + seq = r.info->seq + 1; } dumper->next_seq = next_seq; - dumper->next_idx = next_idx; ret = true; logbuf_unlock_irqrestore(flags); out: @@ -3429,9 +3430,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) { dumper->cur_seq = clear_seq; - dumper->cur_idx = clear_idx; - dumper->next_seq = log_next_seq; - dumper->next_idx = log_next_idx; + dumper->next_seq = prb_next_seq(prb); } /** diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c new file mode 100644 index 000000000000..6b1525685277 --- /dev/null +++ b/kernel/printk/printk_ringbuffer.c @@ -0,0 +1,2086 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/kernel.h> +#include <linux/irqflags.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/bug.h> +#include "printk_ringbuffer.h" + +/** + * DOC: printk_ringbuffer overview + * + * Data Structure + * -------------- + * The printk_ringbuffer is made up of 3 internal ringbuffers: + * + * desc_ring + * A ring of descriptors and their meta data (such as sequence number, + * timestamp, loglevel, etc.) as well as internal state information about + * the record and logical positions specifying where in the other + * ringbuffer the text strings are located. + * + * text_data_ring + * A ring of data blocks. A data block consists of an unsigned long + * integer (ID) that maps to a desc_ring index followed by the text + * string of the record. + * + * The internal state information of a descriptor is the key element to allow + * readers and writers to locklessly synchronize access to the data. + * + * Implementation + * -------------- + * + * Descriptor Ring + * ~~~~~~~~~~~~~~~ + * The descriptor ring is an array of descriptors. A descriptor contains + * essential meta data to track the data of a printk record using + * blk_lpos structs pointing to associated text data blocks (see + * "Data Rings" below). Each descriptor is assigned an ID that maps + * directly to index values of the descriptor array and has a state. The ID + * and the state are bitwise combined into a single descriptor field named + * @state_var, allowing ID and state to be synchronously and atomically + * updated. + * + * Descriptors have four states: + * + * reserved + * A writer is modifying the record. + * + * committed + * The record and all its data are written. A writer can reopen the + * descriptor (transitioning it back to reserved), but in the committed + * state the data is consistent. + * + * finalized + * The record and all its data are complete and available for reading. A + * writer cannot reopen the descriptor. + * + * reusable + * The record exists, but its text and/or meta data may no longer be + * available. + * + * Querying the @state_var of a record requires providing the ID of the + * descriptor to query. This can yield a possible fifth (pseudo) state: + * + * miss + * The descriptor being queried has an unexpected ID. + * + * The descriptor ring has a @tail_id that contains the ID of the oldest + * descriptor and @head_id that contains the ID of the newest descriptor. + * + * When a new descriptor should be created (and the ring is full), the tail + * descriptor is invalidated by first transitioning to the reusable state and + * then invalidating all tail data blocks up to and including the data blocks + * associated with the tail descriptor (for the text ring). Then + * @tail_id is advanced, followed by advancing @head_id. And finally the + * @state_var of the new descriptor is initialized to the new ID and reserved + * state. + * + * The @tail_id can only be advanced if the new @tail_id would be in the + * committed or reusable queried state. This makes it possible that a valid + * sequence number of the tail is always available. + * + * Descriptor Finalization + * ~~~~~~~~~~~~~~~~~~~~~~~ + * When a writer calls the commit function prb_commit(), record data is + * fully stored and is consistent within the ringbuffer. However, a writer can + * reopen that record, claiming exclusive access (as with prb_reserve()), and + * modify that record. When finished, the writer must again commit the record. + * + * In order for a record to be made available to readers (and also become + * recyclable for writers), it must be finalized. A finalized record cannot be + * reopened and can never become "unfinalized". Record finalization can occur + * in three different scenarios: + * + * 1) A writer can simultaneously commit and finalize its record by calling + * prb_final_commit() instead of prb_commit(). + * + * 2) When a new record is reserved and the previous record has been + * committed via prb_commit(), that previous record is automatically + * finalized. + * + * 3) When a record is committed via prb_commit() and a newer record + * already exists, the record being committed is automatically finalized. + * + * Data Ring + * ~~~~~~~~~ + * The text data ring is a byte array composed of data blocks. Data blocks are + * referenced by blk_lpos structs that point to the logical position of the + * beginning of a data block and the beginning of the next adjacent data + * block. Logical positions are mapped directly to index values of the byte + * array ringbuffer. + * + * Each data block consists of an ID followed by the writer data. The ID is + * the identifier of a descriptor that is associated with the data block. A + * given data block is considered valid if all of the following conditions + * are met: + * + * 1) The descriptor associated with the data block is in the committed + * or finalized queried state. + * + * 2) The blk_lpos struct within the descriptor associated with the data + * block references back to the same data block. + * + * 3) The data block is within the head/tail logical position range. + * + * If the writer data of a data block would extend beyond the end of the + * byte array, only the ID of the data block is stored at the logical + * position and the full data block (ID and writer data) is stored at the + * beginning of the byte array. The referencing blk_lpos will point to the + * ID before the wrap and the next data block will be at the logical + * position adjacent the full data block after the wrap. + * + * Data rings have a @tail_lpos that points to the beginning of the oldest + * data block and a @head_lpos that points to the logical position of the + * next (not yet existing) data block. + * + * When a new data block should be created (and the ring is full), tail data + * blocks will first be invalidated by putting their associated descriptors + * into the reusable state and then pushing the @tail_lpos forward beyond + * them. Then the @head_lpos is pushed forward and is associated with a new + * descriptor. If a data block is not valid, the @tail_lpos cannot be + * advanced beyond it. + * + * Info Array + * ~~~~~~~~~~ + * The general meta data of printk records are stored in printk_info structs, + * stored in an array with the same number of elements as the descriptor ring. + * Each info corresponds to the descriptor of the same index in the + * descriptor ring. Info validity is confirmed by evaluating the corresponding + * descriptor before and after loading the info. + * + * Usage + * ----- + * Here are some simple examples demonstrating writers and readers. For the + * examples a global ringbuffer (test_rb) is available (which is not the + * actual ringbuffer used by printk):: + * + * DEFINE_PRINTKRB(test_rb, 15, 5); + * + * This ringbuffer allows up to 32768 records (2 ^ 15) and has a size of + * 1 MiB (2 ^ (15 + 5)) for text data. + * + * Sample writer code:: + * + * const char *textstr = "message text"; + * struct prb_reserved_entry e; + * struct printk_record r; + * + * // specify how much to allocate + * prb_rec_init_wr(&r, strlen(textstr) + 1); + * + * if (prb_reserve(&e, &test_rb, &r)) { + * snprintf(r.text_buf, r.text_buf_size, "%s", textstr); + * + * r.info->text_len = strlen(textstr); + * r.info->ts_nsec = local_clock(); + * r.info->caller_id = printk_caller_id(); + * + * // commit and finalize the record + * prb_final_commit(&e); + * } + * + * Note that additional writer functions are available to extend a record + * after it has been committed but not yet finalized. This can be done as + * long as no new records have been reserved and the caller is the same. + * + * Sample writer code (record extending):: + * + * // alternate rest of previous example + * + * r.info->text_len = strlen(textstr); + * r.info->ts_nsec = local_clock(); + * r.info->caller_id = printk_caller_id(); + * + * // commit the record (but do not finalize yet) + * prb_commit(&e); + * } + * + * ... + * + * // specify additional 5 bytes text space to extend + * prb_rec_init_wr(&r, 5); + * + * // try to extend, but only if it does not exceed 32 bytes + * if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id()), 32) { + * snprintf(&r.text_buf[r.info->text_len], + * r.text_buf_size - r.info->text_len, "hello"); + * + * r.info->text_len += 5; + * + * // commit and finalize the record + * prb_final_commit(&e); + * } + * + * Sample reader code:: + * + * struct printk_info info; + * struct printk_record r; + * char text_buf[32]; + * u64 seq; + * + * prb_rec_init_rd(&r, &info, &text_buf[0], sizeof(text_buf)); + * + * prb_for_each_record(0, &test_rb, &seq, &r) { + * if (info.seq != seq) + * pr_warn("lost %llu records\n", info.seq - seq); + * + * if (info.text_len > r.text_buf_size) { + * pr_warn("record %llu text truncated\n", info.seq); + * text_buf[r.text_buf_size - 1] = 0; + * } + * + * pr_info("%llu: %llu: %s\n", info.seq, info.ts_nsec, + * &text_buf[0]); + * } + * + * Note that additional less convenient reader functions are available to + * allow complex record access. + * + * ABA Issues + * ~~~~~~~~~~ + * To help avoid ABA issues, descriptors are referenced by IDs (array index + * values combined with tagged bits counting array wraps) and data blocks are + * referenced by logical positions (array index values combined with tagged + * bits counting array wraps). However, on 32-bit systems the number of + * tagged bits is relatively small such that an ABA incident is (at least + * theoretically) possible. For example, if 4 million maximally sized (1KiB) + * printk messages were to occur in NMI context on a 32-bit system, the + * interrupted context would not be able to recognize that the 32-bit integer + * completely wrapped and thus represents a different data block than the one + * the interrupted context expects. + * + * To help combat this possibility, additional state checking is performed + * (such as using cmpxchg() even though set() would suffice). These extra + * checks are commented as such and will hopefully catch any ABA issue that + * a 32-bit system might experience. + * + * Memory Barriers + * ~~~~~~~~~~~~~~~ + * Multiple memory barriers are used. To simplify proving correctness and + * generating litmus tests, lines of code related to memory barriers + * (loads, stores, and the associated memory barriers) are labeled:: + * + * LMM(function:letter) + * + * Comments reference the labels using only the "function:letter" part. + * + * The memory barrier pairs and their ordering are: + * + * desc_reserve:D / desc_reserve:B + * push descriptor tail (id), then push descriptor head (id) + * + * desc_reserve:D / data_push_tail:B + * push data tail (lpos), then set new descriptor reserved (state) + * + * desc_reserve:D / desc_push_tail:C + * push descriptor tail (id), then set new descriptor reserved (state) + * + * desc_reserve:D / prb_first_seq:C + * push descriptor tail (id), then set new descriptor reserved (state) + * + * desc_reserve:F / desc_read:D + * set new descriptor id and reserved (state), then allow writer changes + * + * data_alloc:A (or data_realloc:A) / desc_read:D + * set old descriptor reusable (state), then modify new data block area + * + * data_alloc:A (or data_realloc:A) / data_push_tail:B + * push data tail (lpos), then modify new data block area + * + * _prb_commit:B / desc_read:B + * store writer changes, then set new descriptor committed (state) + * + * desc_reopen_last:A / _prb_commit:B + * set descriptor reserved (state), then read descriptor data + * + * _prb_commit:B / desc_reserve:D + * set new descriptor committed (state), then check descriptor head (id) + * + * data_push_tail:D / data_push_tail:A + * set descriptor reusable (state), then push data tail (lpos) + * + * desc_push_tail:B / desc_reserve:D + * set descriptor reusable (state), then push descriptor tail (id) + */ + +#define DATA_SIZE(data_ring) _DATA_SIZE((data_ring)->size_bits) +#define DATA_SIZE_MASK(data_ring) (DATA_SIZE(data_ring) - 1) + +#define DESCS_COUNT(desc_ring) _DESCS_COUNT((desc_ring)->count_bits) +#define DESCS_COUNT_MASK(desc_ring) (DESCS_COUNT(desc_ring) - 1) + +/* Determine the data array index from a logical position. */ +#define DATA_INDEX(data_ring, lpos) ((lpos) & DATA_SIZE_MASK(data_ring)) + +/* Determine the desc array index from an ID or sequence number. */ +#define DESC_INDEX(desc_ring, n) ((n) & DESCS_COUNT_MASK(desc_ring)) + +/* Determine how many times the data array has wrapped. */ +#define DATA_WRAPS(data_ring, lpos) ((lpos) >> (data_ring)->size_bits) + +/* Determine if a logical position refers to a data-less block. */ +#define LPOS_DATALESS(lpos) ((lpos) & 1UL) +#define BLK_DATALESS(blk) (LPOS_DATALESS((blk)->begin) && \ + LPOS_DATALESS((blk)->next)) + +/* Get the logical position at index 0 of the current wrap. */ +#define DATA_THIS_WRAP_START_LPOS(data_ring, lpos) \ +((lpos) & ~DATA_SIZE_MASK(data_ring)) + +/* Get the ID for the same index of the previous wrap as the given ID. */ +#define DESC_ID_PREV_WRAP(desc_ring, id) \ +DESC_ID((id) - DESCS_COUNT(desc_ring)) + +/* + * A data block: mapped directly to the beginning of the data block area + * specified as a logical position within the data ring. + * + * @id: the ID of the associated descriptor + * @data: the writer data + * + * Note that the size of a data block is only known by its associated + * descriptor. + */ +struct prb_data_block { + unsigned long id; + char data[]; +}; + +/* + * Return the descriptor associated with @n. @n can be either a + * descriptor ID or a sequence number. + */ +static struct prb_desc *to_desc(struct prb_desc_ring *desc_ring, u64 n) +{ + return &desc_ring->descs[DESC_INDEX(desc_ring, n)]; +} + +/* + * Return the printk_info associated with @n. @n can be either a + * descriptor ID or a sequence number. + */ +static struct printk_info *to_info(struct prb_desc_ring *desc_ring, u64 n) +{ + return &desc_ring->infos[DESC_INDEX(desc_ring, n)]; +} + +static struct prb_data_block *to_block(struct prb_data_ring *data_ring, + unsigned long begin_lpos) +{ + return (void *)&data_ring->data[DATA_INDEX(data_ring, begin_lpos)]; +} + +/* + * Increase the data size to account for data block meta data plus any + * padding so that the adjacent data block is aligned on the ID size. + */ +static unsigned int to_blk_size(unsigned int size) +{ + struct prb_data_block *db = NULL; + + size += sizeof(*db); + size = ALIGN(size, sizeof(db->id)); + return size; +} + +/* + * Sanity checker for reserve size. The ringbuffer code assumes that a data + * block does not exceed the maximum possible size that could fit within the + * ringbuffer. This function provides that basic size check so that the + * assumption is safe. + */ +static bool data_check_size(struct prb_data_ring *data_ring, unsigned int size) +{ + struct prb_data_block *db = NULL; + + if (size == 0) + return true; + + /* + * Ensure the alignment padded size could possibly fit in the data + * array. The largest possible data block must still leave room for + * at least the ID of the next block. + */ + size = to_blk_size(size); + if (size > DATA_SIZE(data_ring) - sizeof(db->id)) + return false; + + return true; +} + +/* Query the state of a descriptor. */ +static enum desc_state get_desc_state(unsigned long id, + unsigned long state_val) +{ + if (id != DESC_ID(state_val)) + return desc_miss; + + return DESC_STATE(state_val); +} + +/* + * Get a copy of a specified descriptor and return its queried state. If the + * descriptor is in an inconsistent state (miss or reserved), the caller can + * only expect the descriptor's @state_var field to be valid. + * + * The sequence number and caller_id can be optionally retrieved. Like all + * non-state_var data, they are only valid if the descriptor is in a + * consistent state. + */ +static enum desc_state desc_read(struct prb_desc_ring *desc_ring, + unsigned long id, struct prb_desc *desc_out, + u64 *seq_out, u32 *caller_id_out) +{ + struct printk_info *info = to_info(desc_ring, id); + struct prb_desc *desc = to_desc(desc_ring, id); + atomic_long_t *state_var = &desc->state_var; + enum desc_state d_state; + unsigned long state_val; + + /* Check the descriptor state. */ + state_val = atomic_long_read(state_var); /* LMM(desc_read:A) */ + d_state = get_desc_state(id, state_val); + if (d_state == desc_miss || d_state == desc_reserved) { + /* + * The descriptor is in an inconsistent state. Set at least + * @state_var so that the caller can see the details of + * the inconsistent state. + */ + goto out; + } + + /* + * Guarantee the state is loaded before copying the descriptor + * content. This avoids copying obsolete descriptor content that might + * not apply to the descriptor state. This pairs with _prb_commit:B. + * + * Memory barrier involvement: + * + * If desc_read:A reads from _prb_commit:B, then desc_read:C reads + * from _prb_commit:A. + * + * Relies on: + * + * WMB from _prb_commit:A to _prb_commit:B + * matching + * RMB from desc_read:A to desc_read:C + */ + smp_rmb(); /* LMM(desc_read:B) */ + + /* + * Copy the descriptor data. The data is not valid until the + * state has been re-checked. A memcpy() for all of @desc + * cannot be used because of the atomic_t @state_var field. + */ + memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos, + sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */ + if (seq_out) + *seq_out = info->seq; /* also part of desc_read:C */ + if (caller_id_out) + *caller_id_out = info->caller_id; /* also part of desc_read:C */ + + /* + * 1. Guarantee the descriptor content is loaded before re-checking + * the state. This avoids reading an obsolete descriptor state + * that may not apply to the copied content. This pairs with + * desc_reserve:F. + * + * Memory barrier involvement: + * + * If desc_read:C reads from desc_reserve:G, then desc_read:E + * reads from desc_reserve:F. + * + * Relies on: + * + * WMB from desc_reserve:F to desc_reserve:G + * matching + * RMB from desc_read:C to desc_read:E + * + * 2. Guarantee the record data is loaded before re-checking the + * state. This avoids reading an obsolete descriptor state that may + * not apply to the copied data. This pairs with data_alloc:A and + * data_realloc:A. + * + * Memory barrier involvement: + * + * If copy_data:A reads from data_alloc:B, then desc_read:E + * reads from desc_make_reusable:A. + * + * Relies on: + * + * MB from desc_make_reusable:A to data_alloc:B + * matching + * RMB from desc_read:C to desc_read:E + * + * Note: desc_make_reusable:A and data_alloc:B can be different + * CPUs. However, the data_alloc:B CPU (which performs the + * full memory barrier) must have previously seen + * desc_make_reusable:A. + */ + smp_rmb(); /* LMM(desc_read:D) */ + + /* + * The data has been copied. Return the current descriptor state, + * which may have changed since the load above. + */ + state_val = atomic_long_read(state_var); /* LMM(desc_read:E) */ + d_state = get_desc_state(id, state_val); +out: + atomic_long_set(&desc_out->state_var, state_val); + return d_state; +} + +/* + * Take a specified descriptor out of the finalized state by attempting + * the transition from finalized to reusable. Either this context or some + * other context will have been successful. + */ +static void desc_make_reusable(struct prb_desc_ring *desc_ring, + unsigned long id) +{ + unsigned long val_finalized = DESC_SV(id, desc_finalized); + unsigned long val_reusable = DESC_SV(id, desc_reusable); + struct prb_desc *desc = to_desc(desc_ring, id); + atomic_long_t *state_var = &desc->state_var; + + atomic_long_cmpxchg_relaxed(state_var, val_finalized, + val_reusable); /* LMM(desc_make_reusable:A) */ +} + +/* + * Given the text data ring, put the associated descriptor of each + * data block from @lpos_begin until @lpos_end into the reusable state. + * + * If there is any problem making the associated descriptor reusable, either + * the descriptor has not yet been finalized or another writer context has + * already pushed the tail lpos past the problematic data block. Regardless, + * on error the caller can re-load the tail lpos to determine the situation. + */ +static bool data_make_reusable(struct printk_ringbuffer *rb, + struct prb_data_ring *data_ring, + unsigned long lpos_begin, + unsigned long lpos_end, + unsigned long *lpos_out) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + struct prb_data_block *blk; + enum desc_state d_state; + struct prb_desc desc; + struct prb_data_blk_lpos *blk_lpos = &desc.text_blk_lpos; + unsigned long id; + + /* Loop until @lpos_begin has advanced to or beyond @lpos_end. */ + while ((lpos_end - lpos_begin) - 1 < DATA_SIZE(data_ring)) { + blk = to_block(data_ring, lpos_begin); + + /* + * Load the block ID from the data block. This is a data race + * against a writer that may have newly reserved this data + * area. If the loaded value matches a valid descriptor ID, + * the blk_lpos of that descriptor will be checked to make + * sure it points back to this data block. If the check fails, + * the data area has been recycled by another writer. + */ + id = blk->id; /* LMM(data_make_reusable:A) */ + + d_state = desc_read(desc_ring, id, &desc, + NULL, NULL); /* LMM(data_make_reusable:B) */ + + switch (d_state) { + case desc_miss: + case desc_reserved: + case desc_committed: + return false; + case desc_finalized: + /* + * This data block is invalid if the descriptor + * does not point back to it. + */ + if (blk_lpos->begin != lpos_begin) + return false; + desc_make_reusable(desc_ring, id); + break; + case desc_reusable: + /* + * This data block is invalid if the descriptor + * does not point back to it. + */ + if (blk_lpos->begin != lpos_begin) + return false; + break; + } + + /* Advance @lpos_begin to the next data block. */ + lpos_begin = blk_lpos->next; + } + + *lpos_out = lpos_begin; + return true; +} + +/* + * Advance the data ring tail to at least @lpos. This function puts + * descriptors into the reusable state if the tail is pushed beyond + * their associated data block. + */ +static bool data_push_tail(struct printk_ringbuffer *rb, + struct prb_data_ring *data_ring, + unsigned long lpos) +{ + unsigned long tail_lpos_new; + unsigned long tail_lpos; + unsigned long next_lpos; + + /* If @lpos is from a data-less block, there is nothing to do. */ + if (LPOS_DATALESS(lpos)) + return true; + + /* + * Any descriptor states that have transitioned to reusable due to the + * data tail being pushed to this loaded value will be visible to this + * CPU. This pairs with data_push_tail:D. + * + * Memory barrier involvement: + * + * If data_push_tail:A reads from data_push_tail:D, then this CPU can + * see desc_make_reusable:A. + * + * Relies on: + * + * MB from desc_make_reusable:A to data_push_tail:D + * matches + * READFROM from data_push_tail:D to data_push_tail:A + * thus + * READFROM from desc_make_reusable:A to this CPU + */ + tail_lpos = atomic_long_read(&data_ring->tail_lpos); /* LMM(data_push_tail:A) */ + + /* + * Loop until the tail lpos is at or beyond @lpos. This condition + * may already be satisfied, resulting in no full memory barrier + * from data_push_tail:D being performed. However, since this CPU + * sees the new tail lpos, any descriptor states that transitioned to + * the reusable state must already be visible. + */ + while ((lpos - tail_lpos) - 1 < DATA_SIZE(data_ring)) { + /* + * Make all descriptors reusable that are associated with + * data blocks before @lpos. + */ + if (!data_make_reusable(rb, data_ring, tail_lpos, lpos, + &next_lpos)) { + /* + * 1. Guarantee the block ID loaded in + * data_make_reusable() is performed before + * reloading the tail lpos. The failed + * data_make_reusable() may be due to a newly + * recycled data area causing the tail lpos to + * have been previously pushed. This pairs with + * data_alloc:A and data_realloc:A. + * + * Memory barrier involvement: + * + * If data_make_reusable:A reads from data_alloc:B, + * then data_push_tail:C reads from + * data_push_tail:D. + * + * Relies on: + * + * MB from data_push_tail:D to data_alloc:B + * matching + * RMB from data_make_reusable:A to + * data_push_tail:C + * + * Note: data_push_tail:D and data_alloc:B can be + * different CPUs. However, the data_alloc:B + * CPU (which performs the full memory + * barrier) must have previously seen + * data_push_tail:D. + * + * 2. Guarantee the descriptor state loaded in + * data_make_reusable() is performed before + * reloading the tail lpos. The failed + * data_make_reusable() may be due to a newly + * recycled descriptor causing the tail lpos to + * have been previously pushed. This pairs with + * desc_reserve:D. + * + * Memory barrier involvement: + * + * If data_make_reusable:B reads from + * desc_reserve:F, then data_push_tail:C reads + * from data_push_tail:D. + * + * Relies on: + * + * MB from data_push_tail:D to desc_reserve:F + * matching + * RMB from data_make_reusable:B to + * data_push_tail:C + * + * Note: data_push_tail:D and desc_reserve:F can + * be different CPUs. However, the + * desc_reserve:F CPU (which performs the + * full memory barrier) must have previously + * seen data_push_tail:D. + */ + smp_rmb(); /* LMM(data_push_tail:B) */ + + tail_lpos_new = atomic_long_read(&data_ring->tail_lpos + ); /* LMM(data_push_tail:C) */ + if (tail_lpos_new == tail_lpos) + return false; + + /* Another CPU pushed the tail. Try again. */ + tail_lpos = tail_lpos_new; + continue; + } + + /* + * Guarantee any descriptor states that have transitioned to + * reusable are stored before pushing the tail lpos. A full + * memory barrier is needed since other CPUs may have made + * the descriptor states reusable. This pairs with + * data_push_tail:A. + */ + if (atomic_long_try_cmpxchg(&data_ring->tail_lpos, &tail_lpos, + next_lpos)) { /* LMM(data_push_tail:D) */ + break; + } + } + + return true; +} + +/* + * Advance the desc ring tail. This function advances the tail by one + * descriptor, thus invalidating the oldest descriptor. Before advancing + * the tail, the tail descriptor is made reusable and all data blocks up to + * and including the descriptor's data block are invalidated (i.e. the data + * ring tail is pushed past the data block of the descriptor being made + * reusable). + */ +static bool desc_push_tail(struct printk_ringbuffer *rb, + unsigned long tail_id) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + enum desc_state d_state; + struct prb_desc desc; + + d_state = desc_read(desc_ring, tail_id, &desc, NULL, NULL); + + switch (d_state) { + case desc_miss: + /* + * If the ID is exactly 1 wrap behind the expected, it is + * in the process of being reserved by another writer and + * must be considered reserved. + */ + if (DESC_ID(atomic_long_read(&desc.state_var)) == + DESC_ID_PREV_WRAP(desc_ring, tail_id)) { + return false; + } + + /* + * The ID has changed. Another writer must have pushed the + * tail and recycled the descriptor already. Success is + * returned because the caller is only interested in the + * specified tail being pushed, which it was. + */ + return true; + case desc_reserved: + case desc_committed: + return false; + case desc_finalized: + desc_make_reusable(desc_ring, tail_id); + break; + case desc_reusable: + break; + } + + /* + * Data blocks must be invalidated before their associated + * descriptor can be made available for recycling. Invalidating + * them later is not possible because there is no way to trust + * data blocks once their associated descriptor is gone. + */ + + if (!data_push_tail(rb, &rb->text_data_ring, desc.text_blk_lpos.next)) + return false; + + /* + * Check the next descriptor after @tail_id before pushing the tail + * to it because the tail must always be in a finalized or reusable + * state. The implementation of prb_first_seq() relies on this. + * + * A successful read implies that the next descriptor is less than or + * equal to @head_id so there is no risk of pushing the tail past the + * head. + */ + d_state = desc_read(desc_ring, DESC_ID(tail_id + 1), &desc, + NULL, NULL); /* LMM(desc_push_tail:A) */ + + if (d_state == desc_finalized || d_state == desc_reusable) { + /* + * Guarantee any descriptor states that have transitioned to + * reusable are stored before pushing the tail ID. This allows + * verifying the recycled descriptor state. A full memory + * barrier is needed since other CPUs may have made the + * descriptor states reusable. This pairs with desc_reserve:D. + */ + atomic_long_cmpxchg(&desc_ring->tail_id, tail_id, + DESC_ID(tail_id + 1)); /* LMM(desc_push_tail:B) */ + } else { + /* + * Guarantee the last state load from desc_read() is before + * reloading @tail_id in order to see a new tail ID in the + * case that the descriptor has been recycled. This pairs + * with desc_reserve:D. + * + * Memory barrier involvement: + * + * If desc_push_tail:A reads from desc_reserve:F, then + * desc_push_tail:D reads from desc_push_tail:B. + * + * Relies on: + * + * MB from desc_push_tail:B to desc_reserve:F + * matching + * RMB from desc_push_tail:A to desc_push_tail:D + * + * Note: desc_push_tail:B and desc_reserve:F can be different + * CPUs. However, the desc_reserve:F CPU (which performs + * the full memory barrier) must have previously seen + * desc_push_tail:B. + */ + smp_rmb(); /* LMM(desc_push_tail:C) */ + + /* + * Re-check the tail ID. The descriptor following @tail_id is + * not in an allowed tail state. But if the tail has since + * been moved by another CPU, then it does not matter. + */ + if (atomic_long_read(&desc_ring->tail_id) == tail_id) /* LMM(desc_push_tail:D) */ + return false; + } + + return true; +} + +/* Reserve a new descriptor, invalidating the oldest if necessary. */ +static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + unsigned long prev_state_val; + unsigned long id_prev_wrap; + struct prb_desc *desc; + unsigned long head_id; + unsigned long id; + + head_id = atomic_long_read(&desc_ring->head_id); /* LMM(desc_reserve:A) */ + + do { + desc = to_desc(desc_ring, head_id); + + id = DESC_ID(head_id + 1); + id_prev_wrap = DESC_ID_PREV_WRAP(desc_ring, id); + + /* + * Guarantee the head ID is read before reading the tail ID. + * Since the tail ID is updated before the head ID, this + * guarantees that @id_prev_wrap is never ahead of the tail + * ID. This pairs with desc_reserve:D. + * + * Memory barrier involvement: + * + * If desc_reserve:A reads from desc_reserve:D, then + * desc_reserve:C reads from desc_push_tail:B. + * + * Relies on: + * + * MB from desc_push_tail:B to desc_reserve:D + * matching + * RMB from desc_reserve:A to desc_reserve:C + * + * Note: desc_push_tail:B and desc_reserve:D can be different + * CPUs. However, the desc_reserve:D CPU (which performs + * the full memory barrier) must have previously seen + * desc_push_tail:B. + */ + smp_rmb(); /* LMM(desc_reserve:B) */ + + if (id_prev_wrap == atomic_long_read(&desc_ring->tail_id + )) { /* LMM(desc_reserve:C) */ + /* + * Make space for the new descriptor by + * advancing the tail. + */ + if (!desc_push_tail(rb, id_prev_wrap)) + return false; + } + + /* + * 1. Guarantee the tail ID is read before validating the + * recycled descriptor state. A read memory barrier is + * sufficient for this. This pairs with desc_push_tail:B. + * + * Memory barrier involvement: + * + * If desc_reserve:C reads from desc_push_tail:B, then + * desc_reserve:E reads from desc_make_reusable:A. + * + * Relies on: + * + * MB from desc_make_reusable:A to desc_push_tail:B + * matching + * RMB from desc_reserve:C to desc_reserve:E + * + * Note: desc_make_reusable:A and desc_push_tail:B can be + * different CPUs. However, the desc_push_tail:B CPU + * (which performs the full memory barrier) must have + * previously seen desc_make_reusable:A. + * + * 2. Guarantee the tail ID is stored before storing the head + * ID. This pairs with desc_reserve:B. + * + * 3. Guarantee any data ring tail changes are stored before + * recycling the descriptor. Data ring tail changes can + * happen via desc_push_tail()->data_push_tail(). A full + * memory barrier is needed since another CPU may have + * pushed the data ring tails. This pairs with + * data_push_tail:B. + * + * 4. Guarantee a new tail ID is stored before recycling the + * descriptor. A full memory barrier is needed since + * another CPU may have pushed the tail ID. This pairs + * with desc_push_tail:C and this also pairs with + * prb_first_seq:C. + * + * 5. Guarantee the head ID is stored before trying to + * finalize the previous descriptor. This pairs with + * _prb_commit:B. + */ + } while (!atomic_long_try_cmpxchg(&desc_ring->head_id, &head_id, + id)); /* LMM(desc_reserve:D) */ + + desc = to_desc(desc_ring, id); + + /* + * If the descriptor has been recycled, verify the old state val. + * See "ABA Issues" about why this verification is performed. + */ + prev_state_val = atomic_long_read(&desc->state_var); /* LMM(desc_reserve:E) */ + if (prev_state_val && + get_desc_state(id_prev_wrap, prev_state_val) != desc_reusable) { + WARN_ON_ONCE(1); + return false; + } + + /* + * Assign the descriptor a new ID and set its state to reserved. + * See "ABA Issues" about why cmpxchg() instead of set() is used. + * + * Guarantee the new descriptor ID and state is stored before making + * any other changes. A write memory barrier is sufficient for this. + * This pairs with desc_read:D. + */ + if (!atomic_long_try_cmpxchg(&desc->state_var, &prev_state_val, + DESC_SV(id, desc_reserved))) { /* LMM(desc_reserve:F) */ + WARN_ON_ONCE(1); + return false; + } + + /* Now data in @desc can be modified: LMM(desc_reserve:G) */ + + *id_out = id; + return true; +} + +/* Determine the end of a data block. */ +static unsigned long get_next_lpos(struct prb_data_ring *data_ring, + unsigned long lpos, unsigned int size) +{ + unsigned long begin_lpos; + unsigned long next_lpos; + + begin_lpos = lpos; + next_lpos = lpos + size; + + /* First check if the data block does not wrap. */ + if (DATA_WRAPS(data_ring, begin_lpos) == DATA_WRAPS(data_ring, next_lpos)) + return next_lpos; + + /* Wrapping data blocks store their data at the beginning. */ + return (DATA_THIS_WRAP_START_LPOS(data_ring, next_lpos) + size); +} + +/* + * Allocate a new data block, invalidating the oldest data block(s) + * if necessary. This function also associates the data block with + * a specified descriptor. + */ +static char *data_alloc(struct printk_ringbuffer *rb, + struct prb_data_ring *data_ring, unsigned int size, + struct prb_data_blk_lpos *blk_lpos, unsigned long id) +{ + struct prb_data_block *blk; + unsigned long begin_lpos; + unsigned long next_lpos; + + if (size == 0) { + /* Specify a data-less block. */ + blk_lpos->begin = NO_LPOS; + blk_lpos->next = NO_LPOS; + return NULL; + } + + size = to_blk_size(size); + + begin_lpos = atomic_long_read(&data_ring->head_lpos); + + do { + next_lpos = get_next_lpos(data_ring, begin_lpos, size); + + if (!data_push_tail(rb, data_ring, next_lpos - DATA_SIZE(data_ring))) { + /* Failed to allocate, specify a data-less block. */ + blk_lpos->begin = FAILED_LPOS; + blk_lpos->next = FAILED_LPOS; + return NULL; + } + + /* + * 1. Guarantee any descriptor states that have transitioned + * to reusable are stored before modifying the newly + * allocated data area. A full memory barrier is needed + * since other CPUs may have made the descriptor states + * reusable. See data_push_tail:A about why the reusable + * states are visible. This pairs with desc_read:D. + * + * 2. Guarantee any updated tail lpos is stored before + * modifying the newly allocated data area. Another CPU may + * be in data_make_reusable() and is reading a block ID + * from this area. data_make_reusable() can handle reading + * a garbage block ID value, but then it must be able to + * load a new tail lpos. A full memory barrier is needed + * since other CPUs may have updated the tail lpos. This + * pairs with data_push_tail:B. + */ + } while (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &begin_lpos, + next_lpos)); /* LMM(data_alloc:A) */ + + blk = to_block(data_ring, begin_lpos); + blk->id = id; /* LMM(data_alloc:B) */ + + if (DATA_WRAPS(data_ring, begin_lpos) != DATA_WRAPS(data_ring, next_lpos)) { + /* Wrapping data blocks store their data at the beginning. */ + blk = to_block(data_ring, 0); + + /* + * Store the ID on the wrapped block for consistency. + * The printk_ringbuffer does not actually use it. + */ + blk->id = id; + } + + blk_lpos->begin = begin_lpos; + blk_lpos->next = next_lpos; + + return &blk->data[0]; +} + +/* + * Try to resize an existing data block associated with the descriptor + * specified by @id. If the resized data block should become wrapped, it + * copies the old data to the new data block. If @size yields a data block + * with the same or less size, the data block is left as is. + * + * Fail if this is not the last allocated data block or if there is not + * enough space or it is not possible make enough space. + * + * Return a pointer to the beginning of the entire data buffer or NULL on + * failure. + */ +static char *data_realloc(struct printk_ringbuffer *rb, + struct prb_data_ring *data_ring, unsigned int size, + struct prb_data_blk_lpos *blk_lpos, unsigned long id) +{ + struct prb_data_block *blk; + unsigned long head_lpos; + unsigned long next_lpos; + bool wrapped; + + /* Reallocation only works if @blk_lpos is the newest data block. */ + head_lpos = atomic_long_read(&data_ring->head_lpos); + if (head_lpos != blk_lpos->next) + return NULL; + + /* Keep track if @blk_lpos was a wrapping data block. */ + wrapped = (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, blk_lpos->next)); + + size = to_blk_size(size); + + next_lpos = get_next_lpos(data_ring, blk_lpos->begin, size); + + /* If the data block does not increase, there is nothing to do. */ + if (head_lpos - next_lpos < DATA_SIZE(data_ring)) { + if (wrapped) + blk = to_block(data_ring, 0); + else + blk = to_block(data_ring, blk_lpos->begin); + return &blk->data[0]; + } + + if (!data_push_tail(rb, data_ring, next_lpos - DATA_SIZE(data_ring))) + return NULL; + + /* The memory barrier involvement is the same as data_alloc:A. */ + if (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &head_lpos, + next_lpos)) { /* LMM(data_realloc:A) */ + return NULL; + } + + blk = to_block(data_ring, blk_lpos->begin); + + if (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, next_lpos)) { + struct prb_data_block *old_blk = blk; + + /* Wrapping data blocks store their data at the beginning. */ + blk = to_block(data_ring, 0); + + /* + * Store the ID on the wrapped block for consistency. + * The printk_ringbuffer does not actually use it. + */ + blk->id = id; + + if (!wrapped) { + /* + * Since the allocated space is now in the newly + * created wrapping data block, copy the content + * from the old data block. + */ + memcpy(&blk->data[0], &old_blk->data[0], + (blk_lpos->next - blk_lpos->begin) - sizeof(blk->id)); + } + } + + blk_lpos->next = next_lpos; + + return &blk->data[0]; +} + +/* Return the number of bytes used by a data block. */ +static unsigned int space_used(struct prb_data_ring *data_ring, + struct prb_data_blk_lpos *blk_lpos) +{ + /* Data-less blocks take no space. */ + if (BLK_DATALESS(blk_lpos)) + return 0; + + if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next)) { + /* Data block does not wrap. */ + return (DATA_INDEX(data_ring, blk_lpos->next) - + DATA_INDEX(data_ring, blk_lpos->begin)); + } + + /* + * For wrapping data blocks, the trailing (wasted) space is + * also counted. + */ + return (DATA_INDEX(data_ring, blk_lpos->next) + + DATA_SIZE(data_ring) - DATA_INDEX(data_ring, blk_lpos->begin)); +} + +/* + * Given @blk_lpos, return a pointer to the writer data from the data block + * and calculate the size of the data part. A NULL pointer is returned if + * @blk_lpos specifies values that could never be legal. + * + * This function (used by readers) performs strict validation on the lpos + * values to possibly detect bugs in the writer code. A WARN_ON_ONCE() is + * triggered if an internal error is detected. + */ +static const char *get_data(struct prb_data_ring *data_ring, + struct prb_data_blk_lpos *blk_lpos, + unsigned int *data_size) +{ + struct prb_data_block *db; + + /* Data-less data block description. */ + if (BLK_DATALESS(blk_lpos)) { + if (blk_lpos->begin == NO_LPOS && blk_lpos->next == NO_LPOS) { + *data_size = 0; + return ""; + } + return NULL; + } + + /* Regular data block: @begin less than @next and in same wrap. */ + if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next) && + blk_lpos->begin < blk_lpos->next) { + db = to_block(data_ring, blk_lpos->begin); + *data_size = blk_lpos->next - blk_lpos->begin; + + /* Wrapping data block: @begin is one wrap behind @next. */ + } else if (DATA_WRAPS(data_ring, blk_lpos->begin + DATA_SIZE(data_ring)) == + DATA_WRAPS(data_ring, blk_lpos->next)) { + db = to_block(data_ring, 0); + *data_size = DATA_INDEX(data_ring, blk_lpos->next); + + /* Illegal block description. */ + } else { + WARN_ON_ONCE(1); + return NULL; + } + + /* A valid data block will always be aligned to the ID size. */ + if (WARN_ON_ONCE(blk_lpos->begin != ALIGN(blk_lpos->begin, sizeof(db->id))) || + WARN_ON_ONCE(blk_lpos->next != ALIGN(blk_lpos->next, sizeof(db->id)))) { + return NULL; + } + + /* A valid data block will always have at least an ID. */ + if (WARN_ON_ONCE(*data_size < sizeof(db->id))) + return NULL; + + /* Subtract block ID space from size to reflect data size. */ + *data_size -= sizeof(db->id); + + return &db->data[0]; +} + +/* + * Attempt to transition the newest descriptor from committed back to reserved + * so that the record can be modified by a writer again. This is only possible + * if the descriptor is not yet finalized and the provided @caller_id matches. + */ +static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring, + u32 caller_id, unsigned long *id_out) +{ + unsigned long prev_state_val; + enum desc_state d_state; + struct prb_desc desc; + struct prb_desc *d; + unsigned long id; + u32 cid; + + id = atomic_long_read(&desc_ring->head_id); + + /* + * To reduce unnecessarily reopening, first check if the descriptor + * state and caller ID are correct. + */ + d_state = desc_read(desc_ring, id, &desc, NULL, &cid); + if (d_state != desc_committed || cid != caller_id) + return NULL; + + d = to_desc(desc_ring, id); + + prev_state_val = DESC_SV(id, desc_committed); + + /* + * Guarantee the reserved state is stored before reading any + * record data. A full memory barrier is needed because @state_var + * modification is followed by reading. This pairs with _prb_commit:B. + * + * Memory barrier involvement: + * + * If desc_reopen_last:A reads from _prb_commit:B, then + * prb_reserve_in_last:A reads from _prb_commit:A. + * + * Relies on: + * + * WMB from _prb_commit:A to _prb_commit:B + * matching + * MB If desc_reopen_last:A to prb_reserve_in_last:A + */ + if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val, + DESC_SV(id, desc_reserved))) { /* LMM(desc_reopen_last:A) */ + return NULL; + } + + *id_out = id; + return d; +} + +/** + * prb_reserve_in_last() - Re-reserve and extend the space in the ringbuffer + * used by the newest record. + * + * @e: The entry structure to setup. + * @rb: The ringbuffer to re-reserve and extend data in. + * @r: The record structure to allocate buffers for. + * @caller_id: The caller ID of the caller (reserving writer). + * @max_size: Fail if the extended size would be greater than this. + * + * This is the public function available to writers to re-reserve and extend + * data. + * + * The writer specifies the text size to extend (not the new total size) by + * setting the @text_buf_size field of @r. To ensure proper initialization + * of @r, prb_rec_init_wr() should be used. + * + * This function will fail if @caller_id does not match the caller ID of the + * newest record. In that case the caller must reserve new data using + * prb_reserve(). + * + * Context: Any context. Disables local interrupts on success. + * Return: true if text data could be extended, otherwise false. + * + * On success: + * + * - @r->text_buf points to the beginning of the entire text buffer. + * + * - @r->text_buf_size is set to the new total size of the buffer. + * + * - @r->info is not touched so that @r->info->text_len could be used + * to append the text. + * + * - prb_record_text_space() can be used on @e to query the new + * actually used space. + * + * Important: All @r->info fields will already be set with the current values + * for the record. I.e. @r->info->text_len will be less than + * @text_buf_size. Writers can use @r->info->text_len to know + * where concatenation begins and writers should update + * @r->info->text_len after concatenating. + */ +bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, + struct printk_record *r, u32 caller_id, unsigned int max_size) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + struct printk_info *info; + unsigned int data_size; + struct prb_desc *d; + unsigned long id; + + local_irq_save(e->irqflags); + + /* Transition the newest descriptor back to the reserved state. */ + d = desc_reopen_last(desc_ring, caller_id, &id); + if (!d) { + local_irq_restore(e->irqflags); + goto fail_reopen; + } + + /* Now the writer has exclusive access: LMM(prb_reserve_in_last:A) */ + + info = to_info(desc_ring, id); + + /* + * Set the @e fields here so that prb_commit() can be used if + * anything fails from now on. + */ + e->rb = rb; + e->id = id; + + /* + * desc_reopen_last() checked the caller_id, but there was no + * exclusive access at that point. The descriptor may have + * changed since then. + */ + if (caller_id != info->caller_id) + goto fail; + + if (BLK_DATALESS(&d->text_blk_lpos)) { + if (WARN_ON_ONCE(info->text_len != 0)) { + pr_warn_once("wrong text_len value (%hu, expecting 0)\n", + info->text_len); + info->text_len = 0; + } + + if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) + goto fail; + + if (r->text_buf_size > max_size) + goto fail; + + r->text_buf = data_alloc(rb, &rb->text_data_ring, r->text_buf_size, + &d->text_blk_lpos, id); + } else { + if (!get_data(&rb->text_data_ring, &d->text_blk_lpos, &data_size)) + goto fail; + + /* + * Increase the buffer size to include the original size. If + * the meta data (@text_len) is not sane, use the full data + * block size. + */ + if (WARN_ON_ONCE(info->text_len > data_size)) { + pr_warn_once("wrong text_len value (%hu, expecting <=%u)\n", + info->text_len, data_size); + info->text_len = data_size; + } + r->text_buf_size += info->text_len; + + if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) + goto fail; + + if (r->text_buf_size > max_size) + goto fail; + + r->text_buf = data_realloc(rb, &rb->text_data_ring, r->text_buf_size, + &d->text_blk_lpos, id); + } + if (r->text_buf_size && !r->text_buf) + goto fail; + + r->info = info; + + e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos); + + return true; +fail: + prb_commit(e); + /* prb_commit() re-enabled interrupts. */ +fail_reopen: + /* Make it clear to the caller that the re-reserve failed. */ + memset(r, 0, sizeof(*r)); + return false; +} + +/* + * Attempt to finalize a specified descriptor. If this fails, the descriptor + * is either already final or it will finalize itself when the writer commits. + */ +static void desc_make_final(struct prb_desc_ring *desc_ring, unsigned long id) +{ + unsigned long prev_state_val = DESC_SV(id, desc_committed); + struct prb_desc *d = to_desc(desc_ring, id); + + atomic_long_cmpxchg_relaxed(&d->state_var, prev_state_val, + DESC_SV(id, desc_finalized)); /* LMM(desc_make_final:A) */ +} + +/** + * prb_reserve() - Reserve space in the ringbuffer. + * + * @e: The entry structure to setup. + * @rb: The ringbuffer to reserve data in. + * @r: The record structure to allocate buffers for. + * + * This is the public function available to writers to reserve data. + * + * The writer specifies the text size to reserve by setting the + * @text_buf_size field of @r. To ensure proper initialization of @r, + * prb_rec_init_wr() should be used. + * + * Context: Any context. Disables local interrupts on success. + * Return: true if at least text data could be allocated, otherwise false. + * + * On success, the fields @info and @text_buf of @r will be set by this + * function and should be filled in by the writer before committing. Also + * on success, prb_record_text_space() can be used on @e to query the actual + * space used for the text data block. + * + * Important: @info->text_len needs to be set correctly by the writer in + * order for data to be readable and/or extended. Its value + * is initialized to 0. + */ +bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, + struct printk_record *r) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + struct printk_info *info; + struct prb_desc *d; + unsigned long id; + u64 seq; + + if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) + goto fail; + + /* + * Descriptors in the reserved state act as blockers to all further + * reservations once the desc_ring has fully wrapped. Disable + * interrupts during the reserve/commit window in order to minimize + * the likelihood of this happening. + */ + local_irq_save(e->irqflags); + + if (!desc_reserve(rb, &id)) { + /* Descriptor reservation failures are tracked. */ + atomic_long_inc(&rb->fail); + local_irq_restore(e->irqflags); + goto fail; + } + + d = to_desc(desc_ring, id); + info = to_info(desc_ring, id); + + /* + * All @info fields (except @seq) are cleared and must be filled in + * by the writer. Save @seq before clearing because it is used to + * determine the new sequence number. + */ + seq = info->seq; + memset(info, 0, sizeof(*info)); + + /* + * Set the @e fields here so that prb_commit() can be used if + * text data allocation fails. + */ + e->rb = rb; + e->id = id; + + /* + * Initialize the sequence number if it has "never been set". + * Otherwise just increment it by a full wrap. + * + * @seq is considered "never been set" if it has a value of 0, + * _except_ for @infos[0], which was specially setup by the ringbuffer + * initializer and therefore is always considered as set. + * + * See the "Bootstrap" comment block in printk_ringbuffer.h for + * details about how the initializer bootstraps the descriptors. + */ + if (seq == 0 && DESC_INDEX(desc_ring, id) != 0) + info->seq = DESC_INDEX(desc_ring, id); + else + info->seq = seq + DESCS_COUNT(desc_ring); + + /* + * New data is about to be reserved. Once that happens, previous + * descriptors are no longer able to be extended. Finalize the + * previous descriptor now so that it can be made available to + * readers. (For seq==0 there is no previous descriptor.) + */ + if (info->seq > 0) + desc_make_final(desc_ring, DESC_ID(id - 1)); + + r->text_buf = data_alloc(rb, &rb->text_data_ring, r->text_buf_size, + &d->text_blk_lpos, id); + /* If text data allocation fails, a data-less record is committed. */ + if (r->text_buf_size && !r->text_buf) { + prb_commit(e); + /* prb_commit() re-enabled interrupts. */ + goto fail; + } + + r->info = info; + + /* Record full text space used by record. */ + e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos); + + return true; +fail: + /* Make it clear to the caller that the reserve failed. */ + memset(r, 0, sizeof(*r)); + return false; +} + +/* Commit the data (possibly finalizing it) and restore interrupts. */ +static void _prb_commit(struct prb_reserved_entry *e, unsigned long state_val) +{ + struct prb_desc_ring *desc_ring = &e->rb->desc_ring; + struct prb_desc *d = to_desc(desc_ring, e->id); + unsigned long prev_state_val = DESC_SV(e->id, desc_reserved); + + /* Now the writer has finished all writing: LMM(_prb_commit:A) */ + + /* + * Set the descriptor as committed. See "ABA Issues" about why + * cmpxchg() instead of set() is used. + * + * 1 Guarantee all record data is stored before the descriptor state + * is stored as committed. A write memory barrier is sufficient + * for this. This pairs with desc_read:B and desc_reopen_last:A. + * + * 2. Guarantee the descriptor state is stored as committed before + * re-checking the head ID in order to possibly finalize this + * descriptor. This pairs with desc_reserve:D. + * + * Memory barrier involvement: + * + * If prb_commit:A reads from desc_reserve:D, then + * desc_make_final:A reads from _prb_commit:B. + * + * Relies on: + * + * MB _prb_commit:B to prb_commit:A + * matching + * MB desc_reserve:D to desc_make_final:A + */ + if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val, + DESC_SV(e->id, state_val))) { /* LMM(_prb_commit:B) */ + WARN_ON_ONCE(1); + } + + /* Restore interrupts, the reserve/commit window is finished. */ + local_irq_restore(e->irqflags); +} + +/** + * prb_commit() - Commit (previously reserved) data to the ringbuffer. + * + * @e: The entry containing the reserved data information. + * + * This is the public function available to writers to commit data. + * + * Note that the data is not yet available to readers until it is finalized. + * Finalizing happens automatically when space for the next record is + * reserved. + * + * See prb_final_commit() for a version of this function that finalizes + * immediately. + * + * Context: Any context. Enables local interrupts. + */ +void prb_commit(struct prb_reserved_entry *e) +{ + struct prb_desc_ring *desc_ring = &e->rb->desc_ring; + unsigned long head_id; + + _prb_commit(e, desc_committed); + + /* + * If this descriptor is no longer the head (i.e. a new record has + * been allocated), extending the data for this record is no longer + * allowed and therefore it must be finalized. + */ + head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_commit:A) */ + if (head_id != e->id) + desc_make_final(desc_ring, e->id); +} + +/** + * prb_final_commit() - Commit and finalize (previously reserved) data to + * the ringbuffer. + * + * @e: The entry containing the reserved data information. + * + * This is the public function available to writers to commit+finalize data. + * + * By finalizing, the data is made immediately available to readers. + * + * This function should only be used if there are no intentions of extending + * this data using prb_reserve_in_last(). + * + * Context: Any context. Enables local interrupts. + */ +void prb_final_commit(struct prb_reserved_entry *e) +{ + _prb_commit(e, desc_finalized); +} + +/* + * Count the number of lines in provided text. All text has at least 1 line + * (even if @text_size is 0). Each '\n' processed is counted as an additional + * line. + */ +static unsigned int count_lines(const char *text, unsigned int text_size) +{ + unsigned int next_size = text_size; + unsigned int line_count = 1; + const char *next = text; + + while (next_size) { + next = memchr(next, '\n', next_size); + if (!next) + break; + line_count++; + next++; + next_size = text_size - (next - text); + } + + return line_count; +} + +/* + * Given @blk_lpos, copy an expected @len of data into the provided buffer. + * If @line_count is provided, count the number of lines in the data. + * + * This function (used by readers) performs strict validation on the data + * size to possibly detect bugs in the writer code. A WARN_ON_ONCE() is + * triggered if an internal error is detected. + */ +static bool copy_data(struct prb_data_ring *data_ring, + struct prb_data_blk_lpos *blk_lpos, u16 len, char *buf, + unsigned int buf_size, unsigned int *line_count) +{ + unsigned int data_size; + const char *data; + + /* Caller might not want any data. */ + if ((!buf || !buf_size) && !line_count) + return true; + + data = get_data(data_ring, blk_lpos, &data_size); + if (!data) + return false; + + /* + * Actual cannot be less than expected. It can be more than expected + * because of the trailing alignment padding. + * + * Note that invalid @len values can occur because the caller loads + * the value during an allowed data race. + */ + if (data_size < (unsigned int)len) + return false; + + /* Caller interested in the line count? */ + if (line_count) + *line_count = count_lines(data, data_size); + + /* Caller interested in the data content? */ + if (!buf || !buf_size) + return true; + + data_size = min_t(u16, buf_size, len); + + memcpy(&buf[0], data, data_size); /* LMM(copy_data:A) */ + return true; +} + +/* + * This is an extended version of desc_read(). It gets a copy of a specified + * descriptor. However, it also verifies that the record is finalized and has + * the sequence number @seq. On success, 0 is returned. + * + * Error return values: + * -EINVAL: A finalized record with sequence number @seq does not exist. + * -ENOENT: A finalized record with sequence number @seq exists, but its data + * is not available. This is a valid record, so readers should + * continue with the next record. + */ +static int desc_read_finalized_seq(struct prb_desc_ring *desc_ring, + unsigned long id, u64 seq, + struct prb_desc *desc_out) +{ + struct prb_data_blk_lpos *blk_lpos = &desc_out->text_blk_lpos; + enum desc_state d_state; + u64 s; + + d_state = desc_read(desc_ring, id, desc_out, &s, NULL); + + /* + * An unexpected @id (desc_miss) or @seq mismatch means the record + * does not exist. A descriptor in the reserved or committed state + * means the record does not yet exist for the reader. + */ + if (d_state == desc_miss || + d_state == desc_reserved || + d_state == desc_committed || + s != seq) { + return -EINVAL; + } + + /* + * A descriptor in the reusable state may no longer have its data + * available; report it as existing but with lost data. Or the record + * may actually be a record with lost data. + */ + if (d_state == desc_reusable || + (blk_lpos->begin == FAILED_LPOS && blk_lpos->next == FAILED_LPOS)) { + return -ENOENT; + } + + return 0; +} + +/* + * Copy the ringbuffer data from the record with @seq to the provided + * @r buffer. On success, 0 is returned. + * + * See desc_read_finalized_seq() for error return values. + */ +static int prb_read(struct printk_ringbuffer *rb, u64 seq, + struct printk_record *r, unsigned int *line_count) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + struct printk_info *info = to_info(desc_ring, seq); + struct prb_desc *rdesc = to_desc(desc_ring, seq); + atomic_long_t *state_var = &rdesc->state_var; + struct prb_desc desc; + unsigned long id; + int err; + + /* Extract the ID, used to specify the descriptor to read. */ + id = DESC_ID(atomic_long_read(state_var)); + + /* Get a local copy of the correct descriptor (if available). */ + err = desc_read_finalized_seq(desc_ring, id, seq, &desc); + + /* + * If @r is NULL, the caller is only interested in the availability + * of the record. + */ + if (err || !r) + return err; + + /* If requested, copy meta data. */ + if (r->info) + memcpy(r->info, info, sizeof(*(r->info))); + + /* Copy text data. If it fails, this is a data-less record. */ + if (!copy_data(&rb->text_data_ring, &desc.text_blk_lpos, info->text_len, + r->text_buf, r->text_buf_size, line_count)) { + return -ENOENT; + } + + /* Ensure the record is still finalized and has the same @seq. */ + return desc_read_finalized_seq(desc_ring, id, seq, &desc); +} + +/* Get the sequence number of the tail descriptor. */ +static u64 prb_first_seq(struct printk_ringbuffer *rb) +{ + struct prb_desc_ring *desc_ring = &rb->desc_ring; + enum desc_state d_state; + struct prb_desc desc; + unsigned long id; + u64 seq; + + for (;;) { + id = atomic_long_read(&rb->desc_ring.tail_id); /* LMM(prb_first_seq:A) */ + + d_state = desc_read(desc_ring, id, &desc, &seq, NULL); /* LMM(prb_first_seq:B) */ + + /* + * This loop will not be infinite because the tail is + * _always_ in the finalized or reusable state. + */ + if (d_state == desc_finalized || d_state == desc_reusable) + break; + + /* + * Guarantee the last state load from desc_read() is before + * reloading @tail_id in order to see a new tail in the case + * that the descriptor has been recycled. This pairs with + * desc_reserve:D. + * + * Memory barrier involvement: + * + * If prb_first_seq:B reads from desc_reserve:F, then + * prb_first_seq:A reads from desc_push_tail:B. + * + * Relies on: + * + * MB from desc_push_tail:B to desc_reserve:F + * matching + * RMB prb_first_seq:B to prb_first_seq:A + */ + smp_rmb(); /* LMM(prb_first_seq:C) */ + } + + return seq; +} + +/* + * Non-blocking read of a record. Updates @seq to the last finalized record + * (which may have no data available). + * + * See the description of prb_read_valid() and prb_read_valid_info() + * for details. + */ +static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, + struct printk_record *r, unsigned int *line_count) +{ + u64 tail_seq; + int err; + + while ((err = prb_read(rb, *seq, r, line_count))) { + tail_seq = prb_first_seq(rb); + + if (*seq < tail_seq) { + /* + * Behind the tail. Catch up and try again. This + * can happen for -ENOENT and -EINVAL cases. + */ + *seq = tail_seq; + + } else if (err == -ENOENT) { + /* Record exists, but no data available. Skip. */ + (*seq)++; + + } else { + /* Non-existent/non-finalized record. Must stop. */ + return false; + } + } + + return true; +} + +/** + * prb_read_valid() - Non-blocking read of a requested record or (if gone) + * the next available record. + * + * @rb: The ringbuffer to read from. + * @seq: The sequence number of the record to read. + * @r: A record data buffer to store the read record to. + * + * This is the public function available to readers to read a record. + * + * The reader provides the @info and @text_buf buffers of @r to be + * filled in. Any of the buffer pointers can be set to NULL if the reader + * is not interested in that data. To ensure proper initialization of @r, + * prb_rec_init_rd() should be used. + * + * Context: Any context. + * Return: true if a record was read, otherwise false. + * + * On success, the reader must check r->info.seq to see which record was + * actually read. This allows the reader to detect dropped records. + * + * Failure means @seq refers to a not yet written record. + */ +bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq, + struct printk_record *r) +{ + return _prb_read_valid(rb, &seq, r, NULL); +} + +/** + * prb_read_valid_info() - Non-blocking read of meta data for a requested + * record or (if gone) the next available record. + * + * @rb: The ringbuffer to read from. + * @seq: The sequence number of the record to read. + * @info: A buffer to store the read record meta data to. + * @line_count: A buffer to store the number of lines in the record text. + * + * This is the public function available to readers to read only the + * meta data of a record. + * + * The reader provides the @info, @line_count buffers to be filled in. + * Either of the buffer pointers can be set to NULL if the reader is not + * interested in that data. + * + * Context: Any context. + * Return: true if a record's meta data was read, otherwise false. + * + * On success, the reader must check info->seq to see which record meta data + * was actually read. This allows the reader to detect dropped records. + * + * Failure means @seq refers to a not yet written record. + */ +bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq, + struct printk_info *info, unsigned int *line_count) +{ + struct printk_record r; + + prb_rec_init_rd(&r, info, NULL, 0); + + return _prb_read_valid(rb, &seq, &r, line_count); +} + +/** + * prb_first_valid_seq() - Get the sequence number of the oldest available + * record. + * + * @rb: The ringbuffer to get the sequence number from. + * + * This is the public function available to readers to see what the + * first/oldest valid sequence number is. + * + * This provides readers a starting point to begin iterating the ringbuffer. + * + * Context: Any context. + * Return: The sequence number of the first/oldest record or, if the + * ringbuffer is empty, 0 is returned. + */ +u64 prb_first_valid_seq(struct printk_ringbuffer *rb) +{ + u64 seq = 0; + + if (!_prb_read_valid(rb, &seq, NULL, NULL)) + return 0; + + return seq; +} + +/** + * prb_next_seq() - Get the sequence number after the last available record. + * + * @rb: The ringbuffer to get the sequence number from. + * + * This is the public function available to readers to see what the next + * newest sequence number available to readers will be. + * + * This provides readers a sequence number to jump to if all currently + * available records should be skipped. + * + * Context: Any context. + * Return: The sequence number of the next newest (not yet available) record + * for readers. + */ +u64 prb_next_seq(struct printk_ringbuffer *rb) +{ + u64 seq = 0; + + /* Search forward from the oldest descriptor. */ + while (_prb_read_valid(rb, &seq, NULL, NULL)) + seq++; + + return seq; +} + +/** + * prb_init() - Initialize a ringbuffer to use provided external buffers. + * + * @rb: The ringbuffer to initialize. + * @text_buf: The data buffer for text data. + * @textbits: The size of @text_buf as a power-of-2 value. + * @descs: The descriptor buffer for ringbuffer records. + * @descbits: The count of @descs items as a power-of-2 value. + * @infos: The printk_info buffer for ringbuffer records. + * + * This is the public function available to writers to setup a ringbuffer + * during runtime using provided buffers. + * + * This must match the initialization of DEFINE_PRINTKRB(). + * + * Context: Any context. + */ +void prb_init(struct printk_ringbuffer *rb, + char *text_buf, unsigned int textbits, + struct prb_desc *descs, unsigned int descbits, + struct printk_info *infos) +{ + memset(descs, 0, _DESCS_COUNT(descbits) * sizeof(descs[0])); + memset(infos, 0, _DESCS_COUNT(descbits) * sizeof(infos[0])); + + rb->desc_ring.count_bits = descbits; + rb->desc_ring.descs = descs; + rb->desc_ring.infos = infos; + atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits)); + atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits)); + + rb->text_data_ring.size_bits = textbits; + rb->text_data_ring.data = text_buf; + atomic_long_set(&rb->text_data_ring.head_lpos, BLK0_LPOS(textbits)); + atomic_long_set(&rb->text_data_ring.tail_lpos, BLK0_LPOS(textbits)); + + atomic_long_set(&rb->fail, 0); + + atomic_long_set(&(descs[_DESCS_COUNT(descbits) - 1].state_var), DESC0_SV(descbits)); + descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.begin = FAILED_LPOS; + descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.next = FAILED_LPOS; + + infos[0].seq = -(u64)_DESCS_COUNT(descbits); + infos[_DESCS_COUNT(descbits) - 1].seq = 0; +} + +/** + * prb_record_text_space() - Query the full actual used ringbuffer space for + * the text data of a reserved entry. + * + * @e: The successfully reserved entry to query. + * + * This is the public function available to writers to see how much actual + * space is used in the ringbuffer to store the text data of the specified + * entry. + * + * This function is only valid if @e has been successfully reserved using + * prb_reserve(). + * + * Context: Any context. + * Return: The size in bytes used by the text data of the associated record. + */ +unsigned int prb_record_text_space(struct prb_reserved_entry *e) +{ + return e->text_space; +} diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h new file mode 100644 index 000000000000..5dc9d022db07 --- /dev/null +++ b/kernel/printk/printk_ringbuffer.h @@ -0,0 +1,382 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _KERNEL_PRINTK_RINGBUFFER_H +#define _KERNEL_PRINTK_RINGBUFFER_H + +#include <linux/atomic.h> +#include <linux/dev_printk.h> + +/* + * Meta information about each stored message. + * + * All fields are set by the printk code except for @seq, which is + * set by the ringbuffer code. + */ +struct printk_info { + u64 seq; /* sequence number */ + u64 ts_nsec; /* timestamp in nanoseconds */ + u16 text_len; /* length of text message */ + u8 facility; /* syslog facility */ + u8 flags:5; /* internal record flags */ + u8 level:3; /* syslog level */ + u32 caller_id; /* thread id or processor id */ + + struct dev_printk_info dev_info; +}; + +/* + * A structure providing the buffers, used by writers and readers. + * + * Writers: + * Using prb_rec_init_wr(), a writer sets @text_buf_size before calling + * prb_reserve(). On success, prb_reserve() sets @info and @text_buf to + * buffers reserved for that writer. + * + * Readers: + * Using prb_rec_init_rd(), a reader sets all fields before calling + * prb_read_valid(). Note that the reader provides the @info and @text_buf, + * buffers. On success, the struct pointed to by @info will be filled and + * the char array pointed to by @text_buf will be filled with text data. + */ +struct printk_record { + struct printk_info *info; + char *text_buf; + unsigned int text_buf_size; +}; + +/* Specifies the logical position and span of a data block. */ +struct prb_data_blk_lpos { + unsigned long begin; + unsigned long next; +}; + +/* + * A descriptor: the complete meta-data for a record. + * + * @state_var: A bitwise combination of descriptor ID and descriptor state. + */ +struct prb_desc { + atomic_long_t state_var; + struct prb_data_blk_lpos text_blk_lpos; +}; + +/* A ringbuffer of "ID + data" elements. */ +struct prb_data_ring { + unsigned int size_bits; + char *data; + atomic_long_t head_lpos; + atomic_long_t tail_lpos; +}; + +/* A ringbuffer of "struct prb_desc" elements. */ +struct prb_desc_ring { + unsigned int count_bits; + struct prb_desc *descs; + struct printk_info *infos; + atomic_long_t head_id; + atomic_long_t tail_id; +}; + +/* + * The high level structure representing the printk ringbuffer. + * + * @fail: Count of failed prb_reserve() calls where not even a data-less + * record was created. + */ +struct printk_ringbuffer { + struct prb_desc_ring desc_ring; + struct prb_data_ring text_data_ring; + atomic_long_t fail; +}; + +/* + * Used by writers as a reserve/commit handle. + * + * @rb: Ringbuffer where the entry is reserved. + * @irqflags: Saved irq flags to restore on entry commit. + * @id: ID of the reserved descriptor. + * @text_space: Total occupied buffer space in the text data ring, including + * ID, alignment padding, and wrapping data blocks. + * + * This structure is an opaque handle for writers. Its contents are only + * to be used by the ringbuffer implementation. + */ +struct prb_reserved_entry { + struct printk_ringbuffer *rb; + unsigned long irqflags; + unsigned long id; + unsigned int text_space; +}; + +/* The possible responses of a descriptor state-query. */ +enum desc_state { + desc_miss = -1, /* ID mismatch (pseudo state) */ + desc_reserved = 0x0, /* reserved, in use by writer */ + desc_committed = 0x1, /* committed by writer, could get reopened */ + desc_finalized = 0x2, /* committed, no further modification allowed */ + desc_reusable = 0x3, /* free, not yet used by any writer */ +}; + +#define _DATA_SIZE(sz_bits) (1UL << (sz_bits)) +#define _DESCS_COUNT(ct_bits) (1U << (ct_bits)) +#define DESC_SV_BITS (sizeof(unsigned long) * 8) +#define DESC_FLAGS_SHIFT (DESC_SV_BITS - 2) +#define DESC_FLAGS_MASK (3UL << DESC_FLAGS_SHIFT) +#define DESC_STATE(sv) (3UL & (sv >> DESC_FLAGS_SHIFT)) +#define DESC_SV(id, state) (((unsigned long)state << DESC_FLAGS_SHIFT) | id) +#define DESC_ID_MASK (~DESC_FLAGS_MASK) +#define DESC_ID(sv) ((sv) & DESC_ID_MASK) +#define FAILED_LPOS 0x1 +#define NO_LPOS 0x3 + +#define FAILED_BLK_LPOS \ +{ \ + .begin = FAILED_LPOS, \ + .next = FAILED_LPOS, \ +} + +/* + * Descriptor Bootstrap + * + * The descriptor array is minimally initialized to allow immediate usage + * by readers and writers. The requirements that the descriptor array + * initialization must satisfy: + * + * Req1 + * The tail must point to an existing (committed or reusable) descriptor. + * This is required by the implementation of prb_first_seq(). + * + * Req2 + * Readers must see that the ringbuffer is initially empty. + * + * Req3 + * The first record reserved by a writer is assigned sequence number 0. + * + * To satisfy Req1, the tail initially points to a descriptor that is + * minimally initialized (having no data block, i.e. data-less with the + * data block's lpos @begin and @next values set to FAILED_LPOS). + * + * To satisfy Req2, the initial tail descriptor is initialized to the + * reusable state. Readers recognize reusable descriptors as existing + * records, but skip over them. + * + * To satisfy Req3, the last descriptor in the array is used as the initial + * head (and tail) descriptor. This allows the first record reserved by a + * writer (head + 1) to be the first descriptor in the array. (Only the first + * descriptor in the array could have a valid sequence number of 0.) + * + * The first time a descriptor is reserved, it is assigned a sequence number + * with the value of the array index. A "first time reserved" descriptor can + * be recognized because it has a sequence number of 0 but does not have an + * index of 0. (Only the first descriptor in the array could have a valid + * sequence number of 0.) After the first reservation, all future reservations + * (recycling) simply involve incrementing the sequence number by the array + * count. + * + * Hack #1 + * Only the first descriptor in the array is allowed to have the sequence + * number 0. In this case it is not possible to recognize if it is being + * reserved the first time (set to index value) or has been reserved + * previously (increment by the array count). This is handled by _always_ + * incrementing the sequence number by the array count when reserving the + * first descriptor in the array. In order to satisfy Req3, the sequence + * number of the first descriptor in the array is initialized to minus + * the array count. Then, upon the first reservation, it is incremented + * to 0, thus satisfying Req3. + * + * Hack #2 + * prb_first_seq() can be called at any time by readers to retrieve the + * sequence number of the tail descriptor. However, due to Req2 and Req3, + * initially there are no records to report the sequence number of + * (sequence numbers are u64 and there is nothing less than 0). To handle + * this, the sequence number of the initial tail descriptor is initialized + * to 0. Technically this is incorrect, because there is no record with + * sequence number 0 (yet) and the tail descriptor is not the first + * descriptor in the array. But it allows prb_read_valid() to correctly + * report the existence of a record for _any_ given sequence number at all + * times. Bootstrapping is complete when the tail is pushed the first + * time, thus finally pointing to the first descriptor reserved by a + * writer, which has the assigned sequence number 0. + */ + +/* + * Initiating Logical Value Overflows + * + * Both logical position (lpos) and ID values can be mapped to array indexes + * but may experience overflows during the lifetime of the system. To ensure + * that printk_ringbuffer can handle the overflows for these types, initial + * values are chosen that map to the correct initial array indexes, but will + * result in overflows soon. + * + * BLK0_LPOS + * The initial @head_lpos and @tail_lpos for data rings. It is at index + * 0 and the lpos value is such that it will overflow on the first wrap. + * + * DESC0_ID + * The initial @head_id and @tail_id for the desc ring. It is at the last + * index of the descriptor array (see Req3 above) and the ID value is such + * that it will overflow on the second wrap. + */ +#define BLK0_LPOS(sz_bits) (-(_DATA_SIZE(sz_bits))) +#define DESC0_ID(ct_bits) DESC_ID(-(_DESCS_COUNT(ct_bits) + 1)) +#define DESC0_SV(ct_bits) DESC_SV(DESC0_ID(ct_bits), desc_reusable) + +/* + * Define a ringbuffer with an external text data buffer. The same as + * DEFINE_PRINTKRB() but requires specifying an external buffer for the + * text data. + * + * Note: The specified external buffer must be of the size: + * 2 ^ (descbits + avgtextbits) + */ +#define _DEFINE_PRINTKRB(name, descbits, avgtextbits, text_buf) \ +static struct prb_desc _##name##_descs[_DESCS_COUNT(descbits)] = { \ + /* the initial head and tail */ \ + [_DESCS_COUNT(descbits) - 1] = { \ + /* reusable */ \ + .state_var = ATOMIC_INIT(DESC0_SV(descbits)), \ + /* no associated data block */ \ + .text_blk_lpos = FAILED_BLK_LPOS, \ + }, \ +}; \ +static struct printk_info _##name##_infos[_DESCS_COUNT(descbits)] = { \ + /* this will be the first record reserved by a writer */ \ + [0] = { \ + /* will be incremented to 0 on the first reservation */ \ + .seq = -(u64)_DESCS_COUNT(descbits), \ + }, \ + /* the initial head and tail */ \ + [_DESCS_COUNT(descbits) - 1] = { \ + /* reports the first seq value during the bootstrap phase */ \ + .seq = 0, \ + }, \ +}; \ +static struct printk_ringbuffer name = { \ + .desc_ring = { \ + .count_bits = descbits, \ + .descs = &_##name##_descs[0], \ + .infos = &_##name##_infos[0], \ + .head_id = ATOMIC_INIT(DESC0_ID(descbits)), \ + .tail_id = ATOMIC_INIT(DESC0_ID(descbits)), \ + }, \ + .text_data_ring = { \ + .size_bits = (avgtextbits) + (descbits), \ + .data = text_buf, \ + .head_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \ + .tail_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \ + }, \ + .fail = ATOMIC_LONG_INIT(0), \ +} + +/** + * DEFINE_PRINTKRB() - Define a ringbuffer. + * + * @name: The name of the ringbuffer variable. + * @descbits: The number of descriptors as a power-of-2 value. + * @avgtextbits: The average text data size per record as a power-of-2 value. + * + * This is a macro for defining a ringbuffer and all internal structures + * such that it is ready for immediate use. See _DEFINE_PRINTKRB() for a + * variant where the text data buffer can be specified externally. + */ +#define DEFINE_PRINTKRB(name, descbits, avgtextbits) \ +static char _##name##_text[1U << ((avgtextbits) + (descbits))] \ + __aligned(__alignof__(unsigned long)); \ +_DEFINE_PRINTKRB(name, descbits, avgtextbits, &_##name##_text[0]) + +/* Writer Interface */ + +/** + * prb_rec_init_wd() - Initialize a buffer for writing records. + * + * @r: The record to initialize. + * @text_buf_size: The needed text buffer size. + */ +static inline void prb_rec_init_wr(struct printk_record *r, + unsigned int text_buf_size) +{ + r->info = NULL; + r->text_buf = NULL; + r->text_buf_size = text_buf_size; +} + +bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, + struct printk_record *r); +bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, + struct printk_record *r, u32 caller_id, unsigned int max_size); +void prb_commit(struct prb_reserved_entry *e); +void prb_final_commit(struct prb_reserved_entry *e); + +void prb_init(struct printk_ringbuffer *rb, + char *text_buf, unsigned int text_buf_size, + struct prb_desc *descs, unsigned int descs_count_bits, + struct printk_info *infos); +unsigned int prb_record_text_space(struct prb_reserved_entry *e); + +/* Reader Interface */ + +/** + * prb_rec_init_rd() - Initialize a buffer for reading records. + * + * @r: The record to initialize. + * @info: A buffer to store record meta-data. + * @text_buf: A buffer to store text data. + * @text_buf_size: The size of @text_buf. + * + * Initialize all the fields that a reader is interested in. All arguments + * (except @r) are optional. Only record data for arguments that are + * non-NULL or non-zero will be read. + */ +static inline void prb_rec_init_rd(struct printk_record *r, + struct printk_info *info, + char *text_buf, unsigned int text_buf_size) +{ + r->info = info; + r->text_buf = text_buf; + r->text_buf_size = text_buf_size; +} + +/** + * prb_for_each_record() - Iterate over the records of a ringbuffer. + * + * @from: The sequence number to begin with. + * @rb: The ringbuffer to iterate over. + * @s: A u64 to store the sequence number on each iteration. + * @r: A printk_record to store the record on each iteration. + * + * This is a macro for conveniently iterating over a ringbuffer. + * Note that @s may not be the sequence number of the record on each + * iteration. For the sequence number, @r->info->seq should be checked. + * + * Context: Any context. + */ +#define prb_for_each_record(from, rb, s, r) \ +for ((s) = from; prb_read_valid(rb, s, r); (s) = (r)->info->seq + 1) + +/** + * prb_for_each_info() - Iterate over the meta data of a ringbuffer. + * + * @from: The sequence number to begin with. + * @rb: The ringbuffer to iterate over. + * @s: A u64 to store the sequence number on each iteration. + * @i: A printk_info to store the record meta data on each iteration. + * @lc: An unsigned int to store the text line count of each record. + * + * This is a macro for conveniently iterating over a ringbuffer. + * Note that @s may not be the sequence number of the record on each + * iteration. For the sequence number, @r->info->seq should be checked. + * + * Context: Any context. + */ +#define prb_for_each_info(from, rb, s, i, lc) \ +for ((s) = from; prb_read_valid_info(rb, s, i, lc); (s) = (i)->seq + 1) + +bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq, + struct printk_record *r); +bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq, + struct printk_info *info, unsigned int *line_count); + +u64 prb_first_valid_seq(struct printk_ringbuffer *rb); +u64 prb_next_seq(struct printk_ringbuffer *rb); + +#endif /* _KERNEL_PRINTK_RINGBUFFER_H */ diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 50aeae770434..a0e6f746de6c 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -22,7 +22,7 @@ * is later flushed into the main ring buffer via IRQ work. * * The alternative implementation is chosen transparently - * by examinig current printk() context mask stored in @printk_context + * by examining current printk() context mask stored in @printk_context * per-CPU variable. * * The implementation allows to flush the strings also from another CPU. @@ -375,7 +375,7 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args) raw_spin_trylock(&logbuf_lock)) { int len; - len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); + len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args); raw_spin_unlock(&logbuf_lock); defer_console_output(); return len; diff --git a/kernel/range.c b/kernel/range.c index d84de6766472..56435f96da73 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -2,8 +2,9 @@ /* * Range add and subtract */ -#include <linux/kernel.h> #include <linux/init.h> +#include <linux/minmax.h> +#include <linux/printk.h> #include <linux/sort.h> #include <linux/string.h> #include <linux/range.h> diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 0ebe15a84985..b71e21f73c40 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -135,10 +135,12 @@ config RCU_FANOUT config RCU_FANOUT_LEAF int "Tree-based hierarchical RCU leaf-level fanout value" - range 2 64 if 64BIT - range 2 32 if !64BIT + range 2 64 if 64BIT && !RCU_STRICT_GRACE_PERIOD + range 2 32 if !64BIT && !RCU_STRICT_GRACE_PERIOD + range 2 3 if RCU_STRICT_GRACE_PERIOD depends on TREE_RCU && RCU_EXPERT - default 16 + default 16 if !RCU_STRICT_GRACE_PERIOD + default 2 if RCU_STRICT_GRACE_PERIOD help This option controls the leaf-level fanout of hierarchical implementations of RCU, and allows trading off cache misses diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 3cf6132a4bb9..1942c1f1bb65 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -23,7 +23,7 @@ config TORTURE_TEST tristate default n -config RCU_PERF_TEST +config RCU_SCALE_TEST tristate "performance tests for RCU" depends on DEBUG_KERNEL select TORTURE_TEST @@ -114,4 +114,19 @@ config RCU_EQS_DEBUG Say N here if you need ultimate kernel/user switch latencies Say Y if you are unsure +config RCU_STRICT_GRACE_PERIOD + bool "Provide debug RCU implementation with short grace periods" + depends on DEBUG_KERNEL && RCU_EXPERT + default n + select PREEMPT_COUNT if PREEMPT=n + help + Select this option to build an RCU variant that is strict about + grace periods, making them as short as it can. This limits + scalability, destroys real-time response, degrades battery + lifetime and kills performance. Don't try this on large + machines, as in systems with more than about 10 or 20 CPUs. + But in conjunction with tools like KASAN, it can be helpful + when looking for certain types of RCU usage bugs, for example, + too-short RCU read-side critical sections. + endmenu # "RCU Debugging" diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 95f5117ef8da..0cfb009a99b9 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -11,7 +11,7 @@ obj-y += update.o sync.o obj-$(CONFIG_TREE_SRCU) += srcutree.o obj-$(CONFIG_TINY_SRCU) += srcutiny.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o -obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o +obj-$(CONFIG_RCU_SCALE_TEST) += rcuscale.o obj-$(CONFIG_RCU_REF_SCALE_TEST) += refscale.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_TINY_RCU) += tiny.o diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 9a0f66133b4b..2d2a6b6b9dfb 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -475,8 +475,16 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) * Also advance to the oldest segment of callbacks whose * ->gp_seq[] completion is at or after that passed in via "seq", * skipping any empty segments. + * + * Note that segment "i" (and any lower-numbered segments + * containing older callbacks) will be unaffected, and their + * grace-period numbers remain unchanged. For example, if i == + * WAIT_TAIL, then neither WAIT_TAIL nor DONE_TAIL will be touched. + * Instead, the CBs in NEXT_TAIL will be merged with those in + * NEXT_READY_TAIL and the grace-period number of NEXT_READY_TAIL + * would be updated. NEXT_TAIL would then be empty. */ - if (++i >= RCU_NEXT_TAIL) + if (rcu_segcblist_restempty(rsclp, i) || ++i >= RCU_NEXT_TAIL) return false; /* diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuscale.c index 21448d3374e2..2819b95479af 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuscale.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * Read-Copy Update module-based performance-test facility + * Read-Copy Update module-based scalability-test facility * * Copyright (C) IBM Corporation, 2015 * @@ -44,13 +44,13 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); -#define PERF_FLAG "-perf:" -#define PERFOUT_STRING(s) \ - pr_alert("%s" PERF_FLAG " %s\n", perf_type, s) -#define VERBOSE_PERFOUT_STRING(s) \ - do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0) -#define VERBOSE_PERFOUT_ERRSTRING(s) \ - do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) +#define SCALE_FLAG "-scale:" +#define SCALEOUT_STRING(s) \ + pr_alert("%s" SCALE_FLAG " %s\n", scale_type, s) +#define VERBOSE_SCALEOUT_STRING(s) \ + do { if (verbose) pr_alert("%s" SCALE_FLAG " %s\n", scale_type, s); } while (0) +#define VERBOSE_SCALEOUT_ERRSTRING(s) \ + do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! %s\n", scale_type, s); } while (0) /* * The intended use cases for the nreaders and nwriters module parameters @@ -61,25 +61,25 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); * nr_cpus for a mixed reader/writer test. * * 2. Specify the nr_cpus kernel boot parameter, but set - * rcuperf.nreaders to zero. This will set nwriters to the + * rcuscale.nreaders to zero. This will set nwriters to the * value specified by nr_cpus for an update-only test. * * 3. Specify the nr_cpus kernel boot parameter, but set - * rcuperf.nwriters to zero. This will set nreaders to the + * rcuscale.nwriters to zero. This will set nreaders to the * value specified by nr_cpus for a read-only test. * * Various other use cases may of course be specified. * * Note that this test's readers are intended only as a test load for - * the writers. The reader performance statistics will be overly + * the writers. The reader scalability statistics will be overly * pessimistic due to the per-critical-section interrupt disabling, * test-end checks, and the pair of calls through pointers. */ #ifdef MODULE -# define RCUPERF_SHUTDOWN 0 +# define RCUSCALE_SHUTDOWN 0 #else -# define RCUPERF_SHUTDOWN 1 +# define RCUSCALE_SHUTDOWN 1 #endif torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); @@ -88,16 +88,16 @@ torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); -torture_param(bool, shutdown, RCUPERF_SHUTDOWN, - "Shutdown at end of performance tests."); +torture_param(bool, shutdown, RCUSCALE_SHUTDOWN, + "Shutdown at end of scalability tests."); torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); -torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() perf test?"); +torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() scale test?"); torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate."); -static char *perf_type = "rcu"; -module_param(perf_type, charp, 0444); -MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, srcu, ...)"); +static char *scale_type = "rcu"; +module_param(scale_type, charp, 0444); +MODULE_PARM_DESC(scale_type, "Type of RCU to scalability-test (rcu, srcu, ...)"); static int nrealreaders; static int nrealwriters; @@ -107,12 +107,12 @@ static struct task_struct *shutdown_task; static u64 **writer_durations; static int *writer_n_durations; -static atomic_t n_rcu_perf_reader_started; -static atomic_t n_rcu_perf_writer_started; -static atomic_t n_rcu_perf_writer_finished; +static atomic_t n_rcu_scale_reader_started; +static atomic_t n_rcu_scale_writer_started; +static atomic_t n_rcu_scale_writer_finished; static wait_queue_head_t shutdown_wq; -static u64 t_rcu_perf_writer_started; -static u64 t_rcu_perf_writer_finished; +static u64 t_rcu_scale_writer_started; +static u64 t_rcu_scale_writer_finished; static unsigned long b_rcu_gp_test_started; static unsigned long b_rcu_gp_test_finished; static DEFINE_PER_CPU(atomic_t, n_async_inflight); @@ -124,7 +124,7 @@ static DEFINE_PER_CPU(atomic_t, n_async_inflight); * Operations vector for selecting different types of tests. */ -struct rcu_perf_ops { +struct rcu_scale_ops { int ptype; void (*init)(void); void (*cleanup)(void); @@ -140,19 +140,19 @@ struct rcu_perf_ops { const char *name; }; -static struct rcu_perf_ops *cur_ops; +static struct rcu_scale_ops *cur_ops; /* - * Definitions for rcu perf testing. + * Definitions for rcu scalability testing. */ -static int rcu_perf_read_lock(void) __acquires(RCU) +static int rcu_scale_read_lock(void) __acquires(RCU) { rcu_read_lock(); return 0; } -static void rcu_perf_read_unlock(int idx) __releases(RCU) +static void rcu_scale_read_unlock(int idx) __releases(RCU) { rcu_read_unlock(); } @@ -162,15 +162,15 @@ static unsigned long __maybe_unused rcu_no_completed(void) return 0; } -static void rcu_sync_perf_init(void) +static void rcu_sync_scale_init(void) { } -static struct rcu_perf_ops rcu_ops = { +static struct rcu_scale_ops rcu_ops = { .ptype = RCU_FLAVOR, - .init = rcu_sync_perf_init, - .readlock = rcu_perf_read_lock, - .readunlock = rcu_perf_read_unlock, + .init = rcu_sync_scale_init, + .readlock = rcu_scale_read_lock, + .readunlock = rcu_scale_read_unlock, .get_gp_seq = rcu_get_gp_seq, .gp_diff = rcu_seq_diff, .exp_completed = rcu_exp_batches_completed, @@ -182,23 +182,23 @@ static struct rcu_perf_ops rcu_ops = { }; /* - * Definitions for srcu perf testing. + * Definitions for srcu scalability testing. */ -DEFINE_STATIC_SRCU(srcu_ctl_perf); -static struct srcu_struct *srcu_ctlp = &srcu_ctl_perf; +DEFINE_STATIC_SRCU(srcu_ctl_scale); +static struct srcu_struct *srcu_ctlp = &srcu_ctl_scale; -static int srcu_perf_read_lock(void) __acquires(srcu_ctlp) +static int srcu_scale_read_lock(void) __acquires(srcu_ctlp) { return srcu_read_lock(srcu_ctlp); } -static void srcu_perf_read_unlock(int idx) __releases(srcu_ctlp) +static void srcu_scale_read_unlock(int idx) __releases(srcu_ctlp) { srcu_read_unlock(srcu_ctlp, idx); } -static unsigned long srcu_perf_completed(void) +static unsigned long srcu_scale_completed(void) { return srcu_batches_completed(srcu_ctlp); } @@ -213,78 +213,78 @@ static void srcu_rcu_barrier(void) srcu_barrier(srcu_ctlp); } -static void srcu_perf_synchronize(void) +static void srcu_scale_synchronize(void) { synchronize_srcu(srcu_ctlp); } -static void srcu_perf_synchronize_expedited(void) +static void srcu_scale_synchronize_expedited(void) { synchronize_srcu_expedited(srcu_ctlp); } -static struct rcu_perf_ops srcu_ops = { +static struct rcu_scale_ops srcu_ops = { .ptype = SRCU_FLAVOR, - .init = rcu_sync_perf_init, - .readlock = srcu_perf_read_lock, - .readunlock = srcu_perf_read_unlock, - .get_gp_seq = srcu_perf_completed, + .init = rcu_sync_scale_init, + .readlock = srcu_scale_read_lock, + .readunlock = srcu_scale_read_unlock, + .get_gp_seq = srcu_scale_completed, .gp_diff = rcu_seq_diff, - .exp_completed = srcu_perf_completed, + .exp_completed = srcu_scale_completed, .async = srcu_call_rcu, .gp_barrier = srcu_rcu_barrier, - .sync = srcu_perf_synchronize, - .exp_sync = srcu_perf_synchronize_expedited, + .sync = srcu_scale_synchronize, + .exp_sync = srcu_scale_synchronize_expedited, .name = "srcu" }; static struct srcu_struct srcud; -static void srcu_sync_perf_init(void) +static void srcu_sync_scale_init(void) { srcu_ctlp = &srcud; init_srcu_struct(srcu_ctlp); } -static void srcu_sync_perf_cleanup(void) +static void srcu_sync_scale_cleanup(void) { cleanup_srcu_struct(srcu_ctlp); } -static struct rcu_perf_ops srcud_ops = { +static struct rcu_scale_ops srcud_ops = { .ptype = SRCU_FLAVOR, - .init = srcu_sync_perf_init, - .cleanup = srcu_sync_perf_cleanup, - .readlock = srcu_perf_read_lock, - .readunlock = srcu_perf_read_unlock, - .get_gp_seq = srcu_perf_completed, + .init = srcu_sync_scale_init, + .cleanup = srcu_sync_scale_cleanup, + .readlock = srcu_scale_read_lock, + .readunlock = srcu_scale_read_unlock, + .get_gp_seq = srcu_scale_completed, .gp_diff = rcu_seq_diff, - .exp_completed = srcu_perf_completed, + .exp_completed = srcu_scale_completed, .async = srcu_call_rcu, .gp_barrier = srcu_rcu_barrier, - .sync = srcu_perf_synchronize, - .exp_sync = srcu_perf_synchronize_expedited, + .sync = srcu_scale_synchronize, + .exp_sync = srcu_scale_synchronize_expedited, .name = "srcud" }; /* - * Definitions for RCU-tasks perf testing. + * Definitions for RCU-tasks scalability testing. */ -static int tasks_perf_read_lock(void) +static int tasks_scale_read_lock(void) { return 0; } -static void tasks_perf_read_unlock(int idx) +static void tasks_scale_read_unlock(int idx) { } -static struct rcu_perf_ops tasks_ops = { +static struct rcu_scale_ops tasks_ops = { .ptype = RCU_TASKS_FLAVOR, - .init = rcu_sync_perf_init, - .readlock = tasks_perf_read_lock, - .readunlock = tasks_perf_read_unlock, + .init = rcu_sync_scale_init, + .readlock = tasks_scale_read_lock, + .readunlock = tasks_scale_read_unlock, .get_gp_seq = rcu_no_completed, .gp_diff = rcu_seq_diff, .async = call_rcu_tasks, @@ -294,7 +294,7 @@ static struct rcu_perf_ops tasks_ops = { .name = "tasks" }; -static unsigned long rcuperf_seq_diff(unsigned long new, unsigned long old) +static unsigned long rcuscale_seq_diff(unsigned long new, unsigned long old) { if (!cur_ops->gp_diff) return new - old; @@ -302,60 +302,60 @@ static unsigned long rcuperf_seq_diff(unsigned long new, unsigned long old) } /* - * If performance tests complete, wait for shutdown to commence. + * If scalability tests complete, wait for shutdown to commence. */ -static void rcu_perf_wait_shutdown(void) +static void rcu_scale_wait_shutdown(void) { cond_resched_tasks_rcu_qs(); - if (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters) + if (atomic_read(&n_rcu_scale_writer_finished) < nrealwriters) return; while (!torture_must_stop()) schedule_timeout_uninterruptible(1); } /* - * RCU perf reader kthread. Repeatedly does empty RCU read-side critical - * section, minimizing update-side interference. However, the point of - * this test is not to evaluate reader performance, but instead to serve - * as a test load for update-side performance testing. + * RCU scalability reader kthread. Repeatedly does empty RCU read-side + * critical section, minimizing update-side interference. However, the + * point of this test is not to evaluate reader scalability, but instead + * to serve as a test load for update-side scalability testing. */ static int -rcu_perf_reader(void *arg) +rcu_scale_reader(void *arg) { unsigned long flags; int idx; long me = (long)arg; - VERBOSE_PERFOUT_STRING("rcu_perf_reader task started"); + VERBOSE_SCALEOUT_STRING("rcu_scale_reader task started"); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); set_user_nice(current, MAX_NICE); - atomic_inc(&n_rcu_perf_reader_started); + atomic_inc(&n_rcu_scale_reader_started); do { local_irq_save(flags); idx = cur_ops->readlock(); cur_ops->readunlock(idx); local_irq_restore(flags); - rcu_perf_wait_shutdown(); + rcu_scale_wait_shutdown(); } while (!torture_must_stop()); - torture_kthread_stopping("rcu_perf_reader"); + torture_kthread_stopping("rcu_scale_reader"); return 0; } /* - * Callback function for asynchronous grace periods from rcu_perf_writer(). + * Callback function for asynchronous grace periods from rcu_scale_writer(). */ -static void rcu_perf_async_cb(struct rcu_head *rhp) +static void rcu_scale_async_cb(struct rcu_head *rhp) { atomic_dec(this_cpu_ptr(&n_async_inflight)); kfree(rhp); } /* - * RCU perf writer kthread. Repeatedly does a grace period. + * RCU scale writer kthread. Repeatedly does a grace period. */ static int -rcu_perf_writer(void *arg) +rcu_scale_writer(void *arg) { int i = 0; int i_max; @@ -366,7 +366,7 @@ rcu_perf_writer(void *arg) u64 *wdp; u64 *wdpp = writer_durations[me]; - VERBOSE_PERFOUT_STRING("rcu_perf_writer task started"); + VERBOSE_SCALEOUT_STRING("rcu_scale_writer task started"); WARN_ON(!wdpp); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); sched_set_fifo_low(current); @@ -383,8 +383,8 @@ rcu_perf_writer(void *arg) schedule_timeout_uninterruptible(1); t = ktime_get_mono_fast_ns(); - if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) { - t_rcu_perf_writer_started = t; + if (atomic_inc_return(&n_rcu_scale_writer_started) >= nrealwriters) { + t_rcu_scale_writer_started = t; if (gp_exp) { b_rcu_gp_test_started = cur_ops->exp_completed() / 2; @@ -404,7 +404,7 @@ retry: rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) { atomic_inc(this_cpu_ptr(&n_async_inflight)); - cur_ops->async(rhp, rcu_perf_async_cb); + cur_ops->async(rhp, rcu_scale_async_cb); rhp = NULL; } else if (!kthread_should_stop()) { cur_ops->gp_barrier(); @@ -421,19 +421,19 @@ retry: *wdp = t - *wdp; i_max = i; if (!started && - atomic_read(&n_rcu_perf_writer_started) >= nrealwriters) + atomic_read(&n_rcu_scale_writer_started) >= nrealwriters) started = true; if (!done && i >= MIN_MEAS) { done = true; sched_set_normal(current, 0); - pr_alert("%s%s rcu_perf_writer %ld has %d measurements\n", - perf_type, PERF_FLAG, me, MIN_MEAS); - if (atomic_inc_return(&n_rcu_perf_writer_finished) >= + pr_alert("%s%s rcu_scale_writer %ld has %d measurements\n", + scale_type, SCALE_FLAG, me, MIN_MEAS); + if (atomic_inc_return(&n_rcu_scale_writer_finished) >= nrealwriters) { schedule_timeout_interruptible(10); rcu_ftrace_dump(DUMP_ALL); - PERFOUT_STRING("Test complete"); - t_rcu_perf_writer_finished = t; + SCALEOUT_STRING("Test complete"); + t_rcu_scale_writer_finished = t; if (gp_exp) { b_rcu_gp_test_finished = cur_ops->exp_completed() / 2; @@ -448,30 +448,30 @@ retry: } } if (done && !alldone && - atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters) + atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters) alldone = true; if (started && !alldone && i < MAX_MEAS - 1) i++; - rcu_perf_wait_shutdown(); + rcu_scale_wait_shutdown(); } while (!torture_must_stop()); if (gp_async) { cur_ops->gp_barrier(); } writer_n_durations[me] = i_max; - torture_kthread_stopping("rcu_perf_writer"); + torture_kthread_stopping("rcu_scale_writer"); return 0; } static void -rcu_perf_print_module_parms(struct rcu_perf_ops *cur_ops, const char *tag) +rcu_scale_print_module_parms(struct rcu_scale_ops *cur_ops, const char *tag) { - pr_alert("%s" PERF_FLAG + pr_alert("%s" SCALE_FLAG "--- %s: nreaders=%d nwriters=%d verbose=%d shutdown=%d\n", - perf_type, tag, nrealreaders, nrealwriters, verbose, shutdown); + scale_type, tag, nrealreaders, nrealwriters, verbose, shutdown); } static void -rcu_perf_cleanup(void) +rcu_scale_cleanup(void) { int i; int j; @@ -484,11 +484,11 @@ rcu_perf_cleanup(void) * during the mid-boot phase, so have to wait till the end. */ if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) - VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); + VERBOSE_SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); if (rcu_gp_is_normal() && gp_exp) - VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); + VERBOSE_SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); if (gp_exp && gp_async) - VERBOSE_PERFOUT_ERRSTRING("No expedited async GPs, so went with async!"); + VERBOSE_SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!"); if (torture_cleanup_begin()) return; @@ -499,30 +499,30 @@ rcu_perf_cleanup(void) if (reader_tasks) { for (i = 0; i < nrealreaders; i++) - torture_stop_kthread(rcu_perf_reader, + torture_stop_kthread(rcu_scale_reader, reader_tasks[i]); kfree(reader_tasks); } if (writer_tasks) { for (i = 0; i < nrealwriters; i++) { - torture_stop_kthread(rcu_perf_writer, + torture_stop_kthread(rcu_scale_writer, writer_tasks[i]); if (!writer_n_durations) continue; j = writer_n_durations[i]; pr_alert("%s%s writer %d gps: %d\n", - perf_type, PERF_FLAG, i, j); + scale_type, SCALE_FLAG, i, j); ngps += j; } pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n", - perf_type, PERF_FLAG, - t_rcu_perf_writer_started, t_rcu_perf_writer_finished, - t_rcu_perf_writer_finished - - t_rcu_perf_writer_started, + scale_type, SCALE_FLAG, + t_rcu_scale_writer_started, t_rcu_scale_writer_finished, + t_rcu_scale_writer_finished - + t_rcu_scale_writer_started, ngps, - rcuperf_seq_diff(b_rcu_gp_test_finished, - b_rcu_gp_test_started)); + rcuscale_seq_diff(b_rcu_gp_test_finished, + b_rcu_gp_test_started)); for (i = 0; i < nrealwriters; i++) { if (!writer_durations) break; @@ -534,7 +534,7 @@ rcu_perf_cleanup(void) for (j = 0; j <= writer_n_durations[i]; j++) { wdp = &wdpp[j]; pr_alert("%s%s %4d writer-duration: %5d %llu\n", - perf_type, PERF_FLAG, + scale_type, SCALE_FLAG, i, j, *wdp); if (j % 100 == 0) schedule_timeout_uninterruptible(1); @@ -573,22 +573,22 @@ static int compute_real(int n) } /* - * RCU perf shutdown kthread. Just waits to be awakened, then shuts + * RCU scalability shutdown kthread. Just waits to be awakened, then shuts * down system. */ static int -rcu_perf_shutdown(void *arg) +rcu_scale_shutdown(void *arg) { wait_event(shutdown_wq, - atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters); + atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters); smp_mb(); /* Wake before output. */ - rcu_perf_cleanup(); + rcu_scale_cleanup(); kernel_power_off(); return -EINVAL; } /* - * kfree_rcu() performance tests: Start a kfree_rcu() loop on all CPUs for number + * kfree_rcu() scalability tests: Start a kfree_rcu() loop on all CPUs for number * of iterations and measure total time and number of GP for all iterations to complete. */ @@ -598,8 +598,8 @@ torture_param(int, kfree_loops, 10, "Number of loops doing kfree_alloc_num alloc static struct task_struct **kfree_reader_tasks; static int kfree_nrealthreads; -static atomic_t n_kfree_perf_thread_started; -static atomic_t n_kfree_perf_thread_ended; +static atomic_t n_kfree_scale_thread_started; +static atomic_t n_kfree_scale_thread_ended; struct kfree_obj { char kfree_obj[8]; @@ -607,7 +607,7 @@ struct kfree_obj { }; static int -kfree_perf_thread(void *arg) +kfree_scale_thread(void *arg) { int i, loop = 0; long me = (long)arg; @@ -615,13 +615,13 @@ kfree_perf_thread(void *arg) u64 start_time, end_time; long long mem_begin, mem_during = 0; - VERBOSE_PERFOUT_STRING("kfree_perf_thread task started"); + VERBOSE_SCALEOUT_STRING("kfree_scale_thread task started"); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); set_user_nice(current, MAX_NICE); start_time = ktime_get_mono_fast_ns(); - if (atomic_inc_return(&n_kfree_perf_thread_started) >= kfree_nrealthreads) { + if (atomic_inc_return(&n_kfree_scale_thread_started) >= kfree_nrealthreads) { if (gp_exp) b_rcu_gp_test_started = cur_ops->exp_completed() / 2; else @@ -646,7 +646,7 @@ kfree_perf_thread(void *arg) cond_resched(); } while (!torture_must_stop() && ++loop < kfree_loops); - if (atomic_inc_return(&n_kfree_perf_thread_ended) >= kfree_nrealthreads) { + if (atomic_inc_return(&n_kfree_scale_thread_ended) >= kfree_nrealthreads) { end_time = ktime_get_mono_fast_ns(); if (gp_exp) @@ -656,7 +656,7 @@ kfree_perf_thread(void *arg) pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld, memory footprint: %lldMB\n", (unsigned long long)(end_time - start_time), kfree_loops, - rcuperf_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started), + rcuscale_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started), (mem_begin - mem_during) >> (20 - PAGE_SHIFT)); if (shutdown) { @@ -665,12 +665,12 @@ kfree_perf_thread(void *arg) } } - torture_kthread_stopping("kfree_perf_thread"); + torture_kthread_stopping("kfree_scale_thread"); return 0; } static void -kfree_perf_cleanup(void) +kfree_scale_cleanup(void) { int i; @@ -679,7 +679,7 @@ kfree_perf_cleanup(void) if (kfree_reader_tasks) { for (i = 0; i < kfree_nrealthreads; i++) - torture_stop_kthread(kfree_perf_thread, + torture_stop_kthread(kfree_scale_thread, kfree_reader_tasks[i]); kfree(kfree_reader_tasks); } @@ -691,20 +691,20 @@ kfree_perf_cleanup(void) * shutdown kthread. Just waits to be awakened, then shuts down system. */ static int -kfree_perf_shutdown(void *arg) +kfree_scale_shutdown(void *arg) { wait_event(shutdown_wq, - atomic_read(&n_kfree_perf_thread_ended) >= kfree_nrealthreads); + atomic_read(&n_kfree_scale_thread_ended) >= kfree_nrealthreads); smp_mb(); /* Wake before output. */ - kfree_perf_cleanup(); + kfree_scale_cleanup(); kernel_power_off(); return -EINVAL; } static int __init -kfree_perf_init(void) +kfree_scale_init(void) { long i; int firsterr = 0; @@ -713,7 +713,7 @@ kfree_perf_init(void) /* Start up the kthreads. */ if (shutdown) { init_waitqueue_head(&shutdown_wq); - firsterr = torture_create_kthread(kfree_perf_shutdown, NULL, + firsterr = torture_create_kthread(kfree_scale_shutdown, NULL, shutdown_task); if (firsterr) goto unwind; @@ -730,13 +730,13 @@ kfree_perf_init(void) } for (i = 0; i < kfree_nrealthreads; i++) { - firsterr = torture_create_kthread(kfree_perf_thread, (void *)i, + firsterr = torture_create_kthread(kfree_scale_thread, (void *)i, kfree_reader_tasks[i]); if (firsterr) goto unwind; } - while (atomic_read(&n_kfree_perf_thread_started) < kfree_nrealthreads) + while (atomic_read(&n_kfree_scale_thread_started) < kfree_nrealthreads) schedule_timeout_uninterruptible(1); torture_init_end(); @@ -744,35 +744,35 @@ kfree_perf_init(void) unwind: torture_init_end(); - kfree_perf_cleanup(); + kfree_scale_cleanup(); return firsterr; } static int __init -rcu_perf_init(void) +rcu_scale_init(void) { long i; int firsterr = 0; - static struct rcu_perf_ops *perf_ops[] = { + static struct rcu_scale_ops *scale_ops[] = { &rcu_ops, &srcu_ops, &srcud_ops, &tasks_ops, }; - if (!torture_init_begin(perf_type, verbose)) + if (!torture_init_begin(scale_type, verbose)) return -EBUSY; - /* Process args and tell the world that the perf'er is on the job. */ - for (i = 0; i < ARRAY_SIZE(perf_ops); i++) { - cur_ops = perf_ops[i]; - if (strcmp(perf_type, cur_ops->name) == 0) + /* Process args and announce that the scalability'er is on the job. */ + for (i = 0; i < ARRAY_SIZE(scale_ops); i++) { + cur_ops = scale_ops[i]; + if (strcmp(scale_type, cur_ops->name) == 0) break; } - if (i == ARRAY_SIZE(perf_ops)) { - pr_alert("rcu-perf: invalid perf type: \"%s\"\n", perf_type); - pr_alert("rcu-perf types:"); - for (i = 0; i < ARRAY_SIZE(perf_ops); i++) - pr_cont(" %s", perf_ops[i]->name); + if (i == ARRAY_SIZE(scale_ops)) { + pr_alert("rcu-scale: invalid scale type: \"%s\"\n", scale_type); + pr_alert("rcu-scale types:"); + for (i = 0; i < ARRAY_SIZE(scale_ops); i++) + pr_cont(" %s", scale_ops[i]->name); pr_cont("\n"); - WARN_ON(!IS_MODULE(CONFIG_RCU_PERF_TEST)); + WARN_ON(!IS_MODULE(CONFIG_RCU_SCALE_TEST)); firsterr = -EINVAL; cur_ops = NULL; goto unwind; @@ -781,20 +781,20 @@ rcu_perf_init(void) cur_ops->init(); if (kfree_rcu_test) - return kfree_perf_init(); + return kfree_scale_init(); nrealwriters = compute_real(nwriters); nrealreaders = compute_real(nreaders); - atomic_set(&n_rcu_perf_reader_started, 0); - atomic_set(&n_rcu_perf_writer_started, 0); - atomic_set(&n_rcu_perf_writer_finished, 0); - rcu_perf_print_module_parms(cur_ops, "Start of test"); + atomic_set(&n_rcu_scale_reader_started, 0); + atomic_set(&n_rcu_scale_writer_started, 0); + atomic_set(&n_rcu_scale_writer_finished, 0); + rcu_scale_print_module_parms(cur_ops, "Start of test"); /* Start up the kthreads. */ if (shutdown) { init_waitqueue_head(&shutdown_wq); - firsterr = torture_create_kthread(rcu_perf_shutdown, NULL, + firsterr = torture_create_kthread(rcu_scale_shutdown, NULL, shutdown_task); if (firsterr) goto unwind; @@ -803,17 +803,17 @@ rcu_perf_init(void) reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]), GFP_KERNEL); if (reader_tasks == NULL) { - VERBOSE_PERFOUT_ERRSTRING("out of memory"); + VERBOSE_SCALEOUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } for (i = 0; i < nrealreaders; i++) { - firsterr = torture_create_kthread(rcu_perf_reader, (void *)i, + firsterr = torture_create_kthread(rcu_scale_reader, (void *)i, reader_tasks[i]); if (firsterr) goto unwind; } - while (atomic_read(&n_rcu_perf_reader_started) < nrealreaders) + while (atomic_read(&n_rcu_scale_reader_started) < nrealreaders) schedule_timeout_uninterruptible(1); writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]), GFP_KERNEL); @@ -823,7 +823,7 @@ rcu_perf_init(void) kcalloc(nrealwriters, sizeof(*writer_n_durations), GFP_KERNEL); if (!writer_tasks || !writer_durations || !writer_n_durations) { - VERBOSE_PERFOUT_ERRSTRING("out of memory"); + VERBOSE_SCALEOUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } @@ -835,7 +835,7 @@ rcu_perf_init(void) firsterr = -ENOMEM; goto unwind; } - firsterr = torture_create_kthread(rcu_perf_writer, (void *)i, + firsterr = torture_create_kthread(rcu_scale_writer, (void *)i, writer_tasks[i]); if (firsterr) goto unwind; @@ -845,9 +845,9 @@ rcu_perf_init(void) unwind: torture_init_end(); - rcu_perf_cleanup(); + rcu_scale_cleanup(); return firsterr; } -module_init(rcu_perf_init); -module_exit(rcu_perf_cleanup); +module_init(rcu_scale_init); +module_exit(rcu_scale_cleanup); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f453bf8d2f1e..916ea4f66e4b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -52,19 +52,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); -#ifndef data_race -#define data_race(expr) \ - ({ \ - expr; \ - }) -#endif -#ifndef ASSERT_EXCLUSIVE_WRITER -#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) -#endif -#ifndef ASSERT_EXCLUSIVE_ACCESS -#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) -#endif - /* Bits for ->extendables field, extendables param, and related definitions. */ #define RCUTORTURE_RDR_SHIFT 8 /* Put SRCU index in upper bits. */ #define RCUTORTURE_RDR_MASK ((1 << RCUTORTURE_RDR_SHIFT) - 1) @@ -100,6 +87,7 @@ torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives"); torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives"); torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); +torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers"); torture_param(int, n_barrier_cbs, 0, "# of callbacks/kthreads for barrier testing"); torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads"); @@ -185,6 +173,7 @@ static long n_barrier_successes; /* did rcu_barrier test succeed? */ static unsigned long n_read_exits; static struct list_head rcu_torture_removed; static unsigned long shutdown_jiffies; +static unsigned long start_gp_seq; static int rcu_torture_writer_state; #define RTWS_FIXED_DELAY 0 @@ -1413,6 +1402,9 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) preempt_enable(); rcutorture_one_extend(&readstate, 0, trsp, rtrsp); WARN_ON_ONCE(readstate & RCUTORTURE_RDR_MASK); + // This next splat is expected behavior if leakpointer, especially + // for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels. + WARN_ON_ONCE(leakpointer && READ_ONCE(p->rtort_pipe_count) > 1); /* If error or close call, record the sequence of reader protections. */ if ((pipe_count > 1 || completed > 1) && !xchg(&err_segs_recorded, 1)) { @@ -1808,6 +1800,7 @@ struct rcu_fwd { unsigned long rcu_launder_gp_seq_start; }; +static DEFINE_MUTEX(rcu_fwd_mutex); static struct rcu_fwd *rcu_fwds; static bool rcu_fwd_emergency_stop; @@ -2074,8 +2067,14 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) static int rcutorture_oom_notify(struct notifier_block *self, unsigned long notused, void *nfreed) { - struct rcu_fwd *rfp = rcu_fwds; + struct rcu_fwd *rfp; + mutex_lock(&rcu_fwd_mutex); + rfp = rcu_fwds; + if (!rfp) { + mutex_unlock(&rcu_fwd_mutex); + return NOTIFY_OK; + } WARN(1, "%s invoked upon OOM during forward-progress testing.\n", __func__); rcu_torture_fwd_cb_hist(rfp); @@ -2093,6 +2092,7 @@ static int rcutorture_oom_notify(struct notifier_block *self, smp_mb(); /* Frees before return to avoid redoing OOM. */ (*(unsigned long *)nfreed)++; /* Forward progress CBs freed! */ pr_info("%s returning after OOM processing.\n", __func__); + mutex_unlock(&rcu_fwd_mutex); return NOTIFY_OK; } @@ -2114,13 +2114,11 @@ static int rcu_torture_fwd_prog(void *args) do { schedule_timeout_interruptible(fwd_progress_holdoff * HZ); WRITE_ONCE(rcu_fwd_emergency_stop, false); - register_oom_notifier(&rcutorture_oom_nb); if (!IS_ENABLED(CONFIG_TINY_RCU) || rcu_inkernel_boot_has_ended()) rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries); if (rcu_inkernel_boot_has_ended()) rcu_torture_fwd_prog_cr(rfp); - unregister_oom_notifier(&rcutorture_oom_nb); /* Avoid slow periods, better to test when busy. */ stutter_wait("rcu_torture_fwd_prog"); @@ -2160,9 +2158,26 @@ static int __init rcu_torture_fwd_prog_init(void) return -ENOMEM; spin_lock_init(&rfp->rcu_fwd_lock); rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head; + mutex_lock(&rcu_fwd_mutex); + rcu_fwds = rfp; + mutex_unlock(&rcu_fwd_mutex); + register_oom_notifier(&rcutorture_oom_nb); return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task); } +static void rcu_torture_fwd_prog_cleanup(void) +{ + struct rcu_fwd *rfp; + + torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); + rfp = rcu_fwds; + mutex_lock(&rcu_fwd_mutex); + rcu_fwds = NULL; + mutex_unlock(&rcu_fwd_mutex); + unregister_oom_notifier(&rcutorture_oom_nb); + kfree(rfp); +} + /* Callback function for RCU barrier testing. */ static void rcu_torture_barrier_cbf(struct rcu_head *rcu) { @@ -2460,7 +2475,7 @@ rcu_torture_cleanup(void) show_rcu_gp_kthreads(); rcu_torture_read_exit_cleanup(); rcu_torture_barrier_cleanup(); - torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); + rcu_torture_fwd_prog_cleanup(); torture_stop_kthread(rcu_torture_stall, stall_task); torture_stop_kthread(rcu_torture_writer, writer_task); @@ -2482,8 +2497,9 @@ rcu_torture_cleanup(void) rcutorture_get_gp_data(cur_ops->ttype, &flags, &gp_seq); srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq); - pr_alert("%s: End-test grace-period state: g%lu f%#x\n", - cur_ops->name, gp_seq, flags); + pr_alert("%s: End-test grace-period state: g%ld f%#x total-gps=%ld\n", + cur_ops->name, (long)gp_seq, flags, + rcutorture_seq_diff(gp_seq, start_gp_seq)); torture_stop_kthread(rcu_torture_stats, stats_task); torture_stop_kthread(rcu_torture_fqs, fqs_task); if (rcu_torture_can_boost()) @@ -2607,6 +2623,8 @@ rcu_torture_init(void) long i; int cpu; int firsterr = 0; + int flags = 0; + unsigned long gp_seq = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, &busted_srcud_ops, &tasks_ops, &tasks_rude_ops, @@ -2649,6 +2667,11 @@ rcu_torture_init(void) nrealreaders = 1; } rcu_torture_print_module_parms(cur_ops, "Start of test"); + rcutorture_get_gp_data(cur_ops->ttype, &flags, &gp_seq); + srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq); + start_gp_seq = gp_seq; + pr_alert("%s: Start-test grace-period state: g%ld f%#x\n", + cur_ops->name, (long)gp_seq, flags); /* Set up the freelist. */ diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index d9291f883b54..952595c678b3 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -546,9 +546,11 @@ static int main_func(void *arg) // Print the average of all experiments SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n"); - buf[0] = 0; - strcat(buf, "\n"); - strcat(buf, "Runs\tTime(ns)\n"); + if (!errexit) { + buf[0] = 0; + strcat(buf, "\n"); + strcat(buf, "Runs\tTime(ns)\n"); + } for (exp = 0; exp < nruns; exp++) { u64 avg; diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index c100acf332ed..c13348ee80a5 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -29,19 +29,6 @@ #include "rcu.h" #include "rcu_segcblist.h" -#ifndef data_race -#define data_race(expr) \ - ({ \ - expr; \ - }) -#endif -#ifndef ASSERT_EXCLUSIVE_WRITER -#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) -#endif -#ifndef ASSERT_EXCLUSIVE_ACCESS -#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) -#endif - /* Holdoff in nanoseconds for auto-expediting. */ #define DEFAULT_SRCU_EXP_HOLDOFF (25 * 1000) static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF; diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 05d3e1375e4c..d5d9f2d03e8a 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -28,6 +28,8 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. * @gp_func: This flavor's grace-period-wait function. * @gp_state: Grace period's most recent state transition (debugging). + * @gp_sleep: Per-grace-period sleep to prevent CPU-bound looping. + * @init_fract: Initial backoff sleep interval. * @gp_jiffies: Time of last @gp_state transition. * @gp_start: Most recent grace-period start in jiffies. * @n_gps: Number of grace periods completed since boot. @@ -48,6 +50,8 @@ struct rcu_tasks { struct wait_queue_head cbs_wq; raw_spinlock_t cbs_lock; int gp_state; + int gp_sleep; + int init_fract; unsigned long gp_jiffies; unsigned long gp_start; unsigned long n_gps; @@ -81,7 +85,7 @@ static struct rcu_tasks rt_name = \ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); /* Avoid IPIing CPUs early in the grace period. */ -#define RCU_TASK_IPI_DELAY (HZ / 2) +#define RCU_TASK_IPI_DELAY (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) ? HZ / 2 : 0) static int rcu_task_ipi_delay __read_mostly = RCU_TASK_IPI_DELAY; module_param(rcu_task_ipi_delay, int, 0644); @@ -231,7 +235,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) cond_resched(); } /* Paranoid sleep to keep this from entering a tight loop */ - schedule_timeout_idle(HZ/10); + schedule_timeout_idle(rtp->gp_sleep); set_tasks_gp_state(rtp, RTGS_WAIT_CBS); } @@ -329,8 +333,10 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) */ lastreport = jiffies; - /* Start off with HZ/10 wait and slowly back off to 1 HZ wait. */ - fract = 10; + // Start off with initial wait and slowly back off to 1 HZ wait. + fract = rtp->init_fract; + if (fract > HZ) + fract = HZ; for (;;) { bool firstreport; @@ -553,6 +559,8 @@ EXPORT_SYMBOL_GPL(rcu_barrier_tasks); static int __init rcu_spawn_tasks_kthread(void) { + rcu_tasks.gp_sleep = HZ / 10; + rcu_tasks.init_fract = 10; rcu_tasks.pregp_func = rcu_tasks_pregp_step; rcu_tasks.pertask_func = rcu_tasks_pertask; rcu_tasks.postscan_func = rcu_tasks_postscan; @@ -685,6 +693,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude); static int __init rcu_spawn_tasks_rude_kthread(void) { + rcu_tasks_rude.gp_sleep = HZ / 10; rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude); return 0; } @@ -745,9 +754,9 @@ static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); // The number of detections of task quiescent state relying on // heavyweight readers executing explicit memory barriers. -unsigned long n_heavy_reader_attempts; -unsigned long n_heavy_reader_updates; -unsigned long n_heavy_reader_ofl_updates; +static unsigned long n_heavy_reader_attempts; +static unsigned long n_heavy_reader_updates; +static unsigned long n_heavy_reader_ofl_updates; void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, @@ -821,6 +830,12 @@ static void trc_read_check_handler(void *t_in) WRITE_ONCE(t->trc_reader_checked, true); goto reset_ipi; } + // If we are racing with an rcu_read_unlock_trace(), try again later. + if (unlikely(t->trc_reader_nesting < 0)) { + if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) + wake_up(&trc_wait); + goto reset_ipi; + } WRITE_ONCE(t->trc_reader_checked, true); // Get here if the task is in a read-side critical section. Set @@ -911,7 +926,8 @@ static void trc_wait_for_one_reader(struct task_struct *t, // If currently running, send an IPI, either way, add to list. trc_add_holdout(t, bhp); - if (task_curr(t) && time_after(jiffies, rcu_tasks_trace.gp_start + rcu_task_ipi_delay)) { + if (task_curr(t) && + time_after(jiffies + 1, rcu_tasks_trace.gp_start + rcu_task_ipi_delay)) { // The task is currently running, so try IPIing it. cpu = task_cpu(t); @@ -1072,15 +1088,17 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) if (ret) break; // Count reached zero. // Stall warning time, so make a list of the offenders. + rcu_read_lock(); for_each_process_thread(g, t) if (READ_ONCE(t->trc_reader_special.b.need_qs)) trc_add_holdout(t, &holdouts); + rcu_read_unlock(); firstreport = true; - list_for_each_entry_safe(t, g, &holdouts, trc_holdout_list) - if (READ_ONCE(t->trc_reader_special.b.need_qs)) { + list_for_each_entry_safe(t, g, &holdouts, trc_holdout_list) { + if (READ_ONCE(t->trc_reader_special.b.need_qs)) show_stalled_task_trace(t, &firstreport); - trc_del_holdout(t); - } + trc_del_holdout(t); // Release task_struct reference. + } if (firstreport) pr_err("INFO: rcu_tasks_trace detected stalls? (Counter/taskslist mismatch?)\n"); show_stalled_ipi_trace(); @@ -1163,6 +1181,17 @@ EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace); static int __init rcu_spawn_tasks_trace_kthread(void) { + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) { + rcu_tasks_trace.gp_sleep = HZ / 10; + rcu_tasks_trace.init_fract = 10; + } else { + rcu_tasks_trace.gp_sleep = HZ / 200; + if (rcu_tasks_trace.gp_sleep <= 0) + rcu_tasks_trace.gp_sleep = 1; + rcu_tasks_trace.init_fract = HZ / 5; + if (rcu_tasks_trace.init_fract <= 0) + rcu_tasks_trace.init_fract = 1; + } rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step; rcu_tasks_trace.pertask_func = rcu_tasks_trace_pertask; rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f78ee759af9c..06895ef85d69 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -70,19 +70,6 @@ #endif #define MODULE_PARAM_PREFIX "rcutree." -#ifndef data_race -#define data_race(expr) \ - ({ \ - expr; \ - }) -#endif -#ifndef ASSERT_EXCLUSIVE_WRITER -#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) -#endif -#ifndef ASSERT_EXCLUSIVE_ACCESS -#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) -#endif - /* Data structures. */ /* @@ -178,6 +165,12 @@ module_param(gp_init_delay, int, 0444); static int gp_cleanup_delay; module_param(gp_cleanup_delay, int, 0444); +// Add delay to rcu_read_unlock() for strict grace periods. +static int rcu_unlock_delay; +#ifdef CONFIG_RCU_STRICT_GRACE_PERIOD +module_param(rcu_unlock_delay, int, 0444); +#endif + /* * This rcu parameter is runtime-read-only. It reflects * a minimum allowed number of objects which can be cached @@ -468,24 +461,25 @@ static int rcu_is_cpu_rrupt_from_idle(void) return __this_cpu_read(rcu_data.dynticks_nesting) == 0; } -#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch ... */ -#define DEFAULT_MAX_RCU_BLIMIT 10000 /* ... even during callback flood. */ +#define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10) + // Maximum callbacks per rcu_do_batch ... +#define DEFAULT_MAX_RCU_BLIMIT 10000 // ... even during callback flood. static long blimit = DEFAULT_RCU_BLIMIT; -#define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */ +#define DEFAULT_RCU_QHIMARK 10000 // If this many pending, ignore blimit. static long qhimark = DEFAULT_RCU_QHIMARK; -#define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */ +#define DEFAULT_RCU_QLOMARK 100 // Once only this many pending, use blimit. static long qlowmark = DEFAULT_RCU_QLOMARK; #define DEFAULT_RCU_QOVLD_MULT 2 #define DEFAULT_RCU_QOVLD (DEFAULT_RCU_QOVLD_MULT * DEFAULT_RCU_QHIMARK) -static long qovld = DEFAULT_RCU_QOVLD; /* If this many pending, hammer QS. */ -static long qovld_calc = -1; /* No pre-initialization lock acquisitions! */ +static long qovld = DEFAULT_RCU_QOVLD; // If this many pending, hammer QS. +static long qovld_calc = -1; // No pre-initialization lock acquisitions! module_param(blimit, long, 0444); module_param(qhimark, long, 0444); module_param(qlowmark, long, 0444); module_param(qovld, long, 0444); -static ulong jiffies_till_first_fqs = ULONG_MAX; +static ulong jiffies_till_first_fqs = IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 0 : ULONG_MAX; static ulong jiffies_till_next_fqs = ULONG_MAX; static bool rcu_kick_kthreads; static int rcu_divisor = 7; @@ -1092,11 +1086,6 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) } } -noinstr bool __rcu_is_watching(void) -{ - return !rcu_dynticks_curr_cpu_in_eqs(); -} - /** * rcu_is_watching - see if RCU thinks that the current CPU is not idle * @@ -1229,13 +1218,28 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) return 1; } - /* If waiting too long on an offline CPU, complain. */ - if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp)) && - time_after(jiffies, rcu_state.gp_start + HZ)) { + /* + * Complain if a CPU that is considered to be offline from RCU's + * perspective has not yet reported a quiescent state. After all, + * the offline CPU should have reported a quiescent state during + * the CPU-offline process, or, failing that, by rcu_gp_init() + * if it ran concurrently with either the CPU going offline or the + * last task on a leaf rcu_node structure exiting its RCU read-side + * critical section while all CPUs corresponding to that structure + * are offline. This added warning detects bugs in any of these + * code paths. + * + * The rcu_node structure's ->lock is held here, which excludes + * the relevant portions the CPU-hotplug code, the grace-period + * initialization code, and the rcu_read_unlock() code paths. + * + * For more detail, please refer to the "Hotplug CPU" section + * of RCU's Requirements documentation. + */ + if (WARN_ON_ONCE(!(rdp->grpmask & rcu_rnp_online_cpus(rnp)))) { bool onl; struct rcu_node *rnp1; - WARN_ON(1); /* Offline CPUs are supposed to report QS! */ pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n", __func__, rnp->grplo, rnp->grphi, rnp->level, (long)rnp->gp_seq, (long)rnp->completedqs); @@ -1498,9 +1502,10 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp) /* Trace depending on how much we were able to accelerate. */ if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL)) - trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccWaitCB")); + trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccWaitCB")); else - trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccReadyCB")); + trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB")); + return ret; } @@ -1576,6 +1581,19 @@ static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp, } /* + * In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, attempt to generate a + * quiescent state. This is intended to be invoked when the CPU notices + * a new grace period. + */ +static void rcu_strict_gp_check_qs(void) +{ + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) { + rcu_read_lock(); + rcu_read_unlock(); + } +} + +/* * Update CPU-local rcu_data state to record the beginnings and ends of * grace periods. The caller must hold the ->lock of the leaf rcu_node * structure corresponding to the current CPU, and must have irqs disabled. @@ -1645,6 +1663,7 @@ static void note_gp_changes(struct rcu_data *rdp) } needwake = __note_gp_changes(rnp, rdp); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + rcu_strict_gp_check_qs(); if (needwake) rcu_gp_kthread_wake(); } @@ -1683,6 +1702,15 @@ static void rcu_gp_torture_wait(void) } /* + * Handler for on_each_cpu() to invoke the target CPU's RCU core + * processing. + */ +static void rcu_strict_gp_boundary(void *unused) +{ + invoke_rcu_core(); +} + +/* * Initialize a new grace period. Return false if no grace period required. */ static bool rcu_gp_init(void) @@ -1720,10 +1748,13 @@ static bool rcu_gp_init(void) raw_spin_unlock_irq_rcu_node(rnp); /* - * Apply per-leaf buffered online and offline operations to the - * rcu_node tree. Note that this new grace period need not wait - * for subsequent online CPUs, and that quiescent-state forcing - * will handle subsequent offline CPUs. + * Apply per-leaf buffered online and offline operations to + * the rcu_node tree. Note that this new grace period need not + * wait for subsequent online CPUs, and that RCU hooks in the CPU + * offlining path, when combined with checks in this function, + * will handle CPUs that are currently going offline or that will + * go offline later. Please also refer to "Hotplug CPU" section + * of RCU's Requirements documentation. */ rcu_state.gp_state = RCU_GP_ONOFF; rcu_for_each_leaf_node(rnp) { @@ -1810,6 +1841,10 @@ static bool rcu_gp_init(void) WRITE_ONCE(rcu_state.gp_activity, jiffies); } + // If strict, make all CPUs aware of new grace period. + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + on_each_cpu(rcu_strict_gp_boundary, NULL, 0); + return true; } @@ -1898,7 +1933,7 @@ static void rcu_gp_fqs_loop(void) break; /* If time for quiescent-state forcing, do it. */ if (!time_after(rcu_state.jiffies_force_qs, jiffies) || - (gf & RCU_GP_FLAG_FQS)) { + (gf & (RCU_GP_FLAG_FQS | RCU_GP_FLAG_OVLD))) { trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqsstart")); rcu_gp_fqs(first_gp_fqs); @@ -2026,6 +2061,10 @@ static void rcu_gp_cleanup(void) rcu_state.gp_flags & RCU_GP_FLAG_INIT); } raw_spin_unlock_irq_rcu_node(rnp); + + // If strict, make all CPUs aware of the end of the old grace period. + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + on_each_cpu(rcu_strict_gp_boundary, NULL, 0); } /* @@ -2204,7 +2243,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) * structure. This must be called from the specified CPU. */ static void -rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) +rcu_report_qs_rdp(struct rcu_data *rdp) { unsigned long flags; unsigned long mask; @@ -2213,6 +2252,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) rcu_segcblist_is_offloaded(&rdp->cblist); struct rcu_node *rnp; + WARN_ON_ONCE(rdp->cpu != smp_processor_id()); rnp = rdp->mynode; raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rdp->cpu_no_qs.b.norm || rdp->gp_seq != rnp->gp_seq || @@ -2229,8 +2269,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) return; } mask = rdp->grpmask; - if (rdp->cpu == smp_processor_id()) - rdp->core_needs_qs = false; + rdp->core_needs_qs = false; if ((rnp->qsmask & mask) == 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } else { @@ -2279,7 +2318,7 @@ rcu_check_quiescent_state(struct rcu_data *rdp) * Tell RCU we are done (but rcu_report_qs_rdp() will be the * judge of that). */ - rcu_report_qs_rdp(rdp->cpu, rdp); + rcu_report_qs_rdp(rdp); } /* @@ -2376,6 +2415,7 @@ int rcutree_dead_cpu(unsigned int cpu) */ static void rcu_do_batch(struct rcu_data *rdp) { + int div; unsigned long flags; const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && rcu_segcblist_is_offloaded(&rdp->cblist); @@ -2404,9 +2444,15 @@ static void rcu_do_batch(struct rcu_data *rdp) rcu_nocb_lock(rdp); WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); pending = rcu_segcblist_n_cbs(&rdp->cblist); - bl = max(rdp->blimit, pending >> rcu_divisor); - if (unlikely(bl > 100)) - tlimit = local_clock() + rcu_resched_ns; + div = READ_ONCE(rcu_divisor); + div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div; + bl = max(rdp->blimit, pending >> div); + if (unlikely(bl > 100)) { + long rrn = READ_ONCE(rcu_resched_ns); + + rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn; + tlimit = local_clock() + rrn; + } trace_rcu_batch_start(rcu_state.name, rcu_segcblist_n_cbs(&rdp->cblist), bl); rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); @@ -2547,8 +2593,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) raw_spin_lock_irqsave_rcu_node(rnp, flags); rcu_state.cbovldnext |= !!rnp->cbovldmask; if (rnp->qsmask == 0) { - if (!IS_ENABLED(CONFIG_PREEMPT_RCU) || - rcu_preempt_blocked_readers_cgp(rnp)) { + if (rcu_preempt_blocked_readers_cgp(rnp)) { /* * No point in scanning bits because they * are all zero. But we might need to @@ -2616,6 +2661,14 @@ void rcu_force_quiescent_state(void) } EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); +// Workqueue handler for an RCU reader for kernels enforcing struct RCU +// grace periods. +static void strict_work_handler(struct work_struct *work) +{ + rcu_read_lock(); + rcu_read_unlock(); +} + /* Perform RCU core processing work for the current CPU. */ static __latent_entropy void rcu_core(void) { @@ -2660,6 +2713,10 @@ static __latent_entropy void rcu_core(void) /* Do any needed deferred wakeups of rcuo kthreads. */ do_nocb_deferred_wakeup(rdp); trace_rcu_utilization(TPS("End RCU core")); + + // If strict GPs, schedule an RCU reader in a clean environment. + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + queue_work_on(rdp->cpu, rcu_gp_wq, &rdp->strict_work); } static void rcu_core_si(struct softirq_action *h) @@ -3022,6 +3079,12 @@ struct kfree_rcu_cpu_work { * @monitor_todo: Tracks whether a @monitor_work delayed work is pending * @initialized: The @rcu_work fields have been initialized * @count: Number of objects for which GP not started + * @bkvcache: + * A simple cache list that contains objects for reuse purpose. + * In order to save some per-cpu space the list is singular. + * Even though it is lockless an access has to be protected by the + * per-cpu lock. + * @nr_bkv_objs: number of allocated objects at @bkvcache. * * This is a per-CPU structure. The reason that it is not included in * the rcu_data structure is to permit this code to be extracted from @@ -3037,14 +3100,6 @@ struct kfree_rcu_cpu { bool monitor_todo; bool initialized; int count; - - /* - * A simple cache list that contains objects for - * reuse purpose. In order to save some per-cpu - * space the list is singular. Even though it is - * lockless an access has to be protected by the - * per-cpu lock. - */ struct llist_head bkvcache; int nr_bkv_objs; }; @@ -3445,7 +3500,7 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) unsigned long count = 0; /* Snapshot count of all CPUs */ - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); count += READ_ONCE(krcp->count); @@ -3460,7 +3515,7 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) int cpu, freed = 0; unsigned long flags; - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { int count; struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); @@ -3493,7 +3548,7 @@ void __init kfree_rcu_scheduler_running(void) int cpu; unsigned long flags; - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); raw_spin_lock_irqsave(&krcp->lock, flags); @@ -3857,6 +3912,7 @@ rcu_boot_init_percpu_data(int cpu) /* Set up local state, ensuring consistent view of global state. */ rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); + INIT_WORK(&rdp->strict_work, strict_work_handler); WARN_ON_ONCE(rdp->dynticks_nesting != 1); WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp))); rdp->rcu_ofl_gp_seq = rcu_state.gp_seq; @@ -3975,8 +4031,6 @@ int rcutree_offline_cpu(unsigned int cpu) return 0; } -static DEFINE_PER_CPU(int, rcu_cpu_started); - /* * Mark the specified CPU as being online so that subsequent grace periods * (both expedited and normal) will wait on it. Note that this means that @@ -3996,12 +4050,11 @@ void rcu_cpu_starting(unsigned int cpu) struct rcu_node *rnp; bool newcpu; - if (per_cpu(rcu_cpu_started, cpu)) + rdp = per_cpu_ptr(&rcu_data, cpu); + if (rdp->cpu_started) return; + rdp->cpu_started = true; - per_cpu(rcu_cpu_started, cpu) = 1; - - rdp = per_cpu_ptr(&rcu_data, cpu); rnp = rdp->mynode; mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -4061,7 +4114,7 @@ void rcu_report_dead(unsigned int cpu) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); raw_spin_unlock(&rcu_state.ofl_lock); - per_cpu(rcu_cpu_started, cpu) = 0; + rdp->cpu_started = false; } /* diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index c96ae351688b..e4f66b8f7c47 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -156,6 +156,7 @@ struct rcu_data { bool beenonline; /* CPU online at least once. */ bool gpwrap; /* Possible ->gp_seq wrap. */ bool exp_deferred_qs; /* This CPU awaiting a deferred QS? */ + bool cpu_started; /* RCU watching this onlining CPU. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ unsigned long ticks_this_gp; /* The number of scheduling-clock */ @@ -164,6 +165,7 @@ struct rcu_data { /* period it is aware of. */ struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */ bool defer_qs_iw_pending; /* Scheduler attention pending? */ + struct work_struct strict_work; /* Schedule readers for strict GPs. */ /* 2) batch handling */ struct rcu_segcblist cblist; /* Segmented callback list, with */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 1888c0eb1216..8760b6ead770 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -732,11 +732,9 @@ static void rcu_exp_need_qs(void) /* Invoked on each online non-idle CPU for expedited quiescent state. */ static void rcu_exp_handler(void *unused) { - struct rcu_data *rdp; - struct rcu_node *rnp; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + struct rcu_node *rnp = rdp->mynode; - rdp = this_cpu_ptr(&rcu_data); - rnp = rdp->mynode; if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) return; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 982fc5be5269..fd8a52e9a887 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -36,6 +36,8 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); if (IS_ENABLED(CONFIG_PROVE_RCU)) pr_info("\tRCU lockdep checking is enabled.\n"); + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + pr_info("\tRCU strict (and thus non-scalable) grace periods enabled.\n"); if (RCU_NUM_LVLS >= 4) pr_info("\tFour(or more)-level hierarchy is enabled.\n"); if (RCU_FANOUT_LEAF != 16) @@ -374,6 +376,8 @@ void __rcu_read_lock(void) rcu_preempt_read_enter(); if (IS_ENABLED(CONFIG_PROVE_LOCKING)) WARN_ON_ONCE(rcu_preempt_depth() > RCU_NEST_PMAX); + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && rcu_state.gp_kthread) + WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, true); barrier(); /* critical section after entry code. */ } EXPORT_SYMBOL_GPL(__rcu_read_lock); @@ -455,8 +459,14 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) return; } t->rcu_read_unlock_special.s = 0; - if (special.b.need_qs) - rcu_qs(); + if (special.b.need_qs) { + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) { + rcu_report_qs_rdp(rdp); + udelay(rcu_unlock_delay); + } else { + rcu_qs(); + } + } /* * Respond to a request by an expedited grace period for a @@ -769,6 +779,24 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) #else /* #ifdef CONFIG_PREEMPT_RCU */ /* + * If strict grace periods are enabled, and if the calling + * __rcu_read_unlock() marks the beginning of a quiescent state, immediately + * report that quiescent state and, if requested, spin for a bit. + */ +void rcu_read_unlock_strict(void) +{ + struct rcu_data *rdp; + + if (!IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) || + irqs_disabled() || preempt_count() || !rcu_state.gp_kthread) + return; + rdp = this_cpu_ptr(&rcu_data); + rcu_report_qs_rdp(rdp); + udelay(rcu_unlock_delay); +} +EXPORT_SYMBOL_GPL(rcu_read_unlock_strict); + +/* * Tell them what RCU they are running. */ static void __init rcu_bootup_announce(void) @@ -1926,6 +1954,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) * nearest grace period (if any) to wait for next. The CB kthreads * and the global grace-period kthread are awakened if needed. */ + WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp); for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check")); rcu_nocb_lock_irqsave(rdp, flags); @@ -2411,13 +2440,12 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) return; waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock); - wastimer = timer_pending(&rdp->nocb_timer); + wastimer = timer_pending(&rdp->nocb_bypass_timer); wassleep = swait_active(&rdp->nocb_gp_wq); - if (!rdp->nocb_defer_wakeup && !rdp->nocb_gp_sleep && - !waslocked && !wastimer && !wassleep) + if (!rdp->nocb_gp_sleep && !waslocked && !wastimer && !wassleep) return; /* Nothing untowards. */ - pr_info(" !!! %c%c%c%c %c\n", + pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c%c %c\n", "lL"[waslocked], "dD"[!!rdp->nocb_defer_wakeup], "tT"[wastimer], diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index b5d3b4794db4..0fde39b8daab 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -158,7 +158,7 @@ static void rcu_stall_kick_kthreads(void) { unsigned long j; - if (!rcu_kick_kthreads) + if (!READ_ONCE(rcu_kick_kthreads)) return; j = READ_ONCE(rcu_state.jiffies_kick_kthreads); if (time_after(jiffies, j) && rcu_state.gp_kthread && @@ -580,7 +580,7 @@ static void check_cpu_stall(struct rcu_data *rdp) unsigned long js; struct rcu_node *rnp; - if ((rcu_stall_is_suppressed() && !rcu_kick_kthreads) || + if ((rcu_stall_is_suppressed() && !READ_ONCE(rcu_kick_kthreads)) || !rcu_gp_in_progress()) return; rcu_stall_kick_kthreads(); @@ -623,7 +623,7 @@ static void check_cpu_stall(struct rcu_data *rdp) /* We haven't checked in, so go dump stack. */ print_cpu_stall(gps); - if (rcu_cpu_stall_ftrace_dump) + if (READ_ONCE(rcu_cpu_stall_ftrace_dump)) rcu_ftrace_dump(DUMP_ALL); } else if (rcu_gp_in_progress() && @@ -632,7 +632,7 @@ static void check_cpu_stall(struct rcu_data *rdp) /* They had a few time units to dump stack, so complain. */ print_other_cpu_stall(gs2, gps); - if (rcu_cpu_stall_ftrace_dump) + if (READ_ONCE(rcu_cpu_stall_ftrace_dump)) rcu_ftrace_dump(DUMP_ALL); } } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 3e0f4bcb558f..39334d2d2b37 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -53,19 +53,6 @@ #endif #define MODULE_PARAM_PREFIX "rcupdate." -#ifndef data_race -#define data_race(expr) \ - ({ \ - expr; \ - }) -#endif -#ifndef ASSERT_EXCLUSIVE_WRITER -#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) -#endif -#ifndef ASSERT_EXCLUSIVE_ACCESS -#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) -#endif - #ifndef CONFIG_TINY_RCU module_param(rcu_expedited, int, 0); module_param(rcu_normal, int, 0); diff --git a/kernel/relay.c b/kernel/relay.c index fb4e0c530c08..b08d936d5fa7 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1002,7 +1002,7 @@ static int relay_file_read_avail(struct rchan_buf *buf) size_t subbuf_size = buf->chan->subbuf_size; size_t n_subbufs = buf->chan->n_subbufs; size_t produced = buf->subbufs_produced; - size_t consumed = buf->subbufs_consumed; + size_t consumed; relay_file_read_consume(buf, 0, 0); diff --git a/kernel/resource.c b/kernel/resource.c index 841737bbda9e..3ae2f56cc79d 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -382,10 +382,13 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, if (p) { /* copy data */ - res->start = max(start, p->start); - res->end = min(end, p->end); - res->flags = p->flags; - res->desc = p->desc; + *res = (struct resource) { + .start = max(start, p->start), + .end = min(end, p->end), + .flags = p->flags, + .desc = p->desc, + .parent = p->parent, + }; } read_unlock(&resource_lock); @@ -1237,7 +1240,6 @@ EXPORT_SYMBOL(__release_region); #ifdef CONFIG_MEMORY_HOTREMOVE /** * release_mem_region_adjustable - release a previously reserved memory region - * @parent: parent resource descriptor * @start: resource start address * @size: resource region size * @@ -1255,21 +1257,28 @@ EXPORT_SYMBOL(__release_region); * assumes that all children remain in the lower address entry for * simplicity. Enhance this logic when necessary. */ -int release_mem_region_adjustable(struct resource *parent, - resource_size_t start, resource_size_t size) +void release_mem_region_adjustable(resource_size_t start, resource_size_t size) { + struct resource *parent = &iomem_resource; + struct resource *new_res = NULL; + bool alloc_nofail = false; struct resource **p; struct resource *res; - struct resource *new_res; resource_size_t end; - int ret = -EINVAL; end = start + size - 1; - if ((start < parent->start) || (end > parent->end)) - return ret; + if (WARN_ON_ONCE((start < parent->start) || (end > parent->end))) + return; - /* The alloc_resource() result gets checked later */ - new_res = alloc_resource(GFP_KERNEL); + /* + * We free up quite a lot of memory on memory hotunplug (esp., memap), + * just before releasing the region. This is highly unlikely to + * fail - let's play save and make it never fail as the caller cannot + * perform any error handling (e.g., trying to re-add memory will fail + * similarly). + */ +retry: + new_res = alloc_resource(GFP_KERNEL | (alloc_nofail ? __GFP_NOFAIL : 0)); p = &parent->child; write_lock(&resource_lock); @@ -1295,7 +1304,6 @@ int release_mem_region_adjustable(struct resource *parent, * so if we are dealing with them, let us just back off here. */ if (!(res->flags & IORESOURCE_SYSRAM)) { - ret = 0; break; } @@ -1312,20 +1320,23 @@ int release_mem_region_adjustable(struct resource *parent, /* free the whole entry */ *p = res->sibling; free_resource(res); - ret = 0; } else if (res->start == start && res->end != end) { /* adjust the start */ - ret = __adjust_resource(res, end + 1, - res->end - end); + WARN_ON_ONCE(__adjust_resource(res, end + 1, + res->end - end)); } else if (res->start != start && res->end == end) { /* adjust the end */ - ret = __adjust_resource(res, res->start, - start - res->start); + WARN_ON_ONCE(__adjust_resource(res, res->start, + start - res->start)); } else { - /* split into two entries */ + /* split into two entries - we need a new resource */ if (!new_res) { - ret = -ENOMEM; - break; + new_res = alloc_resource(GFP_ATOMIC); + if (!new_res) { + alloc_nofail = true; + write_unlock(&resource_lock); + goto retry; + } } new_res->name = res->name; new_res->start = end + 1; @@ -1336,9 +1347,8 @@ int release_mem_region_adjustable(struct resource *parent, new_res->sibling = res->sibling; new_res->child = NULL; - ret = __adjust_resource(res, res->start, - start - res->start); - if (ret) + if (WARN_ON_ONCE(__adjust_resource(res, res->start, + start - res->start))) break; res->sibling = new_res; new_res = NULL; @@ -1349,10 +1359,69 @@ int release_mem_region_adjustable(struct resource *parent, write_unlock(&resource_lock); free_resource(new_res); - return ret; } #endif /* CONFIG_MEMORY_HOTREMOVE */ +#ifdef CONFIG_MEMORY_HOTPLUG +static bool system_ram_resources_mergeable(struct resource *r1, + struct resource *r2) +{ + /* We assume either r1 or r2 is IORESOURCE_SYSRAM_MERGEABLE. */ + return r1->flags == r2->flags && r1->end + 1 == r2->start && + r1->name == r2->name && r1->desc == r2->desc && + !r1->child && !r2->child; +} + +/* + * merge_system_ram_resource - mark the System RAM resource mergeable and try to + * merge it with adjacent, mergeable resources + * @res: resource descriptor + * + * This interface is intended for memory hotplug, whereby lots of contiguous + * system ram resources are added (e.g., via add_memory*()) by a driver, and + * the actual resource boundaries are not of interest (e.g., it might be + * relevant for DIMMs). Only resources that are marked mergeable, that have the + * same parent, and that don't have any children are considered. All mergeable + * resources must be immutable during the request. + * + * Note: + * - The caller has to make sure that no pointers to resources that are + * marked mergeable are used anymore after this call - the resource might + * be freed and the pointer might be stale! + * - release_mem_region_adjustable() will split on demand on memory hotunplug + */ +void merge_system_ram_resource(struct resource *res) +{ + const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + struct resource *cur; + + if (WARN_ON_ONCE((res->flags & flags) != flags)) + return; + + write_lock(&resource_lock); + res->flags |= IORESOURCE_SYSRAM_MERGEABLE; + + /* Try to merge with next item in the list. */ + cur = res->sibling; + if (cur && system_ram_resources_mergeable(res, cur)) { + res->end = cur->end; + res->sibling = cur->sibling; + free_resource(cur); + } + + /* Try to merge with previous item in the list. */ + cur = res->parent->child; + while (cur && cur->sibling != res) + cur = cur->sibling; + if (cur && system_ram_resources_mergeable(cur, res)) { + cur->end = res->end; + cur->sibling = res->sibling; + free_resource(res); + } + write_unlock(&resource_lock); +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + /* * Managed region resource */ diff --git a/kernel/scftorture.c b/kernel/scftorture.c new file mode 100644 index 000000000000..554a521ee235 --- /dev/null +++ b/kernel/scftorture.c @@ -0,0 +1,575 @@ +// SPDX-License-Identifier: GPL-2.0+ +// +// Torture test for smp_call_function() and friends. +// +// Copyright (C) Facebook, 2020. +// +// Author: Paul E. McKenney <paulmck@kernel.org> + +#define pr_fmt(fmt) fmt + +#include <linux/atomic.h> +#include <linux/bitops.h> +#include <linux/completion.h> +#include <linux/cpu.h> +#include <linux/delay.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/kthread.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/notifier.h> +#include <linux/percpu.h> +#include <linux/rcupdate.h> +#include <linux/rcupdate_trace.h> +#include <linux/reboot.h> +#include <linux/sched.h> +#include <linux/spinlock.h> +#include <linux/smp.h> +#include <linux/stat.h> +#include <linux/srcu.h> +#include <linux/slab.h> +#include <linux/torture.h> +#include <linux/types.h> + +#define SCFTORT_STRING "scftorture" +#define SCFTORT_FLAG SCFTORT_STRING ": " + +#define SCFTORTOUT(s, x...) \ + pr_alert(SCFTORT_FLAG s, ## x) + +#define VERBOSE_SCFTORTOUT(s, x...) \ + do { if (verbose) pr_alert(SCFTORT_FLAG s, ## x); } while (0) + +#define VERBOSE_SCFTORTOUT_ERRSTRING(s, x...) \ + do { if (verbose) pr_alert(SCFTORT_FLAG "!!! " s, ## x); } while (0) + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>"); + +// Wait until there are multiple CPUs before starting test. +torture_param(int, holdoff, IS_BUILTIN(CONFIG_SCF_TORTURE_TEST) ? 10 : 0, + "Holdoff time before test start (s)"); +torture_param(int, longwait, 0, "Include ridiculously long waits? (seconds)"); +torture_param(int, nthreads, -1, "# threads, defaults to -1 for all CPUs."); +torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); +torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable"); +torture_param(int, shutdown_secs, 0, "Shutdown time (ms), <= zero to disable."); +torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s."); +torture_param(int, stutter_cpus, 5, "Number of jiffies to change CPUs under test, 0=disable"); +torture_param(bool, use_cpus_read_lock, 0, "Use cpus_read_lock() to exclude CPU hotplug."); +torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); +torture_param(int, weight_single, -1, "Testing weight for single-CPU no-wait operations."); +torture_param(int, weight_single_wait, -1, "Testing weight for single-CPU operations."); +torture_param(int, weight_many, -1, "Testing weight for multi-CPU no-wait operations."); +torture_param(int, weight_many_wait, -1, "Testing weight for multi-CPU operations."); +torture_param(int, weight_all, -1, "Testing weight for all-CPU no-wait operations."); +torture_param(int, weight_all_wait, -1, "Testing weight for all-CPU operations."); + +char *torture_type = ""; + +#ifdef MODULE +# define SCFTORT_SHUTDOWN 0 +#else +# define SCFTORT_SHUTDOWN 1 +#endif + +torture_param(bool, shutdown, SCFTORT_SHUTDOWN, "Shutdown at end of torture test."); + +struct scf_statistics { + struct task_struct *task; + int cpu; + long long n_single; + long long n_single_ofl; + long long n_single_wait; + long long n_single_wait_ofl; + long long n_many; + long long n_many_wait; + long long n_all; + long long n_all_wait; +}; + +static struct scf_statistics *scf_stats_p; +static struct task_struct *scf_torture_stats_task; +static DEFINE_PER_CPU(long long, scf_invoked_count); + +// Data for random primitive selection +#define SCF_PRIM_SINGLE 0 +#define SCF_PRIM_MANY 1 +#define SCF_PRIM_ALL 2 +#define SCF_NPRIMS (2 * 3) // Need wait and no-wait versions of each. + +static char *scf_prim_name[] = { + "smp_call_function_single", + "smp_call_function_many", + "smp_call_function", +}; + +struct scf_selector { + unsigned long scfs_weight; + int scfs_prim; + bool scfs_wait; +}; +static struct scf_selector scf_sel_array[SCF_NPRIMS]; +static int scf_sel_array_len; +static unsigned long scf_sel_totweight; + +// Communicate between caller and handler. +struct scf_check { + bool scfc_in; + bool scfc_out; + int scfc_cpu; // -1 for not _single(). + bool scfc_wait; +}; + +// Use to wait for all threads to start. +static atomic_t n_started; +static atomic_t n_errs; +static atomic_t n_mb_in_errs; +static atomic_t n_mb_out_errs; +static atomic_t n_alloc_errs; +static bool scfdone; +static char *bangstr = ""; + +static DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand); + +// Print torture statistics. Caller must ensure serialization. +static void scf_torture_stats_print(void) +{ + int cpu; + int i; + long long invoked_count = 0; + bool isdone = READ_ONCE(scfdone); + struct scf_statistics scfs = {}; + + for_each_possible_cpu(cpu) + invoked_count += data_race(per_cpu(scf_invoked_count, cpu)); + for (i = 0; i < nthreads; i++) { + scfs.n_single += scf_stats_p[i].n_single; + scfs.n_single_ofl += scf_stats_p[i].n_single_ofl; + scfs.n_single_wait += scf_stats_p[i].n_single_wait; + scfs.n_single_wait_ofl += scf_stats_p[i].n_single_wait_ofl; + scfs.n_many += scf_stats_p[i].n_many; + scfs.n_many_wait += scf_stats_p[i].n_many_wait; + scfs.n_all += scf_stats_p[i].n_all; + scfs.n_all_wait += scf_stats_p[i].n_all_wait; + } + if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) || + atomic_read(&n_mb_out_errs) || atomic_read(&n_alloc_errs)) + bangstr = "!!! "; + pr_alert("%s %sscf_invoked_count %s: %lld single: %lld/%lld single_ofl: %lld/%lld many: %lld/%lld all: %lld/%lld ", + SCFTORT_FLAG, bangstr, isdone ? "VER" : "ver", invoked_count, + scfs.n_single, scfs.n_single_wait, scfs.n_single_ofl, scfs.n_single_wait_ofl, + scfs.n_many, scfs.n_many_wait, scfs.n_all, scfs.n_all_wait); + torture_onoff_stats(); + pr_cont("ste: %d stnmie: %d stnmoe: %d staf: %d\n", atomic_read(&n_errs), + atomic_read(&n_mb_in_errs), atomic_read(&n_mb_out_errs), + atomic_read(&n_alloc_errs)); +} + +// Periodically prints torture statistics, if periodic statistics printing +// was specified via the stat_interval module parameter. +static int +scf_torture_stats(void *arg) +{ + VERBOSE_TOROUT_STRING("scf_torture_stats task started"); + do { + schedule_timeout_interruptible(stat_interval * HZ); + scf_torture_stats_print(); + torture_shutdown_absorb("scf_torture_stats"); + } while (!torture_must_stop()); + torture_kthread_stopping("scf_torture_stats"); + return 0; +} + +// Add a primitive to the scf_sel_array[]. +static void scf_sel_add(unsigned long weight, int prim, bool wait) +{ + struct scf_selector *scfsp = &scf_sel_array[scf_sel_array_len]; + + // If no weight, if array would overflow, if computing three-place + // percentages would overflow, or if the scf_prim_name[] array would + // overflow, don't bother. In the last three two cases, complain. + if (!weight || + WARN_ON_ONCE(scf_sel_array_len >= ARRAY_SIZE(scf_sel_array)) || + WARN_ON_ONCE(0 - 100000 * weight <= 100000 * scf_sel_totweight) || + WARN_ON_ONCE(prim >= ARRAY_SIZE(scf_prim_name))) + return; + scf_sel_totweight += weight; + scfsp->scfs_weight = scf_sel_totweight; + scfsp->scfs_prim = prim; + scfsp->scfs_wait = wait; + scf_sel_array_len++; +} + +// Dump out weighting percentages for scf_prim_name[] array. +static void scf_sel_dump(void) +{ + int i; + unsigned long oldw = 0; + struct scf_selector *scfsp; + unsigned long w; + + for (i = 0; i < scf_sel_array_len; i++) { + scfsp = &scf_sel_array[i]; + w = (scfsp->scfs_weight - oldw) * 100000 / scf_sel_totweight; + pr_info("%s: %3lu.%03lu %s(%s)\n", __func__, w / 1000, w % 1000, + scf_prim_name[scfsp->scfs_prim], + scfsp->scfs_wait ? "wait" : "nowait"); + oldw = scfsp->scfs_weight; + } +} + +// Randomly pick a primitive and wait/nowait, based on weightings. +static struct scf_selector *scf_sel_rand(struct torture_random_state *trsp) +{ + int i; + unsigned long w = torture_random(trsp) % (scf_sel_totweight + 1); + + for (i = 0; i < scf_sel_array_len; i++) + if (scf_sel_array[i].scfs_weight >= w) + return &scf_sel_array[i]; + WARN_ON_ONCE(1); + return &scf_sel_array[0]; +} + +// Update statistics and occasionally burn up mass quantities of CPU time, +// if told to do so via scftorture.longwait. Otherwise, occasionally burn +// a little bit. +static void scf_handler(void *scfc_in) +{ + int i; + int j; + unsigned long r = torture_random(this_cpu_ptr(&scf_torture_rand)); + struct scf_check *scfcp = scfc_in; + + if (likely(scfcp)) { + WRITE_ONCE(scfcp->scfc_out, false); // For multiple receivers. + if (WARN_ON_ONCE(unlikely(!READ_ONCE(scfcp->scfc_in)))) + atomic_inc(&n_mb_in_errs); + } + this_cpu_inc(scf_invoked_count); + if (longwait <= 0) { + if (!(r & 0xffc0)) + udelay(r & 0x3f); + goto out; + } + if (r & 0xfff) + goto out; + r = (r >> 12); + if (longwait <= 0) { + udelay((r & 0xff) + 1); + goto out; + } + r = r % longwait + 1; + for (i = 0; i < r; i++) { + for (j = 0; j < 1000; j++) { + udelay(1000); + cpu_relax(); + } + } +out: + if (unlikely(!scfcp)) + return; + if (scfcp->scfc_wait) + WRITE_ONCE(scfcp->scfc_out, true); + else + kfree(scfcp); +} + +// As above, but check for correct CPU. +static void scf_handler_1(void *scfc_in) +{ + struct scf_check *scfcp = scfc_in; + + if (likely(scfcp) && WARN_ONCE(smp_processor_id() != scfcp->scfc_cpu, "%s: Wanted CPU %d got CPU %d\n", __func__, scfcp->scfc_cpu, smp_processor_id())) { + atomic_inc(&n_errs); + } + scf_handler(scfcp); +} + +// Randomly do an smp_call_function*() invocation. +static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_random_state *trsp) +{ + uintptr_t cpu; + int ret = 0; + struct scf_check *scfcp = NULL; + struct scf_selector *scfsp = scf_sel_rand(trsp); + + if (use_cpus_read_lock) + cpus_read_lock(); + else + preempt_disable(); + if (scfsp->scfs_prim == SCF_PRIM_SINGLE || scfsp->scfs_wait) { + scfcp = kmalloc(sizeof(*scfcp), GFP_ATOMIC); + if (WARN_ON_ONCE(!scfcp)) { + atomic_inc(&n_alloc_errs); + } else { + scfcp->scfc_cpu = -1; + scfcp->scfc_wait = scfsp->scfs_wait; + scfcp->scfc_out = false; + } + } + switch (scfsp->scfs_prim) { + case SCF_PRIM_SINGLE: + cpu = torture_random(trsp) % nr_cpu_ids; + if (scfsp->scfs_wait) + scfp->n_single_wait++; + else + scfp->n_single++; + if (scfcp) { + scfcp->scfc_cpu = cpu; + barrier(); // Prevent race-reduction compiler optimizations. + scfcp->scfc_in = true; + } + ret = smp_call_function_single(cpu, scf_handler_1, (void *)scfcp, scfsp->scfs_wait); + if (ret) { + if (scfsp->scfs_wait) + scfp->n_single_wait_ofl++; + else + scfp->n_single_ofl++; + kfree(scfcp); + scfcp = NULL; + } + break; + case SCF_PRIM_MANY: + if (scfsp->scfs_wait) + scfp->n_many_wait++; + else + scfp->n_many++; + if (scfcp) { + barrier(); // Prevent race-reduction compiler optimizations. + scfcp->scfc_in = true; + } + smp_call_function_many(cpu_online_mask, scf_handler, scfcp, scfsp->scfs_wait); + break; + case SCF_PRIM_ALL: + if (scfsp->scfs_wait) + scfp->n_all_wait++; + else + scfp->n_all++; + if (scfcp) { + barrier(); // Prevent race-reduction compiler optimizations. + scfcp->scfc_in = true; + } + smp_call_function(scf_handler, scfcp, scfsp->scfs_wait); + break; + default: + WARN_ON_ONCE(1); + if (scfcp) + scfcp->scfc_out = true; + } + if (scfcp && scfsp->scfs_wait) { + if (WARN_ON_ONCE((num_online_cpus() > 1 || scfsp->scfs_prim == SCF_PRIM_SINGLE) && + !scfcp->scfc_out)) + atomic_inc(&n_mb_out_errs); // Leak rather than trash! + else + kfree(scfcp); + barrier(); // Prevent race-reduction compiler optimizations. + } + if (use_cpus_read_lock) + cpus_read_unlock(); + else + preempt_enable(); + if (!(torture_random(trsp) & 0xfff)) + schedule_timeout_uninterruptible(1); +} + +// SCF test kthread. Repeatedly does calls to members of the +// smp_call_function() family of functions. +static int scftorture_invoker(void *arg) +{ + int cpu; + DEFINE_TORTURE_RANDOM(rand); + struct scf_statistics *scfp = (struct scf_statistics *)arg; + bool was_offline = false; + + VERBOSE_SCFTORTOUT("scftorture_invoker %d: task started", scfp->cpu); + cpu = scfp->cpu % nr_cpu_ids; + set_cpus_allowed_ptr(current, cpumask_of(cpu)); + set_user_nice(current, MAX_NICE); + if (holdoff) + schedule_timeout_interruptible(holdoff * HZ); + + VERBOSE_SCFTORTOUT("scftorture_invoker %d: Waiting for all SCF torturers from cpu %d", scfp->cpu, smp_processor_id()); + + // Make sure that the CPU is affinitized appropriately during testing. + WARN_ON_ONCE(smp_processor_id() != scfp->cpu); + + if (!atomic_dec_return(&n_started)) + while (atomic_read_acquire(&n_started)) { + if (torture_must_stop()) { + VERBOSE_SCFTORTOUT("scftorture_invoker %d ended before starting", scfp->cpu); + goto end; + } + schedule_timeout_uninterruptible(1); + } + + VERBOSE_SCFTORTOUT("scftorture_invoker %d started", scfp->cpu); + + do { + scftorture_invoke_one(scfp, &rand); + while (cpu_is_offline(cpu) && !torture_must_stop()) { + schedule_timeout_interruptible(HZ / 5); + was_offline = true; + } + if (was_offline) { + set_cpus_allowed_ptr(current, cpumask_of(cpu)); + was_offline = false; + } + cond_resched(); + } while (!torture_must_stop()); + + VERBOSE_SCFTORTOUT("scftorture_invoker %d ended", scfp->cpu); +end: + torture_kthread_stopping("scftorture_invoker"); + return 0; +} + +static void +scftorture_print_module_parms(const char *tag) +{ + pr_alert(SCFTORT_FLAG + "--- %s: verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter_cpus=%d use_cpus_read_lock=%d, weight_single=%d, weight_single_wait=%d, weight_many=%d, weight_many_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag, + verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter_cpus, use_cpus_read_lock, weight_single, weight_single_wait, weight_many, weight_many_wait, weight_all, weight_all_wait); +} + +static void scf_cleanup_handler(void *unused) +{ +} + +static void scf_torture_cleanup(void) +{ + int i; + + if (torture_cleanup_begin()) + return; + + WRITE_ONCE(scfdone, true); + if (nthreads) + for (i = 0; i < nthreads; i++) + torture_stop_kthread("scftorture_invoker", scf_stats_p[i].task); + else + goto end; + smp_call_function(scf_cleanup_handler, NULL, 0); + torture_stop_kthread(scf_torture_stats, scf_torture_stats_task); + scf_torture_stats_print(); // -After- the stats thread is stopped! + kfree(scf_stats_p); // -After- the last stats print has completed! + scf_stats_p = NULL; + + if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) || atomic_read(&n_mb_out_errs)) + scftorture_print_module_parms("End of test: FAILURE"); + else if (torture_onoff_failures()) + scftorture_print_module_parms("End of test: LOCK_HOTPLUG"); + else + scftorture_print_module_parms("End of test: SUCCESS"); + +end: + torture_cleanup_end(); +} + +static int __init scf_torture_init(void) +{ + long i; + int firsterr = 0; + unsigned long weight_single1 = weight_single; + unsigned long weight_single_wait1 = weight_single_wait; + unsigned long weight_many1 = weight_many; + unsigned long weight_many_wait1 = weight_many_wait; + unsigned long weight_all1 = weight_all; + unsigned long weight_all_wait1 = weight_all_wait; + + if (!torture_init_begin(SCFTORT_STRING, verbose)) + return -EBUSY; + + scftorture_print_module_parms("Start of test"); + + if (weight_single == -1 && weight_single_wait == -1 && + weight_many == -1 && weight_many_wait == -1 && + weight_all == -1 && weight_all_wait == -1) { + weight_single1 = 2 * nr_cpu_ids; + weight_single_wait1 = 2 * nr_cpu_ids; + weight_many1 = 2; + weight_many_wait1 = 2; + weight_all1 = 1; + weight_all_wait1 = 1; + } else { + if (weight_single == -1) + weight_single1 = 0; + if (weight_single_wait == -1) + weight_single_wait1 = 0; + if (weight_many == -1) + weight_many1 = 0; + if (weight_many_wait == -1) + weight_many_wait1 = 0; + if (weight_all == -1) + weight_all1 = 0; + if (weight_all_wait == -1) + weight_all_wait1 = 0; + } + if (weight_single1 == 0 && weight_single_wait1 == 0 && + weight_many1 == 0 && weight_many_wait1 == 0 && + weight_all1 == 0 && weight_all_wait1 == 0) { + VERBOSE_SCFTORTOUT_ERRSTRING("all zero weights makes no sense"); + firsterr = -EINVAL; + goto unwind; + } + scf_sel_add(weight_single1, SCF_PRIM_SINGLE, false); + scf_sel_add(weight_single_wait1, SCF_PRIM_SINGLE, true); + scf_sel_add(weight_many1, SCF_PRIM_MANY, false); + scf_sel_add(weight_many_wait1, SCF_PRIM_MANY, true); + scf_sel_add(weight_all1, SCF_PRIM_ALL, false); + scf_sel_add(weight_all_wait1, SCF_PRIM_ALL, true); + scf_sel_dump(); + + if (onoff_interval > 0) { + firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval, NULL); + if (firsterr) + goto unwind; + } + if (shutdown_secs > 0) { + firsterr = torture_shutdown_init(shutdown_secs, scf_torture_cleanup); + if (firsterr) + goto unwind; + } + + // Worker tasks invoking smp_call_function(). + if (nthreads < 0) + nthreads = num_online_cpus(); + scf_stats_p = kcalloc(nthreads, sizeof(scf_stats_p[0]), GFP_KERNEL); + if (!scf_stats_p) { + VERBOSE_SCFTORTOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + + VERBOSE_SCFTORTOUT("Starting %d smp_call_function() threads\n", nthreads); + + atomic_set(&n_started, nthreads); + for (i = 0; i < nthreads; i++) { + scf_stats_p[i].cpu = i; + firsterr = torture_create_kthread(scftorture_invoker, (void *)&scf_stats_p[i], + scf_stats_p[i].task); + if (firsterr) + goto unwind; + } + if (stat_interval > 0) { + firsterr = torture_create_kthread(scf_torture_stats, NULL, scf_torture_stats_task); + if (firsterr) + goto unwind; + } + + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + scf_torture_cleanup(); + return firsterr; +} + +module_init(scf_torture_init); +module_exit(scf_torture_cleanup); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8160ab5263f8..d2003a7d5ab5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -44,7 +44,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL) +#ifdef CONFIG_SCHED_DEBUG /* * Debugging: various feature bits * diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index e39008242cf4..c03a5775d019 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -102,7 +102,8 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { - if (sg_policy->next_freq == next_freq) + if (sg_policy->next_freq == next_freq && + !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS)) return false; sg_policy->next_freq = next_freq; @@ -114,22 +115,8 @@ static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { - struct cpufreq_policy *policy = sg_policy->policy; - int cpu; - - if (!sugov_update_next_freq(sg_policy, time, next_freq)) - return; - - next_freq = cpufreq_driver_fast_switch(policy, next_freq); - if (!next_freq) - return; - - policy->cur = next_freq; - - if (trace_cpu_frequency_enabled()) { - for_each_cpu(cpu, policy->cpus) - trace_cpu_frequency(next_freq, cpu); - } + if (sugov_update_next_freq(sg_policy, time, next_freq)) + cpufreq_driver_fast_switch(sg_policy->policy, next_freq); } static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time, @@ -175,7 +162,8 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, freq = map_util_freq(util, freq, max); - if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) + if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update && + !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS)) return sg_policy->next_freq; sg_policy->need_freq_update = false; @@ -455,6 +443,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, unsigned long util, max; unsigned int next_f; bool busy; + unsigned int cached_freq = sg_policy->cached_raw_freq; sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; @@ -478,8 +467,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, if (busy && next_f < sg_policy->next_freq) { next_f = sg_policy->next_freq; - /* Reset cached freq as next_freq has changed */ - sg_policy->cached_raw_freq = 0; + /* Restore cached freq as next_freq has changed */ + sg_policy->cached_raw_freq = cached_freq; } /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 6d93f4518734..f232305dcefe 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2504,7 +2504,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, } const struct sched_class dl_sched_class - __attribute__((section("__dl_sched_class"))) = { + __section("__dl_sched_class") = { .enqueue_task = enqueue_task_dl, .dequeue_task = dequeue_task_dl, .yield_task = yield_task_dl, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index aa4c6227cd6d..290f9e38378c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2928,7 +2928,7 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr) curr->node_stamp += period; if (!time_before(jiffies, curr->mm->numa_next_scan)) - task_work_add(curr, work, true); + task_work_add(curr, work, TWA_RESUME); } } @@ -11159,7 +11159,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task * All the scheduling class methods: */ const struct sched_class fair_sched_class - __attribute__((section("__fair_sched_class"))) = { + __section("__fair_sched_class") = { .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index f324dc36fc43..24d0ee26377d 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -458,7 +458,7 @@ static void update_curr_idle(struct rq *rq) * Simple, special scheduling class for the per-CPU idle tasks: */ const struct sched_class idle_sched_class - __attribute__((section("__idle_sched_class"))) = { + __section("__idle_sched_class") = { /* no enqueue/yield_task for idle tasks */ /* dequeue is not valid, we print a debug message there: */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f215eea6a966..49ec096a8aa1 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2430,7 +2430,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) } const struct sched_class rt_sched_class - __attribute__((section("__rt_sched_class"))) = { + __section("__rt_sched_class") = { .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 28709f6b0975..df80bfcea92e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1471,7 +1471,7 @@ struct sched_group_capacity { int id; #endif - unsigned long cpumask[0]; /* Balance mask */ + unsigned long cpumask[]; /* Balance mask */ }; struct sched_group { @@ -1629,7 +1629,7 @@ enum { #undef SCHED_FEAT -#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL) +#ifdef CONFIG_SCHED_DEBUG /* * To support run-time toggling of sched features, all the translation units @@ -1637,6 +1637,7 @@ enum { */ extern const_debug unsigned int sysctl_sched_features; +#ifdef CONFIG_JUMP_LABEL #define SCHED_FEAT(name, enabled) \ static __always_inline bool static_branch_##name(struct static_key *key) \ { \ @@ -1649,7 +1650,13 @@ static __always_inline bool static_branch_##name(struct static_key *key) \ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) -#else /* !(SCHED_DEBUG && CONFIG_JUMP_LABEL) */ +#else /* !CONFIG_JUMP_LABEL */ + +#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) + +#endif /* CONFIG_JUMP_LABEL */ + +#else /* !SCHED_DEBUG */ /* * Each translation unit has its own copy of sysctl_sched_features to allow @@ -1665,7 +1672,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features = #define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) -#endif /* SCHED_DEBUG && CONFIG_JUMP_LABEL */ +#endif /* SCHED_DEBUG */ extern struct static_key_false sched_numa_balancing; extern struct static_key_false sched_schedstats; diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 394bc8126a1e..ceb5b6b12561 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -110,7 +110,7 @@ static void update_curr_stop(struct rq *rq) * Simple, special scheduling class for the per-CPU stop tasks: */ const struct sched_class stop_sched_class - __attribute__((section("__stop_sched_class"))) = { + __section("__stop_sched_class") = { .enqueue_task = enqueue_task_stop, .dequeue_task = dequeue_task_stop, diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 676d4af62103..8ad7a293255a 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -196,6 +196,10 @@ struct seccomp_filter { */ static void populate_seccomp_data(struct seccomp_data *sd) { + /* + * Instead of using current_pt_reg(), we're already doing the work + * to safely fetch "current", so just use "task" everywhere below. + */ struct task_struct *task = current; struct pt_regs *regs = task_pt_regs(task); unsigned long args[6]; @@ -910,7 +914,7 @@ out: if (flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) return 0; - syscall_set_return_value(current, task_pt_regs(current), + syscall_set_return_value(current, current_pt_regs(), err, ret); return -1; } @@ -943,13 +947,13 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, /* Set low-order bits as an errno, capped at MAX_ERRNO. */ if (data > MAX_ERRNO) data = MAX_ERRNO; - syscall_set_return_value(current, task_pt_regs(current), + syscall_set_return_value(current, current_pt_regs(), -data, 0); goto skip; case SECCOMP_RET_TRAP: /* Show the handler the original registers. */ - syscall_rollback(current, task_pt_regs(current)); + syscall_rollback(current, current_pt_regs()); /* Let the filter pass back 16 bits of data. */ seccomp_send_sigsys(this_syscall, data); goto skip; @@ -962,7 +966,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, /* ENOSYS these calls if there is no tracer attached. */ if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { syscall_set_return_value(current, - task_pt_regs(current), + current_pt_regs(), -ENOSYS, 0); goto skip; } @@ -982,7 +986,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, if (fatal_signal_pending(current)) goto skip; /* Check if the tracer forced the syscall to be skipped. */ - this_syscall = syscall_get_nr(current, task_pt_regs(current)); + this_syscall = syscall_get_nr(current, current_pt_regs()); if (this_syscall < 0) goto skip; @@ -1020,20 +1024,20 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, default: seccomp_log(this_syscall, SIGSYS, action, true); /* Dump core only if this is the last remaining thread. */ - if (action == SECCOMP_RET_KILL_PROCESS || + if (action != SECCOMP_RET_KILL_THREAD || get_nr_threads(current) == 1) { kernel_siginfo_t info; /* Show the original registers in the dump. */ - syscall_rollback(current, task_pt_regs(current)); + syscall_rollback(current, current_pt_regs()); /* Trigger a manual coredump since do_exit skips it. */ seccomp_init_siginfo(&info, this_syscall, data); do_coredump(&info); } - if (action == SECCOMP_RET_KILL_PROCESS) - do_group_exit(SIGSYS); - else + if (action == SECCOMP_RET_KILL_THREAD) do_exit(SIGSYS); + else + do_group_exit(SIGSYS); } unreachable(); @@ -1060,7 +1064,7 @@ int __secure_computing(const struct seccomp_data *sd) return 0; this_syscall = sd ? sd->nr : - syscall_get_nr(current, task_pt_regs(current)); + syscall_get_nr(current, current_pt_regs()); switch (mode) { case SECCOMP_MODE_STRICT: @@ -1472,13 +1476,7 @@ static const struct file_operations seccomp_notify_ops = { static struct file *init_listener(struct seccomp_filter *filter) { - struct file *ret = ERR_PTR(-EBUSY); - struct seccomp_filter *cur; - - for (cur = current->seccomp.filter; cur; cur = cur->prev) { - if (cur->notif) - goto out; - } + struct file *ret; ret = ERR_PTR(-ENOMEM); filter->notif = kzalloc(sizeof(*(filter->notif)), GFP_KERNEL); @@ -1504,6 +1502,31 @@ out: return ret; } +/* + * Does @new_child have a listener while an ancestor also has a listener? + * If so, we'll want to reject this filter. + * This only has to be tested for the current process, even in the TSYNC case, + * because TSYNC installs @child with the same parent on all threads. + * Note that @new_child is not hooked up to its parent at this point yet, so + * we use current->seccomp.filter. + */ +static bool has_duplicate_listener(struct seccomp_filter *new_child) +{ + struct seccomp_filter *cur; + + /* must be protected against concurrent TSYNC */ + lockdep_assert_held(¤t->sighand->siglock); + + if (!new_child->notif) + return false; + for (cur = current->seccomp.filter; cur; cur = cur->prev) { + if (cur->notif) + return true; + } + + return false; +} + /** * seccomp_set_mode_filter: internal function for setting seccomp filter * @flags: flags to change filter behavior @@ -1575,6 +1598,11 @@ static long seccomp_set_mode_filter(unsigned int flags, if (!seccomp_may_assign_mode(seccomp_mode)) goto out; + if (has_duplicate_listener(prepared)) { + ret = -EBUSY; + goto out; + } + ret = seccomp_attach_filter(flags, prepared); if (ret) goto out; diff --git a/kernel/smp.c b/kernel/smp.c index d0ae8eb6bf8b..4d17501433be 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -20,6 +20,9 @@ #include <linux/sched.h> #include <linux/sched/idle.h> #include <linux/hypervisor.h> +#include <linux/sched/clock.h> +#include <linux/nmi.h> +#include <linux/sched/debug.h> #include "smpboot.h" #include "sched/smp.h" @@ -96,6 +99,103 @@ void __init call_function_init(void) smpcfd_prepare_cpu(smp_processor_id()); } +#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG + +static DEFINE_PER_CPU(call_single_data_t *, cur_csd); +static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func); +static DEFINE_PER_CPU(void *, cur_csd_info); + +#define CSD_LOCK_TIMEOUT (5ULL * NSEC_PER_SEC) +static atomic_t csd_bug_count = ATOMIC_INIT(0); + +/* Record current CSD work for current CPU, NULL to erase. */ +static void csd_lock_record(call_single_data_t *csd) +{ + if (!csd) { + smp_mb(); /* NULL cur_csd after unlock. */ + __this_cpu_write(cur_csd, NULL); + return; + } + __this_cpu_write(cur_csd_func, csd->func); + __this_cpu_write(cur_csd_info, csd->info); + smp_wmb(); /* func and info before csd. */ + __this_cpu_write(cur_csd, csd); + smp_mb(); /* Update cur_csd before function call. */ + /* Or before unlock, as the case may be. */ +} + +static __always_inline int csd_lock_wait_getcpu(call_single_data_t *csd) +{ + unsigned int csd_type; + + csd_type = CSD_TYPE(csd); + if (csd_type == CSD_TYPE_ASYNC || csd_type == CSD_TYPE_SYNC) + return csd->dst; /* Other CSD_TYPE_ values might not have ->dst. */ + return -1; +} + +/* + * Complain if too much time spent waiting. Note that only + * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU, + * so waiting on other types gets much less information. + */ +static __always_inline bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id) +{ + int cpu = -1; + int cpux; + bool firsttime; + u64 ts2, ts_delta; + call_single_data_t *cpu_cur_csd; + unsigned int flags = READ_ONCE(csd->flags); + + if (!(flags & CSD_FLAG_LOCK)) { + if (!unlikely(*bug_id)) + return true; + cpu = csd_lock_wait_getcpu(csd); + pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n", + *bug_id, raw_smp_processor_id(), cpu); + return true; + } + + ts2 = sched_clock(); + ts_delta = ts2 - *ts1; + if (likely(ts_delta <= CSD_LOCK_TIMEOUT)) + return false; + + firsttime = !*bug_id; + if (firsttime) + *bug_id = atomic_inc_return(&csd_bug_count); + cpu = csd_lock_wait_getcpu(csd); + if (WARN_ONCE(cpu < 0 || cpu >= nr_cpu_ids, "%s: cpu = %d\n", __func__, cpu)) + cpux = 0; + else + cpux = cpu; + cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */ + pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n", + firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts2 - ts0, + cpu, csd->func, csd->info); + if (cpu_cur_csd && csd != cpu_cur_csd) { + pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n", + *bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)), + READ_ONCE(per_cpu(cur_csd_info, cpux))); + } else { + pr_alert("\tcsd: CSD lock (#%d) %s.\n", + *bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request"); + } + if (cpu >= 0) { + if (!trigger_single_cpu_backtrace(cpu)) + dump_cpu_task(cpu); + if (!cpu_cur_csd) { + pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu); + arch_send_call_function_single_ipi(cpu); + } + } + dump_stack(); + *ts1 = ts2; + + return false; +} + /* * csd_lock/csd_unlock used to serialize access to per-cpu csd resources * @@ -105,8 +205,28 @@ void __init call_function_init(void) */ static __always_inline void csd_lock_wait(call_single_data_t *csd) { + int bug_id = 0; + u64 ts0, ts1; + + ts1 = ts0 = sched_clock(); + for (;;) { + if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id)) + break; + cpu_relax(); + } + smp_acquire__after_ctrl_dep(); +} + +#else +static void csd_lock_record(call_single_data_t *csd) +{ +} + +static __always_inline void csd_lock_wait(call_single_data_t *csd) +{ smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK)); } +#endif static __always_inline void csd_lock(call_single_data_t *csd) { @@ -166,9 +286,11 @@ static int generic_exec_single(int cpu, call_single_data_t *csd) * We can unlock early even for the synchronous on-stack case, * since we're doing this from the same CPU.. */ + csd_lock_record(csd); csd_unlock(csd); local_irq_save(flags); func(info); + csd_lock_record(NULL); local_irq_restore(flags); return 0; } @@ -268,8 +390,10 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) entry = &csd_next->llist; } + csd_lock_record(csd); func(info); csd_unlock(csd); + csd_lock_record(NULL); } else { prev = &csd->llist; } @@ -296,8 +420,10 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) smp_call_func_t func = csd->func; void *info = csd->info; + csd_lock_record(csd); csd_unlock(csd); func(info); + csd_lock_record(NULL); } else if (type == CSD_TYPE_IRQ_WORK) { irq_work_single(csd); } @@ -375,6 +501,10 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, csd->func = func; csd->info = info; +#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG + csd->src = smp_processor_id(); + csd->dst = cpu; +#endif err = generic_exec_single(cpu, csd); @@ -540,6 +670,10 @@ static void smp_call_function_many_cond(const struct cpumask *mask, csd->flags |= CSD_TYPE_SYNC; csd->func = func; csd->info = info; +#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG + csd->src = smp_processor_id(); + csd->dst = cpu; +#endif if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) __cpumask_set_cpu(cpu, cfd->cpumask_ipi); } @@ -741,7 +875,7 @@ EXPORT_SYMBOL(on_each_cpu_mask); * for all the required CPUs to finish. This may include the local * processor. * @cond_func: A callback function that is passed a cpu id and - * the the info parameter. The function is called + * the info parameter. The function is called * with preemption disabled. The function should * return a blooean value indicating whether to IPI * the specified CPU. diff --git a/kernel/sys.c b/kernel/sys.c index ab6c409b1159..a730c03ee607 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -373,7 +373,7 @@ long __sys_setregid(gid_t rgid, gid_t egid) if (rgid != (gid_t) -1) { if (gid_eq(old->gid, krgid) || gid_eq(old->egid, krgid) || - ns_capable(old->user_ns, CAP_SETGID)) + ns_capable_setid(old->user_ns, CAP_SETGID)) new->gid = krgid; else goto error; @@ -382,7 +382,7 @@ long __sys_setregid(gid_t rgid, gid_t egid) if (gid_eq(old->gid, kegid) || gid_eq(old->egid, kegid) || gid_eq(old->sgid, kegid) || - ns_capable(old->user_ns, CAP_SETGID)) + ns_capable_setid(old->user_ns, CAP_SETGID)) new->egid = kegid; else goto error; @@ -432,7 +432,7 @@ long __sys_setgid(gid_t gid) old = current_cred(); retval = -EPERM; - if (ns_capable(old->user_ns, CAP_SETGID)) + if (ns_capable_setid(old->user_ns, CAP_SETGID)) new->gid = new->egid = new->sgid = new->fsgid = kgid; else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid)) new->egid = new->fsgid = kgid; @@ -744,7 +744,7 @@ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) old = current_cred(); retval = -EPERM; - if (!ns_capable(old->user_ns, CAP_SETGID)) { + if (!ns_capable_setid(old->user_ns, CAP_SETGID)) { if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid)) goto error; @@ -871,7 +871,7 @@ long __sys_setfsgid(gid_t gid) if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) || gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) || - ns_capable(old->user_ns, CAP_SETGID)) { + ns_capable_setid(old->user_ns, CAP_SETGID)) { if (!gid_eq(kgid, old->fsgid)) { new->fsgid = kgid; if (security_task_fix_setgid(new,old,LSM_SETID_FS) == 0) @@ -2034,7 +2034,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data * VMAs already unmapped and kernel uses these members for statistics * output in procfs mostly, except * - * - @start_brk/@brk which are used in do_brk but kernel lookups + * - @start_brk/@brk which are used in do_brk_flags but kernel lookups * for VMAs when updating these memvers so anything wrong written * here cause kernel to swear at userspace program but won't lead * to any problem in kernel itself @@ -2238,12 +2238,12 @@ out: } #ifdef CONFIG_CHECKPOINT_RESTORE -static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) +static int prctl_get_tid_address(struct task_struct *me, int __user * __user *tid_addr) { return put_user(me->clear_child_tid, tid_addr); } #else -static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) +static int prctl_get_tid_address(struct task_struct *me, int __user * __user *tid_addr) { return -EINVAL; } @@ -2427,7 +2427,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = prctl_set_mm(arg2, arg3, arg4, arg5); break; case PR_GET_TID_ADDRESS: - error = prctl_get_tid_address(me, (int __user **)arg2); + error = prctl_get_tid_address(me, (int __user * __user *)arg2); break; case PR_SET_CHILD_SUBREAPER: me->signal->is_child_subreaper = !!arg2; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 4d59775ea79c..f27ac94d5fa7 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -280,6 +280,7 @@ COND_SYSCALL(mlockall); COND_SYSCALL(munlockall); COND_SYSCALL(mincore); COND_SYSCALL(madvise); +COND_SYSCALL(process_madvise); COND_SYSCALL(remap_file_pages); COND_SYSCALL(mbind); COND_SYSCALL_COMPAT(mbind); @@ -369,7 +370,6 @@ COND_SYSCALL_COMPAT(fanotify_mark); /* x86 */ COND_SYSCALL(vm86old); COND_SYSCALL(modify_ldt); -COND_SYSCALL_COMPAT(quotactl32); COND_SYSCALL(vm86); COND_SYSCALL(kexec_file_load); diff --git a/kernel/task_work.c b/kernel/task_work.c index 613b2d634af8..8d6e1217c451 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -9,23 +9,28 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ * task_work_add - ask the @task to execute @work->func() * @task: the task which should run the callback * @work: the callback to run - * @notify: send the notification if true + * @notify: how to notify the targeted task * - * Queue @work for task_work_run() below and notify the @task if @notify. - * Fails if the @task is exiting/exited and thus it can't process this @work. - * Otherwise @work->func() will be called when the @task returns from kernel - * mode or exits. + * Queue @work for task_work_run() below and notify the @task if @notify + * is @TWA_RESUME or @TWA_SIGNAL. @TWA_SIGNAL works like signals, in that the + * it will interrupt the targeted task and run the task_work. @TWA_RESUME + * work is run only when the task exits the kernel and returns to user mode, + * or before entering guest mode. Fails if the @task is exiting/exited and thus + * it can't process this @work. Otherwise @work->func() will be called when the + * @task goes through one of the aforementioned transitions, or exits. * - * This is like the signal handler which runs in kernel mode, but it doesn't - * try to wake up the @task. + * If the targeted task is exiting, then an error is returned and the work item + * is not queued. It's up to the caller to arrange for an alternative mechanism + * in that case. * - * Note: there is no ordering guarantee on works queued here. + * Note: there is no ordering guarantee on works queued here. The task_work + * list is LIFO. * * RETURNS: * 0 if succeeds or -ESRCH. */ -int -task_work_add(struct task_struct *task, struct callback_head *work, int notify) +int task_work_add(struct task_struct *task, struct callback_head *work, + enum task_work_notify_mode notify) { struct callback_head *head; unsigned long flags; @@ -38,6 +43,8 @@ task_work_add(struct task_struct *task, struct callback_head *work, int notify) } while (cmpxchg(&task->task_works, head, work) != head); switch (notify) { + case TWA_NONE: + break; case TWA_RESUME: set_notify_resume(task); break; @@ -54,6 +61,9 @@ task_work_add(struct task_struct *task, struct callback_head *work, int notify) unlock_task_sighand(task, &flags); } break; + default: + WARN_ON_ONCE(1); + break; } return 0; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index e2ac0e37c4ae..a2802b6ff4bb 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -34,17 +34,13 @@ struct kmem_cache *taskstats_cache; static struct genl_family family; -static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { +static const struct nla_policy taskstats_cmd_get_policy[] = { [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; -/* - * We have to use TASKSTATS_CMD_ATTR_MAX here, it is the maxattr in the family. - * Make sure they are always aligned. - */ -static const struct nla_policy cgroupstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { +static const struct nla_policy cgroupstats_cmd_get_policy[] = { [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, }; @@ -649,47 +645,25 @@ static const struct genl_ops taskstats_ops[] = { .cmd = TASKSTATS_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = taskstats_user_cmd, - /* policy enforced later */ - .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_HASPOL, + .policy = taskstats_cmd_get_policy, + .maxattr = ARRAY_SIZE(taskstats_cmd_get_policy) - 1, + .flags = GENL_ADMIN_PERM, }, { .cmd = CGROUPSTATS_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = cgroupstats_user_cmd, - /* policy enforced later */ - .flags = GENL_CMD_CAP_HASPOL, + .policy = cgroupstats_cmd_get_policy, + .maxattr = ARRAY_SIZE(cgroupstats_cmd_get_policy) - 1, }, }; -static int taskstats_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, - struct genl_info *info) -{ - const struct nla_policy *policy = NULL; - - switch (ops->cmd) { - case TASKSTATS_CMD_GET: - policy = taskstats_cmd_get_policy; - break; - case CGROUPSTATS_CMD_GET: - policy = cgroupstats_cmd_get_policy; - break; - default: - return -EINVAL; - } - - return nlmsg_validate_deprecated(info->nlhdr, GENL_HDRLEN, - TASKSTATS_CMD_ATTR_MAX, policy, - info->extack); -} - static struct genl_family family __ro_after_init = { .name = TASKSTATS_GENL_NAME, .version = TASKSTATS_GENL_VERSION, - .maxattr = TASKSTATS_CMD_ATTR_MAX, .module = THIS_MODULE, .ops = taskstats_ops, .n_ops = ARRAY_SIZE(taskstats_ops), - .pre_doit = taskstats_pre_doit, }; /* Needed early in initialization */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f0199a4ba1ad..81632cd5e3b7 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -927,7 +927,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) if (ratelimit < 10 && (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { - pr_warn("NOHZ: local_softirq_pending %02x\n", + pr_warn("NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #%02x!!!\n", (unsigned int) local_softirq_pending()); ratelimit++; } diff --git a/kernel/time/timer.c b/kernel/time/timer.c index dda05f4b7a1f..de37e33a868d 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1706,6 +1706,8 @@ void update_process_times(int user_tick) { struct task_struct *p = current; + PRANDOM_ADD_NOISE(jiffies, user_tick, p, 0); + /* Note: this timer irq context must be accounted for as well. */ account_process_tick(p, user_tick); run_local_timers(); @@ -1717,13 +1719,6 @@ void update_process_times(int user_tick) scheduler_tick(); if (IS_ENABLED(CONFIG_POSIX_TIMERS)) run_posix_cpu_timers(); - - /* The current CPU might make use of net randoms without receiving IRQs - * to renew them often enough. Let's update the net_rand_state from a - * non-constant value that's not affine to the number of calls to make - * sure it's updated when there's some activity (we don't care in idle). - */ - this_cpu_add(net_rand_state.s1, rol32(jiffies, 24) + user_tick); } /** diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 4b3a42fc3b24..f1022945e346 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -527,7 +527,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, * and scsi-generic block devices we create a temporary new debugfs * directory that will be removed once the trace ends. */ - if (bdev && bdev == bdev->bd_contains) + if (bdev && !bdev_is_partition(bdev)) dir = q->debugfs_dir; else bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); @@ -793,7 +793,7 @@ static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) return cgroup_id(bio_blkcg(bio)->css.cgroup); } #else -u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { return 0; } @@ -1827,13 +1827,11 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); + struct block_device *bdev = bdget_part(dev_to_part(dev)); struct request_queue *q; - struct block_device *bdev; struct blk_trace *bt; ssize_t ret = -ENXIO; - bdev = bdget(part_devt(p)); if (bdev == NULL) goto out; @@ -1875,7 +1873,6 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, { struct block_device *bdev; struct request_queue *q; - struct hd_struct *p; struct blk_trace *bt; u64 value; ssize_t ret = -EINVAL; @@ -1895,9 +1892,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, goto out; ret = -ENXIO; - - p = dev_to_part(dev); - bdev = bdget(part_devt(p)); + bdev = bdget_part(dev_to_part(dev)); if (bdev == NULL) goto out; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 2ecf7892a31b..4517c8b66518 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -7,6 +7,7 @@ #include <linux/slab.h> #include <linux/bpf.h> #include <linux/bpf_perf_event.h> +#include <linux/btf.h> #include <linux/filter.h> #include <linux/uaccess.h> #include <linux/ctype.h> @@ -16,6 +17,9 @@ #include <linux/error-injection.h> #include <linux/btf_ids.h> +#include <uapi/linux/bpf.h> +#include <uapi/linux/btf.h> + #include <asm/tlb.h> #include "trace_probe.h" @@ -67,6 +71,10 @@ static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name) u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); +static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size, + u64 flags, const struct btf **btf, + s32 *btf_id); + /** * trace_call_bpf - invoke BPF program * @call: tracepoint event @@ -743,19 +751,18 @@ out: return err; } -BTF_ID_LIST(bpf_seq_printf_btf_ids) -BTF_ID(struct, seq_file) +BTF_ID_LIST_SINGLE(btf_seq_file_ids, struct, seq_file) static const struct bpf_func_proto bpf_seq_printf_proto = { .func = bpf_seq_printf, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_seq_file_ids[0], .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_PTR_TO_MEM_OR_NULL, .arg5_type = ARG_CONST_SIZE_OR_ZERO, - .btf_id = bpf_seq_printf_btf_ids, }; BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len) @@ -763,17 +770,39 @@ BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len) return seq_write(m, data, len) ? -EOVERFLOW : 0; } -BTF_ID_LIST(bpf_seq_write_btf_ids) -BTF_ID(struct, seq_file) - static const struct bpf_func_proto bpf_seq_write_proto = { .func = bpf_seq_write, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_seq_file_ids[0], .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, - .btf_id = bpf_seq_write_btf_ids, +}; + +BPF_CALL_4(bpf_seq_printf_btf, struct seq_file *, m, struct btf_ptr *, ptr, + u32, btf_ptr_size, u64, flags) +{ + const struct btf *btf; + s32 btf_id; + int ret; + + ret = bpf_btf_printf_prepare(ptr, btf_ptr_size, flags, &btf, &btf_id); + if (ret) + return ret; + + return btf_type_seq_show_flags(btf, btf_id, ptr->ptr, m, flags); +} + +static const struct bpf_func_proto bpf_seq_printf_btf_proto = { + .func = bpf_seq_printf_btf, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_seq_file_ids[0], + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, }; static __always_inline int @@ -1098,6 +1127,118 @@ static const struct bpf_func_proto bpf_send_signal_thread_proto = { .arg1_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_d_path, struct path *, path, char *, buf, u32, sz) +{ + long len; + char *p; + + if (!sz) + return 0; + + p = d_path(path, buf, sz); + if (IS_ERR(p)) { + len = PTR_ERR(p); + } else { + len = buf + sz - p; + memmove(buf, p, len); + } + + return len; +} + +BTF_SET_START(btf_allowlist_d_path) +#ifdef CONFIG_SECURITY +BTF_ID(func, security_file_permission) +BTF_ID(func, security_inode_getattr) +BTF_ID(func, security_file_open) +#endif +#ifdef CONFIG_SECURITY_PATH +BTF_ID(func, security_path_truncate) +#endif +BTF_ID(func, vfs_truncate) +BTF_ID(func, vfs_fallocate) +BTF_ID(func, dentry_open) +BTF_ID(func, vfs_getattr) +BTF_ID(func, filp_close) +BTF_SET_END(btf_allowlist_d_path) + +static bool bpf_d_path_allowed(const struct bpf_prog *prog) +{ + return btf_id_set_contains(&btf_allowlist_d_path, prog->aux->attach_btf_id); +} + +BTF_ID_LIST_SINGLE(bpf_d_path_btf_ids, struct, path) + +static const struct bpf_func_proto bpf_d_path_proto = { + .func = bpf_d_path, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_d_path_btf_ids[0], + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .allowed = bpf_d_path_allowed, +}; + +#define BTF_F_ALL (BTF_F_COMPACT | BTF_F_NONAME | \ + BTF_F_PTR_RAW | BTF_F_ZERO) + +static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size, + u64 flags, const struct btf **btf, + s32 *btf_id) +{ + const struct btf_type *t; + + if (unlikely(flags & ~(BTF_F_ALL))) + return -EINVAL; + + if (btf_ptr_size != sizeof(struct btf_ptr)) + return -EINVAL; + + *btf = bpf_get_btf_vmlinux(); + + if (IS_ERR_OR_NULL(*btf)) + return PTR_ERR(*btf); + + if (ptr->type_id > 0) + *btf_id = ptr->type_id; + else + return -EINVAL; + + if (*btf_id > 0) + t = btf_type_by_id(*btf, *btf_id); + if (*btf_id <= 0 || !t) + return -ENOENT; + + return 0; +} + +BPF_CALL_5(bpf_snprintf_btf, char *, str, u32, str_size, struct btf_ptr *, ptr, + u32, btf_ptr_size, u64, flags) +{ + const struct btf *btf; + s32 btf_id; + int ret; + + ret = bpf_btf_printf_prepare(ptr, btf_ptr_size, flags, &btf, &btf_id); + if (ret) + return ret; + + return btf_type_snprintf_show(btf, btf_id, ptr->ptr, str, str_size, + flags); +} + +const struct bpf_func_proto bpf_snprintf_btf_proto = { + .func = bpf_snprintf_btf, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, +}; + const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1182,6 +1323,14 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_jiffies64_proto; case BPF_FUNC_get_task_stack: return &bpf_get_task_stack_proto; + case BPF_FUNC_copy_from_user: + return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL; + case BPF_FUNC_snprintf_btf: + return &bpf_snprintf_btf_proto; + case BPF_FUNC_bpf_per_cpu_ptr: + return &bpf_per_cpu_ptr_proto; + case BPF_FUNC_bpf_this_cpu_ptr: + return &bpf_this_cpu_ptr_proto; default: return NULL; } @@ -1579,6 +1728,12 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return prog->expected_attach_type == BPF_TRACE_ITER ? &bpf_seq_write_proto : NULL; + case BPF_FUNC_seq_printf_btf: + return prog->expected_attach_type == BPF_TRACE_ITER ? + &bpf_seq_printf_btf_proto : + NULL; + case BPF_FUNC_d_path: + return &bpf_d_path_proto; default: return raw_tp_prog_func_proto(func_id, prog); } @@ -1625,6 +1780,9 @@ const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { }; const struct bpf_prog_ops raw_tracepoint_prog_ops = { +#ifdef CONFIG_NET + .test_run = bpf_prog_test_run_raw_tp, +#endif }; const struct bpf_verifier_ops tracing_verifier_ops = { diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 1af321dec0f1..5658f13037b3 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -387,8 +387,8 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) } } - read_lock(&tasklist_lock); - do_each_thread(g, t) { + rcu_read_lock(); + for_each_process_thread(g, t) { if (start == end) { ret = -EAGAIN; goto unlock; @@ -403,10 +403,10 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) smp_wmb(); t->ret_stack = ret_stack_list[start++]; } - } while_each_thread(g, t); + } unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); free: for (i = start; i < end; i++) kfree(ret_stack_list[i]); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 541453927c82..8185f7240095 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -230,7 +230,7 @@ static void update_ftrace_function(void) /* * For static tracing, we need to be a bit more careful. * The function change takes affect immediately. Thus, - * we need to coorditate the setting of the function_trace_ops + * we need to coordinate the setting of the function_trace_ops * with the setting of the ftrace_trace_function. * * Set the function to the list ops, which will call the @@ -1368,10 +1368,10 @@ static struct ftrace_hash *dup_hash(struct ftrace_hash *src, int size) int i; /* - * Make the hash size about 1/2 the # found + * Use around half the size (max bit of it), but + * a minimum of 2 is fine (as size of 0 or 1 both give 1 for bits). */ - for (size /= 2; size; size >>= 1) - bits++; + bits = fls(size / 2); /* Don't allocate too much */ if (bits > FTRACE_HASH_MAX_BITS) @@ -1451,7 +1451,7 @@ static bool hash_contains_ip(unsigned long ip, { /* * The function record is a match if it exists in the filter - * hash and not in the notrace hash. Note, an emty hash is + * hash and not in the notrace hash. Note, an empty hash is * considered a match for the filter hash, but an empty * notrace hash is considered not in the notrace hash. */ @@ -2402,7 +2402,7 @@ struct ftrace_ops direct_ops = { * * If the record has the FTRACE_FL_REGS set, that means that it * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS - * is not not set, then it wants to convert to the normal callback. + * is not set, then it wants to convert to the normal callback. * * Returns the address of the trampoline to set to */ @@ -2976,7 +2976,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) synchronize_rcu_tasks_rude(); /* - * When the kernel is preeptive, tasks can be preempted + * When the kernel is preemptive, tasks can be preempted * while on a ftrace trampoline. Just scheduling a task on * a CPU is not good enough to flush them. Calling * synchornize_rcu_tasks() will wait for those tasks to @@ -3129,18 +3129,20 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) static int ftrace_allocate_records(struct ftrace_page *pg, int count) { int order; + int pages; int cnt; if (WARN_ON(!count)) return -EINVAL; - order = get_count_order(DIV_ROUND_UP(count, ENTRIES_PER_PAGE)); + pages = DIV_ROUND_UP(count, ENTRIES_PER_PAGE); + order = get_count_order(pages); /* * We want to fill as much as possible. No more than a page * may be empty. */ - while ((PAGE_SIZE << order) / ENTRY_SIZE >= count + ENTRIES_PER_PAGE) + if (!is_power_of_2(pages)) order--; again: @@ -4368,7 +4370,7 @@ void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper, * @ip: The instruction pointer address to map @data to * @data: The data to map to @ip * - * Returns 0 on succes otherwise an error. + * Returns 0 on success otherwise an error. */ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, unsigned long ip, void *data) @@ -4536,7 +4538,7 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, /* * Note, there's a small window here that the func_hash->filter_hash - * may be NULL or empty. Need to be carefule when reading the loop. + * may be NULL or empty. Need to be careful when reading the loop. */ mutex_lock(&probe->ops.func_hash->regex_lock); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 93ef0ab6ea20..7f45fd9d5a45 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -793,7 +793,7 @@ static void rb_wake_up_waiters(struct irq_work *work) * ring_buffer_wait - wait for input to the ring buffer * @buffer: buffer to wait on * @cpu: the cpu buffer to wait on - * @full: wait until a full page is available, if @cpu != RING_BUFFER_ALL_CPUS + * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS * * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon * as data is added to any of the @buffer's cpu buffers. Otherwise @@ -1952,18 +1952,18 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, { struct ring_buffer_per_cpu *cpu_buffer; unsigned long nr_pages; - int cpu, err = 0; + int cpu, err; /* * Always succeed at resizing a non-existent buffer: */ if (!buffer) - return size; + return 0; /* Make sure the requested buffer exists */ if (cpu_id != RING_BUFFER_ALL_CPUS && !cpumask_test_cpu(cpu_id, buffer->cpumask)) - return size; + return 0; nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); @@ -2119,7 +2119,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, } mutex_unlock(&buffer->mutex); - return size; + return 0; out_err: for_each_buffer_cpu(buffer, cpu) { @@ -4866,6 +4866,9 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; + /* prevent another thread from changing buffer sizes */ + mutex_lock(&buffer->mutex); + atomic_inc(&cpu_buffer->resize_disabled); atomic_inc(&cpu_buffer->record_disabled); @@ -4876,6 +4879,8 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) atomic_dec(&cpu_buffer->record_disabled); atomic_dec(&cpu_buffer->resize_disabled); + + mutex_unlock(&buffer->mutex); } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); @@ -4889,6 +4894,9 @@ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) struct ring_buffer_per_cpu *cpu_buffer; int cpu; + /* prevent another thread from changing buffer sizes */ + mutex_lock(&buffer->mutex); + for_each_online_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; @@ -4907,6 +4915,8 @@ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) atomic_dec(&cpu_buffer->record_disabled); atomic_dec(&cpu_buffer->resize_disabled); } + + mutex_unlock(&buffer->mutex); } /** diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c index 7d56d621ffea..edd912cd14aa 100644 --- a/kernel/trace/synth_event_gen_test.c +++ b/kernel/trace/synth_event_gen_test.c @@ -242,9 +242,11 @@ static struct synth_field_desc create_synth_test_fields[] = { { .type = "pid_t", .name = "next_pid_field" }, { .type = "char[16]", .name = "next_comm_field" }, { .type = "u64", .name = "ts_ns" }, + { .type = "char[]", .name = "dynstring_field_1" }, { .type = "u64", .name = "ts_ms" }, { .type = "unsigned int", .name = "cpu" }, { .type = "char[64]", .name = "my_string_field" }, + { .type = "char[]", .name = "dynstring_field_2" }, { .type = "int", .name = "my_int_field" }, }; @@ -254,7 +256,7 @@ static struct synth_field_desc create_synth_test_fields[] = { */ static int __init test_create_synth_event(void) { - u64 vals[7]; + u64 vals[9]; int ret; /* Create the create_synth_test event with the fields above */ @@ -292,10 +294,12 @@ static int __init test_create_synth_event(void) vals[0] = 777; /* next_pid_field */ vals[1] = (u64)(long)"tiddlywinks"; /* next_comm_field */ vals[2] = 1000000; /* ts_ns */ - vals[3] = 1000; /* ts_ms */ - vals[4] = raw_smp_processor_id(); /* cpu */ - vals[5] = (u64)(long)"thneed"; /* my_string_field */ - vals[6] = 398; /* my_int_field */ + vals[3] = (u64)(long)"xrayspecs"; /* dynstring_field_1 */ + vals[4] = 1000; /* ts_ms */ + vals[5] = raw_smp_processor_id(); /* cpu */ + vals[6] = (u64)(long)"thneed"; /* my_string_field */ + vals[7] = (u64)(long)"kerplunk"; /* dynstring_field_2 */ + vals[8] = 398; /* my_int_field */ /* Now generate a create_synth_test event */ ret = synth_event_trace_array(create_synth_test, vals, ARRAY_SIZE(vals)); @@ -422,13 +426,15 @@ static int __init test_trace_synth_event(void) int ret; /* Trace some bogus values just for testing */ - ret = synth_event_trace(create_synth_test, 7, /* number of values */ + ret = synth_event_trace(create_synth_test, 9, /* number of values */ (u64)444, /* next_pid_field */ (u64)(long)"clackers", /* next_comm_field */ (u64)1000000, /* ts_ns */ + (u64)(long)"viewmaster",/* dynstring_field_1 */ (u64)1000, /* ts_ms */ (u64)raw_smp_processor_id(), /* cpu */ (u64)(long)"Thneed", /* my_string_field */ + (u64)(long)"yoyos", /* dynstring_field_2 */ (u64)999); /* my_int_field */ return ret; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 25b72a73608a..528971714fc6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -251,6 +251,145 @@ unsigned long long ns2usecs(u64 nsec) return nsec; } +static void +trace_process_export(struct trace_export *export, + struct ring_buffer_event *event, int flag) +{ + struct trace_entry *entry; + unsigned int size = 0; + + if (export->flags & flag) { + entry = ring_buffer_event_data(event); + size = ring_buffer_event_length(event); + export->write(export, entry, size); + } +} + +static DEFINE_MUTEX(ftrace_export_lock); + +static struct trace_export __rcu *ftrace_exports_list __read_mostly; + +static DEFINE_STATIC_KEY_FALSE(trace_function_exports_enabled); +static DEFINE_STATIC_KEY_FALSE(trace_event_exports_enabled); +static DEFINE_STATIC_KEY_FALSE(trace_marker_exports_enabled); + +static inline void ftrace_exports_enable(struct trace_export *export) +{ + if (export->flags & TRACE_EXPORT_FUNCTION) + static_branch_inc(&trace_function_exports_enabled); + + if (export->flags & TRACE_EXPORT_EVENT) + static_branch_inc(&trace_event_exports_enabled); + + if (export->flags & TRACE_EXPORT_MARKER) + static_branch_inc(&trace_marker_exports_enabled); +} + +static inline void ftrace_exports_disable(struct trace_export *export) +{ + if (export->flags & TRACE_EXPORT_FUNCTION) + static_branch_dec(&trace_function_exports_enabled); + + if (export->flags & TRACE_EXPORT_EVENT) + static_branch_dec(&trace_event_exports_enabled); + + if (export->flags & TRACE_EXPORT_MARKER) + static_branch_dec(&trace_marker_exports_enabled); +} + +static void ftrace_exports(struct ring_buffer_event *event, int flag) +{ + struct trace_export *export; + + preempt_disable_notrace(); + + export = rcu_dereference_raw_check(ftrace_exports_list); + while (export) { + trace_process_export(export, event, flag); + export = rcu_dereference_raw_check(export->next); + } + + preempt_enable_notrace(); +} + +static inline void +add_trace_export(struct trace_export **list, struct trace_export *export) +{ + rcu_assign_pointer(export->next, *list); + /* + * We are entering export into the list but another + * CPU might be walking that list. We need to make sure + * the export->next pointer is valid before another CPU sees + * the export pointer included into the list. + */ + rcu_assign_pointer(*list, export); +} + +static inline int +rm_trace_export(struct trace_export **list, struct trace_export *export) +{ + struct trace_export **p; + + for (p = list; *p != NULL; p = &(*p)->next) + if (*p == export) + break; + + if (*p != export) + return -1; + + rcu_assign_pointer(*p, (*p)->next); + + return 0; +} + +static inline void +add_ftrace_export(struct trace_export **list, struct trace_export *export) +{ + ftrace_exports_enable(export); + + add_trace_export(list, export); +} + +static inline int +rm_ftrace_export(struct trace_export **list, struct trace_export *export) +{ + int ret; + + ret = rm_trace_export(list, export); + ftrace_exports_disable(export); + + return ret; +} + +int register_ftrace_export(struct trace_export *export) +{ + if (WARN_ON_ONCE(!export->write)) + return -1; + + mutex_lock(&ftrace_export_lock); + + add_ftrace_export(&ftrace_exports_list, export); + + mutex_unlock(&ftrace_export_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(register_ftrace_export); + +int unregister_ftrace_export(struct trace_export *export) +{ + int ret; + + mutex_lock(&ftrace_export_lock); + + ret = rm_ftrace_export(&ftrace_exports_list, export); + + mutex_unlock(&ftrace_export_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(unregister_ftrace_export); + /* trace_flags holds trace_options default values */ #define TRACE_DEFAULT_FLAGS \ (FUNCTION_DEFAULT_FLAGS | \ @@ -2511,7 +2650,7 @@ void trace_buffered_event_enable(void) preempt_disable(); if (cpu == smp_processor_id() && - this_cpu_read(trace_buffered_event) != + __this_cpu_read(trace_buffered_event) != per_cpu(trace_buffered_event, cpu)) WARN_ON_ONCE(1); preempt_enable(); @@ -2699,6 +2838,8 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) if (static_key_false(&tracepoint_printk_key.key)) output_printk(fbuffer); + if (static_branch_unlikely(&trace_event_exports_enabled)) + ftrace_exports(fbuffer->event, TRACE_EXPORT_EVENT); event_trigger_unlock_commit_regs(fbuffer->trace_file, fbuffer->buffer, fbuffer->event, fbuffer->entry, fbuffer->flags, fbuffer->pc, fbuffer->regs); @@ -2742,129 +2883,6 @@ trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, __buffer_unlock_commit(buffer, event); } -static void -trace_process_export(struct trace_export *export, - struct ring_buffer_event *event) -{ - struct trace_entry *entry; - unsigned int size = 0; - - entry = ring_buffer_event_data(event); - size = ring_buffer_event_length(event); - export->write(export, entry, size); -} - -static DEFINE_MUTEX(ftrace_export_lock); - -static struct trace_export __rcu *ftrace_exports_list __read_mostly; - -static DEFINE_STATIC_KEY_FALSE(ftrace_exports_enabled); - -static inline void ftrace_exports_enable(void) -{ - static_branch_enable(&ftrace_exports_enabled); -} - -static inline void ftrace_exports_disable(void) -{ - static_branch_disable(&ftrace_exports_enabled); -} - -static void ftrace_exports(struct ring_buffer_event *event) -{ - struct trace_export *export; - - preempt_disable_notrace(); - - export = rcu_dereference_raw_check(ftrace_exports_list); - while (export) { - trace_process_export(export, event); - export = rcu_dereference_raw_check(export->next); - } - - preempt_enable_notrace(); -} - -static inline void -add_trace_export(struct trace_export **list, struct trace_export *export) -{ - rcu_assign_pointer(export->next, *list); - /* - * We are entering export into the list but another - * CPU might be walking that list. We need to make sure - * the export->next pointer is valid before another CPU sees - * the export pointer included into the list. - */ - rcu_assign_pointer(*list, export); -} - -static inline int -rm_trace_export(struct trace_export **list, struct trace_export *export) -{ - struct trace_export **p; - - for (p = list; *p != NULL; p = &(*p)->next) - if (*p == export) - break; - - if (*p != export) - return -1; - - rcu_assign_pointer(*p, (*p)->next); - - return 0; -} - -static inline void -add_ftrace_export(struct trace_export **list, struct trace_export *export) -{ - if (*list == NULL) - ftrace_exports_enable(); - - add_trace_export(list, export); -} - -static inline int -rm_ftrace_export(struct trace_export **list, struct trace_export *export) -{ - int ret; - - ret = rm_trace_export(list, export); - if (*list == NULL) - ftrace_exports_disable(); - - return ret; -} - -int register_ftrace_export(struct trace_export *export) -{ - if (WARN_ON_ONCE(!export->write)) - return -1; - - mutex_lock(&ftrace_export_lock); - - add_ftrace_export(&ftrace_exports_list, export); - - mutex_unlock(&ftrace_export_lock); - - return 0; -} -EXPORT_SYMBOL_GPL(register_ftrace_export); - -int unregister_ftrace_export(struct trace_export *export) -{ - int ret; - - mutex_lock(&ftrace_export_lock); - - ret = rm_ftrace_export(&ftrace_exports_list, export); - - mutex_unlock(&ftrace_export_lock); - - return ret; -} -EXPORT_SYMBOL_GPL(unregister_ftrace_export); - void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, @@ -2884,8 +2902,8 @@ trace_function(struct trace_array *tr, entry->parent_ip = parent_ip; if (!call_filter_check_discard(call, entry, buffer, event)) { - if (static_branch_unlikely(&ftrace_exports_enabled)) - ftrace_exports(event); + if (static_branch_unlikely(&trace_function_exports_enabled)) + ftrace_exports(event, TRACE_EXPORT_FUNCTION); __buffer_unlock_commit(buffer, event); } } @@ -5124,10 +5142,10 @@ static const char readme_msg[] = "\t -:[<group>/]<event>\n" #ifdef CONFIG_KPROBE_EVENTS "\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n" - "place (kretprobe): [<module>:]<symbol>[+<offset>]|<memaddr>\n" + "place (kretprobe): [<module>:]<symbol>[+<offset>]%return|<memaddr>\n" #endif #ifdef CONFIG_UPROBE_EVENTS - " place (uprobe): <path>:<offset>[(ref_ctr_offset)]\n" + " place (uprobe): <path>:<offset>[%return][(ref_ctr_offset)]\n" #endif "\t args: <name>=fetcharg[:type]\n" "\t fetcharg: %<register>, @<address>, @<symbol>[+|-<offset>],\n" @@ -5251,7 +5269,12 @@ static const char readme_msg[] = "\t trace(<synthetic_event>,param list) - generate synthetic event\n" "\t save(field,...) - save current event fields\n" #ifdef CONFIG_TRACER_SNAPSHOT - "\t snapshot() - snapshot the trace buffer\n" + "\t snapshot() - snapshot the trace buffer\n\n" +#endif +#ifdef CONFIG_SYNTH_EVENTS + " events/synthetic_events\t- Create/append/remove/show synthetic events\n" + "\t Write into this file to define/undefine new synthetic events.\n" + "\t example: echo 'myevent u64 lat; char name[]' >> synthetic_events\n" #endif #endif ; @@ -6664,7 +6687,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, written = -EFAULT; } else written = cnt; - len = cnt; if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) { /* do not add \n before testing triggers, but add \0 */ @@ -6678,6 +6700,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, } else entry->buf[cnt] = '\0'; + if (static_branch_unlikely(&trace_marker_exports_enabled)) + ftrace_exports(event, TRACE_EXPORT_MARKER); __buffer_unlock_commit(buffer, event); if (tt) @@ -8638,6 +8662,24 @@ struct trace_array *trace_array_find_get(const char *instance) return tr; } +static int trace_array_create_dir(struct trace_array *tr) +{ + int ret; + + tr->dir = tracefs_create_dir(tr->name, trace_instance_dir); + if (!tr->dir) + return -EINVAL; + + ret = event_trace_add_tracer(tr->dir, tr); + if (ret) + tracefs_remove(tr->dir); + + init_tracer_tracefs(tr, tr->dir); + __update_tracer_options(tr); + + return ret; +} + static struct trace_array *trace_array_create(const char *name) { struct trace_array *tr; @@ -8673,30 +8715,28 @@ static struct trace_array *trace_array_create(const char *name) if (allocate_trace_buffers(tr, trace_buf_size) < 0) goto out_free_tr; - tr->dir = tracefs_create_dir(name, trace_instance_dir); - if (!tr->dir) + if (ftrace_allocate_ftrace_ops(tr) < 0) goto out_free_tr; - ret = event_trace_add_tracer(tr->dir, tr); - if (ret) { - tracefs_remove(tr->dir); - goto out_free_tr; - } - ftrace_init_trace_array(tr); - init_tracer_tracefs(tr, tr->dir); init_trace_flags_index(tr); - __update_tracer_options(tr); + + if (trace_instance_dir) { + ret = trace_array_create_dir(tr); + if (ret) + goto out_free_tr; + } else + __trace_early_add_events(tr); list_add(&tr->list, &ftrace_trace_arrays); tr->ref++; - return tr; out_free_tr: + ftrace_free_ftrace_ops(tr); free_trace_buffers(tr); free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); @@ -8801,7 +8841,6 @@ static int __remove_instance(struct trace_array *tr) free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); - tr = NULL; return 0; } @@ -8855,11 +8894,27 @@ static int instance_rmdir(const char *name) static __init void create_trace_instances(struct dentry *d_tracer) { + struct trace_array *tr; + trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer, instance_mkdir, instance_rmdir); if (MEM_FAIL(!trace_instance_dir, "Failed to create instances directory\n")) return; + + mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (!tr->name) + continue; + if (MEM_FAIL(trace_array_create_dir(tr) < 0, + "Failed to create instance directory\n")) + break; + } + + mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); } static void @@ -8973,21 +9028,21 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) * directory. It is called via fs_initcall() by any of the boot up code * and expects to return the dentry of the top level tracing directory. */ -struct dentry *tracing_init_dentry(void) +int tracing_init_dentry(void) { struct trace_array *tr = &global_trace; if (security_locked_down(LOCKDOWN_TRACEFS)) { pr_warn("Tracing disabled due to lockdown\n"); - return ERR_PTR(-EPERM); + return -EPERM; } /* The top level trace array uses NULL as parent */ if (tr->dir) - return NULL; + return 0; if (WARN_ON(!tracefs_initialized())) - return ERR_PTR(-ENODEV); + return -ENODEV; /* * As there may still be users that expect the tracing @@ -8998,7 +9053,7 @@ struct dentry *tracing_init_dentry(void) tr->dir = debugfs_create_automount("tracing", NULL, trace_automount, NULL); - return NULL; + return 0; } extern struct trace_eval_map *__start_ftrace_eval_maps[]; @@ -9085,48 +9140,48 @@ static struct notifier_block trace_module_nb = { static __init int tracer_init_tracefs(void) { - struct dentry *d_tracer; + int ret; trace_access_lock_init(); - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) + ret = tracing_init_dentry(); + if (ret) return 0; event_trace_init(); - init_tracer_tracefs(&global_trace, d_tracer); - ftrace_init_tracefs_toplevel(&global_trace, d_tracer); + init_tracer_tracefs(&global_trace, NULL); + ftrace_init_tracefs_toplevel(&global_trace, NULL); - trace_create_file("tracing_thresh", 0644, d_tracer, + trace_create_file("tracing_thresh", 0644, NULL, &global_trace, &tracing_thresh_fops); - trace_create_file("README", 0444, d_tracer, + trace_create_file("README", 0444, NULL, NULL, &tracing_readme_fops); - trace_create_file("saved_cmdlines", 0444, d_tracer, + trace_create_file("saved_cmdlines", 0444, NULL, NULL, &tracing_saved_cmdlines_fops); - trace_create_file("saved_cmdlines_size", 0644, d_tracer, + trace_create_file("saved_cmdlines_size", 0644, NULL, NULL, &tracing_saved_cmdlines_size_fops); - trace_create_file("saved_tgids", 0444, d_tracer, + trace_create_file("saved_tgids", 0444, NULL, NULL, &tracing_saved_tgids_fops); trace_eval_init(); - trace_create_eval_file(d_tracer); + trace_create_eval_file(NULL); #ifdef CONFIG_MODULES register_module_notifier(&trace_module_nb); #endif #ifdef CONFIG_DYNAMIC_FTRACE - trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, + trace_create_file("dyn_ftrace_total_info", 0444, NULL, NULL, &tracing_dyn_info_fops); #endif - create_trace_instances(d_tracer); + create_trace_instances(NULL); update_tracer_options(&global_trace); @@ -9289,7 +9344,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) } /* - * We need to stop all tracing on all CPUS to read the + * We need to stop all tracing on all CPUS to read * the next buffer. This is a bit expensive, but is * not done often. We fill all what we can read, * and then release the locks again. @@ -9432,7 +9487,7 @@ __init static int tracer_alloc_buffers(void) } /* - * Make sure we don't accidently add more trace options + * Make sure we don't accidentally add more trace options * than we have bits for. */ BUILD_BUG_ON(TRACE_ITER_LAST_BIT > TRACE_FLAGS_MAX_SIZE); @@ -9461,7 +9516,7 @@ __init static int tracer_alloc_buffers(void) /* * The prepare callbacks allocates some memory for the ring buffer. We - * don't free the buffer if the if the CPU goes down. If we were to free + * don't free the buffer if the CPU goes down. If we were to free * the buffer, then the user would lose any trace that was in the * buffer. The memory will be removed once the "instance" is removed. */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 610d21355526..f3f5e77123ad 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -19,6 +19,7 @@ #include <linux/glob.h> #include <linux/irq_work.h> #include <linux/workqueue.h> +#include <linux/ctype.h> #ifdef CONFIG_FTRACE_SYSCALLS #include <asm/unistd.h> /* For NR_SYSCALLS */ @@ -98,7 +99,7 @@ enum trace_type { /* Use this for memory failure errors */ #define MEM_FAIL(condition, fmt, ...) ({ \ - static bool __section(.data.once) __warned; \ + static bool __section(".data.once") __warned; \ int __ret_warn_once = !!(condition); \ \ if (unlikely(__ret_warn_once && !__warned)) { \ @@ -246,7 +247,7 @@ typedef bool (*cond_update_fn_t)(struct trace_array *tr, void *cond_data); * tracing_snapshot_cond(tr, cond_data), the cond_data passed in is * passed in turn to the cond_snapshot.update() function. That data * can be compared by the update() implementation with the cond_data - * contained wihin the struct cond_snapshot instance associated with + * contained within the struct cond_snapshot instance associated with * the trace_array. Because the tr->max_lock is held throughout the * update() call, the update() function can directly retrieve the * cond_snapshot and cond_data associated with the per-instance @@ -271,7 +272,7 @@ typedef bool (*cond_update_fn_t)(struct trace_array *tr, void *cond_data); * take the snapshot, by returning 'true' if so, 'false' if no * snapshot should be taken. Because the max_lock is held for * the duration of update(), the implementation is safe to - * directly retrieven and save any implementation data it needs + * directly retrieved and save any implementation data it needs * to in association with the snapshot. */ struct cond_snapshot { @@ -573,7 +574,7 @@ struct tracer { * The function callback, which can use the FTRACE bits to * check for recursion. * - * Now if the arch does not suppport a feature, and it calls + * Now if the arch does not support a feature, and it calls * the global list function which calls the ftrace callback * all three of these steps will do a recursion protection. * There's no reason to do one if the previous caller already @@ -737,7 +738,7 @@ struct dentry *trace_create_file(const char *name, void *data, const struct file_operations *fops); -struct dentry *tracing_init_dentry(void); +int tracing_init_dentry(void); struct ring_buffer_event; @@ -1125,6 +1126,8 @@ extern int ftrace_is_dead(void); int ftrace_create_function_files(struct trace_array *tr, struct dentry *parent); void ftrace_destroy_function_files(struct trace_array *tr); +int ftrace_allocate_ftrace_ops(struct trace_array *tr); +void ftrace_free_ftrace_ops(struct trace_array *tr); void ftrace_init_global_array_ops(struct trace_array *tr); void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); void ftrace_reset_array_ops(struct trace_array *tr); @@ -1146,6 +1149,11 @@ ftrace_create_function_files(struct trace_array *tr, { return 0; } +static inline int ftrace_allocate_ftrace_ops(struct trace_array *tr) +{ + return 0; +} +static inline void ftrace_free_ftrace_ops(struct trace_array *tr) { } static inline void ftrace_destroy_function_files(struct trace_array *tr) { } static inline __init void ftrace_init_global_array_ops(struct trace_array *tr) { } @@ -1472,7 +1480,7 @@ __trace_event_discard_commit(struct trace_buffer *buffer, /* * Helper function for event_trigger_unlock_commit{_regs}(). * If there are event triggers attached to this event that requires - * filtering against its fields, then they wil be called as the + * filtering against its fields, then they will be called as the * entry already holds the field information of the current event. * * It also checks if the event should be discarded or not. @@ -1651,6 +1659,7 @@ extern void trace_event_enable_tgid_record(bool enable); extern int event_trace_init(void); extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); extern int event_trace_del_tracer(struct trace_array *tr); +extern void __trace_early_add_events(struct trace_array *tr); extern struct trace_event_file *__find_event_file(struct trace_array *tr, const char *system, @@ -2082,4 +2091,16 @@ static __always_inline void trace_iterator_reset(struct trace_iterator *iter) iter->pos = -1; } +/* Check the name is good for event/group/fields */ +static inline bool is_good_name(const char *name) +{ + if (!isalpha(*name) && *name != '_') + return false; + while (*++name != '\0') { + if (!isalpha(*name) && !isdigit(*name) && *name != '_') + return false; + } + return true; +} + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index fa0fc08c6ef8..c22a152ef0b4 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -40,6 +40,16 @@ trace_boot_set_instance_options(struct trace_array *tr, struct xbc_node *node) pr_err("Failed to set option: %s\n", buf); } + p = xbc_node_find_value(node, "tracing_on", NULL); + if (p && *p != '\0') { + if (kstrtoul(p, 10, &v)) + pr_err("Failed to set tracing on: %s\n", p); + if (v) + tracer_tracing_on(tr); + else + tracer_tracing_off(tr); + } + p = xbc_node_find_value(node, "trace_clock", NULL); if (p && *p != '\0') { if (tracing_set_clock(tr, p) < 0) @@ -274,6 +284,12 @@ trace_boot_enable_tracer(struct trace_array *tr, struct xbc_node *node) if (tracing_set_tracer(tr, p) < 0) pr_err("Failed to set given tracer: %s\n", p); } + + /* Since tracer can free snapshot buffer, allocate snapshot here.*/ + if (xbc_node_find_value(node, "alloc_snapshot", NULL)) { + if (tracing_alloc_snapshot_instance(tr) < 0) + pr_err("Failed to allocate snapshot buffer\n"); + } } static void __init @@ -330,5 +346,8 @@ static int __init trace_boot_init(void) return 0; } - -fs_initcall(trace_boot_init); +/* + * Start tracing at the end of core-initcall, so that it starts tracing + * from the beginning of postcore_initcall. + */ +core_initcall_sync(trace_boot_init); diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 9f2e8520b748..5fa49cfd2bb6 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -206,14 +206,14 @@ static const struct file_operations dynamic_events_ops = { /* Make a tracefs interface for controlling dynamic events */ static __init int init_dynamic_event(void) { - struct dentry *d_tracer; struct dentry *entry; + int ret; - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) + ret = tracing_init_dentry(); + if (ret) return 0; - entry = tracefs_create_file("dynamic_events", 0644, d_tracer, + entry = tracefs_create_file("dynamic_events", 0644, NULL, NULL, &dynamic_events_ops); /* Event list interface */ @@ -402,7 +402,7 @@ void dynevent_arg_init(struct dynevent_arg *arg, * whitespace, all followed by a separator, if applicable. After the * first arg string is successfully appended to the command string, * the optional @operator is appended, followed by the second arg and - * and optional @separator. If no separator was specified when + * optional @separator. If no separator was specified when * initializing the arg, a space will be appended. */ void dynevent_arg_pair_init(struct dynevent_arg_pair *arg_pair, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index beebf2cd364b..47a71f96e5bc 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -38,6 +38,7 @@ DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); static LIST_HEAD(ftrace_generic_fields); static LIST_HEAD(ftrace_common_fields); +static bool eventdir_initialized; #define GFP_TRACE (GFP_KERNEL | __GFP_ZERO) @@ -2124,11 +2125,47 @@ event_subsystem_dir(struct trace_array *tr, const char *name, } static int +event_define_fields(struct trace_event_call *call) +{ + struct list_head *head; + int ret = 0; + + /* + * Other events may have the same class. Only update + * the fields if they are not already defined. + */ + head = trace_get_fields(call); + if (list_empty(head)) { + struct trace_event_fields *field = call->class->fields_array; + unsigned int offset = sizeof(struct trace_entry); + + for (; field->type; field++) { + if (field->type == TRACE_FUNCTION_TYPE) { + field->define_fields(call); + break; + } + + offset = ALIGN(offset, field->align); + ret = trace_define_field(call, field->type, field->name, + offset, field->size, + field->is_signed, field->filter_type); + if (WARN_ON_ONCE(ret)) { + pr_err("error code is %d\n", ret); + break; + } + + offset += field->size; + } + } + + return ret; +} + +static int event_create_dir(struct dentry *parent, struct trace_event_file *file) { struct trace_event_call *call = file->event_call; struct trace_array *tr = file->tr; - struct list_head *head; struct dentry *d_events; const char *name; int ret; @@ -2162,35 +2199,10 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) &ftrace_event_id_fops); #endif - /* - * Other events may have the same class. Only update - * the fields if they are not already defined. - */ - head = trace_get_fields(call); - if (list_empty(head)) { - struct trace_event_fields *field = call->class->fields_array; - unsigned int offset = sizeof(struct trace_entry); - - for (; field->type; field++) { - if (field->type == TRACE_FUNCTION_TYPE) { - ret = field->define_fields(call); - break; - } - - offset = ALIGN(offset, field->align); - ret = trace_define_field(call, field->type, field->name, - offset, field->size, - field->is_signed, field->filter_type); - if (ret) - break; - - offset += field->size; - } - if (ret < 0) { - pr_warn("Could not initialize trace point events/%s\n", - name); - return -1; - } + ret = event_define_fields(call); + if (ret < 0) { + pr_warn("Could not initialize trace point events/%s\n", name); + return ret; } /* @@ -2475,7 +2487,10 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) if (!file) return -ENOMEM; - return event_create_dir(tr->event_dir, file); + if (eventdir_initialized) + return event_create_dir(tr->event_dir, file); + else + return event_define_fields(call); } /* @@ -2483,7 +2498,7 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) * for enabling events at boot. We want to enable events before * the filesystem is initialized. */ -static __init int +static int __trace_early_add_new_event(struct trace_event_call *call, struct trace_array *tr) { @@ -2493,7 +2508,7 @@ __trace_early_add_new_event(struct trace_event_call *call, if (!file) return -ENOMEM; - return 0; + return event_define_fields(call); } struct ftrace_module_file_ops; @@ -3116,14 +3131,13 @@ static inline int register_event_cmds(void) { return 0; } #endif /* CONFIG_DYNAMIC_FTRACE */ /* - * The top level array has already had its trace_event_file - * descriptors created in order to allow for early events to - * be recorded. This function is called after the tracefs has been - * initialized, and we now have to create the files associated - * to the events. + * The top level array and trace arrays created by boot-time tracing + * have already had its trace_event_file descriptors created in order + * to allow for early events to be recorded. + * This function is called after the tracefs has been initialized, + * and we now have to create the files associated to the events. */ -static __init void -__trace_early_add_event_dirs(struct trace_array *tr) +static void __trace_early_add_event_dirs(struct trace_array *tr) { struct trace_event_file *file; int ret; @@ -3138,13 +3152,12 @@ __trace_early_add_event_dirs(struct trace_array *tr) } /* - * For early boot up, the top trace array requires to have - * a list of events that can be enabled. This must be done before - * the filesystem is set up in order to allow events to be traced - * early. + * For early boot up, the top trace array and the trace arrays created + * by boot-time tracing require to have a list of events that can be + * enabled. This must be done before the filesystem is set up in order + * to allow events to be traced early. */ -static __init void -__trace_early_add_events(struct trace_array *tr) +void __trace_early_add_events(struct trace_array *tr) { struct trace_event_call *call; int ret; @@ -3275,7 +3288,11 @@ int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr) goto out; down_write(&trace_event_sem); - __trace_add_event_dirs(tr); + /* If tr already has the event list, it is initialized in early boot. */ + if (unlikely(!list_empty(&tr->events))) + __trace_early_add_event_dirs(tr); + else + __trace_add_event_dirs(tr); up_write(&trace_event_sem); out: @@ -3431,10 +3448,21 @@ static __init int event_trace_enable_again(void) early_initcall(event_trace_enable_again); +/* Init fields which doesn't related to the tracefs */ +static __init int event_trace_init_fields(void) +{ + if (trace_define_generic_fields()) + pr_warn("tracing: Failed to allocated generic fields"); + + if (trace_define_common_fields()) + pr_warn("tracing: Failed to allocate common fields"); + + return 0; +} + __init int event_trace_init(void) { struct trace_array *tr; - struct dentry *d_tracer; struct dentry *entry; int ret; @@ -3442,22 +3470,12 @@ __init int event_trace_init(void) if (!tr) return -ENODEV; - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) - return 0; - - entry = tracefs_create_file("available_events", 0444, d_tracer, + entry = tracefs_create_file("available_events", 0444, NULL, tr, &ftrace_avail_fops); if (!entry) pr_warn("Could not create tracefs 'available_events' entry\n"); - if (trace_define_generic_fields()) - pr_warn("tracing: Failed to allocated generic fields"); - - if (trace_define_common_fields()) - pr_warn("tracing: Failed to allocate common fields"); - - ret = early_event_add_tracer(d_tracer, tr); + ret = early_event_add_tracer(NULL, tr); if (ret) return ret; @@ -3466,6 +3484,9 @@ __init int event_trace_init(void) if (ret) pr_warn("Failed to register trace events module notifier\n"); #endif + + eventdir_initialized = true; + return 0; } @@ -3474,6 +3495,7 @@ void __init trace_event_init(void) event_trace_memsetup(); init_ftrace_syscalls(); event_trace_enable(); + event_trace_init_fields(); } #ifdef CONFIG_EVENT_TRACE_STARTUP_TEST diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1b2ef6490229..96c3f86b81c5 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -147,6 +147,8 @@ struct hist_field { */ unsigned int var_ref_idx; bool read_once; + + unsigned int var_str_idx; }; static u64 hist_field_none(struct hist_field *field, @@ -349,6 +351,7 @@ struct hist_trigger_data { unsigned int n_keys; unsigned int n_fields; unsigned int n_vars; + unsigned int n_var_str; unsigned int key_size; struct tracing_map_sort_key sort_keys[TRACING_MAP_SORT_KEYS_MAX]; unsigned int n_sort_keys; @@ -1396,7 +1399,14 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt) } } - n_str = hist_data->n_field_var_str + hist_data->n_save_var_str; + n_str = hist_data->n_field_var_str + hist_data->n_save_var_str + + hist_data->n_var_str; + if (n_str > SYNTH_FIELDS_MAX) { + hist_elt_data_free(elt_data); + return -EINVAL; + } + + BUILD_BUG_ON(STR_VAR_LEN_MAX & (sizeof(u64) - 1)); size = STR_VAR_LEN_MAX; @@ -3279,6 +3289,15 @@ static int check_synth_field(struct synth_event *event, field = event->fields[field_pos]; + /* + * A dynamic string synth field can accept static or + * dynamic. A static string synth field can only accept a + * same-sized static string, which is checked for later. + */ + if (strstr(hist_field->type, "char[") && field->is_string + && field->is_dynamic) + return 0; + if (strcmp(field->type, hist_field->type) != 0) { if (field->size != hist_field->size || field->is_signed != hist_field->is_signed) @@ -3651,6 +3670,7 @@ static int create_var_field(struct hist_trigger_data *hist_data, { struct trace_array *tr = hist_data->event_file->tr; unsigned long flags = 0; + int ret; if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX)) return -EINVAL; @@ -3665,7 +3685,12 @@ static int create_var_field(struct hist_trigger_data *hist_data, if (WARN_ON(hist_data->n_vars > TRACING_MAP_VARS_MAX)) return -EINVAL; - return __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags); + ret = __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags); + + if (!ret && hist_data->fields[val_idx]->flags & HIST_FIELD_FL_STRING) + hist_data->fields[val_idx]->var_str_idx = hist_data->n_var_str++; + + return ret; } static int create_val_fields(struct hist_trigger_data *hist_data, @@ -4392,6 +4417,22 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, hist_val = hist_field->fn(hist_field, elt, rbe, rec); if (hist_field->flags & HIST_FIELD_FL_VAR) { var_idx = hist_field->var.idx; + + if (hist_field->flags & HIST_FIELD_FL_STRING) { + unsigned int str_start, var_str_idx, idx; + char *str, *val_str; + + str_start = hist_data->n_field_var_str + + hist_data->n_save_var_str; + var_str_idx = hist_field->var_str_idx; + idx = str_start + var_str_idx; + + str = elt_data->field_var_str[idx]; + val_str = (char *)(uintptr_t)hist_val; + strscpy(str, val_str, STR_VAR_LEN_MAX); + + hist_val = (u64)(uintptr_t)str; + } tracing_map_set_var(elt, var_idx, hist_val); continue; } diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index c6cca0d1d584..84b7cab55291 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -20,6 +20,48 @@ #include "trace_synth.h" +#undef ERRORS +#define ERRORS \ + C(BAD_NAME, "Illegal name"), \ + C(CMD_INCOMPLETE, "Incomplete command"), \ + C(EVENT_EXISTS, "Event already exists"), \ + C(TOO_MANY_FIELDS, "Too many fields"), \ + C(INCOMPLETE_TYPE, "Incomplete type"), \ + C(INVALID_TYPE, "Invalid type"), \ + C(INVALID_FIELD, "Invalid field"), \ + C(CMD_TOO_LONG, "Command too long"), + +#undef C +#define C(a, b) SYNTH_ERR_##a + +enum { ERRORS }; + +#undef C +#define C(a, b) b + +static const char *err_text[] = { ERRORS }; + +static char last_cmd[MAX_FILTER_STR_VAL]; + +static int errpos(const char *str) +{ + return err_pos(last_cmd, str); +} + +static void last_cmd_set(char *str) +{ + if (!str) + return; + + strncpy(last_cmd, str, MAX_FILTER_STR_VAL - 1); +} + +static void synth_err(u8 err_type, u8 err_pos) +{ + tracing_log_err(NULL, "synthetic_events", last_cmd, err_text, + err_type, err_pos); +} + static int create_synth_event(int argc, const char **argv); static int synth_event_show(struct seq_file *m, struct dyn_event *ev); static int synth_event_release(struct dyn_event *ev); @@ -88,7 +130,7 @@ static int synth_event_define_fields(struct trace_event_call *call) event->fields[i]->offset = n_u64; - if (event->fields[i]->is_string) { + if (event->fields[i]->is_string && !event->fields[i]->is_dynamic) { offset += STR_VAR_LEN_MAX; n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } else { @@ -132,13 +174,16 @@ static int synth_field_string_size(char *type) start += sizeof("char[") - 1; end = strchr(type, ']'); - if (!end || end < start) + if (!end || end < start || type + strlen(type) > end + 1) return -EINVAL; len = end - start; if (len > 3) return -EINVAL; + if (len == 0) + return 0; /* variable-length string */ + strncpy(buf, start, len); buf[len] = '\0'; @@ -184,6 +229,8 @@ static int synth_field_size(char *type) size = sizeof(long); else if (strcmp(type, "unsigned long") == 0) size = sizeof(unsigned long); + else if (strcmp(type, "bool") == 0) + size = sizeof(bool); else if (strcmp(type, "pid_t") == 0) size = sizeof(pid_t); else if (strcmp(type, "gfp_t") == 0) @@ -226,12 +273,14 @@ static const char *synth_field_fmt(char *type) fmt = "%ld"; else if (strcmp(type, "unsigned long") == 0) fmt = "%lu"; + else if (strcmp(type, "bool") == 0) + fmt = "%d"; else if (strcmp(type, "pid_t") == 0) fmt = "%d"; else if (strcmp(type, "gfp_t") == 0) fmt = "%x"; else if (synth_field_is_string(type)) - fmt = "%s"; + fmt = "%.*s"; return fmt; } @@ -290,10 +339,27 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, /* parameter values */ if (se->fields[i]->is_string) { - trace_seq_printf(s, print_fmt, se->fields[i]->name, - (char *)&entry->fields[n_u64], - i == se->n_fields - 1 ? "" : " "); - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + if (se->fields[i]->is_dynamic) { + u32 offset, data_offset; + char *str_field; + + offset = (u32)entry->fields[n_u64]; + data_offset = offset & 0xffff; + + str_field = (char *)entry + data_offset; + + trace_seq_printf(s, print_fmt, se->fields[i]->name, + STR_VAR_LEN_MAX, + str_field, + i == se->n_fields - 1 ? "" : " "); + n_u64++; + } else { + trace_seq_printf(s, print_fmt, se->fields[i]->name, + STR_VAR_LEN_MAX, + (char *)&entry->fields[n_u64], + i == se->n_fields - 1 ? "" : " "); + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } } else { struct trace_print_flags __flags[] = { __def_gfpflag_names, {-1, NULL} }; @@ -325,16 +391,52 @@ static struct trace_event_functions synth_event_funcs = { .trace = print_synth_event }; +static unsigned int trace_string(struct synth_trace_event *entry, + struct synth_event *event, + char *str_val, + bool is_dynamic, + unsigned int data_size, + unsigned int *n_u64) +{ + unsigned int len = 0; + char *str_field; + + if (is_dynamic) { + u32 data_offset; + + data_offset = offsetof(typeof(*entry), fields); + data_offset += event->n_u64 * sizeof(u64); + data_offset += data_size; + + str_field = (char *)entry + data_offset; + + len = strlen(str_val) + 1; + strscpy(str_field, str_val, len); + + data_offset |= len << 16; + *(u32 *)&entry->fields[*n_u64] = data_offset; + + (*n_u64)++; + } else { + str_field = (char *)&entry->fields[*n_u64]; + + strscpy(str_field, str_val, STR_VAR_LEN_MAX); + (*n_u64) += STR_VAR_LEN_MAX / sizeof(u64); + } + + return len; +} + static notrace void trace_event_raw_event_synth(void *__data, u64 *var_ref_vals, unsigned int *var_ref_idx) { + unsigned int i, n_u64, val_idx, len, data_size = 0; struct trace_event_file *trace_file = __data; struct synth_trace_event *entry; struct trace_event_buffer fbuffer; struct trace_buffer *buffer; struct synth_event *event; - unsigned int i, n_u64, val_idx; int fields_size = 0; event = trace_file->event_call->data; @@ -344,6 +446,18 @@ static notrace void trace_event_raw_event_synth(void *__data, fields_size = event->n_u64 * sizeof(u64); + for (i = 0; i < event->n_dynamic_fields; i++) { + unsigned int field_pos = event->dynamic_fields[i]->field_pos; + char *str_val; + + val_idx = var_ref_idx[field_pos]; + str_val = (char *)(long)var_ref_vals[val_idx]; + + len = strlen(str_val) + 1; + + fields_size += len; + } + /* * Avoid ring buffer recursion detection, as this event * is being performed within another event. @@ -360,10 +474,11 @@ static notrace void trace_event_raw_event_synth(void *__data, val_idx = var_ref_idx[i]; if (event->fields[i]->is_string) { char *str_val = (char *)(long)var_ref_vals[val_idx]; - char *str_field = (char *)&entry->fields[n_u64]; - strscpy(str_field, str_val, STR_VAR_LEN_MAX); - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + len = trace_string(entry, event, str_val, + event->fields[i]->is_dynamic, + data_size, &n_u64); + data_size += len; /* only dynamic string increments */ } else { struct synth_field *field = event->fields[i]; u64 val = var_ref_vals[val_idx]; @@ -422,8 +537,13 @@ static int __set_synth_event_print_fmt(struct synth_event *event, pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); for (i = 0; i < event->n_fields; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, - ", REC->%s", event->fields[i]->name); + if (event->fields[i]->is_string && + event->fields[i]->is_dynamic) + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", __get_str(%s)", event->fields[i]->name); + else + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", REC->%s", event->fields[i]->name); } #undef LEN_OR_ZERO @@ -465,13 +585,17 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, struct synth_field *field; const char *prefix = NULL, *field_type = argv[0], *field_name, *array; int len, ret = 0; + struct seq_buf s; + ssize_t size; if (field_type[0] == ';') field_type++; if (!strcmp(field_type, "unsigned")) { - if (argc < 3) + if (argc < 3) { + synth_err(SYNTH_ERR_INCOMPLETE_TYPE, errpos(field_type)); return ERR_PTR(-EINVAL); + } prefix = "unsigned "; field_type = argv[1]; field_name = argv[2]; @@ -497,12 +621,19 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, ret = -ENOMEM; goto free; } + if (!is_good_name(field->name)) { + synth_err(SYNTH_ERR_BAD_NAME, errpos(field_name)); + ret = -EINVAL; + goto free; + } if (field_type[0] == ';') field_type++; len = strlen(field_type) + 1; + if (array) len += strlen(array); + if (prefix) len += strlen(prefix); @@ -511,26 +642,60 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, ret = -ENOMEM; goto free; } + seq_buf_init(&s, field->type, len); if (prefix) - strcat(field->type, prefix); - strcat(field->type, field_type); + seq_buf_puts(&s, prefix); + seq_buf_puts(&s, field_type); if (array) { - strcat(field->type, array); - if (field->type[len - 1] == ';') - field->type[len - 1] = '\0'; + seq_buf_puts(&s, array); + if (s.buffer[s.len - 1] == ';') + s.len--; } + if (WARN_ON_ONCE(!seq_buf_buffer_left(&s))) + goto free; + s.buffer[s.len] = '\0'; - field->size = synth_field_size(field->type); - if (!field->size) { + size = synth_field_size(field->type); + if (size < 0) { + synth_err(SYNTH_ERR_INVALID_TYPE, errpos(field_type)); ret = -EINVAL; goto free; + } else if (size == 0) { + if (synth_field_is_string(field->type)) { + char *type; + + len = sizeof("__data_loc ") + strlen(field->type) + 1; + type = kzalloc(len, GFP_KERNEL); + if (!type) { + ret = -ENOMEM; + goto free; + } + + seq_buf_init(&s, type, len); + seq_buf_puts(&s, "__data_loc "); + seq_buf_puts(&s, field->type); + + if (WARN_ON_ONCE(!seq_buf_buffer_left(&s))) + goto free; + s.buffer[s.len] = '\0'; + + kfree(field->type); + field->type = type; + + field->is_dynamic = true; + size = sizeof(u64); + } else { + synth_err(SYNTH_ERR_INVALID_TYPE, errpos(field_type)); + ret = -EINVAL; + goto free; + } } + field->size = size; if (synth_field_is_string(field->type)) field->is_string = true; field->is_signed = synth_field_signed(field->type); - out: return field; free: @@ -661,6 +826,7 @@ static void free_synth_event(struct synth_event *event) free_synth_field(event->fields[i]); kfree(event->fields); + kfree(event->dynamic_fields); kfree(event->name); kfree(event->class.system); free_synth_tracepoint(event->tp); @@ -671,8 +837,8 @@ static void free_synth_event(struct synth_event *event) static struct synth_event *alloc_synth_event(const char *name, int n_fields, struct synth_field **fields) { + unsigned int i, j, n_dynamic_fields = 0; struct synth_event *event; - unsigned int i; event = kzalloc(sizeof(*event), GFP_KERNEL); if (!event) { @@ -694,11 +860,33 @@ static struct synth_event *alloc_synth_event(const char *name, int n_fields, goto out; } + for (i = 0; i < n_fields; i++) + if (fields[i]->is_dynamic) + n_dynamic_fields++; + + if (n_dynamic_fields) { + event->dynamic_fields = kcalloc(n_dynamic_fields, + sizeof(*event->dynamic_fields), + GFP_KERNEL); + if (!event->dynamic_fields) { + free_synth_event(event); + event = ERR_PTR(-ENOMEM); + goto out; + } + } + dyn_event_init(&event->devent, &synth_event_ops); - for (i = 0; i < n_fields; i++) + for (i = 0, j = 0; i < n_fields; i++) { event->fields[i] = fields[i]; + if (fields[i]->is_dynamic) { + event->dynamic_fields[j] = fields[i]; + event->dynamic_fields[j]->field_pos = i; + event->dynamic_fields[j++] = fields[i]; + event->n_dynamic_fields++; + } + } event->n_fields = n_fields; out: return event; @@ -710,6 +898,10 @@ static int synth_event_check_arg_fn(void *data) int size; size = synth_field_size((char *)arg_pair->lhs); + if (size == 0) { + if (strstr((char *)arg_pair->lhs, "[")) + return 0; + } return size ? 0 : -EINVAL; } @@ -971,12 +1163,47 @@ int synth_event_gen_cmd_array_start(struct dynevent_cmd *cmd, const char *name, } EXPORT_SYMBOL_GPL(synth_event_gen_cmd_array_start); +static int save_cmdstr(int argc, const char *name, const char **argv) +{ + struct seq_buf s; + char *buf; + int i; + + buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + seq_buf_init(&s, buf, MAX_DYNEVENT_CMD_LEN); + + seq_buf_puts(&s, name); + + for (i = 0; i < argc; i++) { + seq_buf_putc(&s, ' '); + seq_buf_puts(&s, argv[i]); + } + + if (!seq_buf_buffer_left(&s)) { + synth_err(SYNTH_ERR_CMD_TOO_LONG, 0); + kfree(buf); + return -EINVAL; + } + buf[s.len] = 0; + last_cmd_set(buf); + + kfree(buf); + return 0; +} + static int __create_synth_event(int argc, const char *name, const char **argv) { struct synth_field *field, *fields[SYNTH_FIELDS_MAX]; struct synth_event *event = NULL; int i, consumed = 0, n_fields = 0, ret = 0; + ret = save_cmdstr(argc, name, argv); + if (ret) + return ret; + /* * Argument syntax: * - Add synthetic event: <event_name> field[;field] ... @@ -984,13 +1211,22 @@ static int __create_synth_event(int argc, const char *name, const char **argv) * where 'field' = type field_name */ - if (name[0] == '\0' || argc < 1) + if (name[0] == '\0' || argc < 1) { + synth_err(SYNTH_ERR_CMD_INCOMPLETE, 0); return -EINVAL; + } mutex_lock(&event_mutex); + if (!is_good_name(name)) { + synth_err(SYNTH_ERR_BAD_NAME, errpos(name)); + ret = -EINVAL; + goto out; + } + event = find_synth_event(name); if (event) { + synth_err(SYNTH_ERR_EVENT_EXISTS, errpos(name)); ret = -EEXIST; goto out; } @@ -999,6 +1235,7 @@ static int __create_synth_event(int argc, const char *name, const char **argv) if (strcmp(argv[i], ";") == 0) continue; if (n_fields == SYNTH_FIELDS_MAX) { + synth_err(SYNTH_ERR_TOO_MANY_FIELDS, 0); ret = -EINVAL; goto err; } @@ -1013,6 +1250,7 @@ static int __create_synth_event(int argc, const char *name, const char **argv) } if (i < argc && strcmp(argv[i], ";") != 0) { + synth_err(SYNTH_ERR_INVALID_FIELD, errpos(argv[i])); ret = -EINVAL; goto err; } @@ -1198,10 +1436,9 @@ void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen) EXPORT_SYMBOL_GPL(synth_event_cmd_init); static inline int -__synth_event_trace_start(struct trace_event_file *file, - struct synth_event_trace_state *trace_state) +__synth_event_trace_init(struct trace_event_file *file, + struct synth_event_trace_state *trace_state) { - int entry_size, fields_size = 0; int ret = 0; memset(trace_state, '\0', sizeof(*trace_state)); @@ -1211,7 +1448,7 @@ __synth_event_trace_start(struct trace_event_file *file, * ENABLED bit is set (which attaches the probe thus allowing * this code to be called, etc). Because this is called * directly by the user, we don't have that but we still need - * to honor not logging when disabled. For the the iterated + * to honor not logging when disabled. For the iterated * trace case, we save the enabed state upon start and just * ignore the following data calls. */ @@ -1223,8 +1460,20 @@ __synth_event_trace_start(struct trace_event_file *file, } trace_state->event = file->event_call->data; +out: + return ret; +} + +static inline int +__synth_event_trace_start(struct trace_event_file *file, + struct synth_event_trace_state *trace_state, + int dynamic_fields_size) +{ + int entry_size, fields_size = 0; + int ret = 0; fields_size = trace_state->event->n_u64 * sizeof(u64); + fields_size += dynamic_fields_size; /* * Avoid ring buffer recursion detection, as this event @@ -1241,7 +1490,7 @@ __synth_event_trace_start(struct trace_event_file *file, ring_buffer_nest_end(trace_state->buffer); ret = -EINVAL; } -out: + return ret; } @@ -1274,23 +1523,46 @@ __synth_event_trace_end(struct synth_event_trace_state *trace_state) */ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) { + unsigned int i, n_u64, len, data_size = 0; struct synth_event_trace_state state; - unsigned int i, n_u64; va_list args; int ret; - ret = __synth_event_trace_start(file, &state); + ret = __synth_event_trace_init(file, &state); if (ret) { if (ret == -ENOENT) ret = 0; /* just disabled, not really an error */ return ret; } + if (state.event->n_dynamic_fields) { + va_start(args, n_vals); + + for (i = 0; i < state.event->n_fields; i++) { + u64 val = va_arg(args, u64); + + if (state.event->fields[i]->is_string && + state.event->fields[i]->is_dynamic) { + char *str_val = (char *)(long)val; + + data_size += strlen(str_val) + 1; + } + } + + va_end(args); + } + + ret = __synth_event_trace_start(file, &state, data_size); + if (ret) + return ret; + if (n_vals != state.event->n_fields) { ret = -EINVAL; goto out; } + data_size = 0; + va_start(args, n_vals); for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { u64 val; @@ -1299,10 +1571,11 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) if (state.event->fields[i]->is_string) { char *str_val = (char *)(long)val; - char *str_field = (char *)&state.entry->fields[n_u64]; - strscpy(str_field, str_val, STR_VAR_LEN_MAX); - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + len = trace_string(state.entry, state.event, str_val, + state.event->fields[i]->is_dynamic, + data_size, &n_u64); + data_size += len; /* only dynamic string increments */ } else { struct synth_field *field = state.event->fields[i]; @@ -1355,29 +1628,46 @@ EXPORT_SYMBOL_GPL(synth_event_trace); int synth_event_trace_array(struct trace_event_file *file, u64 *vals, unsigned int n_vals) { + unsigned int i, n_u64, field_pos, len, data_size = 0; struct synth_event_trace_state state; - unsigned int i, n_u64; + char *str_val; int ret; - ret = __synth_event_trace_start(file, &state); + ret = __synth_event_trace_init(file, &state); if (ret) { if (ret == -ENOENT) ret = 0; /* just disabled, not really an error */ return ret; } + if (state.event->n_dynamic_fields) { + for (i = 0; i < state.event->n_dynamic_fields; i++) { + field_pos = state.event->dynamic_fields[i]->field_pos; + str_val = (char *)(long)vals[field_pos]; + len = strlen(str_val) + 1; + data_size += len; + } + } + + ret = __synth_event_trace_start(file, &state, data_size); + if (ret) + return ret; + if (n_vals != state.event->n_fields) { ret = -EINVAL; goto out; } + data_size = 0; + for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { if (state.event->fields[i]->is_string) { char *str_val = (char *)(long)vals[i]; - char *str_field = (char *)&state.entry->fields[n_u64]; - strscpy(str_field, str_val, STR_VAR_LEN_MAX); - n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + len = trace_string(state.entry, state.event, str_val, + state.event->fields[i]->is_dynamic, + data_size, &n_u64); + data_size += len; /* only dynamic string increments */ } else { struct synth_field *field = state.event->fields[i]; u64 val = vals[i]; @@ -1445,9 +1735,17 @@ int synth_event_trace_start(struct trace_event_file *file, if (!trace_state) return -EINVAL; - ret = __synth_event_trace_start(file, trace_state); - if (ret == -ENOENT) - ret = 0; /* just disabled, not really an error */ + ret = __synth_event_trace_init(file, trace_state); + if (ret) { + if (ret == -ENOENT) + ret = 0; /* just disabled, not really an error */ + return ret; + } + + if (trace_state->event->n_dynamic_fields) + return -ENOTSUPP; + + ret = __synth_event_trace_start(file, trace_state, 0); return ret; } @@ -1508,6 +1806,11 @@ static int __synth_event_add_val(const char *field_name, u64 val, char *str_val = (char *)(long)val; char *str_field; + if (field->is_dynamic) { /* add_val can't do dynamic strings */ + ret = -EINVAL; + goto out; + } + if (!str_val) { ret = -EINVAL; goto out; @@ -1679,14 +1982,22 @@ static int __synth_event_show(struct seq_file *m, struct synth_event *event) { struct synth_field *field; unsigned int i; + char *type, *t; seq_printf(m, "%s\t", event->name); for (i = 0; i < event->n_fields; i++) { field = event->fields[i]; + type = field->type; + t = strstr(type, "__data_loc"); + if (t) { /* __data_loc belongs in format but not event desc */ + t += sizeof("__data_loc"); + type = t; + } + /* parameter values */ - seq_printf(m, "%s %s%s", field->type, field->name, + seq_printf(m, "%s %s%s", type, field->name, i == event->n_fields - 1 ? "" : "; "); } @@ -1754,25 +2065,31 @@ static const struct file_operations synth_events_fops = { .release = seq_release, }; -static __init int trace_events_synth_init(void) +/* + * Register dynevent at core_initcall. This allows kernel to setup kprobe + * events in postcore_initcall without tracefs. + */ +static __init int trace_events_synth_init_early(void) { - struct dentry *entry = NULL; - struct dentry *d_tracer; int err = 0; err = dyn_event_register(&synth_event_ops); - if (err) { + if (err) pr_warn("Could not register synth_event_ops\n"); - return err; - } - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) { - err = PTR_ERR(d_tracer); + return err; +} +core_initcall(trace_events_synth_init_early); + +static __init int trace_events_synth_init(void) +{ + struct dentry *entry = NULL; + int err = 0; + err = tracing_init_dentry(); + if (err) goto err; - } - entry = tracefs_create_file("synthetic_events", 0644, d_tracer, + entry = tracefs_create_file("synthetic_events", 0644, NULL, NULL, &synth_events_fops); if (!entry) { err = -ENODEV; diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 70d3d0a09053..90f81d33fa3f 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -176,7 +176,7 @@ struct trace_event_call __used event_##call = { \ .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ }; \ static struct trace_event_call __used \ -__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; +__section("_ftrace_events") *__event_##call = &event_##call; #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \ diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index dd4dff71d89a..2c2126e1871d 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -34,10 +34,14 @@ enum { TRACE_FUNC_OPT_STACK = 0x1, }; -static int allocate_ftrace_ops(struct trace_array *tr) +int ftrace_allocate_ftrace_ops(struct trace_array *tr) { struct ftrace_ops *ops; + /* The top level array uses the "global_ops" */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) + return 0; + ops = kzalloc(sizeof(*ops), GFP_KERNEL); if (!ops) return -ENOMEM; @@ -48,15 +52,19 @@ static int allocate_ftrace_ops(struct trace_array *tr) tr->ops = ops; ops->private = tr; + return 0; } +void ftrace_free_ftrace_ops(struct trace_array *tr) +{ + kfree(tr->ops); + tr->ops = NULL; +} int ftrace_create_function_files(struct trace_array *tr, struct dentry *parent) { - int ret; - /* * The top level array uses the "global_ops", and the files are * created on boot up. @@ -64,9 +72,8 @@ int ftrace_create_function_files(struct trace_array *tr, if (tr->flags & TRACE_ARRAY_FL_GLOBAL) return 0; - ret = allocate_ftrace_ops(tr); - if (ret) - return ret; + if (!tr->ops) + return -EINVAL; ftrace_create_filter_files(tr->ops, parent); @@ -76,8 +83,7 @@ int ftrace_create_function_files(struct trace_array *tr, void ftrace_destroy_function_files(struct trace_array *tr) { ftrace_destroy_filter_files(tr->ops); - kfree(tr->ops); - tr->ops = NULL; + ftrace_free_ftrace_ops(tr); } static int function_trace_init(struct trace_array *tr) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4a9c49c08ec9..60d66278aa0d 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1336,13 +1336,13 @@ static const struct file_operations graph_depth_fops = { static __init int init_graph_tracefs(void) { - struct dentry *d_tracer; + int ret; - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) + ret = tracing_init_dentry(); + if (ret) return 0; - trace_create_file("max_graph_depth", 0644, d_tracer, + trace_create_file("max_graph_depth", 0644, NULL, NULL, &graph_depth_fops); return 0; diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 17873e5d0353..c9ad5c6fbaad 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -538,14 +538,14 @@ static const struct file_operations window_fops = { */ static int init_tracefs(void) { - struct dentry *d_tracer; + int ret; struct dentry *top_dir; - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) + ret = tracing_init_dentry(); + if (ret) return -ENOMEM; - top_dir = tracefs_create_dir("hwlat_detector", d_tracer); + top_dir = tracefs_create_dir("hwlat_detector", NULL); if (!top_dir) return -ENOMEM; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 19c00ee90945..b911e9f6d9f5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -718,6 +718,9 @@ static int trace_kprobe_create(int argc, const char *argv[]) * p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] * - Add kretprobe: * r[MAXACTIVE][:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS] + * Or + * p:[GRP/]EVENT] [MOD:]KSYM[+0]%return [FETCHARGS] + * * Fetch args: * $retval : fetch return value * $stack : fetch stack address @@ -747,7 +750,6 @@ static int trace_kprobe_create(int argc, const char *argv[]) switch (argv[0][0]) { case 'r': is_return = true; - flags |= TPARG_FL_RETURN; break; case 'p': break; @@ -805,12 +807,26 @@ static int trace_kprobe_create(int argc, const char *argv[]) symbol = kstrdup(argv[1], GFP_KERNEL); if (!symbol) return -ENOMEM; + + tmp = strchr(symbol, '%'); + if (tmp) { + if (!strcmp(tmp, "%return")) { + *tmp = '\0'; + is_return = true; + } else { + trace_probe_log_err(tmp - symbol, BAD_ADDR_SUFFIX); + goto parse_error; + } + } + /* TODO: support .init module functions */ ret = traceprobe_split_symbol_offset(symbol, &offset); if (ret || offset < 0 || offset > UINT_MAX) { trace_probe_log_err(0, BAD_PROBE_ADDR); goto parse_error; } + if (is_return) + flags |= TPARG_FL_RETURN; if (kprobe_on_func_entry(NULL, symbol, offset)) flags |= TPARG_FL_FENTRY; if (offset && is_return && !(flags & TPARG_FL_FENTRY)) { @@ -1881,8 +1897,8 @@ static __init void setup_boot_kprobe_events(void) } /* - * Register dynevent at subsys_initcall. This allows kernel to setup kprobe - * events in fs_initcall without tracefs. + * Register dynevent at core_initcall. This allows kernel to setup kprobe + * events in postcore_initcall without tracefs. */ static __init int init_kprobe_trace_early(void) { @@ -1897,19 +1913,19 @@ static __init int init_kprobe_trace_early(void) return 0; } -subsys_initcall(init_kprobe_trace_early); +core_initcall(init_kprobe_trace_early); /* Make a tracefs interface for controlling probe points */ static __init int init_kprobe_trace(void) { - struct dentry *d_tracer; + int ret; struct dentry *entry; - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) + ret = tracing_init_dentry(); + if (ret) return 0; - entry = tracefs_create_file("kprobe_events", 0644, d_tracer, + entry = tracefs_create_file("kprobe_events", 0644, NULL, NULL, &kprobe_events_ops); /* Event list interface */ @@ -1917,7 +1933,7 @@ static __init int init_kprobe_trace(void) pr_warn("Could not create tracefs 'kprobe_events' entry\n"); /* Profile interface */ - entry = tracefs_create_file("kprobe_profile", 0444, d_tracer, + entry = tracefs_create_file("kprobe_profile", 0444, NULL, NULL, &kprobe_profile_ops); if (!entry) diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index bb7783b90361..ff32476df072 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -367,13 +367,13 @@ static const struct file_operations ftrace_formats_fops = { static __init int init_trace_printk_function_export(void) { - struct dentry *d_tracer; + int ret; - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) + ret = tracing_init_dentry(); + if (ret) return 0; - trace_create_file("printk_formats", 0444, d_tracer, + trace_create_file("printk_formats", 0444, NULL, NULL, &ftrace_formats_fops); return 0; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index a22b62813f8c..2f703a20c724 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -16,7 +16,6 @@ #include <linux/tracefs.h> #include <linux/types.h> #include <linux/string.h> -#include <linux/ctype.h> #include <linux/ptrace.h> #include <linux/perf_event.h> #include <linux/kprobes.h> @@ -348,18 +347,6 @@ bool trace_probe_match_command_args(struct trace_probe *tp, #define trace_probe_for_each_link_rcu(pos, tp) \ list_for_each_entry_rcu(pos, &(tp)->event->files, list) -/* Check the name is good for event/group/fields */ -static inline bool is_good_name(const char *name) -{ - if (!isalpha(*name) && *name != '_') - return false; - while (*++name != '\0') { - if (!isalpha(*name) && !isdigit(*name) && *name != '_') - return false; - } - return true; -} - #define TPARG_FL_RETURN BIT(0) #define TPARG_FL_KERNEL BIT(1) #define TPARG_FL_FENTRY BIT(2) @@ -404,6 +391,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(MAXACT_TOO_BIG, "Maxactive is too big"), \ C(BAD_PROBE_ADDR, "Invalid probed address or symbol"), \ C(BAD_RETPROBE, "Retprobe address must be an function entry"), \ + C(BAD_ADDR_SUFFIX, "Invalid probed address suffix"), \ C(NO_GROUP_NAME, "Group name is not specified"), \ C(GROUP_TOO_LONG, "Group name is too long"), \ C(BAD_GROUP_NAME, "Group name must follow the same rules as C identifiers"), \ diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 98bba4764c52..c408423e5d65 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -554,20 +554,20 @@ __setup("stacktrace", enable_stacktrace); static __init int stack_trace_init(void) { - struct dentry *d_tracer; + int ret; - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) + ret = tracing_init_dentry(); + if (ret) return 0; - trace_create_file("stack_max_size", 0644, d_tracer, + trace_create_file("stack_max_size", 0644, NULL, &stack_trace_max_size, &stack_max_size_fops); - trace_create_file("stack_trace", 0444, d_tracer, + trace_create_file("stack_trace", 0444, NULL, NULL, &stack_trace_fops); #ifdef CONFIG_DYNAMIC_FTRACE - trace_create_file("stack_trace_filter", 0644, d_tracer, + trace_create_file("stack_trace_filter", 0644, NULL, &trace_ops, &stack_trace_filter_fops); #endif diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index d1fa19773cc8..8d141c3825a9 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -276,13 +276,13 @@ static const struct file_operations tracing_stat_fops = { static int tracing_stat_init(void) { - struct dentry *d_tracing; + int ret; - d_tracing = tracing_init_dentry(); - if (IS_ERR(d_tracing)) + ret = tracing_init_dentry(); + if (ret) return -ENODEV; - stat_dir = tracefs_create_dir("trace_stat", d_tracing); + stat_dir = tracefs_create_dir("trace_stat", NULL); if (!stat_dir) { pr_warn("Could not create tracefs 'trace_stat' entry\n"); return -ENOMEM; diff --git a/kernel/trace/trace_synth.h b/kernel/trace/trace_synth.h index ac35c45207c4..6e146b959dcd 100644 --- a/kernel/trace/trace_synth.h +++ b/kernel/trace/trace_synth.h @@ -7,7 +7,7 @@ #define SYNTH_SYSTEM "synthetic" #define SYNTH_FIELDS_MAX 32 -#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */ +#define STR_VAR_LEN_MAX MAX_FILTER_STR_VAL /* must be multiple of sizeof(u64) */ struct synth_field { char *type; @@ -16,6 +16,8 @@ struct synth_field { unsigned int offset; bool is_signed; bool is_string; + bool is_dynamic; + bool field_pos; }; struct synth_event { @@ -24,6 +26,8 @@ struct synth_event { char *name; struct synth_field **fields; unsigned int n_fields; + struct synth_field **dynamic_fields; + unsigned int n_dynamic_fields; unsigned int n_u64; struct trace_event_class class; struct trace_event_call call; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index f4286c9bdeb4..3cf7128e1ad3 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -528,7 +528,7 @@ end: /* * Argument syntax: - * - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] + * - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET[%return][(REF)] [FETCHARGS] */ static int trace_uprobe_create(int argc, const char **argv) { @@ -617,6 +617,19 @@ static int trace_uprobe_create(int argc, const char **argv) } } + /* Check if there is %return suffix */ + tmp = strchr(arg, '%'); + if (tmp) { + if (!strcmp(tmp, "%return")) { + *tmp = '\0'; + is_return = true; + } else { + trace_probe_log_err(tmp - filename, BAD_ADDR_SUFFIX); + ret = -EINVAL; + goto fail_address_parse; + } + } + /* Parse uprobe offset. */ ret = kstrtoul(arg, 0, &offset); if (ret) { @@ -1625,21 +1638,20 @@ void destroy_local_trace_uprobe(struct trace_event_call *event_call) /* Make a trace interface for controling probe points */ static __init int init_uprobe_trace(void) { - struct dentry *d_tracer; int ret; ret = dyn_event_register(&trace_uprobe_ops); if (ret) return ret; - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) + ret = tracing_init_dentry(); + if (ret) return 0; - trace_create_file("uprobe_events", 0644, d_tracer, + trace_create_file("uprobe_events", 0644, NULL, NULL, &uprobe_events_ops); /* Profile interface */ - trace_create_file("uprobe_profile", 0444, d_tracer, + trace_create_file("uprobe_profile", 0444, NULL, NULL, &uprobe_profile_ops); return 0; } diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 74738c9856f1..4b50fc0cb12c 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -260,7 +260,7 @@ int tracing_map_add_var(struct tracing_map *map) * to use cmp_fn. * * A key can be a subset of a compound key; for that purpose, the - * offset param is used to describe where within the the compound key + * offset param is used to describe where within the compound key * the key referenced by this key field resides. * * Return: The index identifying the field in the map and associated diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 26efd22f0633..3f659f855074 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -50,7 +50,7 @@ static bool ok_to_free_tracepoints; */ struct tp_probes { struct rcu_head rcu; - struct tracepoint_func probes[0]; + struct tracepoint_func probes[]; }; static inline void *allocate_probes(int count) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 87804e0371fe..e703d5d9cbe8 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -515,7 +515,7 @@ EXPORT_SYMBOL(from_kgid_munged); * * When there is no mapping defined for the user-namespace projid * pair INVALID_PROJID is returned. Callers are expected to test - * for and handle handle INVALID_PROJID being returned. INVALID_PROJID + * for and handle INVALID_PROJID being returned. INVALID_PROJID * may be tested for using projid_valid(). */ kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ac088ce6059b..437935e7a199 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1212,11 +1212,14 @@ out_put: * stable state - idle, on timer or on worklist. * * Return: + * + * ======== ================================================================ * 1 if @work was pending and we successfully stole PENDING * 0 if @work was idle and we claimed PENDING * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry * -ENOENT if someone else is canceling @work, this state may persist * for arbitrarily long + * ======== ================================================================ * * Note: * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting |