diff options
Diffstat (limited to 'kernel')
167 files changed, 17445 insertions, 4915 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 53abf008ecb3..e2ec54e2b952 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -14,11 +14,21 @@ obj-y = fork.o exec_domain.o panic.o \ obj-$(CONFIG_MULTIUSER) += groups.o ifdef CONFIG_FUNCTION_TRACER -# Do not trace debug files and internal ftrace files -CFLAGS_REMOVE_cgroup-debug.o = $(CC_FLAGS_FTRACE) +# Do not trace internal ftrace files CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE) endif +# Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip() +# in coverage traces. +KCOV_INSTRUMENT_softirq.o := n +# These are called from save_stack_trace() on slub debug path, +# and produce insane amounts of uninteresting coverage. +KCOV_INSTRUMENT_module.o := n +KCOV_INSTRUMENT_extable.o := n +# Don't self-instrument. +KCOV_INSTRUMENT_kcov.o := n +KASAN_SANITIZE_kcov.o := n + # cond_syscall is currently not LTO compatible CFLAGS_sys_ni.o = $(DISABLE_LTO) @@ -69,6 +79,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_GCOV_KERNEL) += gcov/ +obj-$(CONFIG_KCOV) += kcov.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o @@ -80,9 +91,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_TRACEPOINTS) += tracepoint.o obj-$(CONFIG_LATENCYTOP) += latencytop.o -obj-$(CONFIG_BINFMT_ELF) += elfcore.o -obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o -obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o +obj-$(CONFIG_ELFCORE) += elfcore.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_TRACE_CLOCK) += trace/ diff --git a/kernel/audit.c b/kernel/audit.c index d5971010e44a..8d528f9930da 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -807,6 +807,16 @@ static int audit_set_feature(struct sk_buff *skb) return 0; } +static int audit_replace(pid_t pid) +{ + struct sk_buff *skb = audit_make_reply(0, 0, AUDIT_REPLACE, 0, 0, + &pid, sizeof(pid)); + + if (!skb) + return -ENOMEM; + return netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); +} + static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { u32 seq; @@ -868,9 +878,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } if (s.mask & AUDIT_STATUS_PID) { int new_pid = s.pid; + pid_t requesting_pid = task_tgid_vnr(current); - if ((!new_pid) && (task_tgid_vnr(current) != audit_pid)) + if ((!new_pid) && (requesting_pid != audit_pid)) { + audit_log_config_change("audit_pid", new_pid, audit_pid, 0); return -EACCES; + } + if (audit_pid && new_pid && + audit_replace(requesting_pid) != -ECONNREFUSED) { + audit_log_config_change("audit_pid", new_pid, audit_pid, 0); + return -EEXIST; + } if (audit_enabled != AUDIT_OFF) audit_log_config_change("audit_pid", new_pid, audit_pid, 1); audit_pid = new_pid; @@ -918,7 +936,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (err == 1) { /* match or error */ err = 0; if (msg_type == AUDIT_USER_TTY) { - err = tty_audit_push_current(); + err = tty_audit_push(); if (err) break; } @@ -1028,20 +1046,19 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; case AUDIT_TTY_GET: { struct audit_tty_status s; - struct task_struct *tsk = current; + unsigned int t; - spin_lock(&tsk->sighand->siglock); - s.enabled = tsk->signal->audit_tty; - s.log_passwd = tsk->signal->audit_tty_log_passwd; - spin_unlock(&tsk->sighand->siglock); + t = READ_ONCE(current->signal->audit_tty); + s.enabled = t & AUDIT_TTY_ENABLE; + s.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD); audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_TTY_SET: { struct audit_tty_status s, old; - struct task_struct *tsk = current; struct audit_buffer *ab; + unsigned int t; memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ @@ -1051,14 +1068,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) (s.log_passwd != 0 && s.log_passwd != 1)) err = -EINVAL; - spin_lock(&tsk->sighand->siglock); - old.enabled = tsk->signal->audit_tty; - old.log_passwd = tsk->signal->audit_tty_log_passwd; - if (!err) { - tsk->signal->audit_tty = s.enabled; - tsk->signal->audit_tty_log_passwd = s.log_passwd; + if (err) + t = READ_ONCE(current->signal->audit_tty); + else { + t = s.enabled | (-s.log_passwd & AUDIT_TTY_LOG_PASSWD); + t = xchg(¤t->signal->audit_tty, t); } - spin_unlock(&tsk->sighand->siglock); + old.enabled = t & AUDIT_TTY_ENABLE; + old.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD); audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d" diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 9f194aad0adc..d6709eb70970 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -185,7 +185,7 @@ static struct audit_watch *audit_init_watch(char *path) return watch; } -/* Translate a watch string to kernel respresentation. */ +/* Translate a watch string to kernel representation. */ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op) { struct audit_watch *watch; @@ -367,7 +367,7 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) inode_unlock(d_backing_inode(parent->dentry)); if (d_is_positive(d)) { /* update watch filter fields */ - watch->dev = d_backing_inode(d)->i_sb->s_dev; + watch->dev = d->d_sb->s_dev; watch->ino = d_backing_inode(d)->i_ino; } dput(d); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index b8ff9e193753..94ca7b1e5e7e 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -158,7 +158,7 @@ char *audit_unpack_string(void **bufp, size_t *remain, size_t len) return str; } -/* Translate an inode field to kernel respresentation. */ +/* Translate an inode field to kernel representation. */ static inline int audit_to_inode(struct audit_krule *krule, struct audit_field *f) { @@ -415,7 +415,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) return 0; } -/* Translate struct audit_rule_data to kernel's rule respresentation. */ +/* Translate struct audit_rule_data to kernel's rule representation. */ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, size_t datasz) { @@ -593,7 +593,7 @@ static inline size_t audit_pack_string(void **bufp, const char *str) return len; } -/* Translate kernel rule respresentation to struct audit_rule_data. */ +/* Translate kernel rule representation to struct audit_rule_data. */ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) { struct audit_rule_data *data; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 60a354eed2fa..2672d105cffc 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2416,8 +2416,8 @@ void __audit_seccomp(unsigned long syscall, long signr, int code) return; audit_log_task(ab); audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x", - signr, syscall_get_arch(), syscall, is_compat_task(), - KSTK_EIP(current), code); + signr, syscall_get_arch(), syscall, + in_compat_syscall(), KSTK_EIP(current), code); audit_log_end(ab); } diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 13272582eee0..eed911d091da 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,4 +1,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o -obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o +ifeq ($(CONFIG_PERF_EVENTS),y) +obj-$(CONFIG_BPF_SYSCALL) += stackmap.o +endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 89ebbc4d1164..76d5a794e426 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -17,15 +17,43 @@ #include <linux/filter.h> #include <linux/perf_event.h> +static void bpf_array_free_percpu(struct bpf_array *array) +{ + int i; + + for (i = 0; i < array->map.max_entries; i++) + free_percpu(array->pptrs[i]); +} + +static int bpf_array_alloc_percpu(struct bpf_array *array) +{ + void __percpu *ptr; + int i; + + for (i = 0; i < array->map.max_entries; i++) { + ptr = __alloc_percpu_gfp(array->elem_size, 8, + GFP_USER | __GFP_NOWARN); + if (!ptr) { + bpf_array_free_percpu(array); + return -ENOMEM; + } + array->pptrs[i] = ptr; + } + + return 0; +} + /* Called from syscall */ static struct bpf_map *array_map_alloc(union bpf_attr *attr) { + bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; struct bpf_array *array; - u32 elem_size, array_size; + u64 array_size; + u32 elem_size; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size == 0) + attr->value_size == 0 || attr->map_flags) return ERR_PTR(-EINVAL); if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1)) @@ -36,12 +64,16 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) elem_size = round_up(attr->value_size, 8); - /* check round_up into zero and u32 overflow */ - if (elem_size == 0 || - attr->max_entries > (U32_MAX - PAGE_SIZE - sizeof(*array)) / elem_size) + array_size = sizeof(*array); + if (percpu) + array_size += (u64) attr->max_entries * sizeof(void *); + else + array_size += (u64) attr->max_entries * elem_size; + + /* make sure there is no u32 overflow later in round_up() */ + if (array_size >= U32_MAX - PAGE_SIZE) return ERR_PTR(-ENOMEM); - array_size = sizeof(*array) + attr->max_entries * elem_size; /* allocate all map elements and zero-initialize them */ array = kzalloc(array_size, GFP_USER | __GFP_NOWARN); @@ -52,12 +84,25 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) } /* copy mandatory map attributes */ + array->map.map_type = attr->map_type; array->map.key_size = attr->key_size; array->map.value_size = attr->value_size; array->map.max_entries = attr->max_entries; - array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT; array->elem_size = elem_size; + if (!percpu) + goto out; + + array_size += (u64) attr->max_entries * elem_size * num_possible_cpus(); + + if (array_size >= U32_MAX - PAGE_SIZE || + elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) { + kvfree(array); + return ERR_PTR(-ENOMEM); + } +out: + array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT; + return &array->map; } @@ -67,12 +112,50 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; - if (index >= array->map.max_entries) + if (unlikely(index >= array->map.max_entries)) return NULL; return array->value + array->elem_size * index; } +/* Called from eBPF program */ +static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + + if (unlikely(index >= array->map.max_entries)) + return NULL; + + return this_cpu_ptr(array->pptrs[index]); +} + +int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + void __percpu *pptr; + int cpu, off = 0; + u32 size; + + if (unlikely(index >= array->map.max_entries)) + return -ENOENT; + + /* per_cpu areas are zero-filled and bpf programs can only + * access 'value_size' of them, so copying rounded areas + * will not leak any kernel data + */ + size = round_up(map->value_size, 8); + rcu_read_lock(); + pptr = array->pptrs[index]; + for_each_possible_cpu(cpu) { + bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); + off += size; + } + rcu_read_unlock(); + return 0; +} + /* Called from syscall */ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { @@ -99,19 +182,62 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; - if (map_flags > BPF_EXIST) + if (unlikely(map_flags > BPF_EXIST)) /* unknown flags */ return -EINVAL; - if (index >= array->map.max_entries) + if (unlikely(index >= array->map.max_entries)) /* all elements were pre-allocated, cannot insert a new one */ return -E2BIG; - if (map_flags == BPF_NOEXIST) + if (unlikely(map_flags == BPF_NOEXIST)) /* all elements already exist */ return -EEXIST; - memcpy(array->value + array->elem_size * index, value, map->value_size); + if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + memcpy(this_cpu_ptr(array->pptrs[index]), + value, map->value_size); + else + memcpy(array->value + array->elem_size * index, + value, map->value_size); + return 0; +} + +int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + void __percpu *pptr; + int cpu, off = 0; + u32 size; + + if (unlikely(map_flags > BPF_EXIST)) + /* unknown flags */ + return -EINVAL; + + if (unlikely(index >= array->map.max_entries)) + /* all elements were pre-allocated, cannot insert a new one */ + return -E2BIG; + + if (unlikely(map_flags == BPF_NOEXIST)) + /* all elements already exist */ + return -EEXIST; + + /* the user space will provide round_up(value_size, 8) bytes that + * will be copied into per-cpu area. bpf programs can only access + * value_size of it. During lookup the same extra bytes will be + * returned or zeros which were zero-filled by percpu_alloc, + * so no kernel data leaks possible + */ + size = round_up(map->value_size, 8); + rcu_read_lock(); + pptr = array->pptrs[index]; + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); + off += size; + } + rcu_read_unlock(); return 0; } @@ -133,6 +259,9 @@ static void array_map_free(struct bpf_map *map) */ synchronize_rcu(); + if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + bpf_array_free_percpu(array); + kvfree(array); } @@ -150,9 +279,24 @@ static struct bpf_map_type_list array_type __read_mostly = { .type = BPF_MAP_TYPE_ARRAY, }; +static const struct bpf_map_ops percpu_array_ops = { + .map_alloc = array_map_alloc, + .map_free = array_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = percpu_array_map_lookup_elem, + .map_update_elem = array_map_update_elem, + .map_delete_elem = array_map_delete_elem, +}; + +static struct bpf_map_type_list percpu_array_type __read_mostly = { + .ops = &percpu_array_ops, + .type = BPF_MAP_TYPE_PERCPU_ARRAY, +}; + static int __init register_array_map(void) { bpf_register_map_type(&array_type); + bpf_register_map_type(&percpu_array_type); return 0; } late_initcall(register_array_map); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 972d9a8e4ac4..b94a36550591 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -27,6 +27,7 @@ #include <linux/random.h> #include <linux/moduleloader.h> #include <linux/bpf.h> +#include <linux/frame.h> #include <asm/unaligned.h> @@ -128,14 +129,83 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, return fp; } -EXPORT_SYMBOL_GPL(bpf_prog_realloc); void __bpf_prog_free(struct bpf_prog *fp) { kfree(fp->aux); vfree(fp); } -EXPORT_SYMBOL_GPL(__bpf_prog_free); + +static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn) +{ + return BPF_CLASS(insn->code) == BPF_JMP && + /* Call and Exit are both special jumps with no + * target inside the BPF instruction image. + */ + BPF_OP(insn->code) != BPF_CALL && + BPF_OP(insn->code) != BPF_EXIT; +} + +static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta) +{ + struct bpf_insn *insn = prog->insnsi; + u32 i, insn_cnt = prog->len; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (!bpf_is_jmp_and_has_target(insn)) + continue; + + /* Adjust offset of jmps if we cross boundaries. */ + if (i < pos && i + insn->off + 1 > pos) + insn->off += delta; + else if (i > pos + delta && i + insn->off + 1 <= pos + delta) + insn->off -= delta; + } +} + +struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, + const struct bpf_insn *patch, u32 len) +{ + u32 insn_adj_cnt, insn_rest, insn_delta = len - 1; + struct bpf_prog *prog_adj; + + /* Since our patchlet doesn't expand the image, we're done. */ + if (insn_delta == 0) { + memcpy(prog->insnsi + off, patch, sizeof(*patch)); + return prog; + } + + insn_adj_cnt = prog->len + insn_delta; + + /* Several new instructions need to be inserted. Make room + * for them. Likely, there's no need for a new allocation as + * last page could have large enough tailroom. + */ + prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt), + GFP_USER); + if (!prog_adj) + return NULL; + + prog_adj->len = insn_adj_cnt; + + /* Patching happens in 3 steps: + * + * 1) Move over tail of insnsi from next instruction onwards, + * so we can patch the single target insn with one or more + * new ones (patching is always from 1 to n insns, n > 0). + * 2) Inject new instructions at the target location. + * 3) Adjust branch offsets if necessary. + */ + insn_rest = insn_adj_cnt - off - len; + + memmove(prog_adj->insnsi + off + len, prog_adj->insnsi + off + 1, + sizeof(*patch) * insn_rest); + memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len); + + bpf_adj_branches(prog_adj, off, insn_delta); + + return prog_adj; +} #ifdef CONFIG_BPF_JIT struct bpf_binary_header * @@ -161,7 +231,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, hdr->pages = size / PAGE_SIZE; hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); - start = (prandom_u32() % hole) & ~(alignment - 1); + start = (get_random_int() % hole) & ~(alignment - 1); /* Leave a random number of instructions before BPF code. */ *image_ptr = &hdr->image[start]; @@ -173,6 +243,209 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr) { module_memfree(hdr); } + +int bpf_jit_harden __read_mostly; + +static int bpf_jit_blind_insn(const struct bpf_insn *from, + const struct bpf_insn *aux, + struct bpf_insn *to_buff) +{ + struct bpf_insn *to = to_buff; + u32 imm_rnd = get_random_int(); + s16 off; + + BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); + BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG); + + if (from->imm == 0 && + (from->code == (BPF_ALU | BPF_MOV | BPF_K) || + from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) { + *to++ = BPF_ALU64_REG(BPF_XOR, from->dst_reg, from->dst_reg); + goto out; + } + + switch (from->code) { + case BPF_ALU | BPF_ADD | BPF_K: + case BPF_ALU | BPF_SUB | BPF_K: + case BPF_ALU | BPF_AND | BPF_K: + case BPF_ALU | BPF_OR | BPF_K: + case BPF_ALU | BPF_XOR | BPF_K: + case BPF_ALU | BPF_MUL | BPF_K: + case BPF_ALU | BPF_MOV | BPF_K: + case BPF_ALU | BPF_DIV | BPF_K: + case BPF_ALU | BPF_MOD | BPF_K: + *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_ALU32_REG(from->code, from->dst_reg, BPF_REG_AX); + break; + + case BPF_ALU64 | BPF_ADD | BPF_K: + case BPF_ALU64 | BPF_SUB | BPF_K: + case BPF_ALU64 | BPF_AND | BPF_K: + case BPF_ALU64 | BPF_OR | BPF_K: + case BPF_ALU64 | BPF_XOR | BPF_K: + case BPF_ALU64 | BPF_MUL | BPF_K: + case BPF_ALU64 | BPF_MOV | BPF_K: + case BPF_ALU64 | BPF_DIV | BPF_K: + case BPF_ALU64 | BPF_MOD | BPF_K: + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_ALU64_REG(from->code, from->dst_reg, BPF_REG_AX); + break; + + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JNE | BPF_K: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JSGT | BPF_K: + case BPF_JMP | BPF_JSGE | BPF_K: + case BPF_JMP | BPF_JSET | BPF_K: + /* Accommodate for extra offset in case of a backjump. */ + off = from->off; + if (off < 0) + off -= 2; + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off); + break; + + case BPF_LD | BPF_ABS | BPF_W: + case BPF_LD | BPF_ABS | BPF_H: + case BPF_LD | BPF_ABS | BPF_B: + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0); + break; + + case BPF_LD | BPF_IND | BPF_W: + case BPF_LD | BPF_IND | BPF_H: + case BPF_LD | BPF_IND | BPF_B: + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_ALU32_REG(BPF_ADD, BPF_REG_AX, from->src_reg); + *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0); + break; + + case BPF_LD | BPF_IMM | BPF_DW: + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32); + *to++ = BPF_ALU64_REG(BPF_MOV, aux[0].dst_reg, BPF_REG_AX); + break; + case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */ + *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm); + *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_ALU64_REG(BPF_OR, aux[0].dst_reg, BPF_REG_AX); + break; + + case BPF_ST | BPF_MEM | BPF_DW: + case BPF_ST | BPF_MEM | BPF_W: + case BPF_ST | BPF_MEM | BPF_H: + case BPF_ST | BPF_MEM | BPF_B: + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_STX_MEM(from->code, from->dst_reg, BPF_REG_AX, from->off); + break; + } +out: + return to - to_buff; +} + +static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, + gfp_t gfp_extra_flags) +{ + gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | + gfp_extra_flags; + struct bpf_prog *fp; + + fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL); + if (fp != NULL) { + kmemcheck_annotate_bitfield(fp, meta); + + /* aux->prog still points to the fp_other one, so + * when promoting the clone to the real program, + * this still needs to be adapted. + */ + memcpy(fp, fp_other, fp_other->pages * PAGE_SIZE); + } + + return fp; +} + +static void bpf_prog_clone_free(struct bpf_prog *fp) +{ + /* aux was stolen by the other clone, so we cannot free + * it from this path! It will be freed eventually by the + * other program on release. + * + * At this point, we don't need a deferred release since + * clone is guaranteed to not be locked. + */ + fp->aux = NULL; + __bpf_prog_free(fp); +} + +void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other) +{ + /* We have to repoint aux->prog to self, as we don't + * know whether fp here is the clone or the original. + */ + fp->aux->prog = fp; + bpf_prog_clone_free(fp_other); +} + +struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) +{ + struct bpf_insn insn_buff[16], aux[2]; + struct bpf_prog *clone, *tmp; + int insn_delta, insn_cnt; + struct bpf_insn *insn; + int i, rewritten; + + if (!bpf_jit_blinding_enabled()) + return prog; + + clone = bpf_prog_clone_create(prog, GFP_USER); + if (!clone) + return ERR_PTR(-ENOMEM); + + insn_cnt = clone->len; + insn = clone->insnsi; + + for (i = 0; i < insn_cnt; i++, insn++) { + /* We temporarily need to hold the original ld64 insn + * so that we can still access the first part in the + * second blinding run. + */ + if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW) && + insn[1].code == 0) + memcpy(aux, insn, sizeof(aux)); + + rewritten = bpf_jit_blind_insn(insn, aux, insn_buff); + if (!rewritten) + continue; + + tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten); + if (!tmp) { + /* Patching may have repointed aux->prog during + * realloc from the original one, so we need to + * fix it up here on error. + */ + bpf_jit_prog_release_other(prog, clone); + return ERR_PTR(-ENOMEM); + } + + clone = tmp; + insn_delta = rewritten - 1; + + /* Walk new program and skip insns we just inserted. */ + insn = clone->insnsi + i + insn_delta; + insn_cnt += insn_delta; + i += insn_delta; + } + + return clone; +} #endif /* CONFIG_BPF_JIT */ /* Base function for offset calculation. Needs to go into .text section, @@ -649,6 +922,7 @@ load_byte: WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); return 0; } +STACK_FRAME_NON_STANDARD(__bpf_prog_run); /* jump table */ bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) @@ -690,15 +964,22 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) /** * bpf_prog_select_runtime - select exec runtime for BPF program * @fp: bpf_prog populated with internal BPF program + * @err: pointer to error variable * * Try to JIT eBPF program, if JIT is not available, use interpreter. * The BPF program will be executed via BPF_PROG_RUN() macro. */ -int bpf_prog_select_runtime(struct bpf_prog *fp) +struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { fp->bpf_func = (void *) __bpf_prog_run; - bpf_int_jit_compile(fp); + /* eBPF JITs can rewrite the program in case constant + * blinding is active. However, in case of error during + * blinding, bpf_int_jit_compile() must always return a + * valid program, which in this case would simply not + * be JITed, but falls back to the interpreter. + */ + fp = bpf_int_jit_compile(fp); bpf_prog_lock_ro(fp); /* The tail call compatibility check can only be done at @@ -706,7 +987,9 @@ int bpf_prog_select_runtime(struct bpf_prog *fp) * with JITed or non JITed program concatenations and not * all eBPF JITs might immediately support all features. */ - return bpf_check_tail_call(fp); + *err = bpf_check_tail_call(fp); + + return fp; } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); @@ -762,14 +1045,21 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; + const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; + const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { return NULL; } +const struct bpf_func_proto * __weak bpf_get_event_output_proto(void) +{ + return NULL; +} + /* Always built-in helper functions. */ const struct bpf_func_proto bpf_tail_call_proto = { .func = NULL, @@ -781,8 +1071,14 @@ const struct bpf_func_proto bpf_tail_call_proto = { }; /* For classic BPF JITs that don't implement bpf_int_jit_compile(). */ -void __weak bpf_int_jit_compile(struct bpf_prog *prog) +struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog) +{ + return prog; +} + +bool __weak bpf_helper_changes_skb_data(void *func) { + return false; } /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index c5b30fd8a315..fff3650d52fc 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1,4 +1,5 @@ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -13,6 +14,7 @@ #include <linux/jhash.h> #include <linux/filter.h> #include <linux/vmalloc.h> +#include "percpu_freelist.h" struct bucket { struct hlist_head head; @@ -22,6 +24,8 @@ struct bucket { struct bpf_htab { struct bpf_map map; struct bucket *buckets; + void *elems; + struct pcpu_freelist freelist; atomic_t count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ @@ -29,26 +33,108 @@ struct bpf_htab { /* each htab element is struct htab_elem + key + value */ struct htab_elem { - struct hlist_node hash_node; + union { + struct hlist_node hash_node; + struct bpf_htab *htab; + struct pcpu_freelist_node fnode; + }; struct rcu_head rcu; u32 hash; char key[0] __aligned(8); }; +static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, + void __percpu *pptr) +{ + *(void __percpu **)(l->key + key_size) = pptr; +} + +static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) +{ + return *(void __percpu **)(l->key + key_size); +} + +static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) +{ + return (struct htab_elem *) (htab->elems + i * htab->elem_size); +} + +static void htab_free_elems(struct bpf_htab *htab) +{ + int i; + + if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) + goto free_elems; + + for (i = 0; i < htab->map.max_entries; i++) { + void __percpu *pptr; + + pptr = htab_elem_get_ptr(get_htab_elem(htab, i), + htab->map.key_size); + free_percpu(pptr); + } +free_elems: + vfree(htab->elems); +} + +static int prealloc_elems_and_freelist(struct bpf_htab *htab) +{ + int err = -ENOMEM, i; + + htab->elems = vzalloc(htab->elem_size * htab->map.max_entries); + if (!htab->elems) + return -ENOMEM; + + if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) + goto skip_percpu_elems; + + for (i = 0; i < htab->map.max_entries; i++) { + u32 size = round_up(htab->map.value_size, 8); + void __percpu *pptr; + + pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN); + if (!pptr) + goto free_elems; + htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, + pptr); + } + +skip_percpu_elems: + err = pcpu_freelist_init(&htab->freelist); + if (err) + goto free_elems; + + pcpu_freelist_populate(&htab->freelist, htab->elems, htab->elem_size, + htab->map.max_entries); + return 0; + +free_elems: + htab_free_elems(htab); + return err; +} + /* Called from syscall */ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { + bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH; struct bpf_htab *htab; int err, i; + u64 cost; + + if (attr->map_flags & ~BPF_F_NO_PREALLOC) + /* reserved bits should not be used */ + return ERR_PTR(-EINVAL); htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) return ERR_PTR(-ENOMEM); /* mandatory map attributes */ + htab->map.map_type = attr->map_type; htab->map.key_size = attr->key_size; htab->map.value_size = attr->value_size; htab->map.max_entries = attr->max_entries; + htab->map.map_flags = attr->map_flags; /* check sanity of attributes. * value_size == 0 may be allowed in the future to use map as a set @@ -77,24 +163,39 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) */ goto free_htab; + if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE) + /* make sure the size for pcpu_alloc() is reasonable */ + goto free_htab; + htab->elem_size = sizeof(struct htab_elem) + - round_up(htab->map.key_size, 8) + - htab->map.value_size; + round_up(htab->map.key_size, 8); + if (percpu) + htab->elem_size += sizeof(void *); + else + htab->elem_size += round_up(htab->map.value_size, 8); /* prevent zero size kmalloc and check for u32 overflow */ if (htab->n_buckets == 0 || htab->n_buckets > U32_MAX / sizeof(struct bucket)) goto free_htab; - if ((u64) htab->n_buckets * sizeof(struct bucket) + - (u64) htab->elem_size * htab->map.max_entries >= - U32_MAX - PAGE_SIZE) + cost = (u64) htab->n_buckets * sizeof(struct bucket) + + (u64) htab->elem_size * htab->map.max_entries; + + if (percpu) + cost += (u64) round_up(htab->map.value_size, 8) * + num_possible_cpus() * htab->map.max_entries; + + if (cost >= U32_MAX - PAGE_SIZE) /* make sure page count doesn't overflow */ goto free_htab; - htab->map.pages = round_up(htab->n_buckets * sizeof(struct bucket) + - htab->elem_size * htab->map.max_entries, - PAGE_SIZE) >> PAGE_SHIFT; + htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + /* if map size is larger than memlock limit, reject it early */ + err = bpf_map_precharge_memlock(htab->map.pages); + if (err) + goto free_htab; err = -ENOMEM; htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket), @@ -111,10 +212,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) raw_spin_lock_init(&htab->buckets[i].lock); } - atomic_set(&htab->count, 0); + if (!(attr->map_flags & BPF_F_NO_PREALLOC)) { + err = prealloc_elems_and_freelist(htab); + if (err) + goto free_buckets; + } return &htab->map; +free_buckets: + kvfree(htab->buckets); free_htab: kfree(htab); return ERR_PTR(err); @@ -148,7 +255,7 @@ static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, } /* Called from syscall or from eBPF program */ -static void *htab_map_lookup_elem(struct bpf_map *map, void *key) +static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_head *head; @@ -166,6 +273,13 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) l = lookup_elem_raw(head, hash, key, key_size); + return l; +} + +static void *htab_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct htab_elem *l = __htab_map_lookup_elem(map, key); + if (l) return l->key + round_up(map->key_size, 8); @@ -226,86 +340,248 @@ find_first_elem: } } - /* itereated over all buckets and all elements */ + /* iterated over all buckets and all elements */ return -ENOENT; } +static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) +{ + if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) + free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); + kfree(l); + +} + +static void htab_elem_free_rcu(struct rcu_head *head) +{ + struct htab_elem *l = container_of(head, struct htab_elem, rcu); + struct bpf_htab *htab = l->htab; + + /* must increment bpf_prog_active to avoid kprobe+bpf triggering while + * we're calling kfree, otherwise deadlock is possible if kprobes + * are placed somewhere inside of slub + */ + preempt_disable(); + __this_cpu_inc(bpf_prog_active); + htab_elem_free(htab, l); + __this_cpu_dec(bpf_prog_active); + preempt_enable(); +} + +static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) +{ + if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) { + pcpu_freelist_push(&htab->freelist, &l->fnode); + } else { + atomic_dec(&htab->count); + l->htab = htab; + call_rcu(&l->rcu, htab_elem_free_rcu); + } +} + +static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, + void *value, u32 key_size, u32 hash, + bool percpu, bool onallcpus) +{ + u32 size = htab->map.value_size; + bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC); + struct htab_elem *l_new; + void __percpu *pptr; + + if (prealloc) { + l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist); + if (!l_new) + return ERR_PTR(-E2BIG); + } else { + if (atomic_inc_return(&htab->count) > htab->map.max_entries) { + atomic_dec(&htab->count); + return ERR_PTR(-E2BIG); + } + l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); + if (!l_new) + return ERR_PTR(-ENOMEM); + } + + memcpy(l_new->key, key, key_size); + if (percpu) { + /* round up value_size to 8 bytes */ + size = round_up(size, 8); + + if (prealloc) { + pptr = htab_elem_get_ptr(l_new, key_size); + } else { + /* alloc_percpu zero-fills */ + pptr = __alloc_percpu_gfp(size, 8, + GFP_ATOMIC | __GFP_NOWARN); + if (!pptr) { + kfree(l_new); + return ERR_PTR(-ENOMEM); + } + } + + if (!onallcpus) { + /* copy true value_size bytes */ + memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); + } else { + int off = 0, cpu; + + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), + value + off, size); + off += size; + } + } + if (!prealloc) + htab_elem_set_ptr(l_new, key_size, pptr); + } else { + memcpy(l_new->key + round_up(key_size, 8), value, size); + } + + l_new->hash = hash; + return l_new; +} + +static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, + u64 map_flags) +{ + if (l_old && map_flags == BPF_NOEXIST) + /* elem already exists */ + return -EEXIST; + + if (!l_old && map_flags == BPF_EXIST) + /* elem doesn't exist, cannot update it */ + return -ENOENT; + + return 0; +} + /* Called from syscall or from eBPF program */ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct htab_elem *l_new, *l_old; + struct htab_elem *l_new = NULL, *l_old; struct hlist_head *head; - struct bucket *b; unsigned long flags; - u32 key_size; + struct bucket *b; + u32 key_size, hash; int ret; - if (map_flags > BPF_EXIST) + if (unlikely(map_flags > BPF_EXIST)) /* unknown flags */ return -EINVAL; WARN_ON_ONCE(!rcu_read_lock_held()); - /* allocate new element outside of lock */ - l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); - if (!l_new) - return -ENOMEM; - key_size = map->key_size; - memcpy(l_new->key, key, key_size); - memcpy(l_new->key + round_up(key_size, 8), value, map->value_size); + hash = htab_map_hash(key, key_size); - l_new->hash = htab_map_hash(l_new->key, key_size); - b = __select_bucket(htab, l_new->hash); + b = __select_bucket(htab, hash); head = &b->head; /* bpf_map_update_elem() can be called in_irq() */ raw_spin_lock_irqsave(&b->lock, flags); - l_old = lookup_elem_raw(head, l_new->hash, key, key_size); + l_old = lookup_elem_raw(head, hash, key, key_size); - if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) { - /* if elem with this 'key' doesn't exist and we've reached - * max_entries limit, fail insertion of new elem - */ - ret = -E2BIG; + ret = check_flags(htab, l_old, map_flags); + if (ret) goto err; - } - if (l_old && map_flags == BPF_NOEXIST) { - /* elem already exists */ - ret = -EEXIST; + l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false); + if (IS_ERR(l_new)) { + /* all pre-allocated elements are in use or memory exhausted */ + ret = PTR_ERR(l_new); goto err; } - if (!l_old && map_flags == BPF_EXIST) { - /* elem doesn't exist, cannot update it */ - ret = -ENOENT; - goto err; - } - - /* add new element to the head of the list, so that concurrent - * search will find it before old elem + /* add new element to the head of the list, so that + * concurrent search will find it before old elem */ hlist_add_head_rcu(&l_new->hash_node, head); if (l_old) { hlist_del_rcu(&l_old->hash_node); - kfree_rcu(l_old, rcu); - } else { - atomic_inc(&htab->count); + free_htab_elem(htab, l_old); } + ret = 0; +err: raw_spin_unlock_irqrestore(&b->lock, flags); + return ret; +} - return 0; +static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags, + bool onallcpus) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l_new = NULL, *l_old; + struct hlist_head *head; + unsigned long flags; + struct bucket *b; + u32 key_size, hash; + int ret; + + if (unlikely(map_flags > BPF_EXIST)) + /* unknown flags */ + return -EINVAL; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + b = __select_bucket(htab, hash); + head = &b->head; + + /* bpf_map_update_elem() can be called in_irq() */ + raw_spin_lock_irqsave(&b->lock, flags); + + l_old = lookup_elem_raw(head, hash, key, key_size); + + ret = check_flags(htab, l_old, map_flags); + if (ret) + goto err; + + if (l_old) { + void __percpu *pptr = htab_elem_get_ptr(l_old, key_size); + u32 size = htab->map.value_size; + + /* per-cpu hash map can update value in-place */ + if (!onallcpus) { + memcpy(this_cpu_ptr(pptr), value, size); + } else { + int off = 0, cpu; + + size = round_up(size, 8); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), + value + off, size); + off += size; + } + } + } else { + l_new = alloc_htab_elem(htab, key, value, key_size, + hash, true, onallcpus); + if (IS_ERR(l_new)) { + ret = PTR_ERR(l_new); + goto err; + } + hlist_add_head_rcu(&l_new->hash_node, head); + } + ret = 0; err: raw_spin_unlock_irqrestore(&b->lock, flags); - kfree(l_new); return ret; } +static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + return __htab_percpu_map_update_elem(map, key, value, map_flags, false); +} + /* Called from syscall or from eBPF program */ static int htab_map_delete_elem(struct bpf_map *map, void *key) { @@ -331,8 +607,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) if (l) { hlist_del_rcu(&l->hash_node); - atomic_dec(&htab->count); - kfree_rcu(l, rcu); + free_htab_elem(htab, l); ret = 0; } @@ -351,12 +626,10 @@ static void delete_all_elements(struct bpf_htab *htab) hlist_for_each_entry_safe(l, n, head, hash_node) { hlist_del_rcu(&l->hash_node); - atomic_dec(&htab->count); - kfree(l); + htab_elem_free(htab, l); } } } - /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void htab_map_free(struct bpf_map *map) { @@ -369,10 +642,16 @@ static void htab_map_free(struct bpf_map *map) */ synchronize_rcu(); - /* some of kfree_rcu() callbacks for elements of this map may not have - * executed. It's ok. Proceed to free residual elements and map itself + /* some of free_htab_elem() callbacks for elements of this map may + * not have executed. Wait for them. */ - delete_all_elements(htab); + rcu_barrier(); + if (htab->map.map_flags & BPF_F_NO_PREALLOC) { + delete_all_elements(htab); + } else { + htab_free_elems(htab); + pcpu_freelist_destroy(&htab->freelist); + } kvfree(htab->buckets); kfree(htab); } @@ -391,9 +670,76 @@ static struct bpf_map_type_list htab_type __read_mostly = { .type = BPF_MAP_TYPE_HASH, }; +/* Called from eBPF program */ +static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct htab_elem *l = __htab_map_lookup_elem(map, key); + + if (l) + return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); + else + return NULL; +} + +int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) +{ + struct htab_elem *l; + void __percpu *pptr; + int ret = -ENOENT; + int cpu, off = 0; + u32 size; + + /* per_cpu areas are zero-filled and bpf programs can only + * access 'value_size' of them, so copying rounded areas + * will not leak any kernel data + */ + size = round_up(map->value_size, 8); + rcu_read_lock(); + l = __htab_map_lookup_elem(map, key); + if (!l) + goto out; + pptr = htab_elem_get_ptr(l, map->key_size); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(value + off, + per_cpu_ptr(pptr, cpu), size); + off += size; + } + ret = 0; +out: + rcu_read_unlock(); + return ret; +} + +int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + int ret; + + rcu_read_lock(); + ret = __htab_percpu_map_update_elem(map, key, value, map_flags, true); + rcu_read_unlock(); + + return ret; +} + +static const struct bpf_map_ops htab_percpu_ops = { + .map_alloc = htab_map_alloc, + .map_free = htab_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_percpu_map_lookup_elem, + .map_update_elem = htab_percpu_map_update_elem, + .map_delete_elem = htab_map_delete_elem, +}; + +static struct bpf_map_type_list htab_percpu_type __read_mostly = { + .ops = &htab_percpu_ops, + .type = BPF_MAP_TYPE_PERCPU_HASH, +}; + static int __init register_htab_map(void) { bpf_register_map_type(&htab_type); + bpf_register_map_type(&htab_percpu_type); return 0; } late_initcall(register_htab_map); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 4504ca66118d..ad7a0573f71b 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -163,17 +163,26 @@ static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5) struct task_struct *task = current; char *buf = (char *) (long) r1; - if (!task) - return -EINVAL; + if (unlikely(!task)) + goto err_clear; - memcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm))); + strncpy(buf, task->comm, size); + + /* Verifier guarantees that size > 0. For task->comm exceeding + * size, guarantee that buf is %NUL-terminated. Unconditionally + * done here to save the size test. + */ + buf[size - 1] = 0; return 0; +err_clear: + memset(buf, 0, size); + return -EINVAL; } const struct bpf_func_proto bpf_get_current_comm_proto = { .func = bpf_get_current_comm, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_STACK, + .arg1_type = ARG_PTR_TO_RAW_STACK, .arg2_type = ARG_CONST_STACK_SIZE, }; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index f2ece3c174a5..318858edb1cd 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -31,10 +31,10 @@ static void *bpf_any_get(void *raw, enum bpf_type type) { switch (type) { case BPF_TYPE_PROG: - atomic_inc(&((struct bpf_prog *)raw)->aux->refcnt); + raw = bpf_prog_inc(raw); break; case BPF_TYPE_MAP: - bpf_map_inc(raw, true); + raw = bpf_map_inc(raw, true); break; default: WARN_ON_ONCE(1); @@ -119,18 +119,10 @@ static int bpf_inode_type(const struct inode *inode, enum bpf_type *type) return 0; } -static bool bpf_dname_reserved(const struct dentry *dentry) -{ - return strchr(dentry->d_name.name, '.'); -} - static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; - if (bpf_dname_reserved(dentry)) - return -EPERM; - inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -152,9 +144,6 @@ static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry, { struct inode *inode; - if (bpf_dname_reserved(dentry)) - return -EPERM; - inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -187,31 +176,21 @@ static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode, } } -static int bpf_link(struct dentry *old_dentry, struct inode *dir, - struct dentry *new_dentry) +static struct dentry * +bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) { - if (bpf_dname_reserved(new_dentry)) - return -EPERM; - - return simple_link(old_dentry, dir, new_dentry); -} - -static int bpf_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) -{ - if (bpf_dname_reserved(new_dentry)) - return -EPERM; - - return simple_rename(old_dir, old_dentry, new_dir, new_dentry); + if (strchr(dentry->d_name.name, '.')) + return ERR_PTR(-EPERM); + return simple_lookup(dir, dentry, flags); } static const struct inode_operations bpf_dir_iops = { - .lookup = simple_lookup, + .lookup = bpf_lookup, .mknod = bpf_mkobj, .mkdir = bpf_mkdir, .rmdir = simple_rmdir, - .rename = bpf_rename, - .link = bpf_link, + .rename = simple_rename, + .link = simple_link, .unlink = simple_unlink, }; @@ -297,7 +276,8 @@ static void *bpf_obj_do_get(const struct filename *pathname, goto out; raw = bpf_any_get(inode->i_private, *type); - touch_atime(&path); + if (!IS_ERR(raw)) + touch_atime(&path); path_put(&path); return raw; @@ -377,7 +357,7 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent) static struct dentry *bpf_mount(struct file_system_type *type, int flags, const char *dev_name, void *data) { - return mount_ns(type, flags, current->nsproxy->mnt_ns, bpf_fill_super); + return mount_nodev(type, flags, data, bpf_fill_super); } static struct file_system_type bpf_fs_type = { @@ -385,7 +365,6 @@ static struct file_system_type bpf_fs_type = { .name = "bpf", .mount = bpf_mount, .kill_sb = kill_litter_super, - .fs_flags = FS_USERNS_MOUNT, }; MODULE_ALIAS_FS("bpf"); diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c new file mode 100644 index 000000000000..5c51d1985b51 --- /dev/null +++ b/kernel/bpf/percpu_freelist.c @@ -0,0 +1,100 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include "percpu_freelist.h" + +int pcpu_freelist_init(struct pcpu_freelist *s) +{ + int cpu; + + s->freelist = alloc_percpu(struct pcpu_freelist_head); + if (!s->freelist) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu); + + raw_spin_lock_init(&head->lock); + head->first = NULL; + } + return 0; +} + +void pcpu_freelist_destroy(struct pcpu_freelist *s) +{ + free_percpu(s->freelist); +} + +static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head, + struct pcpu_freelist_node *node) +{ + raw_spin_lock(&head->lock); + node->next = head->first; + head->first = node; + raw_spin_unlock(&head->lock); +} + +void pcpu_freelist_push(struct pcpu_freelist *s, + struct pcpu_freelist_node *node) +{ + struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist); + + __pcpu_freelist_push(head, node); +} + +void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, + u32 nr_elems) +{ + struct pcpu_freelist_head *head; + unsigned long flags; + int i, cpu, pcpu_entries; + + pcpu_entries = nr_elems / num_possible_cpus() + 1; + i = 0; + + /* disable irq to workaround lockdep false positive + * in bpf usage pcpu_freelist_populate() will never race + * with pcpu_freelist_push() + */ + local_irq_save(flags); + for_each_possible_cpu(cpu) { +again: + head = per_cpu_ptr(s->freelist, cpu); + __pcpu_freelist_push(head, buf); + i++; + buf += elem_size; + if (i == nr_elems) + break; + if (i % pcpu_entries) + goto again; + } + local_irq_restore(flags); +} + +struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) +{ + struct pcpu_freelist_head *head; + struct pcpu_freelist_node *node; + int orig_cpu, cpu; + + orig_cpu = cpu = raw_smp_processor_id(); + while (1) { + head = per_cpu_ptr(s->freelist, cpu); + raw_spin_lock(&head->lock); + node = head->first; + if (node) { + head->first = node->next; + raw_spin_unlock(&head->lock); + return node; + } + raw_spin_unlock(&head->lock); + cpu = cpumask_next(cpu, cpu_possible_mask); + if (cpu >= nr_cpu_ids) + cpu = 0; + if (cpu == orig_cpu) + return NULL; + } +} diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h new file mode 100644 index 000000000000..3049aae8ea1e --- /dev/null +++ b/kernel/bpf/percpu_freelist.h @@ -0,0 +1,31 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef __PERCPU_FREELIST_H__ +#define __PERCPU_FREELIST_H__ +#include <linux/spinlock.h> +#include <linux/percpu.h> + +struct pcpu_freelist_head { + struct pcpu_freelist_node *first; + raw_spinlock_t lock; +}; + +struct pcpu_freelist { + struct pcpu_freelist_head __percpu *freelist; +}; + +struct pcpu_freelist_node { + struct pcpu_freelist_node *next; +}; + +void pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *); +struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *); +void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, + u32 nr_elems); +int pcpu_freelist_init(struct pcpu_freelist *); +void pcpu_freelist_destroy(struct pcpu_freelist *s); +#endif diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c new file mode 100644 index 000000000000..080a2dfb5800 --- /dev/null +++ b/kernel/bpf/stackmap.c @@ -0,0 +1,291 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/bpf.h> +#include <linux/jhash.h> +#include <linux/filter.h> +#include <linux/vmalloc.h> +#include <linux/stacktrace.h> +#include <linux/perf_event.h> +#include "percpu_freelist.h" + +struct stack_map_bucket { + struct pcpu_freelist_node fnode; + u32 hash; + u32 nr; + u64 ip[]; +}; + +struct bpf_stack_map { + struct bpf_map map; + void *elems; + struct pcpu_freelist freelist; + u32 n_buckets; + struct stack_map_bucket *buckets[]; +}; + +static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) +{ + u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size; + int err; + + smap->elems = vzalloc(elem_size * smap->map.max_entries); + if (!smap->elems) + return -ENOMEM; + + err = pcpu_freelist_init(&smap->freelist); + if (err) + goto free_elems; + + pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size, + smap->map.max_entries); + return 0; + +free_elems: + vfree(smap->elems); + return err; +} + +/* Called from syscall */ +static struct bpf_map *stack_map_alloc(union bpf_attr *attr) +{ + u32 value_size = attr->value_size; + struct bpf_stack_map *smap; + u64 cost, n_buckets; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + if (attr->map_flags) + return ERR_PTR(-EINVAL); + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 4 || + value_size < 8 || value_size % 8 || + value_size / 8 > sysctl_perf_event_max_stack) + return ERR_PTR(-EINVAL); + + /* hash table size must be power of 2 */ + n_buckets = roundup_pow_of_two(attr->max_entries); + + cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); + if (cost >= U32_MAX - PAGE_SIZE) + return ERR_PTR(-E2BIG); + + smap = kzalloc(cost, GFP_USER | __GFP_NOWARN); + if (!smap) { + smap = vzalloc(cost); + if (!smap) + return ERR_PTR(-ENOMEM); + } + + err = -E2BIG; + cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); + if (cost >= U32_MAX - PAGE_SIZE) + goto free_smap; + + smap->map.map_type = attr->map_type; + smap->map.key_size = attr->key_size; + smap->map.value_size = value_size; + smap->map.max_entries = attr->max_entries; + smap->n_buckets = n_buckets; + smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + err = bpf_map_precharge_memlock(smap->map.pages); + if (err) + goto free_smap; + + err = get_callchain_buffers(); + if (err) + goto free_smap; + + err = prealloc_elems_and_freelist(smap); + if (err) + goto put_buffers; + + return &smap->map; + +put_buffers: + put_callchain_buffers(); +free_smap: + kvfree(smap); + return ERR_PTR(err); +} + +u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) +{ + struct pt_regs *regs = (struct pt_regs *) (long) r1; + struct bpf_map *map = (struct bpf_map *) (long) r2; + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + struct perf_callchain_entry *trace; + struct stack_map_bucket *bucket, *new_bucket, *old_bucket; + u32 max_depth = map->value_size / 8; + /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ + u32 init_nr = sysctl_perf_event_max_stack - max_depth; + u32 skip = flags & BPF_F_SKIP_FIELD_MASK; + u32 hash, id, trace_nr, trace_len; + bool user = flags & BPF_F_USER_STACK; + bool kernel = !user; + u64 *ips; + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) + return -EINVAL; + + trace = get_perf_callchain(regs, init_nr, kernel, user, + sysctl_perf_event_max_stack, false, false); + + if (unlikely(!trace)) + /* couldn't fetch the stack trace */ + return -EFAULT; + + /* get_perf_callchain() guarantees that trace->nr >= init_nr + * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth + */ + trace_nr = trace->nr - init_nr; + + if (trace_nr <= skip) + /* skipping more than usable stack trace */ + return -EFAULT; + + trace_nr -= skip; + trace_len = trace_nr * sizeof(u64); + ips = trace->ip + skip + init_nr; + hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); + id = hash & (smap->n_buckets - 1); + bucket = READ_ONCE(smap->buckets[id]); + + if (bucket && bucket->hash == hash) { + if (flags & BPF_F_FAST_STACK_CMP) + return id; + if (bucket->nr == trace_nr && + memcmp(bucket->ip, ips, trace_len) == 0) + return id; + } + + /* this call stack is not in the map, try to add it */ + if (bucket && !(flags & BPF_F_REUSE_STACKID)) + return -EEXIST; + + new_bucket = (struct stack_map_bucket *) + pcpu_freelist_pop(&smap->freelist); + if (unlikely(!new_bucket)) + return -ENOMEM; + + memcpy(new_bucket->ip, ips, trace_len); + new_bucket->hash = hash; + new_bucket->nr = trace_nr; + + old_bucket = xchg(&smap->buckets[id], new_bucket); + if (old_bucket) + pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); + return id; +} + +const struct bpf_func_proto bpf_get_stackid_proto = { + .func = bpf_get_stackid, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + +/* Called from eBPF program */ +static void *stack_map_lookup_elem(struct bpf_map *map, void *key) +{ + return NULL; +} + +/* Called from syscall */ +int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) +{ + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + struct stack_map_bucket *bucket, *old_bucket; + u32 id = *(u32 *)key, trace_len; + + if (unlikely(id >= smap->n_buckets)) + return -ENOENT; + + bucket = xchg(&smap->buckets[id], NULL); + if (!bucket) + return -ENOENT; + + trace_len = bucket->nr * sizeof(u64); + memcpy(value, bucket->ip, trace_len); + memset(value + trace_len, 0, map->value_size - trace_len); + + old_bucket = xchg(&smap->buckets[id], bucket); + if (old_bucket) + pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); + return 0; +} + +static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + return -EINVAL; +} + +static int stack_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + return -EINVAL; +} + +/* Called from syscall or from eBPF program */ +static int stack_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + struct stack_map_bucket *old_bucket; + u32 id = *(u32 *)key; + + if (unlikely(id >= smap->n_buckets)) + return -E2BIG; + + old_bucket = xchg(&smap->buckets[id], NULL); + if (old_bucket) { + pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); + return 0; + } else { + return -ENOENT; + } +} + +/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ +static void stack_map_free(struct bpf_map *map) +{ + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + + /* wait for bpf programs to complete before freeing stack map */ + synchronize_rcu(); + + vfree(smap->elems); + pcpu_freelist_destroy(&smap->freelist); + kvfree(smap); + put_callchain_buffers(); +} + +static const struct bpf_map_ops stack_map_ops = { + .map_alloc = stack_map_alloc, + .map_free = stack_map_free, + .map_get_next_key = stack_map_get_next_key, + .map_lookup_elem = stack_map_lookup_elem, + .map_update_elem = stack_map_update_elem, + .map_delete_elem = stack_map_delete_elem, +}; + +static struct bpf_map_type_list stack_map_type __read_mostly = { + .ops = &stack_map_ops, + .type = BPF_MAP_TYPE_STACK_TRACE, +}; + +static int __init register_stack_map(void) +{ + bpf_register_map_type(&stack_map_type); + return 0; +} +late_initcall(register_stack_map); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 637397059f76..46ecce4b79ed 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -18,6 +18,8 @@ #include <linux/filter.h> #include <linux/version.h> +DEFINE_PER_CPU(int, bpf_prog_active); + int sysctl_unprivileged_bpf_disabled __read_mostly; static LIST_HEAD(bpf_map_types); @@ -46,6 +48,19 @@ void bpf_register_map_type(struct bpf_map_type_list *tl) list_add(&tl->list_node, &bpf_map_types); } +int bpf_map_precharge_memlock(u32 pages) +{ + struct user_struct *user = get_current_user(); + unsigned long memlock_limit, cur; + + memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + cur = atomic_long_read(&user->locked_vm); + free_uid(user); + if (cur + pages > memlock_limit) + return -EPERM; + return 0; +} + static int bpf_map_charge_memlock(struct bpf_map *map) { struct user_struct *user = get_current_user(); @@ -122,11 +137,13 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) "map_type:\t%u\n" "key_size:\t%u\n" "value_size:\t%u\n" - "max_entries:\t%u\n", + "max_entries:\t%u\n" + "map_flags:\t%#x\n", map->map_type, map->key_size, map->value_size, - map->max_entries); + map->max_entries, + map->map_flags); } #endif @@ -151,7 +168,7 @@ int bpf_map_new_fd(struct bpf_map *map) offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL -#define BPF_MAP_CREATE_LAST_FIELD max_entries +#define BPF_MAP_CREATE_LAST_FIELD map_flags /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -201,11 +218,18 @@ struct bpf_map *__bpf_map_get(struct fd f) return f.file->private_data; } -void bpf_map_inc(struct bpf_map *map, bool uref) +/* prog's and map's refcnt limit */ +#define BPF_MAX_REFCNT 32768 + +struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref) { - atomic_inc(&map->refcnt); + if (atomic_inc_return(&map->refcnt) > BPF_MAX_REFCNT) { + atomic_dec(&map->refcnt); + return ERR_PTR(-EBUSY); + } if (uref) atomic_inc(&map->usercnt); + return map; } struct bpf_map *bpf_map_get_with_uref(u32 ufd) @@ -217,7 +241,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd) if (IS_ERR(map)) return map; - bpf_map_inc(map, true); + map = bpf_map_inc(map, true); fdput(f); return map; @@ -229,6 +253,11 @@ static void __user *u64_to_ptr(__u64 val) return (void __user *) (unsigned long) val; } +int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) +{ + return -ENOTSUPP; +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value @@ -239,6 +268,7 @@ static int map_lookup_elem(union bpf_attr *attr) int ufd = attr->map_fd; struct bpf_map *map; void *key, *value, *ptr; + u32 value_size; struct fd f; int err; @@ -259,23 +289,37 @@ static int map_lookup_elem(union bpf_attr *attr) if (copy_from_user(key, ukey, map->key_size) != 0) goto free_key; + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + value_size = round_up(map->value_size, 8) * num_possible_cpus(); + else + value_size = map->value_size; + err = -ENOMEM; - value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; - rcu_read_lock(); - ptr = map->ops->map_lookup_elem(map, key); - if (ptr) - memcpy(value, ptr, map->value_size); - rcu_read_unlock(); + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { + err = bpf_percpu_hash_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + err = bpf_percpu_array_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { + err = bpf_stackmap_copy(map, key, value); + } else { + rcu_read_lock(); + ptr = map->ops->map_lookup_elem(map, key); + if (ptr) + memcpy(value, ptr, value_size); + rcu_read_unlock(); + err = ptr ? 0 : -ENOENT; + } - err = -ENOENT; - if (!ptr) + if (err) goto free_value; err = -EFAULT; - if (copy_to_user(uvalue, value, map->value_size) != 0) + if (copy_to_user(uvalue, value, value_size) != 0) goto free_value; err = 0; @@ -298,6 +342,7 @@ static int map_update_elem(union bpf_attr *attr) int ufd = attr->map_fd; struct bpf_map *map; void *key, *value; + u32 value_size; struct fd f; int err; @@ -318,21 +363,37 @@ static int map_update_elem(union bpf_attr *attr) if (copy_from_user(key, ukey, map->key_size) != 0) goto free_key; + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + value_size = round_up(map->value_size, 8) * num_possible_cpus(); + else + value_size = map->value_size; + err = -ENOMEM; - value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; err = -EFAULT; - if (copy_from_user(value, uvalue, map->value_size) != 0) + if (copy_from_user(value, uvalue, value_size) != 0) goto free_value; - /* eBPF program that use maps are running under rcu_read_lock(), - * therefore all map accessors rely on this fact, so do the same here + /* must increment bpf_prog_active to avoid kprobe+bpf triggering from + * inside bpf map update or delete otherwise deadlocks are possible */ - rcu_read_lock(); - err = map->ops->map_update_elem(map, key, value, attr->flags); - rcu_read_unlock(); + preempt_disable(); + __this_cpu_inc(bpf_prog_active); + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { + err = bpf_percpu_hash_update(map, key, value, attr->flags); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + err = bpf_percpu_array_update(map, key, value, attr->flags); + } else { + rcu_read_lock(); + err = map->ops->map_update_elem(map, key, value, attr->flags); + rcu_read_unlock(); + } + __this_cpu_dec(bpf_prog_active); + preempt_enable(); free_value: kfree(value); @@ -371,9 +432,13 @@ static int map_delete_elem(union bpf_attr *attr) if (copy_from_user(key, ukey, map->key_size) != 0) goto free_key; + preempt_disable(); + __this_cpu_inc(bpf_prog_active); rcu_read_lock(); err = map->ops->map_delete_elem(map, key); rcu_read_unlock(); + __this_cpu_dec(bpf_prog_active); + preempt_enable(); free_key: kfree(key); @@ -600,6 +665,15 @@ static struct bpf_prog *__bpf_prog_get(struct fd f) return f.file->private_data; } +struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) +{ + if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) { + atomic_dec(&prog->aux->refcnt); + return ERR_PTR(-EBUSY); + } + return prog; +} + /* called by sockets/tracing/seccomp before attaching program to an event * pairs with bpf_prog_put() */ @@ -612,7 +686,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd) if (IS_ERR(prog)) return prog; - atomic_inc(&prog->aux->refcnt); + prog = bpf_prog_inc(prog); fdput(f); return prog; @@ -688,7 +762,7 @@ static int bpf_prog_load(union bpf_attr *attr) fixup_bpf_calls(prog); /* eBPF program is ready to be JITed */ - err = bpf_prog_select_runtime(prog); + prog = bpf_prog_select_runtime(prog, &err); if (err < 0) goto free_used_maps; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2e7f7ab739e4..eec9f90ba030 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1,4 +1,5 @@ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -125,24 +126,18 @@ * are set to NOT_INIT to indicate that they are no longer readable. */ -/* types of values stored in eBPF registers */ -enum bpf_reg_type { - NOT_INIT = 0, /* nothing was written into register */ - UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */ - PTR_TO_CTX, /* reg points to bpf_context */ - CONST_PTR_TO_MAP, /* reg points to struct bpf_map */ - PTR_TO_MAP_VALUE, /* reg points to map element value */ - PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ - FRAME_PTR, /* reg == frame_pointer */ - PTR_TO_STACK, /* reg == frame_pointer + imm */ - CONST_IMM, /* constant integer value */ -}; - struct reg_state { enum bpf_reg_type type; union { - /* valid when type == CONST_IMM | PTR_TO_STACK */ - int imm; + /* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */ + s64 imm; + + /* valid when type == PTR_TO_PACKET* */ + struct { + u32 id; + u16 off; + u16 range; + }; /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | * PTR_TO_MAP_VALUE_OR_NULL @@ -202,6 +197,16 @@ struct verifier_env { bool allow_ptr_leaks; }; +#define BPF_COMPLEXITY_LIMIT_INSNS 65536 +#define BPF_COMPLEXITY_LIMIT_STACK 1024 + +struct bpf_call_arg_meta { + struct bpf_map *map_ptr; + bool raw_mode; + int regno; + int access_size; +}; + /* verbose verifier prints what it's seeing * bpf_check() is called under lock, so no race to access these global vars */ @@ -237,39 +242,39 @@ static const char * const reg_type_str[] = { [FRAME_PTR] = "fp", [PTR_TO_STACK] = "fp", [CONST_IMM] = "imm", + [PTR_TO_PACKET] = "pkt", + [PTR_TO_PACKET_END] = "pkt_end", }; -static const struct { - int map_type; - int func_id; -} func_limit[] = { - {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call}, - {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read}, - {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output}, -}; - -static void print_verifier_state(struct verifier_env *env) +static void print_verifier_state(struct verifier_state *state) { + struct reg_state *reg; enum bpf_reg_type t; int i; for (i = 0; i < MAX_BPF_REG; i++) { - t = env->cur_state.regs[i].type; + reg = &state->regs[i]; + t = reg->type; if (t == NOT_INIT) continue; verbose(" R%d=%s", i, reg_type_str[t]); if (t == CONST_IMM || t == PTR_TO_STACK) - verbose("%d", env->cur_state.regs[i].imm); + verbose("%lld", reg->imm); + else if (t == PTR_TO_PACKET) + verbose("(id=%d,off=%d,r=%d)", + reg->id, reg->off, reg->range); + else if (t == UNKNOWN_VALUE && reg->imm) + verbose("%lld", reg->imm); else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || t == PTR_TO_MAP_VALUE_OR_NULL) verbose("(ks=%d,vs=%d)", - env->cur_state.regs[i].map_ptr->key_size, - env->cur_state.regs[i].map_ptr->value_size); + reg->map_ptr->key_size, + reg->map_ptr->value_size); } for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { - if (env->cur_state.stack_slot_type[i] == STACK_SPILL) + if (state->stack_slot_type[i] == STACK_SPILL) verbose(" fp%d=%s", -MAX_BPF_STACK + i, - reg_type_str[env->cur_state.spilled_regs[i / BPF_REG_SIZE].type]); + reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]); } verbose("\n"); } @@ -453,7 +458,7 @@ static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx, elem->next = env->head; env->head = elem; env->stack_size++; - if (env->stack_size > 1024) { + if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { verbose("BPF program is too complex\n"); goto err; } @@ -476,7 +481,6 @@ static void init_reg_state(struct reg_state *regs) for (i = 0; i < MAX_BPF_REG; i++) { regs[i].type = NOT_INIT; regs[i].imm = 0; - regs[i].map_ptr = NULL; } /* frame pointer */ @@ -491,7 +495,6 @@ static void mark_reg_unknown_value(struct reg_state *regs, u32 regno) BUG_ON(regno >= MAX_BPF_REG); regs[regno].type = UNKNOWN_VALUE; regs[regno].imm = 0; - regs[regno].map_ptr = NULL; } enum reg_arg_type { @@ -547,6 +550,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_MAP_VALUE_OR_NULL: case PTR_TO_STACK: case PTR_TO_CTX: + case PTR_TO_PACKET: + case PTR_TO_PACKET_END: case FRAME_PTR: case CONST_PTR_TO_MAP: return true; @@ -646,13 +651,34 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off, return 0; } +#define MAX_PACKET_OFF 0xffff + +static int check_packet_access(struct verifier_env *env, u32 regno, int off, + int size) +{ + struct reg_state *regs = env->cur_state.regs; + struct reg_state *reg = ®s[regno]; + + off += reg->off; + if (off < 0 || off + size > reg->range) { + verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", + off, size, regno, reg->id, reg->off, reg->range); + return -EACCES; + } + return 0; +} + /* check access to 'struct bpf_context' fields */ static int check_ctx_access(struct verifier_env *env, int off, int size, - enum bpf_access_type t) + enum bpf_access_type t, enum bpf_reg_type *reg_type) { if (env->prog->aux->ops->is_valid_access && - env->prog->aux->ops->is_valid_access(off, size, t)) + env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) { + /* remember the offset of last byte accessed in ctx */ + if (env->prog->aux->max_ctx_offset < off + size) + env->prog->aux->max_ctx_offset = off + size; return 0; + } verbose("invalid bpf_context access off=%d size=%d\n", off, size); return -EACCES; @@ -672,6 +698,45 @@ static bool is_pointer_value(struct verifier_env *env, int regno) } } +static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg, + int off, int size) +{ + if (reg->type != PTR_TO_PACKET) { + if (off % size != 0) { + verbose("misaligned access off %d size %d\n", off, size); + return -EACCES; + } else { + return 0; + } + } + + switch (env->prog->type) { + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: + break; + default: + verbose("verifier is misconfigured\n"); + return -EACCES; + } + + if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) + /* misaligned access to packet is ok on x86,arm,arm64 */ + return 0; + + if (reg->id && size != 1) { + verbose("Unknown packet alignment. Only byte-sized access allowed\n"); + return -EACCES; + } + + /* skb->data is NET_IP_ALIGN-ed */ + if ((NET_IP_ALIGN + reg->off + off) % size != 0) { + verbose("misaligned packet access off %d+%d+%d size %d\n", + NET_IP_ALIGN, reg->off, off, size); + return -EACCES; + } + return 0; +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -683,21 +748,21 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, int value_regno) { struct verifier_state *state = &env->cur_state; + struct reg_state *reg = &state->regs[regno]; int size, err = 0; - if (state->regs[regno].type == PTR_TO_STACK) - off += state->regs[regno].imm; + if (reg->type == PTR_TO_STACK) + off += reg->imm; size = bpf_size_to_bytes(bpf_size); if (size < 0) return size; - if (off % size != 0) { - verbose("misaligned access off %d size %d\n", off, size); - return -EACCES; - } + err = check_ptr_alignment(env, reg, off, size); + if (err) + return err; - if (state->regs[regno].type == PTR_TO_MAP_VALUE) { + if (reg->type == PTR_TO_MAP_VALUE) { if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose("R%d leaks addr into map\n", value_regno); @@ -707,18 +772,23 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown_value(state->regs, value_regno); - } else if (state->regs[regno].type == PTR_TO_CTX) { + } else if (reg->type == PTR_TO_CTX) { + enum bpf_reg_type reg_type = UNKNOWN_VALUE; + if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose("R%d leaks addr into ctx\n", value_regno); return -EACCES; } - err = check_ctx_access(env, off, size, t); - if (!err && t == BPF_READ && value_regno >= 0) + err = check_ctx_access(env, off, size, t, ®_type); + if (!err && t == BPF_READ && value_regno >= 0) { mark_reg_unknown_value(state->regs, value_regno); + if (env->allow_ptr_leaks) + /* note that reg.[id|off|range] == 0 */ + state->regs[value_regno].type = reg_type; + } - } else if (state->regs[regno].type == FRAME_PTR || - state->regs[regno].type == PTR_TO_STACK) { + } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) { if (off >= 0 || off < -MAX_BPF_STACK) { verbose("invalid stack off=%d size=%d\n", off, size); return -EACCES; @@ -734,11 +804,28 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, } else { err = check_stack_read(state, off, size, value_regno); } + } else if (state->regs[regno].type == PTR_TO_PACKET) { + if (t == BPF_WRITE) { + verbose("cannot write into packet\n"); + return -EACCES; + } + err = check_packet_access(env, regno, off, size); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown_value(state->regs, value_regno); } else { verbose("R%d invalid mem access '%s'\n", - regno, reg_type_str[state->regs[regno].type]); + regno, reg_type_str[reg->type]); return -EACCES; } + + if (!err && size <= 2 && value_regno >= 0 && env->allow_ptr_leaks && + state->regs[value_regno].type == UNKNOWN_VALUE) { + /* 1 or 2 byte load zero-extends, determine the number of + * zero upper bits. Not doing it fo 4 byte load, since + * such values cannot be added to ptr_to_packet anyway. + */ + state->regs[value_regno].imm = 64 - size * 8; + } return err; } @@ -778,15 +865,25 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) * bytes from that pointer, make sure that it's within stack boundary * and all elements of stack are initialized */ -static int check_stack_boundary(struct verifier_env *env, - int regno, int access_size) +static int check_stack_boundary(struct verifier_env *env, int regno, + int access_size, bool zero_size_allowed, + struct bpf_call_arg_meta *meta) { struct verifier_state *state = &env->cur_state; struct reg_state *regs = state->regs; int off, i; - if (regs[regno].type != PTR_TO_STACK) + if (regs[regno].type != PTR_TO_STACK) { + if (zero_size_allowed && access_size == 0 && + regs[regno].type == CONST_IMM && + regs[regno].imm == 0) + return 0; + + verbose("R%d type=%s expected=%s\n", regno, + reg_type_str[regs[regno].type], + reg_type_str[PTR_TO_STACK]); return -EACCES; + } off = regs[regno].imm; if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || @@ -796,6 +893,12 @@ static int check_stack_boundary(struct verifier_env *env, return -EACCES; } + if (meta && meta->raw_mode) { + meta->access_size = access_size; + meta->regno = regno; + return 0; + } + for (i = 0; i < access_size; i++) { if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) { verbose("invalid indirect read from stack off %d+%d size %d\n", @@ -807,7 +910,8 @@ static int check_stack_boundary(struct verifier_env *env, } static int check_func_arg(struct verifier_env *env, u32 regno, - enum bpf_arg_type arg_type, struct bpf_map **mapp) + enum bpf_arg_type arg_type, + struct bpf_call_arg_meta *meta) { struct reg_state *reg = env->cur_state.regs + regno; enum bpf_reg_type expected_type; @@ -829,15 +933,26 @@ static int check_func_arg(struct verifier_env *env, u32 regno, return 0; } - if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || + if (arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE) { expected_type = PTR_TO_STACK; - } else if (arg_type == ARG_CONST_STACK_SIZE) { + } else if (arg_type == ARG_CONST_STACK_SIZE || + arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { expected_type = CONST_IMM; } else if (arg_type == ARG_CONST_MAP_PTR) { expected_type = CONST_PTR_TO_MAP; } else if (arg_type == ARG_PTR_TO_CTX) { expected_type = PTR_TO_CTX; + } else if (arg_type == ARG_PTR_TO_STACK || + arg_type == ARG_PTR_TO_RAW_STACK) { + expected_type = PTR_TO_STACK; + /* One exception here. In case function allows for NULL to be + * passed in as argument, it's a CONST_IMM type. Final test + * happens during stack boundary checking. + */ + if (reg->type == CONST_IMM && reg->imm == 0) + expected_type = CONST_IMM; + meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK; } else { verbose("unsupported arg_type %d\n", arg_type); return -EFAULT; @@ -851,14 +966,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno, if (arg_type == ARG_CONST_MAP_PTR) { /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ - *mapp = reg->map_ptr; - + meta->map_ptr = reg->map_ptr; } else if (arg_type == ARG_PTR_TO_MAP_KEY) { /* bpf_map_xxx(..., map_ptr, ..., key) call: * check that [key, key + map->key_size) are within * stack limits and initialized */ - if (!*mapp) { + if (!meta->map_ptr) { /* in function declaration map_ptr must come before * map_key, so that it's verified and known before * we have to check map_key here. Otherwise it means @@ -867,20 +981,24 @@ static int check_func_arg(struct verifier_env *env, u32 regno, verbose("invalid map_ptr to access map->key\n"); return -EACCES; } - err = check_stack_boundary(env, regno, (*mapp)->key_size); - + err = check_stack_boundary(env, regno, meta->map_ptr->key_size, + false, NULL); } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity */ - if (!*mapp) { + if (!meta->map_ptr) { /* kernel subsystem misconfigured verifier */ verbose("invalid map_ptr to access map->value\n"); return -EACCES; } - err = check_stack_boundary(env, regno, (*mapp)->value_size); + err = check_stack_boundary(env, regno, + meta->map_ptr->value_size, + false, NULL); + } else if (arg_type == ARG_CONST_STACK_SIZE || + arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { + bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO); - } else if (arg_type == ARG_CONST_STACK_SIZE) { /* bpf_xxx(..., buf, len) call will access 'len' bytes * from stack pointer 'buf'. Check it * note: regno == len, regno - 1 == buf @@ -890,7 +1008,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno, verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); return -EACCES; } - err = check_stack_boundary(env, regno - 1, reg->imm); + err = check_stack_boundary(env, regno - 1, reg->imm, + zero_size_allowed, meta); } return err; @@ -898,24 +1017,93 @@ static int check_func_arg(struct verifier_env *env, u32 regno, static int check_map_func_compatibility(struct bpf_map *map, int func_id) { - bool bool_map, bool_func; - int i; - if (!map) return 0; - for (i = 0; i < ARRAY_SIZE(func_limit); i++) { - bool_map = (map->map_type == func_limit[i].map_type); - bool_func = (func_id == func_limit[i].func_id); - /* only when map & func pair match it can continue. - * don't allow any other map type to be passed into - * the special func; - */ - if (bool_func && bool_map != bool_func) - return -EINVAL; + /* We need a two way check, first is from map perspective ... */ + switch (map->map_type) { + case BPF_MAP_TYPE_PROG_ARRAY: + if (func_id != BPF_FUNC_tail_call) + goto error; + break; + case BPF_MAP_TYPE_PERF_EVENT_ARRAY: + if (func_id != BPF_FUNC_perf_event_read && + func_id != BPF_FUNC_perf_event_output) + goto error; + break; + case BPF_MAP_TYPE_STACK_TRACE: + if (func_id != BPF_FUNC_get_stackid) + goto error; + break; + default: + break; + } + + /* ... and second from the function itself. */ + switch (func_id) { + case BPF_FUNC_tail_call: + if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) + goto error; + break; + case BPF_FUNC_perf_event_read: + case BPF_FUNC_perf_event_output: + if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) + goto error; + break; + case BPF_FUNC_get_stackid: + if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) + goto error; + break; + default: + break; } return 0; +error: + verbose("cannot pass map_type %d into func %d\n", + map->map_type, func_id); + return -EINVAL; +} + +static int check_raw_mode(const struct bpf_func_proto *fn) +{ + int count = 0; + + if (fn->arg1_type == ARG_PTR_TO_RAW_STACK) + count++; + if (fn->arg2_type == ARG_PTR_TO_RAW_STACK) + count++; + if (fn->arg3_type == ARG_PTR_TO_RAW_STACK) + count++; + if (fn->arg4_type == ARG_PTR_TO_RAW_STACK) + count++; + if (fn->arg5_type == ARG_PTR_TO_RAW_STACK) + count++; + + return count > 1 ? -EINVAL : 0; +} + +static void clear_all_pkt_pointers(struct verifier_env *env) +{ + struct verifier_state *state = &env->cur_state; + struct reg_state *regs = state->regs, *reg; + int i; + + for (i = 0; i < MAX_BPF_REG; i++) + if (regs[i].type == PTR_TO_PACKET || + regs[i].type == PTR_TO_PACKET_END) + mark_reg_unknown_value(regs, i); + + for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { + if (state->stack_slot_type[i] != STACK_SPILL) + continue; + reg = &state->spilled_regs[i / BPF_REG_SIZE]; + if (reg->type != PTR_TO_PACKET && + reg->type != PTR_TO_PACKET_END) + continue; + reg->type = UNKNOWN_VALUE; + reg->imm = 0; + } } static int check_call(struct verifier_env *env, int func_id) @@ -923,8 +1111,9 @@ static int check_call(struct verifier_env *env, int func_id) struct verifier_state *state = &env->cur_state; const struct bpf_func_proto *fn = NULL; struct reg_state *regs = state->regs; - struct bpf_map *map = NULL; struct reg_state *reg; + struct bpf_call_arg_meta meta; + bool changes_data; int i, err; /* find function prototype */ @@ -947,23 +1136,45 @@ static int check_call(struct verifier_env *env, int func_id) return -EINVAL; } + changes_data = bpf_helper_changes_skb_data(fn->func); + + memset(&meta, 0, sizeof(meta)); + + /* We only support one arg being in raw mode at the moment, which + * is sufficient for the helper functions we have right now. + */ + err = check_raw_mode(fn); + if (err) { + verbose("kernel subsystem misconfigured func %d\n", func_id); + return err; + } + /* check args */ - err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &map); + err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta); if (err) return err; - err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &map); + err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta); if (err) return err; - err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &map); + err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta); if (err) return err; - err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &map); + err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &meta); if (err) return err; - err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &map); + err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &meta); if (err) return err; + /* Mark slots with STACK_MISC in case of raw mode, stack offset + * is inferred from register state. + */ + for (i = 0; i < meta.access_size; i++) { + err = check_mem_access(env, meta.regno, i, BPF_B, BPF_WRITE, -1); + if (err) + return err; + } + /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { reg = regs + caller_saved[i]; @@ -982,28 +1193,225 @@ static int check_call(struct verifier_env *env, int func_id) * can check 'value_size' boundary of memory access * to map element returned from bpf_map_lookup_elem() */ - if (map == NULL) { + if (meta.map_ptr == NULL) { verbose("kernel subsystem misconfigured verifier\n"); return -EINVAL; } - regs[BPF_REG_0].map_ptr = map; + regs[BPF_REG_0].map_ptr = meta.map_ptr; } else { verbose("unknown return type %d of func %d\n", fn->ret_type, func_id); return -EINVAL; } - err = check_map_func_compatibility(map, func_id); + err = check_map_func_compatibility(meta.map_ptr, func_id); if (err) return err; + if (changes_data) + clear_all_pkt_pointers(env); + return 0; +} + +static int check_packet_ptr_add(struct verifier_env *env, struct bpf_insn *insn) +{ + struct reg_state *regs = env->cur_state.regs; + struct reg_state *dst_reg = ®s[insn->dst_reg]; + struct reg_state *src_reg = ®s[insn->src_reg]; + struct reg_state tmp_reg; + s32 imm; + + if (BPF_SRC(insn->code) == BPF_K) { + /* pkt_ptr += imm */ + imm = insn->imm; + +add_imm: + if (imm <= 0) { + verbose("addition of negative constant to packet pointer is not allowed\n"); + return -EACCES; + } + if (imm >= MAX_PACKET_OFF || + imm + dst_reg->off >= MAX_PACKET_OFF) { + verbose("constant %d is too large to add to packet pointer\n", + imm); + return -EACCES; + } + /* a constant was added to pkt_ptr. + * Remember it while keeping the same 'id' + */ + dst_reg->off += imm; + } else { + if (src_reg->type == PTR_TO_PACKET) { + /* R6=pkt(id=0,off=0,r=62) R7=imm22; r7 += r6 */ + tmp_reg = *dst_reg; /* save r7 state */ + *dst_reg = *src_reg; /* copy pkt_ptr state r6 into r7 */ + src_reg = &tmp_reg; /* pretend it's src_reg state */ + /* if the checks below reject it, the copy won't matter, + * since we're rejecting the whole program. If all ok, + * then imm22 state will be added to r7 + * and r7 will be pkt(id=0,off=22,r=62) while + * r6 will stay as pkt(id=0,off=0,r=62) + */ + } + + if (src_reg->type == CONST_IMM) { + /* pkt_ptr += reg where reg is known constant */ + imm = src_reg->imm; + goto add_imm; + } + /* disallow pkt_ptr += reg + * if reg is not uknown_value with guaranteed zero upper bits + * otherwise pkt_ptr may overflow and addition will become + * subtraction which is not allowed + */ + if (src_reg->type != UNKNOWN_VALUE) { + verbose("cannot add '%s' to ptr_to_packet\n", + reg_type_str[src_reg->type]); + return -EACCES; + } + if (src_reg->imm < 48) { + verbose("cannot add integer value with %lld upper zero bits to ptr_to_packet\n", + src_reg->imm); + return -EACCES; + } + /* dst_reg stays as pkt_ptr type and since some positive + * integer value was added to the pointer, increment its 'id' + */ + dst_reg->id++; + + /* something was added to pkt_ptr, set range and off to zero */ + dst_reg->off = 0; + dst_reg->range = 0; + } + return 0; +} + +static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn) +{ + struct reg_state *regs = env->cur_state.regs; + struct reg_state *dst_reg = ®s[insn->dst_reg]; + u8 opcode = BPF_OP(insn->code); + s64 imm_log2; + + /* for type == UNKNOWN_VALUE: + * imm > 0 -> number of zero upper bits + * imm == 0 -> don't track which is the same as all bits can be non-zero + */ + + if (BPF_SRC(insn->code) == BPF_X) { + struct reg_state *src_reg = ®s[insn->src_reg]; + + if (src_reg->type == UNKNOWN_VALUE && src_reg->imm > 0 && + dst_reg->imm && opcode == BPF_ADD) { + /* dreg += sreg + * where both have zero upper bits. Adding them + * can only result making one more bit non-zero + * in the larger value. + * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47) + * 0xffff (imm=48) + 0xffff = 0x1fffe (imm=47) + */ + dst_reg->imm = min(dst_reg->imm, src_reg->imm); + dst_reg->imm--; + return 0; + } + if (src_reg->type == CONST_IMM && src_reg->imm > 0 && + dst_reg->imm && opcode == BPF_ADD) { + /* dreg += sreg + * where dreg has zero upper bits and sreg is const. + * Adding them can only result making one more bit + * non-zero in the larger value. + */ + imm_log2 = __ilog2_u64((long long)src_reg->imm); + dst_reg->imm = min(dst_reg->imm, 63 - imm_log2); + dst_reg->imm--; + return 0; + } + /* all other cases non supported yet, just mark dst_reg */ + dst_reg->imm = 0; + return 0; + } + + /* sign extend 32-bit imm into 64-bit to make sure that + * negative values occupy bit 63. Note ilog2() would have + * been incorrect, since sizeof(insn->imm) == 4 + */ + imm_log2 = __ilog2_u64((long long)insn->imm); + + if (dst_reg->imm && opcode == BPF_LSH) { + /* reg <<= imm + * if reg was a result of 2 byte load, then its imm == 48 + * which means that upper 48 bits are zero and shifting this reg + * left by 4 would mean that upper 44 bits are still zero + */ + dst_reg->imm -= insn->imm; + } else if (dst_reg->imm && opcode == BPF_MUL) { + /* reg *= imm + * if multiplying by 14 subtract 4 + * This is conservative calculation of upper zero bits. + * It's not trying to special case insn->imm == 1 or 0 cases + */ + dst_reg->imm -= imm_log2 + 1; + } else if (opcode == BPF_AND) { + /* reg &= imm */ + dst_reg->imm = 63 - imm_log2; + } else if (dst_reg->imm && opcode == BPF_ADD) { + /* reg += imm */ + dst_reg->imm = min(dst_reg->imm, 63 - imm_log2); + dst_reg->imm--; + } else if (opcode == BPF_RSH) { + /* reg >>= imm + * which means that after right shift, upper bits will be zero + * note that verifier already checked that + * 0 <= imm < 64 for shift insn + */ + dst_reg->imm += insn->imm; + if (unlikely(dst_reg->imm > 64)) + /* some dumb code did: + * r2 = *(u32 *)mem; + * r2 >>= 32; + * and all bits are zero now */ + dst_reg->imm = 64; + } else { + /* all other alu ops, means that we don't know what will + * happen to the value, mark it with unknown number of zero bits + */ + dst_reg->imm = 0; + } + + if (dst_reg->imm < 0) { + /* all 64 bits of the register can contain non-zero bits + * and such value cannot be added to ptr_to_packet, since it + * may overflow, mark it as unknown to avoid further eval + */ + dst_reg->imm = 0; + } + return 0; +} + +static int evaluate_reg_imm_alu(struct verifier_env *env, struct bpf_insn *insn) +{ + struct reg_state *regs = env->cur_state.regs; + struct reg_state *dst_reg = ®s[insn->dst_reg]; + struct reg_state *src_reg = ®s[insn->src_reg]; + u8 opcode = BPF_OP(insn->code); + + /* dst_reg->type == CONST_IMM here, simulate execution of 'add' insn. + * Don't care about overflow or negative values, just add them + */ + if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K) + dst_reg->imm += insn->imm; + else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) + dst_reg->imm += src_reg->imm; + else + mark_reg_unknown_value(regs, insn->dst_reg); return 0; } /* check validity of 32-bit and 64-bit arithmetic operations */ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) { - struct reg_state *regs = env->cur_state.regs; + struct reg_state *regs = env->cur_state.regs, *dst_reg; u8 opcode = BPF_OP(insn->code); int err; @@ -1092,8 +1500,6 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) } else { /* all other ALU ops: and, sub, xor, add, ... */ - bool stack_relative = false; - if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0 || insn->off != 0) { verbose("BPF_ALU uses reserved fields\n"); @@ -1131,11 +1537,36 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) } } + /* check dest operand */ + err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK); + if (err) + return err; + + dst_reg = ®s[insn->dst_reg]; + /* pattern match 'bpf_add Rx, imm' instruction */ if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && - regs[insn->dst_reg].type == FRAME_PTR && - BPF_SRC(insn->code) == BPF_K) { - stack_relative = true; + dst_reg->type == FRAME_PTR && BPF_SRC(insn->code) == BPF_K) { + dst_reg->type = PTR_TO_STACK; + dst_reg->imm = insn->imm; + return 0; + } else if (opcode == BPF_ADD && + BPF_CLASS(insn->code) == BPF_ALU64 && + (dst_reg->type == PTR_TO_PACKET || + (BPF_SRC(insn->code) == BPF_X && + regs[insn->src_reg].type == PTR_TO_PACKET))) { + /* ptr_to_packet += K|X */ + return check_packet_ptr_add(env, insn); + } else if (BPF_CLASS(insn->code) == BPF_ALU64 && + dst_reg->type == UNKNOWN_VALUE && + env->allow_ptr_leaks) { + /* unknown += K|X */ + return evaluate_reg_alu(env, insn); + } else if (BPF_CLASS(insn->code) == BPF_ALU64 && + dst_reg->type == CONST_IMM && + env->allow_ptr_leaks) { + /* reg_imm += K|X */ + return evaluate_reg_imm_alu(env, insn); } else if (is_pointer_value(env, insn->dst_reg)) { verbose("R%d pointer arithmetic prohibited\n", insn->dst_reg); @@ -1147,24 +1578,45 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) return -EACCES; } - /* check dest operand */ - err = check_reg_arg(regs, insn->dst_reg, DST_OP); - if (err) - return err; - - if (stack_relative) { - regs[insn->dst_reg].type = PTR_TO_STACK; - regs[insn->dst_reg].imm = insn->imm; - } + /* mark dest operand */ + mark_reg_unknown_value(regs, insn->dst_reg); } return 0; } +static void find_good_pkt_pointers(struct verifier_env *env, + struct reg_state *dst_reg) +{ + struct verifier_state *state = &env->cur_state; + struct reg_state *regs = state->regs, *reg; + int i; + /* r2 = r3; + * r2 += 8 + * if (r2 > pkt_end) goto somewhere + * r2 == dst_reg, pkt_end == src_reg, + * r2=pkt(id=n,off=8,r=0) + * r3=pkt(id=n,off=0,r=0) + * find register r3 and mark its range as r3=pkt(id=n,off=0,r=8) + * so that range of bytes [r3, r3 + 8) is safe to access + */ + for (i = 0; i < MAX_BPF_REG; i++) + if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) + regs[i].range = dst_reg->off; + + for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { + if (state->stack_slot_type[i] != STACK_SPILL) + continue; + reg = &state->spilled_regs[i / BPF_REG_SIZE]; + if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id) + reg->range = dst_reg->off; + } +} + static int check_cond_jmp_op(struct verifier_env *env, struct bpf_insn *insn, int *insn_idx) { - struct reg_state *regs = env->cur_state.regs; + struct reg_state *regs = env->cur_state.regs, *dst_reg; struct verifier_state *other_branch; u8 opcode = BPF_OP(insn->code); int err; @@ -1202,11 +1654,12 @@ static int check_cond_jmp_op(struct verifier_env *env, if (err) return err; + dst_reg = ®s[insn->dst_reg]; + /* detect if R == 0 where R was initialized to zero earlier */ if (BPF_SRC(insn->code) == BPF_K && (opcode == BPF_JEQ || opcode == BPF_JNE) && - regs[insn->dst_reg].type == CONST_IMM && - regs[insn->dst_reg].imm == insn->imm) { + dst_reg->type == CONST_IMM && dst_reg->imm == insn->imm) { if (opcode == BPF_JEQ) { /* if (imm == imm) goto pc+off; * only follow the goto, ignore fall-through @@ -1228,44 +1681,30 @@ static int check_cond_jmp_op(struct verifier_env *env, /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */ if (BPF_SRC(insn->code) == BPF_K && - insn->imm == 0 && (opcode == BPF_JEQ || - opcode == BPF_JNE) && - regs[insn->dst_reg].type == PTR_TO_MAP_VALUE_OR_NULL) { + insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && + dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { if (opcode == BPF_JEQ) { /* next fallthrough insn can access memory via * this register */ regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; /* branch targer cannot access it, since reg == 0 */ - other_branch->regs[insn->dst_reg].type = CONST_IMM; - other_branch->regs[insn->dst_reg].imm = 0; + mark_reg_unknown_value(other_branch->regs, + insn->dst_reg); } else { other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; - regs[insn->dst_reg].type = CONST_IMM; - regs[insn->dst_reg].imm = 0; + mark_reg_unknown_value(regs, insn->dst_reg); } + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && + dst_reg->type == PTR_TO_PACKET && + regs[insn->src_reg].type == PTR_TO_PACKET_END) { + find_good_pkt_pointers(env, dst_reg); } else if (is_pointer_value(env, insn->dst_reg)) { verbose("R%d pointer comparison prohibited\n", insn->dst_reg); return -EACCES; - } else if (BPF_SRC(insn->code) == BPF_K && - (opcode == BPF_JEQ || opcode == BPF_JNE)) { - - if (opcode == BPF_JEQ) { - /* detect if (R == imm) goto - * and in the target state recognize that R = imm - */ - other_branch->regs[insn->dst_reg].type = CONST_IMM; - other_branch->regs[insn->dst_reg].imm = insn->imm; - } else { - /* detect if (R != imm) goto - * and in the fall-through state recognize that R = imm - */ - regs[insn->dst_reg].type = CONST_IMM; - regs[insn->dst_reg].imm = insn->imm; - } } if (log_level) - print_verifier_state(env); + print_verifier_state(&env->cur_state); return 0; } @@ -1343,13 +1782,14 @@ static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) int i, err; if (!may_access_skb(env->prog->type)) { - verbose("BPF_LD_ABS|IND instructions not allowed for this program type\n"); + verbose("BPF_LD_[ABS|IND] instructions not allowed for this program type\n"); return -EINVAL; } if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || + BPF_SIZE(insn->code) == BPF_DW || (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { - verbose("BPF_LD_ABS uses reserved fields\n"); + verbose("BPF_LD_[ABS|IND] uses reserved fields\n"); return -EINVAL; } @@ -1513,6 +1953,8 @@ peek_stack: goto peek_stack; else if (ret < 0) goto err_free; + if (t + 1 < insn_cnt) + env->explored_states[t + 1] = STATE_LIST_MARK; } else if (opcode == BPF_JA) { if (BPF_SRC(insns[t].code) != BPF_K) { ret = -EINVAL; @@ -1580,6 +2022,58 @@ err_free: return ret; } +/* the following conditions reduce the number of explored insns + * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet + */ +static bool compare_ptrs_to_packet(struct reg_state *old, struct reg_state *cur) +{ + if (old->id != cur->id) + return false; + + /* old ptr_to_packet is more conservative, since it allows smaller + * range. Ex: + * old(off=0,r=10) is equal to cur(off=0,r=20), because + * old(off=0,r=10) means that with range=10 the verifier proceeded + * further and found no issues with the program. Now we're in the same + * spot with cur(off=0,r=20), so we're safe too, since anything further + * will only be looking at most 10 bytes after this pointer. + */ + if (old->off == cur->off && old->range < cur->range) + return true; + + /* old(off=20,r=10) is equal to cur(off=22,re=22 or 5 or 0) + * since both cannot be used for packet access and safe(old) + * pointer has smaller off that could be used for further + * 'if (ptr > data_end)' check + * Ex: + * old(off=20,r=10) and cur(off=22,r=22) and cur(off=22,r=0) mean + * that we cannot access the packet. + * The safe range is: + * [ptr, ptr + range - off) + * so whenever off >=range, it means no safe bytes from this pointer. + * When comparing old->off <= cur->off, it means that older code + * went with smaller offset and that offset was later + * used to figure out the safe range after 'if (ptr > data_end)' check + * Say, 'old' state was explored like: + * ... R3(off=0, r=0) + * R4 = R3 + 20 + * ... now R4(off=20,r=0) <-- here + * if (R4 > data_end) + * ... R4(off=20,r=20), R3(off=0,r=20) and R3 can be used to access. + * ... the code further went all the way to bpf_exit. + * Now the 'cur' state at the mark 'here' has R4(off=30,r=0). + * old_R4(off=20,r=0) equal to cur_R4(off=30,r=0), since if the verifier + * goes further, such cur_R4 will give larger safe packet range after + * 'if (R4 > data_end)' and all further insn were already good with r=20, + * so they will be good with r=30 and we can prune the search. + */ + if (old->off <= cur->off && + old->off >= old->range && cur->off >= cur->range) + return true; + + return false; +} + /* compare two verifier states * * all states stored in state_list are known to be valid, since @@ -1608,17 +2102,25 @@ err_free: */ static bool states_equal(struct verifier_state *old, struct verifier_state *cur) { + struct reg_state *rold, *rcur; int i; for (i = 0; i < MAX_BPF_REG; i++) { - if (memcmp(&old->regs[i], &cur->regs[i], - sizeof(old->regs[0])) != 0) { - if (old->regs[i].type == NOT_INIT || - (old->regs[i].type == UNKNOWN_VALUE && - cur->regs[i].type != NOT_INIT)) - continue; - return false; - } + rold = &old->regs[i]; + rcur = &cur->regs[i]; + + if (memcmp(rold, rcur, sizeof(*rold)) == 0) + continue; + + if (rold->type == NOT_INIT || + (rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT)) + continue; + + if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET && + compare_ptrs_to_packet(rold, rcur)) + continue; + + return false; } for (i = 0; i < MAX_BPF_STACK; i++) { @@ -1717,7 +2219,7 @@ static int do_check(struct verifier_env *env) insn = &insns[insn_idx]; class = BPF_CLASS(insn->code); - if (++insn_processed > 32768) { + if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { verbose("BPF program is too large. Proccessed %d insn\n", insn_processed); return -E2BIG; @@ -1740,7 +2242,7 @@ static int do_check(struct verifier_env *env) if (log_level && do_print_state) { verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx); - print_verifier_state(env); + print_verifier_state(&env->cur_state); do_print_state = false; } @@ -1952,6 +2454,7 @@ process_bpf_exit: insn_idx++; } + verbose("processed %d insns\n", insn_processed); return 0; } @@ -2003,7 +2506,6 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) if (IS_ERR(map)) { verbose("fd %d is not pointing to valid bpf_map\n", insn->imm); - fdput(f); return PTR_ERR(map); } @@ -2023,15 +2525,18 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) return -E2BIG; } - /* remember this map */ - env->used_maps[env->used_map_cnt++] = map; - /* hold the map. If the program is rejected by verifier, * the map will be released by release_maps() or it * will be used by the valid program until it's unloaded * and all maps are released in free_bpf_prog_info() */ - bpf_map_inc(map, false); + map = bpf_map_inc(map, false); + if (IS_ERR(map)) { + fdput(f); + return PTR_ERR(map); + } + env->used_maps[env->used_map_cnt++] = map; + fdput(f); next_insn: insn++; @@ -2067,26 +2572,6 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env) insn->src_reg = 0; } -static void adjust_branches(struct bpf_prog *prog, int pos, int delta) -{ - struct bpf_insn *insn = prog->insnsi; - int insn_cnt = prog->len; - int i; - - for (i = 0; i < insn_cnt; i++, insn++) { - if (BPF_CLASS(insn->code) != BPF_JMP || - BPF_OP(insn->code) == BPF_CALL || - BPF_OP(insn->code) == BPF_EXIT) - continue; - - /* adjust offset of jmps if necessary */ - if (i < pos && i + insn->off + 1 > pos) - insn->off += delta; - else if (i > pos + delta && i + insn->off + 1 <= pos + delta) - insn->off -= delta; - } -} - /* convert load instructions that access fields of 'struct __sk_buff' * into sequence of instructions that access fields of 'struct sk_buff' */ @@ -2096,14 +2581,15 @@ static int convert_ctx_accesses(struct verifier_env *env) int insn_cnt = env->prog->len; struct bpf_insn insn_buf[16]; struct bpf_prog *new_prog; - u32 cnt; - int i; enum bpf_access_type type; + int i; if (!env->prog->aux->ops->convert_ctx_access) return 0; for (i = 0; i < insn_cnt; i++, insn++) { + u32 insn_delta, cnt; + if (insn->code == (BPF_LDX | BPF_MEM | BPF_W)) type = BPF_READ; else if (insn->code == (BPF_STX | BPF_MEM | BPF_W)) @@ -2125,34 +2611,18 @@ static int convert_ctx_accesses(struct verifier_env *env) return -EINVAL; } - if (cnt == 1) { - memcpy(insn, insn_buf, sizeof(*insn)); - continue; - } - - /* several new insns need to be inserted. Make room for them */ - insn_cnt += cnt - 1; - new_prog = bpf_prog_realloc(env->prog, - bpf_prog_size(insn_cnt), - GFP_USER); + new_prog = bpf_patch_insn_single(env->prog, i, insn_buf, cnt); if (!new_prog) return -ENOMEM; - new_prog->len = insn_cnt; - - memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1, - sizeof(*insn) * (insn_cnt - i - cnt)); - - /* copy substitute insns in place of load instruction */ - memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt); - - /* adjust branches in the whole program */ - adjust_branches(new_prog, i, cnt - 1); + insn_delta = cnt - 1; /* keep walking new program and skip insns we just inserted */ env->prog = new_prog; - insn = new_prog->insnsi + i + cnt - 1; - i += cnt - 1; + insn = new_prog->insnsi + i + insn_delta; + + insn_cnt += insn_delta; + i += insn_delta; } return 0; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d27904c193da..75c0ff00aca6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -59,6 +59,9 @@ #include <linux/delay.h> #include <linux/atomic.h> #include <linux/cpuset.h> +#include <linux/proc_ns.h> +#include <linux/nsproxy.h> +#include <linux/proc_ns.h> #include <net/sock.h> /* @@ -178,10 +181,16 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root); * The default hierarchy always exists but is hidden until mounted for the * first time. This is for backward compatibility. */ -static bool cgrp_dfl_root_visible; +static bool cgrp_dfl_visible; + +/* Controllers blocked by the commandline in v1 */ +static u16 cgroup_no_v1_mask; /* some controllers are not supported in the default hierarchy */ -static unsigned long cgrp_dfl_root_inhibit_ss_mask; +static u16 cgrp_dfl_inhibit_ss_mask; + +/* some controllers are implicitly enabled on the default hierarchy */ +static unsigned long cgrp_dfl_implicit_ss_mask; /* The list of hierarchy roots */ @@ -205,23 +214,34 @@ static u64 css_serial_nr_next = 1; * fork/exit handlers to call. This avoids us having to do extra work in the * fork/exit path to check which subsystems have fork/exit callbacks. */ -static unsigned long have_fork_callback __read_mostly; -static unsigned long have_exit_callback __read_mostly; -static unsigned long have_free_callback __read_mostly; +static u16 have_fork_callback __read_mostly; +static u16 have_exit_callback __read_mostly; +static u16 have_free_callback __read_mostly; + +/* cgroup namespace for init task */ +struct cgroup_namespace init_cgroup_ns = { + .count = { .counter = 2, }, + .user_ns = &init_user_ns, + .ns.ops = &cgroupns_operations, + .ns.inum = PROC_CGROUP_INIT_INO, + .root_cset = &init_css_set, +}; /* Ditto for the can_fork callback. */ -static unsigned long have_canfork_callback __read_mostly; +static u16 have_canfork_callback __read_mostly; static struct file_system_type cgroup2_fs_type; static struct cftype cgroup_dfl_base_files[]; static struct cftype cgroup_legacy_base_files[]; -static int rebind_subsystems(struct cgroup_root *dst_root, - unsigned long ss_mask); +static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); +static void cgroup_lock_and_drain_offline(struct cgroup *cgrp); +static int cgroup_apply_control(struct cgroup *cgrp); +static void cgroup_finalize_control(struct cgroup *cgrp, int ret); static void css_task_iter_advance(struct css_task_iter *it); static int cgroup_destroy_locked(struct cgroup *cgrp); -static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, - bool visible); +static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, + struct cgroup_subsys *ss); static void css_release(struct percpu_ref *ref); static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup_subsys_state *css, @@ -238,9 +258,17 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, */ static bool cgroup_ssid_enabled(int ssid) { + if (CGROUP_SUBSYS_COUNT == 0) + return false; + return static_key_enabled(cgroup_subsys_enabled_key[ssid]); } +static bool cgroup_ssid_no_v1(int ssid) +{ + return cgroup_no_v1_mask & (1 << ssid); +} + /** * cgroup_on_dfl - test whether a cgroup is on the default hierarchy * @cgrp: the cgroup of interest @@ -339,6 +367,32 @@ static struct cgroup *cgroup_parent(struct cgroup *cgrp) return NULL; } +/* subsystems visibly enabled on a cgroup */ +static u16 cgroup_control(struct cgroup *cgrp) +{ + struct cgroup *parent = cgroup_parent(cgrp); + u16 root_ss_mask = cgrp->root->subsys_mask; + + if (parent) + return parent->subtree_control; + + if (cgroup_on_dfl(cgrp)) + root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask | + cgrp_dfl_implicit_ss_mask); + return root_ss_mask; +} + +/* subsystems enabled on a cgroup */ +static u16 cgroup_ss_mask(struct cgroup *cgrp) +{ + struct cgroup *parent = cgroup_parent(cgrp); + + if (parent) + return parent->subtree_ss_mask; + + return cgrp->root->subsys_mask; +} + /** * cgroup_css - obtain a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest @@ -378,16 +432,15 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, if (!ss) return &cgrp->self; - if (!(cgrp->root->subsys_mask & (1 << ss->id))) - return NULL; - /* * This function is used while updating css associations and thus - * can't test the csses directly. Use ->child_subsys_mask. + * can't test the csses directly. Test ss_mask. */ - while (cgroup_parent(cgrp) && - !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id))) + while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) { cgrp = cgroup_parent(cgrp); + if (!cgrp) + return NULL; + } return cgroup_css(cgrp, ss); } @@ -506,22 +559,28 @@ static int notify_on_release(const struct cgroup *cgrp) (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) /** - * for_each_subsys_which - filter for_each_subsys with a bitmask + * do_each_subsys_mask - filter for_each_subsys with a bitmask * @ss: the iteration cursor * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end - * @ss_maskp: a pointer to the bitmask + * @ss_mask: the bitmask * * The block will only run for cases where the ssid-th bit (1 << ssid) of - * mask is set to 1. + * @ss_mask is set. */ -#define for_each_subsys_which(ss, ssid, ss_maskp) \ - if (!CGROUP_SUBSYS_COUNT) /* to avoid spurious gcc warning */ \ +#define do_each_subsys_mask(ss, ssid, ss_mask) do { \ + unsigned long __ss_mask = (ss_mask); \ + if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */ \ (ssid) = 0; \ - else \ - for_each_set_bit(ssid, ss_maskp, CGROUP_SUBSYS_COUNT) \ - if (((ss) = cgroup_subsys[ssid]) && false) \ - break; \ - else + break; \ + } \ + for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \ + (ss) = cgroup_subsys[ssid]; \ + { + +#define while_each_subsys_mask() \ + } \ + } \ +} while (false) /* iterate across the hierarchies */ #define for_each_root(root) \ @@ -535,6 +594,24 @@ static int notify_on_release(const struct cgroup *cgrp) ; \ else +/* walk live descendants in preorder */ +#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \ + css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ + (dsct) = (d_css)->cgroup; \ + cgroup_is_dead(dsct); })) \ + ; \ + else + +/* walk live descendants in postorder */ +#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \ + css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ + (dsct) = (d_css)->cgroup; \ + cgroup_is_dead(dsct); })) \ + ; \ + else + static void cgroup_release_agent(struct work_struct *work); static void check_for_release(struct cgroup *cgrp); @@ -665,6 +742,9 @@ static void css_set_move_task(struct task_struct *task, { lockdep_assert_held(&css_set_lock); + if (to_cset && !css_set_populated(to_cset)) + css_set_update_populated(to_cset, true); + if (from_cset) { struct css_task_iter *it, *pos; @@ -698,8 +778,6 @@ static void css_set_move_task(struct task_struct *task, */ WARN_ON_ONCE(task->flags & PF_EXITING); - if (!css_set_populated(to_cset)) - css_set_update_populated(to_cset, true); rcu_assign_pointer(task->cgroups, to_cset); list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : &to_cset->tasks); @@ -759,6 +837,8 @@ static void put_css_set_locked(struct css_set *cset) static void put_css_set(struct css_set *cset) { + unsigned long flags; + /* * Ensure that the refcount doesn't hit zero while any readers * can see it. Similar to atomic_dec_and_lock(), but for an @@ -767,9 +847,9 @@ static void put_css_set(struct css_set *cset) if (atomic_add_unless(&cset->refcount, -1, 1)) return; - spin_lock_bh(&css_set_lock); + spin_lock_irqsave(&css_set_lock, flags); put_css_set_locked(cset); - spin_unlock_bh(&css_set_lock); + spin_unlock_irqrestore(&css_set_lock, flags); } /* @@ -992,11 +1072,11 @@ static struct css_set *find_css_set(struct css_set *old_cset, /* First see if we already have a cgroup group that matches * the desired set */ - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); cset = find_existing_css_set(old_cset, cgrp, template); if (cset) get_css_set(cset); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); if (cset) return cset; @@ -1024,7 +1104,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, * find_existing_css_set() */ memcpy(cset->subsys, template, sizeof(cset->subsys)); - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); /* Add reference counts and links from the new css_set. */ list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; @@ -1050,7 +1130,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, css_get(css); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); return cset; } @@ -1102,19 +1182,19 @@ static void cgroup_destroy_root(struct cgroup_root *root) struct cgroup *cgrp = &root->cgrp; struct cgrp_cset_link *link, *tmp_link; - mutex_lock(&cgroup_mutex); + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); BUG_ON(atomic_read(&root->nr_cgrps)); BUG_ON(!list_empty(&cgrp->self.children)); /* Rebind all subsystems back to the default hierarchy */ - rebind_subsystems(&cgrp_dfl_root, root->subsys_mask); + WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask)); /* * Release all the links from cset_links to this hierarchy's * root cgroup */ - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { list_del(&link->cset_link); @@ -1122,7 +1202,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) kfree(link); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); if (!list_empty(&root->root_list)) { list_del(&root->root_list); @@ -1137,6 +1217,41 @@ static void cgroup_destroy_root(struct cgroup_root *root) cgroup_free_root(root); } +/* + * look up cgroup associated with current task's cgroup namespace on the + * specified hierarchy + */ +static struct cgroup * +current_cgns_cgroup_from_root(struct cgroup_root *root) +{ + struct cgroup *res = NULL; + struct css_set *cset; + + lockdep_assert_held(&css_set_lock); + + rcu_read_lock(); + + cset = current->nsproxy->cgroup_ns->root_cset; + if (cset == &init_css_set) { + res = &root->cgrp; + } else { + struct cgrp_cset_link *link; + + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { + struct cgroup *c = link->cgrp; + + if (c->root == root) { + res = c; + break; + } + } + } + rcu_read_unlock(); + + BUG_ON(!res); + return res; +} + /* look up cgroup associated with given css_set on the specified hierarchy */ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) @@ -1248,46 +1363,40 @@ static umode_t cgroup_file_mode(const struct cftype *cft) } /** - * cgroup_calc_child_subsys_mask - calculate child_subsys_mask - * @cgrp: the target cgroup + * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask * @subtree_control: the new subtree_control mask to consider + * @this_ss_mask: available subsystems * * On the default hierarchy, a subsystem may request other subsystems to be * enabled together through its ->depends_on mask. In such cases, more * subsystems than specified in "cgroup.subtree_control" may be enabled. * * This function calculates which subsystems need to be enabled if - * @subtree_control is to be applied to @cgrp. The returned mask is always - * a superset of @subtree_control and follows the usual hierarchy rules. + * @subtree_control is to be applied while restricted to @this_ss_mask. */ -static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp, - unsigned long subtree_control) +static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask) { - struct cgroup *parent = cgroup_parent(cgrp); - unsigned long cur_ss_mask = subtree_control; + u16 cur_ss_mask = subtree_control; struct cgroup_subsys *ss; int ssid; lockdep_assert_held(&cgroup_mutex); - if (!cgroup_on_dfl(cgrp)) - return cur_ss_mask; + cur_ss_mask |= cgrp_dfl_implicit_ss_mask; while (true) { - unsigned long new_ss_mask = cur_ss_mask; + u16 new_ss_mask = cur_ss_mask; - for_each_subsys_which(ss, ssid, &cur_ss_mask) + do_each_subsys_mask(ss, ssid, cur_ss_mask) { new_ss_mask |= ss->depends_on; + } while_each_subsys_mask(); /* * Mask out subsystems which aren't available. This can * happen only if some depended-upon subsystems were bound * to non-default hierarchies. */ - if (parent) - new_ss_mask &= parent->child_subsys_mask; - else - new_ss_mask &= cgrp->root->subsys_mask; + new_ss_mask &= this_ss_mask; if (new_ss_mask == cur_ss_mask) break; @@ -1298,19 +1407,6 @@ static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp, } /** - * cgroup_refresh_child_subsys_mask - update child_subsys_mask - * @cgrp: the target cgroup - * - * Update @cgrp->child_subsys_mask according to the current - * @cgrp->subtree_control using cgroup_calc_child_subsys_mask(). - */ -static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) -{ - cgrp->child_subsys_mask = - cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control); -} - -/** * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced * @@ -1338,19 +1434,22 @@ static void cgroup_kn_unlock(struct kernfs_node *kn) /** * cgroup_kn_lock_live - locking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced + * @drain_offline: perform offline draining on the cgroup * * This helper is to be used by a cgroup kernfs method currently servicing * @kn. It breaks the active protection, performs cgroup locking and * verifies that the associated cgroup is alive. Returns the cgroup if * alive; otherwise, %NULL. A successful return should be undone by a - * matching cgroup_kn_unlock() invocation. + * matching cgroup_kn_unlock() invocation. If @drain_offline is %true, the + * cgroup is drained of offlining csses before return. * * Any cgroup kernfs method implementation which requires locking the * associated cgroup should use this helper. It avoids nesting cgroup * locking under kernfs active protection and allows all kernfs operations * including self-removal. */ -static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) +static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, + bool drain_offline) { struct cgroup *cgrp; @@ -1369,7 +1468,10 @@ static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) return NULL; kernfs_break_active_protection(kn); - mutex_lock(&cgroup_mutex); + if (drain_offline) + cgroup_lock_and_drain_offline(cgrp); + else + mutex_lock(&cgroup_mutex); if (!cgroup_is_dead(cgrp)) return cgrp; @@ -1399,14 +1501,17 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) /** * css_clear_dir - remove subsys files in a cgroup directory * @css: taget css - * @cgrp_override: specify if target cgroup is different from css->cgroup */ -static void css_clear_dir(struct cgroup_subsys_state *css, - struct cgroup *cgrp_override) +static void css_clear_dir(struct cgroup_subsys_state *css) { - struct cgroup *cgrp = cgrp_override ?: css->cgroup; + struct cgroup *cgrp = css->cgroup; struct cftype *cfts; + if (!(css->flags & CSS_VISIBLE)) + return; + + css->flags &= ~CSS_VISIBLE; + list_for_each_entry(cfts, &css->ss->cfts, node) cgroup_addrm_files(css, cgrp, cfts, false); } @@ -1414,17 +1519,18 @@ static void css_clear_dir(struct cgroup_subsys_state *css, /** * css_populate_dir - create subsys files in a cgroup directory * @css: target css - * @cgrp_overried: specify if target cgroup is different from css->cgroup * * On failure, no file is added. */ -static int css_populate_dir(struct cgroup_subsys_state *css, - struct cgroup *cgrp_override) +static int css_populate_dir(struct cgroup_subsys_state *css) { - struct cgroup *cgrp = cgrp_override ?: css->cgroup; + struct cgroup *cgrp = css->cgroup; struct cftype *cfts, *failed_cfts; int ret; + if ((css->flags & CSS_VISIBLE) || !cgrp->kn) + return 0; + if (!css->ss) { if (cgroup_on_dfl(cgrp)) cfts = cgroup_dfl_base_files; @@ -1441,6 +1547,9 @@ static int css_populate_dir(struct cgroup_subsys_state *css, goto err; } } + + css->flags |= CSS_VISIBLE; + return 0; err: list_for_each_entry(cfts, &css->ss->cfts, node) { @@ -1451,67 +1560,30 @@ err: return ret; } -static int rebind_subsystems(struct cgroup_root *dst_root, - unsigned long ss_mask) +static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; - unsigned long tmp_ss_mask; int ssid, i, ret; lockdep_assert_held(&cgroup_mutex); - for_each_subsys_which(ss, ssid, &ss_mask) { - /* if @ss has non-root csses attached to it, can't move */ - if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss))) + do_each_subsys_mask(ss, ssid, ss_mask) { + /* + * If @ss has non-root csses attached to it, can't move. + * If @ss is an implicit controller, it is exempt from this + * rule and can be stolen. + */ + if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) && + !ss->implicit_on_dfl) return -EBUSY; /* can't move between two non-dummy roots either */ if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) return -EBUSY; - } - - /* skip creating root files on dfl_root for inhibited subsystems */ - tmp_ss_mask = ss_mask; - if (dst_root == &cgrp_dfl_root) - tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask; - - for_each_subsys_which(ss, ssid, &tmp_ss_mask) { - struct cgroup *scgrp = &ss->root->cgrp; - int tssid; + } while_each_subsys_mask(); - ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp); - if (!ret) - continue; - - /* - * Rebinding back to the default root is not allowed to - * fail. Using both default and non-default roots should - * be rare. Moving subsystems back and forth even more so. - * Just warn about it and continue. - */ - if (dst_root == &cgrp_dfl_root) { - if (cgrp_dfl_root_visible) { - pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n", - ret, ss_mask); - pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); - } - continue; - } - - for_each_subsys_which(ss, tssid, &tmp_ss_mask) { - if (tssid == ssid) - break; - css_clear_dir(cgroup_css(scgrp, ss), dcgrp); - } - return ret; - } - - /* - * Nothing can fail from this point on. Remove files for the - * removed subsystems and rebind each subsystem. - */ - for_each_subsys_which(ss, ssid, &ss_mask) { + do_each_subsys_mask(ss, ssid, ss_mask) { struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); @@ -1519,22 +1591,22 @@ static int rebind_subsystems(struct cgroup_root *dst_root, WARN_ON(!css || cgroup_css(dcgrp, ss)); - css_clear_dir(css, NULL); + /* disable from the source */ + src_root->subsys_mask &= ~(1 << ssid); + WARN_ON(cgroup_apply_control(scgrp)); + cgroup_finalize_control(scgrp, 0); + /* rebind */ RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); rcu_assign_pointer(dcgrp->subsys[ssid], css); ss->root = dst_root; css->cgroup = dcgrp; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); hash_for_each(css_set_table, i, cset, hlist) list_move_tail(&cset->e_cset_node[ss->id], &dcgrp->e_csets[ss->id]); - spin_unlock_bh(&css_set_lock); - - src_root->subsys_mask &= ~(1 << ssid); - scgrp->subtree_control &= ~(1 << ssid); - cgroup_refresh_child_subsys_mask(scgrp); + spin_unlock_irq(&css_set_lock); /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; @@ -1542,18 +1614,49 @@ static int rebind_subsystems(struct cgroup_root *dst_root, static_branch_enable(cgroup_subsys_on_dfl_key[ssid]); } else { dcgrp->subtree_control |= 1 << ssid; - cgroup_refresh_child_subsys_mask(dcgrp); static_branch_disable(cgroup_subsys_on_dfl_key[ssid]); } + ret = cgroup_apply_control(dcgrp); + if (ret) + pr_warn("partial failure to rebind %s controller (err=%d)\n", + ss->name, ret); + if (ss->bind) ss->bind(css); - } + } while_each_subsys_mask(); kernfs_activate(dcgrp->kn); return 0; } +static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, + struct kernfs_root *kf_root) +{ + int len = 0; + char *buf = NULL; + struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root); + struct cgroup *ns_cgroup; + + buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + spin_lock_irq(&css_set_lock); + ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot); + len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX); + spin_unlock_irq(&css_set_lock); + + if (len >= PATH_MAX) + len = -ERANGE; + else if (len > 0) { + seq_escape(sf, buf, " \t\n\\"); + len = 0; + } + kfree(buf); + return len; +} + static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root) { @@ -1584,7 +1687,7 @@ static int cgroup_show_options(struct seq_file *seq, } struct cgroup_sb_opts { - unsigned long subsys_mask; + u16 subsys_mask; unsigned int flags; char *release_agent; bool cpuset_clone_children; @@ -1597,13 +1700,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) { char *token, *o = data; bool all_ss = false, one_ss = false; - unsigned long mask = -1UL; + u16 mask = U16_MAX; struct cgroup_subsys *ss; int nr_opts = 0; int i; #ifdef CONFIG_CPUSETS - mask = ~(1U << cpuset_cgrp_id); + mask = ~((u16)1 << cpuset_cgrp_id); #endif memset(opts, 0, sizeof(*opts)); @@ -1678,6 +1781,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) continue; if (!cgroup_ssid_enabled(i)) continue; + if (cgroup_ssid_no_v1(i)) + continue; /* Mutually exclusive option 'all' + subsystem name */ if (all_ss) @@ -1698,7 +1803,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) */ if (all_ss || (!one_ss && !opts->none && !opts->name)) for_each_subsys(ss, i) - if (cgroup_ssid_enabled(i)) + if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i)) opts->subsys_mask |= (1 << i); /* @@ -1728,14 +1833,14 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) int ret = 0; struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_sb_opts opts; - unsigned long added_mask, removed_mask; + u16 added_mask, removed_mask; if (root == &cgrp_dfl_root) { pr_err("remount is not allowed\n"); return -EINVAL; } - mutex_lock(&cgroup_mutex); + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); /* See what subsystems are wanted */ ret = parse_cgroupfs_options(data, &opts); @@ -1768,7 +1873,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) if (ret) goto out_unlock; - rebind_subsystems(&cgrp_dfl_root, removed_mask); + WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); if (opts.release_agent) { spin_lock(&release_agent_path_lock); @@ -1794,7 +1899,7 @@ static void cgroup_enable_task_cg_lists(void) { struct task_struct *p, *g; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); if (use_task_css_set_links) goto out_unlock; @@ -1819,8 +1924,12 @@ static void cgroup_enable_task_cg_lists(void) * entry won't be deleted though the process has exited. * Do it while holding siglock so that we don't end up * racing against cgroup_exit(). + * + * Interrupts were already disabled while acquiring + * the css_set_lock, so we do not need to disable it + * again when acquiring the sighand->siglock here. */ - spin_lock_irq(&p->sighand->siglock); + spin_lock(&p->sighand->siglock); if (!(p->flags & PF_EXITING)) { struct css_set *cset = task_css_set(p); @@ -1829,11 +1938,11 @@ static void cgroup_enable_task_cg_lists(void) list_add_tail(&p->cg_list, &cset->tasks); get_css_set(cset); } - spin_unlock_irq(&p->sighand->siglock); + spin_unlock(&p->sighand->siglock); } while_each_thread(g, p); read_unlock(&tasklist_lock); out_unlock: - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); } static void init_cgroup_housekeeping(struct cgroup *cgrp) @@ -1876,7 +1985,7 @@ static void init_cgroup_root(struct cgroup_root *root, set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) +static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; @@ -1899,10 +2008,11 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) /* * We're accessing css_set_count without locking css_set_lock here, * but that's OK - it can only be increased by someone holding - * cgroup_lock, and that's us. The worst that can happen is that we - * have some link structures left over + * cgroup_lock, and that's us. Later rebinding may disable + * controllers on the default hierarchy and thus create new csets, + * which can't be more than the existing ones. Allocate 2x. */ - ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); + ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links); if (ret) goto cancel_ref; @@ -1919,7 +2029,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) } root_cgrp->kn = root->kf_root->kn; - ret = css_populate_dir(&root_cgrp->self, NULL); + ret = css_populate_dir(&root_cgrp->self); if (ret) goto destroy_root; @@ -1939,13 +2049,13 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) * Link the root cgroup in this hierarchy into all the css_set * objects. */ - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); hash_for_each(css_set_table, i, cset, hlist) { link_css_set(&tmp_links, cset, root_cgrp); if (css_set_populated(cset)) cgroup_update_populated(root_cgrp, true); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); BUG_ON(!list_empty(&root_cgrp->self.children)); BUG_ON(atomic_read(&root->nr_cgrps) != 1); @@ -1972,6 +2082,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, { bool is_v2 = fs_type == &cgroup2_fs_type; struct super_block *pinned_sb = NULL; + struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct cgroup_subsys *ss; struct cgroup_root *root; struct cgroup_sb_opts opts; @@ -1980,6 +2091,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int i; bool new_sb; + get_cgroup_ns(ns); + + /* Check if the caller has permission to mount. */ + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) { + put_cgroup_ns(ns); + return ERR_PTR(-EPERM); + } + /* * The first time anyone tries to mount a cgroup, enable the list * linking each css_set to its tasks and fix up all existing tasks. @@ -1990,15 +2109,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (is_v2) { if (data) { pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); + put_cgroup_ns(ns); return ERR_PTR(-EINVAL); } - cgrp_dfl_root_visible = true; + cgrp_dfl_visible = true; root = &cgrp_dfl_root; cgroup_get(&root->cgrp); goto out_mount; } - mutex_lock(&cgroup_mutex); + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); /* First find the desired set of subsystems */ ret = parse_cgroupfs_options(data, &opts); @@ -2095,6 +2215,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, goto out_unlock; } + /* + * We know this subsystem has not yet been bound. Users in a non-init + * user namespace may only mount hierarchies with no bound subsystems, + * i.e. 'none,name=user1' + */ + if (!opts.none && !capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_unlock; + } + root = kzalloc(sizeof(*root), GFP_KERNEL); if (!root) { ret = -ENOMEM; @@ -2113,12 +2243,37 @@ out_free: kfree(opts.release_agent); kfree(opts.name); - if (ret) + if (ret) { + put_cgroup_ns(ns); return ERR_PTR(ret); + } out_mount: dentry = kernfs_mount(fs_type, flags, root->kf_root, is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC, &new_sb); + + /* + * In non-init cgroup namespace, instead of root cgroup's + * dentry, we return the dentry corresponding to the + * cgroupns->root_cgrp. + */ + if (!IS_ERR(dentry) && ns != &init_cgroup_ns) { + struct dentry *nsdentry; + struct cgroup *cgrp; + + mutex_lock(&cgroup_mutex); + spin_lock_irq(&css_set_lock); + + cgrp = cset_cgroup_from_root(ns->root_cset, root); + + spin_unlock_irq(&css_set_lock); + mutex_unlock(&cgroup_mutex); + + nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb); + dput(dentry); + dentry = nsdentry; + } + if (IS_ERR(dentry) || !new_sb) cgroup_put(&root->cgrp); @@ -2131,6 +2286,7 @@ out_mount: deactivate_super(pinned_sb); } + put_cgroup_ns(ns); return dentry; } @@ -2159,14 +2315,45 @@ static struct file_system_type cgroup_fs_type = { .name = "cgroup", .mount = cgroup_mount, .kill_sb = cgroup_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; static struct file_system_type cgroup2_fs_type = { .name = "cgroup2", .mount = cgroup_mount, .kill_sb = cgroup_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; +static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) +{ + struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); + int ret; + + ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); + if (ret < 0 || ret >= buflen) + return NULL; + return buf; +} + +char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) +{ + char *ret; + + mutex_lock(&cgroup_mutex); + spin_lock_irq(&css_set_lock); + + ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns); + + spin_unlock_irq(&css_set_lock); + mutex_unlock(&cgroup_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(cgroup_path_ns); + /** * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy * @task: target task @@ -2188,20 +2375,20 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) char *path = NULL; mutex_lock(&cgroup_mutex); - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); if (root) { cgrp = task_cgroup_from_root(task, root); - path = cgroup_path(cgrp, buf, buflen); + path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); } else { /* if no hierarchy exists, everyone is in "/" */ if (strlcpy(buf, "/", buflen) < buflen) path = buf; } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); return path; } @@ -2338,45 +2525,45 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, } /** - * cgroup_taskset_migrate - migrate a taskset to a cgroup + * cgroup_taskset_migrate - migrate a taskset * @tset: taget taskset - * @dst_cgrp: destination cgroup + * @root: cgroup root the migration is taking place on * - * Migrate tasks in @tset to @dst_cgrp. This function fails iff one of the - * ->can_attach callbacks fails and guarantees that either all or none of - * the tasks in @tset are migrated. @tset is consumed regardless of - * success. + * Migrate tasks in @tset as setup by migration preparation functions. + * This function fails iff one of the ->can_attach callbacks fails and + * guarantees that either all or none of the tasks in @tset are migrated. + * @tset is consumed regardless of success. */ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, - struct cgroup *dst_cgrp) + struct cgroup_root *root) { - struct cgroup_subsys_state *css, *failed_css = NULL; + struct cgroup_subsys *ss; struct task_struct *task, *tmp_task; struct css_set *cset, *tmp_cset; - int i, ret; + int ssid, failed_ssid, ret; /* methods shouldn't be called if no task is actually migrating */ if (list_empty(&tset->src_csets)) return 0; /* check that we can legitimately attach to the cgroup */ - for_each_e_css(css, i, dst_cgrp) { - if (css->ss->can_attach) { - tset->ssid = i; - ret = css->ss->can_attach(tset); + do_each_subsys_mask(ss, ssid, root->subsys_mask) { + if (ss->can_attach) { + tset->ssid = ssid; + ret = ss->can_attach(tset); if (ret) { - failed_css = css; + failed_ssid = ssid; goto out_cancel_attach; } } - } + } while_each_subsys_mask(); /* * Now that we're guaranteed success, proceed to move all tasks to * the new cgroup. There are no failure cases after here, so this * is the commit point. */ - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_for_each_entry(cset, &tset->src_csets, mg_node) { list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) { struct css_set *from_cset = task_css_set(task); @@ -2387,7 +2574,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, put_css_set_locked(from_cset); } } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); /* * Migration is committed, all target tasks are now on dst_csets. @@ -2396,37 +2583,51 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, */ tset->csets = &tset->dst_csets; - for_each_e_css(css, i, dst_cgrp) { - if (css->ss->attach) { - tset->ssid = i; - css->ss->attach(tset); + do_each_subsys_mask(ss, ssid, root->subsys_mask) { + if (ss->attach) { + tset->ssid = ssid; + ss->attach(tset); } - } + } while_each_subsys_mask(); ret = 0; goto out_release_tset; out_cancel_attach: - for_each_e_css(css, i, dst_cgrp) { - if (css == failed_css) + do_each_subsys_mask(ss, ssid, root->subsys_mask) { + if (ssid == failed_ssid) break; - if (css->ss->cancel_attach) { - tset->ssid = i; - css->ss->cancel_attach(tset); + if (ss->cancel_attach) { + tset->ssid = ssid; + ss->cancel_attach(tset); } - } + } while_each_subsys_mask(); out_release_tset: - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_splice_init(&tset->dst_csets, &tset->src_csets); list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) { list_splice_tail_init(&cset->mg_tasks, &cset->tasks); list_del_init(&cset->mg_node); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); return ret; } /** + * cgroup_may_migrate_to - verify whether a cgroup can be migration destination + * @dst_cgrp: destination cgroup to test + * + * On the default hierarchy, except for the root, subtree_control must be + * zero for migration destination cgroups with tasks so that child cgroups + * don't compete against tasks. + */ +static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp) +{ + return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) || + !dst_cgrp->subtree_control; +} + +/** * cgroup_migrate_finish - cleanup after attach * @preloaded_csets: list of preloaded css_sets * @@ -2439,14 +2640,15 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) lockdep_assert_held(&cgroup_mutex); - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { cset->mg_src_cgrp = NULL; + cset->mg_dst_cgrp = NULL; cset->mg_dst_cset = NULL; list_del_init(&cset->mg_preload_node); put_css_set_locked(cset); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); } /** @@ -2474,58 +2676,56 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); + /* + * If ->dead, @src_set is associated with one or more dead cgroups + * and doesn't contain any migratable tasks. Ignore it early so + * that the rest of migration path doesn't get confused by it. + */ + if (src_cset->dead) + return; + src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); if (!list_empty(&src_cset->mg_preload_node)) return; WARN_ON(src_cset->mg_src_cgrp); + WARN_ON(src_cset->mg_dst_cgrp); WARN_ON(!list_empty(&src_cset->mg_tasks)); WARN_ON(!list_empty(&src_cset->mg_node)); src_cset->mg_src_cgrp = src_cgrp; + src_cset->mg_dst_cgrp = dst_cgrp; get_css_set(src_cset); list_add(&src_cset->mg_preload_node, preloaded_csets); } /** * cgroup_migrate_prepare_dst - prepare destination css_sets for migration - * @dst_cgrp: the destination cgroup (may be %NULL) * @preloaded_csets: list of preloaded source css_sets * - * Tasks are about to be moved to @dst_cgrp and all the source css_sets - * have been preloaded to @preloaded_csets. This function looks up and - * pins all destination css_sets, links each to its source, and append them - * to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each - * source css_set is assumed to be its cgroup on the default hierarchy. + * Tasks are about to be moved and all the source css_sets have been + * preloaded to @preloaded_csets. This function looks up and pins all + * destination css_sets, links each to its source, and append them to + * @preloaded_csets. * * This function must be called after cgroup_migrate_add_src() has been * called on each migration source css_set. After migration is performed * using cgroup_migrate(), cgroup_migrate_finish() must be called on * @preloaded_csets. */ -static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, - struct list_head *preloaded_csets) +static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets) { LIST_HEAD(csets); struct css_set *src_cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); - /* - * Except for the root, child_subsys_mask must be zero for a cgroup - * with tasks so that child cgroups don't compete against tasks. - */ - if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) && - dst_cgrp->child_subsys_mask) - return -EBUSY; - /* look up the dst cset for each src cset and link it to src */ list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) { struct css_set *dst_cset; - dst_cset = find_css_set(src_cset, - dst_cgrp ?: src_cset->dfl_cgrp); + dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp); if (!dst_cset) goto err; @@ -2538,6 +2738,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, */ if (src_cset == dst_cset) { src_cset->mg_src_cgrp = NULL; + src_cset->mg_dst_cgrp = NULL; list_del_init(&src_cset->mg_preload_node); put_css_set(src_cset); put_css_set(dst_cset); @@ -2563,11 +2764,11 @@ err: * cgroup_migrate - migrate a process or task to a cgroup * @leader: the leader of the process or the task to migrate * @threadgroup: whether @leader points to the whole process or a single task - * @cgrp: the destination cgroup + * @root: cgroup root migration is taking place on * - * Migrate a process or task denoted by @leader to @cgrp. If migrating a - * process, the caller must be holding cgroup_threadgroup_rwsem. The - * caller is also responsible for invoking cgroup_migrate_add_src() and + * Migrate a process or task denoted by @leader. If migrating a process, + * the caller must be holding cgroup_threadgroup_rwsem. The caller is also + * responsible for invoking cgroup_migrate_add_src() and * cgroup_migrate_prepare_dst() on the targets before invoking this * function and following up with cgroup_migrate_finish(). * @@ -2578,7 +2779,7 @@ err: * actually starting migrating. */ static int cgroup_migrate(struct task_struct *leader, bool threadgroup, - struct cgroup *cgrp) + struct cgroup_root *root) { struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); struct task_struct *task; @@ -2588,7 +2789,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup, * already PF_EXITING could be freed from underneath us unless we * take an rcu_read_lock. */ - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); rcu_read_lock(); task = leader; do { @@ -2597,9 +2798,9 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup, break; } while_each_thread(leader, task); rcu_read_unlock(); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); - return cgroup_taskset_migrate(&tset, cgrp); + return cgroup_taskset_migrate(&tset, root); } /** @@ -2617,8 +2818,11 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *task; int ret; + if (!cgroup_may_migrate_to(dst_cgrp)) + return -EBUSY; + /* look up all src csets */ - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); rcu_read_lock(); task = leader; do { @@ -2628,12 +2832,12 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, break; } while_each_thread(leader, task); rcu_read_unlock(); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); /* prepare dst csets and commit */ - ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets); + ret = cgroup_migrate_prepare_dst(&preloaded_csets); if (!ret) - ret = cgroup_migrate(leader, threadgroup, dst_cgrp); + ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root); cgroup_migrate_finish(&preloaded_csets); return ret; @@ -2661,9 +2865,9 @@ static int cgroup_procs_write_permission(struct task_struct *task, struct cgroup *cgrp; struct inode *inode; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); while (!cgroup_is_descendant(dst_cgrp, cgrp)) cgrp = cgroup_parent(cgrp); @@ -2689,14 +2893,15 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool threadgroup) { struct task_struct *tsk; + struct cgroup_subsys *ss; struct cgroup *cgrp; pid_t pid; - int ret; + int ssid, ret; if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return -EINVAL; - cgrp = cgroup_kn_lock_live(of->kn); + cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; @@ -2739,8 +2944,10 @@ out_unlock_rcu: rcu_read_unlock(); out_unlock_threadgroup: percpu_up_write(&cgroup_threadgroup_rwsem); + for_each_subsys(ss, ssid) + if (ss->post_attach) + ss->post_attach(); cgroup_kn_unlock(of->kn); - cpuset_post_attach_flush(); return ret ?: nbytes; } @@ -2761,9 +2968,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) if (root == &cgrp_dfl_root) continue; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); from_cgrp = task_cgroup_from_root(from, root); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); retval = cgroup_attach_task(from_cgrp, tsk, false); if (retval) @@ -2794,7 +3001,7 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); - cgrp = cgroup_kn_lock_live(of->kn); + cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; spin_lock(&release_agent_path_lock); @@ -2822,38 +3029,28 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) return 0; } -static void cgroup_print_ss_mask(struct seq_file *seq, unsigned long ss_mask) +static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) { struct cgroup_subsys *ss; bool printed = false; int ssid; - for_each_subsys_which(ss, ssid, &ss_mask) { + do_each_subsys_mask(ss, ssid, ss_mask) { if (printed) seq_putc(seq, ' '); seq_printf(seq, "%s", ss->name); printed = true; - } + } while_each_subsys_mask(); if (printed) seq_putc(seq, '\n'); } -/* show controllers which are currently attached to the default hierarchy */ -static int cgroup_root_controllers_show(struct seq_file *seq, void *v) -{ - struct cgroup *cgrp = seq_css(seq)->cgroup; - - cgroup_print_ss_mask(seq, cgrp->root->subsys_mask & - ~cgrp_dfl_root_inhibit_ss_mask); - return 0; -} - /* show controllers which are enabled from the parent */ static int cgroup_controllers_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control); + cgroup_print_ss_mask(seq, cgroup_control(cgrp)); return 0; } @@ -2870,16 +3067,17 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy * @cgrp: root of the subtree to update csses for * - * @cgrp's child_subsys_mask has changed and its subtree's (self excluded) - * css associations need to be updated accordingly. This function looks up - * all css_sets which are attached to the subtree, creates the matching - * updated css_sets and migrates the tasks to the new ones. + * @cgrp's control masks have changed and its subtree's css associations + * need to be updated accordingly. This function looks up all css_sets + * which are attached to the subtree, creates the matching updated css_sets + * and migrates the tasks to the new ones. */ static int cgroup_update_dfl_csses(struct cgroup *cgrp) { LIST_HEAD(preloaded_csets); struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); - struct cgroup_subsys_state *css; + struct cgroup_subsys_state *d_css; + struct cgroup *dsct; struct css_set *src_cset; int ret; @@ -2888,26 +3086,22 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) percpu_down_write(&cgroup_threadgroup_rwsem); /* look up all csses currently attached to @cgrp's subtree */ - spin_lock_bh(&css_set_lock); - css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { + spin_lock_irq(&css_set_lock); + cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { struct cgrp_cset_link *link; - /* self is not affected by child_subsys_mask change */ - if (css->cgroup == cgrp) - continue; - - list_for_each_entry(link, &css->cgroup->cset_links, cset_link) - cgroup_migrate_add_src(link->cset, cgrp, + list_for_each_entry(link, &dsct->cset_links, cset_link) + cgroup_migrate_add_src(link->cset, dsct, &preloaded_csets); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); /* NULL dst indicates self on default hierarchy */ - ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets); + ret = cgroup_migrate_prepare_dst(&preloaded_csets); if (ret) goto out_finish; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { struct task_struct *task, *ntask; @@ -2919,22 +3113,274 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list) cgroup_taskset_add(task, &tset); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); - ret = cgroup_taskset_migrate(&tset, cgrp); + ret = cgroup_taskset_migrate(&tset, cgrp->root); out_finish: cgroup_migrate_finish(&preloaded_csets); percpu_up_write(&cgroup_threadgroup_rwsem); return ret; } +/** + * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses + * @cgrp: root of the target subtree + * + * Because css offlining is asynchronous, userland may try to re-enable a + * controller while the previous css is still around. This function grabs + * cgroup_mutex and drains the previous css instances of @cgrp's subtree. + */ +static void cgroup_lock_and_drain_offline(struct cgroup *cgrp) + __acquires(&cgroup_mutex) +{ + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; + struct cgroup_subsys *ss; + int ssid; + +restart: + mutex_lock(&cgroup_mutex); + + cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { + for_each_subsys(ss, ssid) { + struct cgroup_subsys_state *css = cgroup_css(dsct, ss); + DEFINE_WAIT(wait); + + if (!css || !percpu_ref_is_dying(&css->refcnt)) + continue; + + cgroup_get(dsct); + prepare_to_wait(&dsct->offline_waitq, &wait, + TASK_UNINTERRUPTIBLE); + + mutex_unlock(&cgroup_mutex); + schedule(); + finish_wait(&dsct->offline_waitq, &wait); + + cgroup_put(dsct); + goto restart; + } + } +} + +/** + * cgroup_save_control - save control masks of a subtree + * @cgrp: root of the target subtree + * + * Save ->subtree_control and ->subtree_ss_mask to the respective old_ + * prefixed fields for @cgrp's subtree including @cgrp itself. + */ +static void cgroup_save_control(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; + + cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { + dsct->old_subtree_control = dsct->subtree_control; + dsct->old_subtree_ss_mask = dsct->subtree_ss_mask; + } +} + +/** + * cgroup_propagate_control - refresh control masks of a subtree + * @cgrp: root of the target subtree + * + * For @cgrp and its subtree, ensure ->subtree_ss_mask matches + * ->subtree_control and propagate controller availability through the + * subtree so that descendants don't have unavailable controllers enabled. + */ +static void cgroup_propagate_control(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; + + cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { + dsct->subtree_control &= cgroup_control(dsct); + dsct->subtree_ss_mask = + cgroup_calc_subtree_ss_mask(dsct->subtree_control, + cgroup_ss_mask(dsct)); + } +} + +/** + * cgroup_restore_control - restore control masks of a subtree + * @cgrp: root of the target subtree + * + * Restore ->subtree_control and ->subtree_ss_mask from the respective old_ + * prefixed fields for @cgrp's subtree including @cgrp itself. + */ +static void cgroup_restore_control(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; + + cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { + dsct->subtree_control = dsct->old_subtree_control; + dsct->subtree_ss_mask = dsct->old_subtree_ss_mask; + } +} + +static bool css_visible(struct cgroup_subsys_state *css) +{ + struct cgroup_subsys *ss = css->ss; + struct cgroup *cgrp = css->cgroup; + + if (cgroup_control(cgrp) & (1 << ss->id)) + return true; + if (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) + return false; + return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl; +} + +/** + * cgroup_apply_control_enable - enable or show csses according to control + * @cgrp: root of the target subtree + * + * Walk @cgrp's subtree and create new csses or make the existing ones + * visible. A css is created invisible if it's being implicitly enabled + * through dependency. An invisible css is made visible when the userland + * explicitly enables it. + * + * Returns 0 on success, -errno on failure. On failure, csses which have + * been processed already aren't cleaned up. The caller is responsible for + * cleaning up with cgroup_apply_control_disble(). + */ +static int cgroup_apply_control_enable(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; + struct cgroup_subsys *ss; + int ssid, ret; + + cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { + for_each_subsys(ss, ssid) { + struct cgroup_subsys_state *css = cgroup_css(dsct, ss); + + WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt)); + + if (!(cgroup_ss_mask(dsct) & (1 << ss->id))) + continue; + + if (!css) { + css = css_create(dsct, ss); + if (IS_ERR(css)) + return PTR_ERR(css); + } + + if (css_visible(css)) { + ret = css_populate_dir(css); + if (ret) + return ret; + } + } + } + + return 0; +} + +/** + * cgroup_apply_control_disable - kill or hide csses according to control + * @cgrp: root of the target subtree + * + * Walk @cgrp's subtree and kill and hide csses so that they match + * cgroup_ss_mask() and cgroup_visible_mask(). + * + * A css is hidden when the userland requests it to be disabled while other + * subsystems are still depending on it. The css must not actively control + * resources and be in the vanilla state if it's made visible again later. + * Controllers which may be depended upon should provide ->css_reset() for + * this purpose. + */ +static void cgroup_apply_control_disable(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; + struct cgroup_subsys *ss; + int ssid; + + cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { + for_each_subsys(ss, ssid) { + struct cgroup_subsys_state *css = cgroup_css(dsct, ss); + + WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt)); + + if (!css) + continue; + + if (css->parent && + !(cgroup_ss_mask(dsct) & (1 << ss->id))) { + kill_css(css); + } else if (!css_visible(css)) { + css_clear_dir(css); + if (ss->css_reset) + ss->css_reset(css); + } + } + } +} + +/** + * cgroup_apply_control - apply control mask updates to the subtree + * @cgrp: root of the target subtree + * + * subsystems can be enabled and disabled in a subtree using the following + * steps. + * + * 1. Call cgroup_save_control() to stash the current state. + * 2. Update ->subtree_control masks in the subtree as desired. + * 3. Call cgroup_apply_control() to apply the changes. + * 4. Optionally perform other related operations. + * 5. Call cgroup_finalize_control() to finish up. + * + * This function implements step 3 and propagates the mask changes + * throughout @cgrp's subtree, updates csses accordingly and perform + * process migrations. + */ +static int cgroup_apply_control(struct cgroup *cgrp) +{ + int ret; + + cgroup_propagate_control(cgrp); + + ret = cgroup_apply_control_enable(cgrp); + if (ret) + return ret; + + /* + * At this point, cgroup_e_css() results reflect the new csses + * making the following cgroup_update_dfl_csses() properly update + * css associations of all tasks in the subtree. + */ + ret = cgroup_update_dfl_csses(cgrp); + if (ret) + return ret; + + return 0; +} + +/** + * cgroup_finalize_control - finalize control mask update + * @cgrp: root of the target subtree + * @ret: the result of the update + * + * Finalize control mask update. See cgroup_apply_control() for more info. + */ +static void cgroup_finalize_control(struct cgroup *cgrp, int ret) +{ + if (ret) { + cgroup_restore_control(cgrp); + cgroup_propagate_control(cgrp); + } + + cgroup_apply_control_disable(cgrp); +} + /* change the enabled child controllers for a cgroup in the default hierarchy */ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - unsigned long enable = 0, disable = 0; - unsigned long css_enable, css_disable, old_sc, new_sc, old_ss, new_ss; + u16 enable = 0, disable = 0; struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; @@ -2946,11 +3392,9 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, */ buf = strstrip(buf); while ((tok = strsep(&buf, " "))) { - unsigned long tmp_ss_mask = ~cgrp_dfl_root_inhibit_ss_mask; - if (tok[0] == '\0') continue; - for_each_subsys_which(ss, ssid, &tmp_ss_mask) { + do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) { if (!cgroup_ssid_enabled(ssid) || strcmp(tok + 1, ss->name)) continue; @@ -2965,12 +3409,12 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, return -EINVAL; } break; - } + } while_each_subsys_mask(); if (ssid == CGROUP_SUBSYS_COUNT) return -EINVAL; } - cgrp = cgroup_kn_lock_live(of->kn); + cgrp = cgroup_kn_lock_live(of->kn, true); if (!cgrp) return -ENODEV; @@ -2981,10 +3425,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, continue; } - /* unavailable or not enabled on the parent? */ - if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || - (cgroup_parent(cgrp) && - !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) { + if (!(cgroup_control(cgrp) & (1 << ssid))) { ret = -ENOENT; goto out_unlock; } @@ -3018,150 +3459,21 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, goto out_unlock; } - /* - * Update subsys masks and calculate what needs to be done. More - * subsystems than specified may need to be enabled or disabled - * depending on subsystem dependencies. - */ - old_sc = cgrp->subtree_control; - old_ss = cgrp->child_subsys_mask; - new_sc = (old_sc | enable) & ~disable; - new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc); - - css_enable = ~old_ss & new_ss; - css_disable = old_ss & ~new_ss; - enable |= css_enable; - disable |= css_disable; - - /* - * Because css offlining is asynchronous, userland might try to - * re-enable the same controller while the previous instance is - * still around. In such cases, wait till it's gone using - * offline_waitq. - */ - for_each_subsys_which(ss, ssid, &css_enable) { - cgroup_for_each_live_child(child, cgrp) { - DEFINE_WAIT(wait); - - if (!cgroup_css(child, ss)) - continue; - - cgroup_get(child); - prepare_to_wait(&child->offline_waitq, &wait, - TASK_UNINTERRUPTIBLE); - cgroup_kn_unlock(of->kn); - schedule(); - finish_wait(&child->offline_waitq, &wait); - cgroup_put(child); - - return restart_syscall(); - } - } - - cgrp->subtree_control = new_sc; - cgrp->child_subsys_mask = new_ss; - - /* - * Create new csses or make the existing ones visible. A css is - * created invisible if it's being implicitly enabled through - * dependency. An invisible css is made visible when the userland - * explicitly enables it. - */ - for_each_subsys(ss, ssid) { - if (!(enable & (1 << ssid))) - continue; - - cgroup_for_each_live_child(child, cgrp) { - if (css_enable & (1 << ssid)) - ret = create_css(child, ss, - cgrp->subtree_control & (1 << ssid)); - else - ret = css_populate_dir(cgroup_css(child, ss), - NULL); - if (ret) - goto err_undo_css; - } - } + /* save and update control masks and prepare csses */ + cgroup_save_control(cgrp); - /* - * At this point, cgroup_e_css() results reflect the new csses - * making the following cgroup_update_dfl_csses() properly update - * css associations of all tasks in the subtree. - */ - ret = cgroup_update_dfl_csses(cgrp); - if (ret) - goto err_undo_css; - - /* - * All tasks are migrated out of disabled csses. Kill or hide - * them. A css is hidden when the userland requests it to be - * disabled while other subsystems are still depending on it. The - * css must not actively control resources and be in the vanilla - * state if it's made visible again later. Controllers which may - * be depended upon should provide ->css_reset() for this purpose. - */ - for_each_subsys(ss, ssid) { - if (!(disable & (1 << ssid))) - continue; - - cgroup_for_each_live_child(child, cgrp) { - struct cgroup_subsys_state *css = cgroup_css(child, ss); - - if (css_disable & (1 << ssid)) { - kill_css(css); - } else { - css_clear_dir(css, NULL); - if (ss->css_reset) - ss->css_reset(css); - } - } - } - - /* - * The effective csses of all the descendants (excluding @cgrp) may - * have changed. Subsystems can optionally subscribe to this event - * by implementing ->css_e_css_changed() which is invoked if any of - * the effective csses seen from the css's cgroup may have changed. - */ - for_each_subsys(ss, ssid) { - struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss); - struct cgroup_subsys_state *css; + cgrp->subtree_control |= enable; + cgrp->subtree_control &= ~disable; - if (!ss->css_e_css_changed || !this_css) - continue; + ret = cgroup_apply_control(cgrp); - css_for_each_descendant_pre(css, this_css) - if (css != this_css) - ss->css_e_css_changed(css); - } + cgroup_finalize_control(cgrp, ret); kernfs_activate(cgrp->kn); ret = 0; out_unlock: cgroup_kn_unlock(of->kn); return ret ?: nbytes; - -err_undo_css: - cgrp->subtree_control = old_sc; - cgrp->child_subsys_mask = old_ss; - - for_each_subsys(ss, ssid) { - if (!(enable & (1 << ssid))) - continue; - - cgroup_for_each_live_child(child, cgrp) { - struct cgroup_subsys_state *css = cgroup_css(child, ss); - - if (!css) - continue; - - if (css_enable & (1 << ssid)) - kill_css(css); - else - css_clear_dir(css, NULL); - } - } - goto out_unlock; } static int cgroup_events_show(struct seq_file *seq, void *v) @@ -3359,7 +3671,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, bool is_add) { struct cftype *cft, *cft_end = NULL; - int ret; + int ret = 0; lockdep_assert_held(&cgroup_mutex); @@ -3388,7 +3700,7 @@ restart: cgroup_rm_file(cgrp, cft); } } - return 0; + return ret; } static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) @@ -3405,7 +3717,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) css_for_each_descendant_pre(css, cgroup_css(root, ss)) { struct cgroup *cgrp = css->cgroup; - if (cgroup_is_dead(cgrp)) + if (!(css->flags & CSS_VISIBLE)) continue; ret = cgroup_addrm_files(css, cgrp, cfts, is_add); @@ -3602,10 +3914,10 @@ static int cgroup_task_count(const struct cgroup *cgrp) int count = 0; struct cgrp_cset_link *link; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_for_each_entry(link, &cgrp->cset_links, cset_link) count += atomic_read(&link->cset->refcount); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); return count; } @@ -3943,7 +4255,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, memset(it, 0, sizeof(*it)); - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); it->ss = css->ss; @@ -3956,7 +4268,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, css_task_iter_advance_css_set(it); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); } /** @@ -3974,7 +4286,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) it->cur_task = NULL; } - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); if (it->task_pos) { it->cur_task = list_entry(it->task_pos, struct task_struct, @@ -3983,7 +4295,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) css_task_iter_advance(it); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); return it->cur_task; } @@ -3997,10 +4309,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) void css_task_iter_end(struct css_task_iter *it) { if (it->cur_cset) { - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_del(&it->iters_node); put_css_set_locked(it->cur_cset); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); } if (it->cur_task) @@ -4026,15 +4338,18 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) struct task_struct *task; int ret; + if (!cgroup_may_migrate_to(to)) + return -EBUSY; + mutex_lock(&cgroup_mutex); /* all tasks in @from are being moved, all csets are source */ - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_for_each_entry(link, &from->cset_links, cset_link) cgroup_migrate_add_src(link->cset, to, &preloaded_csets); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); - ret = cgroup_migrate_prepare_dst(to, &preloaded_csets); + ret = cgroup_migrate_prepare_dst(&preloaded_csets); if (ret) goto out_err; @@ -4050,7 +4365,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) css_task_iter_end(&it); if (task) { - ret = cgroup_migrate(task, false, to); + ret = cgroup_migrate(task, false, to->root); put_task_struct(task); } } while (task && !ret); @@ -4557,12 +4872,6 @@ static struct cftype cgroup_dfl_base_files[] = { }, { .name = "cgroup.controllers", - .flags = CFTYPE_ONLY_ON_ROOT, - .seq_show = cgroup_root_controllers_show, - }, - { - .name = "cgroup.controllers", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_controllers_show, }, { @@ -4731,7 +5040,9 @@ static void css_release_work_fn(struct work_struct *work) * Those are supported by RCU protecting clearing of * cgrp->kn->priv backpointer. */ - RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); + if (cgrp->kn) + RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, + NULL); } mutex_unlock(&cgroup_mutex); @@ -4758,6 +5069,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, memset(css, 0, sizeof(*css)); css->cgroup = cgrp; css->ss = ss; + css->id = -1; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); css->serial_nr = css_serial_nr_next++; @@ -4802,6 +5114,9 @@ static void offline_css(struct cgroup_subsys_state *css) if (!(css->flags & CSS_ONLINE)) return; + if (ss->css_reset) + ss->css_reset(css); + if (ss->css_offline) ss->css_offline(css); @@ -4812,17 +5127,16 @@ static void offline_css(struct cgroup_subsys_state *css) } /** - * create_css - create a cgroup_subsys_state + * css_create - create a cgroup_subsys_state * @cgrp: the cgroup new css will be associated with * @ss: the subsys of new css - * @visible: whether to create control knobs for the new css or not * * Create a new css associated with @cgrp - @ss pair. On success, the new - * css is online and installed in @cgrp with all interface files created if - * @visible. Returns 0 on success, -errno on failure. + * css is online and installed in @cgrp. This function doesn't create the + * interface files. Returns 0 on success, -errno on failure. */ -static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, - bool visible) +static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, + struct cgroup_subsys *ss) { struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); @@ -4833,7 +5147,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, css = ss->css_alloc(parent_css); if (IS_ERR(css)) - return PTR_ERR(css); + return css; init_and_link_css(css, ss, cgrp); @@ -4843,15 +5157,9 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL); if (err < 0) - goto err_free_percpu_ref; + goto err_free_css; css->id = err; - if (visible) { - err = css_populate_dir(css, NULL); - if (err) - goto err_free_id; - } - /* @css is ready to be brought online now, make it visible */ list_add_tail_rcu(&css->sibling, &parent_css->children); cgroup_idr_replace(&ss->css_idr, css, css->id); @@ -4869,47 +5177,27 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, ss->warned_broken_hierarchy = true; } - return 0; + return css; err_list_del: list_del_rcu(&css->sibling); - css_clear_dir(css, NULL); -err_free_id: - cgroup_idr_remove(&ss->css_idr, css->id); -err_free_percpu_ref: - percpu_ref_exit(&css->refcnt); err_free_css: call_rcu(&css->rcu_head, css_free_rcu_fn); - return err; + return ERR_PTR(err); } -static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, - umode_t mode) +static struct cgroup *cgroup_create(struct cgroup *parent) { - struct cgroup *parent, *cgrp, *tcgrp; - struct cgroup_root *root; - struct cgroup_subsys *ss; - struct kernfs_node *kn; - int level, ssid, ret; - - /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable. - */ - if (strchr(name, '\n')) - return -EINVAL; - - parent = cgroup_kn_lock_live(parent_kn); - if (!parent) - return -ENODEV; - root = parent->root; - level = parent->level + 1; + struct cgroup_root *root = parent->root; + struct cgroup *cgrp, *tcgrp; + int level = parent->level + 1; + int ret; /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(sizeof(*cgrp) + sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL); - if (!cgrp) { - ret = -ENOMEM; - goto out_unlock; - } + if (!cgrp) + return ERR_PTR(-ENOMEM); ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) @@ -4940,20 +5228,6 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); - /* create the directory */ - kn = kernfs_create_dir(parent->kn, name, mode, cgrp); - if (IS_ERR(kn)) { - ret = PTR_ERR(kn); - goto out_free_id; - } - cgrp->kn = kn; - - /* - * This extra ref will be put in cgroup_free_fn() and guarantees - * that @cgrp->kn is always accessible. - */ - kernfs_get(kn); - cgrp->self.serial_nr = css_serial_nr_next++; /* allocation complete, commit to creation */ @@ -4967,51 +5241,90 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, */ cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); - ret = cgroup_kn_set_ugid(kn); - if (ret) - goto out_destroy; + /* + * On the default hierarchy, a child doesn't automatically inherit + * subtree_control from the parent. Each is configured manually. + */ + if (!cgroup_on_dfl(cgrp)) + cgrp->subtree_control = cgroup_control(cgrp); - ret = css_populate_dir(&cgrp->self, NULL); + cgroup_propagate_control(cgrp); + + /* @cgrp doesn't have dir yet so the following will only create csses */ + ret = cgroup_apply_control_enable(cgrp); if (ret) goto out_destroy; - /* let's create and online css's */ - for_each_subsys(ss, ssid) { - if (parent->child_subsys_mask & (1 << ssid)) { - ret = create_css(cgrp, ss, - parent->subtree_control & (1 << ssid)); - if (ret) - goto out_destroy; - } + return cgrp; + +out_cancel_ref: + percpu_ref_exit(&cgrp->self.refcnt); +out_free_cgrp: + kfree(cgrp); + return ERR_PTR(ret); +out_destroy: + cgroup_destroy_locked(cgrp); + return ERR_PTR(ret); +} + +static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) +{ + struct cgroup *parent, *cgrp; + struct kernfs_node *kn; + int ret; + + /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */ + if (strchr(name, '\n')) + return -EINVAL; + + parent = cgroup_kn_lock_live(parent_kn, false); + if (!parent) + return -ENODEV; + + cgrp = cgroup_create(parent); + if (IS_ERR(cgrp)) { + ret = PTR_ERR(cgrp); + goto out_unlock; } + /* create the directory */ + kn = kernfs_create_dir(parent->kn, name, mode, cgrp); + if (IS_ERR(kn)) { + ret = PTR_ERR(kn); + goto out_destroy; + } + cgrp->kn = kn; + /* - * On the default hierarchy, a child doesn't automatically inherit - * subtree_control from the parent. Each is configured manually. + * This extra ref will be put in cgroup_free_fn() and guarantees + * that @cgrp->kn is always accessible. */ - if (!cgroup_on_dfl(cgrp)) { - cgrp->subtree_control = parent->subtree_control; - cgroup_refresh_child_subsys_mask(cgrp); - } + kernfs_get(kn); + ret = cgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + + ret = css_populate_dir(&cgrp->self); + if (ret) + goto out_destroy; + + ret = cgroup_apply_control_enable(cgrp); + if (ret) + goto out_destroy; + + /* let's create and online css's */ kernfs_activate(kn); ret = 0; goto out_unlock; -out_free_id: - cgroup_idr_remove(&root->cgroup_idr, cgrp->id); -out_cancel_ref: - percpu_ref_exit(&cgrp->self.refcnt); -out_free_cgrp: - kfree(cgrp); +out_destroy: + cgroup_destroy_locked(cgrp); out_unlock: cgroup_kn_unlock(parent_kn); return ret; - -out_destroy: - cgroup_destroy_locked(cgrp); - goto out_unlock; } /* @@ -5065,7 +5378,7 @@ static void kill_css(struct cgroup_subsys_state *css) * This must happen before css is disassociated with its cgroup. * See seq_css() for details. */ - css_clear_dir(css, NULL); + css_clear_dir(css); /* * Killing would put the base ref, but we need to keep it alive @@ -5114,6 +5427,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct cgroup_subsys_state *css; + struct cgrp_cset_link *link; int ssid; lockdep_assert_held(&cgroup_mutex); @@ -5134,11 +5448,18 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) return -EBUSY; /* - * Mark @cgrp dead. This prevents further task migration and child - * creation by disabling cgroup_lock_live_group(). + * Mark @cgrp and the associated csets dead. The former prevents + * further task migration and child creation by disabling + * cgroup_lock_live_group(). The latter makes the csets ignored by + * the migration path. */ cgrp->self.flags &= ~CSS_ONLINE; + spin_lock_irq(&css_set_lock); + list_for_each_entry(link, &cgrp->cset_links, cset_link) + link->cset->dead = true; + spin_unlock_irq(&css_set_lock); + /* initiate massacre of all css's */ for_each_css(css, ssid, cgrp) kill_css(css); @@ -5162,7 +5483,7 @@ static int cgroup_rmdir(struct kernfs_node *kn) struct cgroup *cgrp; int ret = 0; - cgrp = cgroup_kn_lock_live(kn); + cgrp = cgroup_kn_lock_live(kn, false); if (!cgrp) return 0; @@ -5178,6 +5499,7 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .rename = cgroup_rename, + .show_path = cgroup_show_path, }; static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) @@ -5252,7 +5574,7 @@ int __init cgroup_init_early(void) for_each_subsys(ss, i) { WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id, - "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n", + "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n", i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free, ss->id, ss->name); WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN, @@ -5269,7 +5591,7 @@ int __init cgroup_init_early(void) return 0; } -static unsigned long cgroup_disable_mask __initdata; +static u16 cgroup_disable_mask __initdata; /** * cgroup_init - cgroup initialization @@ -5280,18 +5602,23 @@ static unsigned long cgroup_disable_mask __initdata; int __init cgroup_init(void) { struct cgroup_subsys *ss; - unsigned long key; int ssid; + BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); + get_user_ns(init_cgroup_ns.user_ns); + mutex_lock(&cgroup_mutex); - /* Add init_css_set to the hash table */ - key = css_set_hash(init_css_set.subsys); - hash_add(css_set_table, &init_css_set.hlist, key); + /* + * Add init_css_set to the hash table so that dfl_root can link to + * it during init. + */ + hash_add(css_set_table, &init_css_set.hlist, + css_set_hash(init_css_set.subsys)); BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); @@ -5324,10 +5651,16 @@ int __init cgroup_init(void) continue; } + if (cgroup_ssid_no_v1(ssid)) + printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n", + ss->name); + cgrp_dfl_root.subsys_mask |= 1 << ss->id; - if (!ss->dfl_cftypes) - cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id; + if (ss->implicit_on_dfl) + cgrp_dfl_implicit_ss_mask |= 1 << ss->id; + else if (!ss->dfl_cftypes) + cgrp_dfl_inhibit_ss_mask |= 1 << ss->id; if (ss->dfl_cftypes == ss->legacy_cftypes) { WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes)); @@ -5340,6 +5673,11 @@ int __init cgroup_init(void) ss->bind(init_css_set.subsys[ssid]); } + /* init_css_set.subsys[] has been updated, re-hash */ + hash_del(&init_css_set.hlist); + hash_add(css_set_table, &init_css_set.hlist, + css_set_hash(init_css_set.subsys)); + WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); WARN_ON(register_filesystem(&cgroup_fs_type)); WARN_ON(register_filesystem(&cgroup2_fs_type)); @@ -5391,14 +5729,14 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, goto out; mutex_lock(&cgroup_mutex); - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); for_each_root(root) { struct cgroup_subsys *ss; struct cgroup *cgrp; int ssid, count = 0; - if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible) + if (root == &cgrp_dfl_root && !cgrp_dfl_visible) continue; seq_printf(m, "%d:", root->hierarchy_id); @@ -5424,7 +5762,8 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, * " (deleted)" is appended to the cgroup path. */ if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { - path = cgroup_path(cgrp, buf, PATH_MAX); + path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, + current->nsproxy->cgroup_ns); if (!path) { retval = -ENAMETOOLONG; goto out_unlock; @@ -5443,7 +5782,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, retval = 0; out_unlock: - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); kfree(buf); out: @@ -5513,11 +5852,11 @@ int cgroup_can_fork(struct task_struct *child) struct cgroup_subsys *ss; int i, j, ret; - for_each_subsys_which(ss, i, &have_canfork_callback) { + do_each_subsys_mask(ss, i, have_canfork_callback) { ret = ss->can_fork(child); if (ret) goto out_revert; - } + } while_each_subsys_mask(); return 0; @@ -5588,13 +5927,13 @@ void cgroup_post_fork(struct task_struct *child) if (use_task_css_set_links) { struct css_set *cset; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); cset = task_css_set(current); if (list_empty(&child->cg_list)) { get_css_set(cset); css_set_move_task(child, NULL, cset, false); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); } /* @@ -5602,8 +5941,9 @@ void cgroup_post_fork(struct task_struct *child) * css_set; otherwise, @child might change state between ->fork() * and addition to css_set. */ - for_each_subsys_which(ss, i, &have_fork_callback) + do_each_subsys_mask(ss, i, have_fork_callback) { ss->fork(child); + } while_each_subsys_mask(); } /** @@ -5638,16 +5978,17 @@ void cgroup_exit(struct task_struct *tsk) cset = task_css_set(tsk); if (!list_empty(&tsk->cg_list)) { - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); css_set_move_task(tsk, cset, NULL, false); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); } else { get_css_set(cset); } /* see cgroup_post_fork() for details */ - for_each_subsys_which(ss, i, &have_exit_callback) + do_each_subsys_mask(ss, i, have_exit_callback) { ss->exit(tsk); + } while_each_subsys_mask(); } void cgroup_free(struct task_struct *task) @@ -5656,8 +5997,9 @@ void cgroup_free(struct task_struct *task) struct cgroup_subsys *ss; int ssid; - for_each_subsys_which(ss, ssid, &have_free_callback) + do_each_subsys_mask(ss, ssid, have_free_callback) { ss->free(task); + } while_each_subsys_mask(); put_css_set(cset); } @@ -5706,7 +6048,9 @@ static void cgroup_release_agent(struct work_struct *work) if (!pathbuf || !agentbuf) goto out; - path = cgroup_path(cgrp, pathbuf, PATH_MAX); + spin_lock_irq(&css_set_lock); + path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); + spin_unlock_irq(&css_set_lock); if (!path) goto out; @@ -5750,6 +6094,33 @@ static int __init cgroup_disable(char *str) } __setup("cgroup_disable=", cgroup_disable); +static int __init cgroup_no_v1(char *str) +{ + struct cgroup_subsys *ss; + char *token; + int i; + + while ((token = strsep(&str, ",")) != NULL) { + if (!*token) + continue; + + if (!strcmp(token, "all")) { + cgroup_no_v1_mask = U16_MAX; + break; + } + + for_each_subsys(ss, i) { + if (strcmp(token, ss->name) && + strcmp(token, ss->legacy_name)) + continue; + + cgroup_no_v1_mask |= 1 << i; + } + } + return 1; +} +__setup("cgroup_no_v1=", cgroup_no_v1); + /** * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest @@ -5763,12 +6134,13 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys *ss) { struct kernfs_node *kn = kernfs_node_from_dentry(dentry); + struct file_system_type *s_type = dentry->d_sb->s_type; struct cgroup_subsys_state *css = NULL; struct cgroup *cgrp; /* is @dentry a cgroup dir? */ - if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || - kernfs_type(kn) != KERNFS_DIR) + if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) || + !kn || kernfs_type(kn) != KERNFS_DIR) return ERR_PTR(-EBADF); rcu_read_lock(); @@ -5890,6 +6262,133 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) #endif /* CONFIG_SOCK_CGROUP_DATA */ +/* cgroup namespaces */ + +static struct cgroup_namespace *alloc_cgroup_ns(void) +{ + struct cgroup_namespace *new_ns; + int ret; + + new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL); + if (!new_ns) + return ERR_PTR(-ENOMEM); + ret = ns_alloc_inum(&new_ns->ns); + if (ret) { + kfree(new_ns); + return ERR_PTR(ret); + } + atomic_set(&new_ns->count, 1); + new_ns->ns.ops = &cgroupns_operations; + return new_ns; +} + +void free_cgroup_ns(struct cgroup_namespace *ns) +{ + put_css_set(ns->root_cset); + put_user_ns(ns->user_ns); + ns_free_inum(&ns->ns); + kfree(ns); +} +EXPORT_SYMBOL(free_cgroup_ns); + +struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, + struct user_namespace *user_ns, + struct cgroup_namespace *old_ns) +{ + struct cgroup_namespace *new_ns; + struct css_set *cset; + + BUG_ON(!old_ns); + + if (!(flags & CLONE_NEWCGROUP)) { + get_cgroup_ns(old_ns); + return old_ns; + } + + /* Allow only sysadmin to create cgroup namespace. */ + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + mutex_lock(&cgroup_mutex); + spin_lock_irq(&css_set_lock); + + cset = task_css_set(current); + get_css_set(cset); + + spin_unlock_irq(&css_set_lock); + mutex_unlock(&cgroup_mutex); + + new_ns = alloc_cgroup_ns(); + if (IS_ERR(new_ns)) { + put_css_set(cset); + return new_ns; + } + + new_ns->user_ns = get_user_ns(user_ns); + new_ns->root_cset = cset; + + return new_ns; +} + +static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) +{ + return container_of(ns, struct cgroup_namespace, ns); +} + +static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns) +{ + struct cgroup_namespace *cgroup_ns = to_cg_ns(ns); + + if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) || + !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + /* Don't need to do anything if we are attaching to our own cgroupns. */ + if (cgroup_ns == nsproxy->cgroup_ns) + return 0; + + get_cgroup_ns(cgroup_ns); + put_cgroup_ns(nsproxy->cgroup_ns); + nsproxy->cgroup_ns = cgroup_ns; + + return 0; +} + +static struct ns_common *cgroupns_get(struct task_struct *task) +{ + struct cgroup_namespace *ns = NULL; + struct nsproxy *nsproxy; + + task_lock(task); + nsproxy = task->nsproxy; + if (nsproxy) { + ns = nsproxy->cgroup_ns; + get_cgroup_ns(ns); + } + task_unlock(task); + + return ns ? &ns->ns : NULL; +} + +static void cgroupns_put(struct ns_common *ns) +{ + put_cgroup_ns(to_cg_ns(ns)); +} + +const struct proc_ns_operations cgroupns_operations = { + .name = "cgroup", + .type = CLONE_NEWCGROUP, + .get = cgroupns_get, + .put = cgroupns_put, + .install = cgroupns_install, +}; + +static __init int cgroup_namespaces_init(void) +{ + return 0; +} +subsys_initcall(cgroup_namespaces_init); + #ifdef CONFIG_CGROUP_DEBUG static struct cgroup_subsys_state * debug_css_alloc(struct cgroup_subsys_state *parent_css) @@ -5940,7 +6439,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) if (!name_buf) return -ENOMEM; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); rcu_read_lock(); cset = rcu_dereference(current->cgroups); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { @@ -5951,7 +6450,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) c->root->hierarchy_id, name_buf); } rcu_read_unlock(); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); kfree(name_buf); return 0; } @@ -5962,7 +6461,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) struct cgroup_subsys_state *css = seq_css(seq); struct cgrp_cset_link *link; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { struct css_set *cset = link->cset; struct task_struct *task; @@ -5985,7 +6484,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) overflow: seq_puts(seq, " ...\n"); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); return 0; } diff --git a/kernel/cpu.c b/kernel/cpu.c index 5b9d39633ce9..d948e44c471e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -22,13 +22,90 @@ #include <linux/lockdep.h> #include <linux/tick.h> #include <linux/irq.h> +#include <linux/smpboot.h> + #include <trace/events/power.h> +#define CREATE_TRACE_POINTS +#include <trace/events/cpuhp.h> #include "smpboot.h" +/** + * cpuhp_cpu_state - Per cpu hotplug state storage + * @state: The current cpu state + * @target: The target state + * @thread: Pointer to the hotplug thread + * @should_run: Thread should execute + * @rollback: Perform a rollback + * @cb_stat: The state for a single callback (install/uninstall) + * @cb: Single callback function (install/uninstall) + * @result: Result of the operation + * @done: Signal completion to the issuer of the task + */ +struct cpuhp_cpu_state { + enum cpuhp_state state; + enum cpuhp_state target; +#ifdef CONFIG_SMP + struct task_struct *thread; + bool should_run; + bool rollback; + enum cpuhp_state cb_state; + int (*cb)(unsigned int cpu); + int result; + struct completion done; +#endif +}; + +static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); + +/** + * cpuhp_step - Hotplug state machine step + * @name: Name of the step + * @startup: Startup function of the step + * @teardown: Teardown function of the step + * @skip_onerr: Do not invoke the functions on error rollback + * Will go away once the notifiers are gone + * @cant_stop: Bringup/teardown can't be stopped at this step + */ +struct cpuhp_step { + const char *name; + int (*startup)(unsigned int cpu); + int (*teardown)(unsigned int cpu); + bool skip_onerr; + bool cant_stop; +}; + +static DEFINE_MUTEX(cpuhp_state_mutex); +static struct cpuhp_step cpuhp_bp_states[]; +static struct cpuhp_step cpuhp_ap_states[]; + +/** + * cpuhp_invoke_callback _ Invoke the callbacks for a given state + * @cpu: The cpu for which the callback should be invoked + * @step: The step in the state machine + * @cb: The callback function to invoke + * + * Called from cpu hotplug and from the state register machinery + */ +static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state step, + int (*cb)(unsigned int)) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int ret = 0; + + if (cb) { + trace_cpuhp_enter(cpu, st->target, step, cb); + ret = cb(cpu); + trace_cpuhp_exit(cpu, st->state, step, ret); + } + return ret; +} + #ifdef CONFIG_SMP /* Serializes the updates to cpu_online_mask, cpu_present_mask */ static DEFINE_MUTEX(cpu_add_remove_lock); +bool cpuhp_tasks_frozen; +EXPORT_SYMBOL_GPL(cpuhp_tasks_frozen); /* * The following two APIs (cpu_maps_update_begin/done) must be used when @@ -207,31 +284,296 @@ int __register_cpu_notifier(struct notifier_block *nb) return raw_notifier_chain_register(&cpu_chain, nb); } -static int __cpu_notify(unsigned long val, void *v, int nr_to_call, +static int __cpu_notify(unsigned long val, unsigned int cpu, int nr_to_call, int *nr_calls) { + unsigned long mod = cpuhp_tasks_frozen ? CPU_TASKS_FROZEN : 0; + void *hcpu = (void *)(long)cpu; + int ret; - ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call, + ret = __raw_notifier_call_chain(&cpu_chain, val | mod, hcpu, nr_to_call, nr_calls); return notifier_to_errno(ret); } -static int cpu_notify(unsigned long val, void *v) +static int cpu_notify(unsigned long val, unsigned int cpu) { - return __cpu_notify(val, v, -1, NULL); + return __cpu_notify(val, cpu, -1, NULL); } -#ifdef CONFIG_HOTPLUG_CPU +static void cpu_notify_nofail(unsigned long val, unsigned int cpu) +{ + BUG_ON(cpu_notify(val, cpu)); +} + +/* Notifier wrappers for transitioning to state machine */ +static int notify_prepare(unsigned int cpu) +{ + int nr_calls = 0; + int ret; + + ret = __cpu_notify(CPU_UP_PREPARE, cpu, -1, &nr_calls); + if (ret) { + nr_calls--; + printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", + __func__, cpu); + __cpu_notify(CPU_UP_CANCELED, cpu, nr_calls, NULL); + } + return ret; +} + +static int notify_online(unsigned int cpu) +{ + cpu_notify(CPU_ONLINE, cpu); + return 0; +} + +static int notify_starting(unsigned int cpu) +{ + cpu_notify(CPU_STARTING, cpu); + return 0; +} + +static int bringup_wait_for_ap(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + + wait_for_completion(&st->done); + return st->result; +} + +static int bringup_cpu(unsigned int cpu) +{ + struct task_struct *idle = idle_thread_get(cpu); + int ret; + + /* Arch-specific enabling code. */ + ret = __cpu_up(cpu, idle); + if (ret) { + cpu_notify(CPU_UP_CANCELED, cpu); + return ret; + } + ret = bringup_wait_for_ap(cpu); + BUG_ON(!cpu_online(cpu)); + return ret; +} + +/* + * Hotplug state machine related functions + */ +static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st, + struct cpuhp_step *steps) +{ + for (st->state++; st->state < st->target; st->state++) { + struct cpuhp_step *step = steps + st->state; + + if (!step->skip_onerr) + cpuhp_invoke_callback(cpu, st->state, step->startup); + } +} + +static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, + struct cpuhp_step *steps, enum cpuhp_state target) +{ + enum cpuhp_state prev_state = st->state; + int ret = 0; + + for (; st->state > target; st->state--) { + struct cpuhp_step *step = steps + st->state; + + ret = cpuhp_invoke_callback(cpu, st->state, step->teardown); + if (ret) { + st->target = prev_state; + undo_cpu_down(cpu, st, steps); + break; + } + } + return ret; +} + +static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st, + struct cpuhp_step *steps) +{ + for (st->state--; st->state > st->target; st->state--) { + struct cpuhp_step *step = steps + st->state; + + if (!step->skip_onerr) + cpuhp_invoke_callback(cpu, st->state, step->teardown); + } +} + +static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, + struct cpuhp_step *steps, enum cpuhp_state target) +{ + enum cpuhp_state prev_state = st->state; + int ret = 0; + + while (st->state < target) { + struct cpuhp_step *step; + + st->state++; + step = steps + st->state; + ret = cpuhp_invoke_callback(cpu, st->state, step->startup); + if (ret) { + st->target = prev_state; + undo_cpu_up(cpu, st, steps); + break; + } + } + return ret; +} + +/* + * The cpu hotplug threads manage the bringup and teardown of the cpus + */ +static void cpuhp_create(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + + init_completion(&st->done); +} + +static int cpuhp_should_run(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); + + return st->should_run; +} + +/* Execute the teardown callbacks. Used to be CPU_DOWN_PREPARE */ +static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st) +{ + enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU); + + return cpuhp_down_callbacks(cpu, st, cpuhp_ap_states, target); +} + +/* Execute the online startup callbacks. Used to be CPU_ONLINE */ +static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st) +{ + return cpuhp_up_callbacks(cpu, st, cpuhp_ap_states, st->target); +} + +/* + * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke + * callbacks when a state gets [un]installed at runtime. + */ +static void cpuhp_thread_fun(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); + int ret = 0; + + /* + * Paired with the mb() in cpuhp_kick_ap_work and + * cpuhp_invoke_ap_callback, so the work set is consistent visible. + */ + smp_mb(); + if (!st->should_run) + return; + + st->should_run = false; + + /* Single callback invocation for [un]install ? */ + if (st->cb) { + if (st->cb_state < CPUHP_AP_ONLINE) { + local_irq_disable(); + ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb); + local_irq_enable(); + } else { + ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb); + } + } else if (st->rollback) { + BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); + + undo_cpu_down(cpu, st, cpuhp_ap_states); + /* + * This is a momentary workaround to keep the notifier users + * happy. Will go away once we got rid of the notifiers. + */ + cpu_notify_nofail(CPU_DOWN_FAILED, cpu); + st->rollback = false; + } else { + /* Cannot happen .... */ + BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); + + /* Regular hotplug work */ + if (st->state < st->target) + ret = cpuhp_ap_online(cpu, st); + else if (st->state > st->target) + ret = cpuhp_ap_offline(cpu, st); + } + st->result = ret; + complete(&st->done); +} + +/* Invoke a single callback on a remote cpu */ +static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, + int (*cb)(unsigned int)) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + + if (!cpu_online(cpu)) + return 0; + + st->cb_state = state; + st->cb = cb; + /* + * Make sure the above stores are visible before should_run becomes + * true. Paired with the mb() above in cpuhp_thread_fun() + */ + smp_mb(); + st->should_run = true; + wake_up_process(st->thread); + wait_for_completion(&st->done); + return st->result; +} + +/* Regular hotplug invocation of the AP hotplug thread */ +static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st) +{ + st->result = 0; + st->cb = NULL; + /* + * Make sure the above stores are visible before should_run becomes + * true. Paired with the mb() above in cpuhp_thread_fun() + */ + smp_mb(); + st->should_run = true; + wake_up_process(st->thread); +} + +static int cpuhp_kick_ap_work(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + enum cpuhp_state state = st->state; + + trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work); + __cpuhp_kick_ap_work(st); + wait_for_completion(&st->done); + trace_cpuhp_exit(cpu, st->state, state, st->result); + return st->result; +} -static void cpu_notify_nofail(unsigned long val, void *v) +static struct smp_hotplug_thread cpuhp_threads = { + .store = &cpuhp_state.thread, + .create = &cpuhp_create, + .thread_should_run = cpuhp_should_run, + .thread_fn = cpuhp_thread_fun, + .thread_comm = "cpuhp/%u", + .selfparking = true, +}; + +void __init cpuhp_threads_init(void) { - BUG_ON(cpu_notify(val, v)); + BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads)); + kthread_unpark(this_cpu_read(cpuhp_state.thread)); } + +#ifdef CONFIG_HOTPLUG_CPU EXPORT_SYMBOL(register_cpu_notifier); EXPORT_SYMBOL(__register_cpu_notifier); - void unregister_cpu_notifier(struct notifier_block *nb) { cpu_maps_update_begin(); @@ -311,73 +653,58 @@ static inline void check_for_tasks(int dead_cpu) read_unlock(&tasklist_lock); } -struct take_cpu_down_param { - unsigned long mod; - void *hcpu; -}; +static int notify_down_prepare(unsigned int cpu) +{ + int err, nr_calls = 0; + + err = __cpu_notify(CPU_DOWN_PREPARE, cpu, -1, &nr_calls); + if (err) { + nr_calls--; + __cpu_notify(CPU_DOWN_FAILED, cpu, nr_calls, NULL); + pr_warn("%s: attempt to take down CPU %u failed\n", + __func__, cpu); + } + return err; +} + +static int notify_dying(unsigned int cpu) +{ + cpu_notify(CPU_DYING, cpu); + return 0; +} /* Take this CPU down. */ static int take_cpu_down(void *_param) { - struct take_cpu_down_param *param = _param; - int err; + struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); + enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE); + int err, cpu = smp_processor_id(); /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); if (err < 0) return err; - cpu_notify(CPU_DYING | param->mod, param->hcpu); + /* Invoke the former CPU_DYING callbacks */ + for (; st->state > target; st->state--) { + struct cpuhp_step *step = cpuhp_ap_states + st->state; + + cpuhp_invoke_callback(cpu, st->state, step->teardown); + } /* Give up timekeeping duties */ tick_handover_do_timer(); /* Park the stopper thread */ - stop_machine_park((long)param->hcpu); + stop_machine_park(cpu); return 0; } -/* Requires cpu_add_remove_lock to be held */ -static int _cpu_down(unsigned int cpu, int tasks_frozen) +static int takedown_cpu(unsigned int cpu) { - int err, nr_calls = 0; - void *hcpu = (void *)(long)cpu; - unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; - struct take_cpu_down_param tcd_param = { - .mod = mod, - .hcpu = hcpu, - }; - - if (num_online_cpus() == 1) - return -EBUSY; - - if (!cpu_online(cpu)) - return -EINVAL; - - cpu_hotplug_begin(); - - err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); - if (err) { - nr_calls--; - __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); - pr_warn("%s: attempt to take down CPU %u failed\n", - __func__, cpu); - goto out_release; - } - - /* - * By now we've cleared cpu_active_mask, wait for all preempt-disabled - * and RCU users of this state to go away such that all new such users - * will observe it. - * - * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might - * not imply sync_sched(), so wait for both. - * - * Do sync before park smpboot threads to take care the rcu boost case. - */ - if (IS_ENABLED(CONFIG_PREEMPT)) - synchronize_rcu_mult(call_rcu, call_rcu_sched); - else - synchronize_rcu(); + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int err; + /* Park the smpboot threads */ + kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread); smpboot_park_threads(cpu); /* @@ -389,12 +716,13 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) /* * So now all preempt/rcu users must observe !cpu_active(). */ - err = stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); + err = stop_machine(take_cpu_down, NULL, cpumask_of(cpu)); if (err) { - /* CPU didn't die: tell everyone. Can't complain. */ - cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); + /* CPU refused to die */ irq_unlock_sparse(); - goto out_release; + /* Unpark the hotplug thread so we can rollback there */ + kthread_unpark(per_cpu_ptr(&cpuhp_state, cpu)->thread); + return err; } BUG_ON(cpu_online(cpu)); @@ -405,10 +733,8 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) * * Wait for the stop thread to go away. */ - while (!per_cpu(cpu_dead_idle, cpu)) - cpu_relax(); - smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */ - per_cpu(cpu_dead_idle, cpu) = false; + wait_for_completion(&st->done); + BUG_ON(st->state != CPUHP_AP_IDLE_DEAD); /* Interrupts are moved away from the dying cpu, reenable alloc/free */ irq_unlock_sparse(); @@ -417,20 +743,109 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) /* This actually kills the CPU. */ __cpu_die(cpu); - /* CPU is completely dead: tell everyone. Too late to complain. */ tick_cleanup_dead_cpu(cpu); - cpu_notify_nofail(CPU_DEAD | mod, hcpu); + return 0; +} +static int notify_dead(unsigned int cpu) +{ + cpu_notify_nofail(CPU_DEAD, cpu); check_for_tasks(cpu); + return 0; +} + +static void cpuhp_complete_idle_dead(void *arg) +{ + struct cpuhp_cpu_state *st = arg; + + complete(&st->done); +} + +void cpuhp_report_idle_dead(void) +{ + struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); + + BUG_ON(st->state != CPUHP_AP_OFFLINE); + rcu_report_dead(smp_processor_id()); + st->state = CPUHP_AP_IDLE_DEAD; + /* + * We cannot call complete after rcu_report_dead() so we delegate it + * to an online cpu. + */ + smp_call_function_single(cpumask_first(cpu_online_mask), + cpuhp_complete_idle_dead, st, 0); +} + +#else +#define notify_down_prepare NULL +#define takedown_cpu NULL +#define notify_dead NULL +#define notify_dying NULL +#endif + +#ifdef CONFIG_HOTPLUG_CPU + +/* Requires cpu_add_remove_lock to be held */ +static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, + enum cpuhp_state target) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int prev_state, ret = 0; + bool hasdied = false; -out_release: + if (num_online_cpus() == 1) + return -EBUSY; + + if (!cpu_present(cpu)) + return -EINVAL; + + cpu_hotplug_begin(); + + cpuhp_tasks_frozen = tasks_frozen; + + prev_state = st->state; + st->target = target; + /* + * If the current CPU state is in the range of the AP hotplug thread, + * then we need to kick the thread. + */ + if (st->state > CPUHP_TEARDOWN_CPU) { + ret = cpuhp_kick_ap_work(cpu); + /* + * The AP side has done the error rollback already. Just + * return the error code.. + */ + if (ret) + goto out; + + /* + * We might have stopped still in the range of the AP hotplug + * thread. Nothing to do anymore. + */ + if (st->state > CPUHP_TEARDOWN_CPU) + goto out; + } + /* + * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need + * to do the further cleanups. + */ + ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target); + if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) { + st->target = prev_state; + st->rollback = true; + cpuhp_kick_ap_work(cpu); + } + + hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE; +out: cpu_hotplug_done(); - if (!err) - cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu); - return err; + /* This post dead nonsense must die */ + if (!ret && hasdied) + cpu_notify_nofail(CPU_POST_DEAD, cpu); + return ret; } -int cpu_down(unsigned int cpu) +static int do_cpu_down(unsigned int cpu, enum cpuhp_state target) { int err; @@ -441,100 +856,129 @@ int cpu_down(unsigned int cpu) goto out; } - err = _cpu_down(cpu, 0); + err = _cpu_down(cpu, 0, target); out: cpu_maps_update_done(); return err; } +int cpu_down(unsigned int cpu) +{ + return do_cpu_down(cpu, CPUHP_OFFLINE); +} EXPORT_SYMBOL(cpu_down); #endif /*CONFIG_HOTPLUG_CPU*/ -/* - * Unpark per-CPU smpboot kthreads at CPU-online time. +/** + * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers + * @cpu: cpu that just started + * + * This function calls the cpu_chain notifiers with CPU_STARTING. + * It must be called by the arch code on the new cpu, before the new cpu + * enables interrupts and before the "boot" cpu returns from __cpu_up(). */ -static int smpboot_thread_call(struct notifier_block *nfb, - unsigned long action, void *hcpu) +void notify_cpu_starting(unsigned int cpu) { - int cpu = (long)hcpu; + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); - switch (action & ~CPU_TASKS_FROZEN) { + while (st->state < target) { + struct cpuhp_step *step; - case CPU_DOWN_FAILED: - case CPU_ONLINE: - smpboot_unpark_threads(cpu); - break; - - default: - break; + st->state++; + step = cpuhp_ap_states + st->state; + cpuhp_invoke_callback(cpu, st->state, step->startup); } - - return NOTIFY_OK; } -static struct notifier_block smpboot_thread_notifier = { - .notifier_call = smpboot_thread_call, - .priority = CPU_PRI_SMPBOOT, -}; - -void smpboot_thread_init(void) +/* + * Called from the idle task. We need to set active here, so we can kick off + * the stopper thread and unpark the smpboot threads. If the target state is + * beyond CPUHP_AP_ONLINE_IDLE we kick cpuhp thread and let it bring up the + * cpu further. + */ +void cpuhp_online_idle(enum cpuhp_state state) { - register_cpu_notifier(&smpboot_thread_notifier); + struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); + unsigned int cpu = smp_processor_id(); + + /* Happens for the boot cpu */ + if (state != CPUHP_AP_ONLINE_IDLE) + return; + + st->state = CPUHP_AP_ONLINE_IDLE; + + /* Unpark the stopper thread and the hotplug thread of this cpu */ + stop_machine_unpark(cpu); + kthread_unpark(st->thread); + + /* Should we go further up ? */ + if (st->target > CPUHP_AP_ONLINE_IDLE) + __cpuhp_kick_ap_work(st); + else + complete(&st->done); } /* Requires cpu_add_remove_lock to be held */ -static int _cpu_up(unsigned int cpu, int tasks_frozen) +static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) { - int ret, nr_calls = 0; - void *hcpu = (void *)(long)cpu; - unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); struct task_struct *idle; + int ret = 0; cpu_hotplug_begin(); - if (cpu_online(cpu) || !cpu_present(cpu)) { + if (!cpu_present(cpu)) { ret = -EINVAL; goto out; } - idle = idle_thread_get(cpu); - if (IS_ERR(idle)) { - ret = PTR_ERR(idle); - goto out; - } - - ret = smpboot_create_threads(cpu); - if (ret) + /* + * The caller of do_cpu_up might have raced with another + * caller. Ignore it for now. + */ + if (st->state >= target) goto out; - ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); - if (ret) { - nr_calls--; - pr_warn("%s: attempt to bring up CPU %u failed\n", - __func__, cpu); - goto out_notify; + if (st->state == CPUHP_OFFLINE) { + /* Let it fail before we try to bring the cpu up */ + idle = idle_thread_get(cpu); + if (IS_ERR(idle)) { + ret = PTR_ERR(idle); + goto out; + } } - /* Arch-specific enabling code. */ - ret = __cpu_up(cpu, idle); - - if (ret != 0) - goto out_notify; - BUG_ON(!cpu_online(cpu)); + cpuhp_tasks_frozen = tasks_frozen; - /* Now call notifier in preparation. */ - cpu_notify(CPU_ONLINE | mod, hcpu); + st->target = target; + /* + * If the current CPU state is in the range of the AP hotplug thread, + * then we need to kick the thread once more. + */ + if (st->state > CPUHP_BRINGUP_CPU) { + ret = cpuhp_kick_ap_work(cpu); + /* + * The AP side has done the error rollback already. Just + * return the error code.. + */ + if (ret) + goto out; + } -out_notify: - if (ret != 0) - __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); + /* + * Try to reach the target state. We max out on the BP at + * CPUHP_BRINGUP_CPU. After that the AP hotplug thread is + * responsible for bringing it up to the target state. + */ + target = min((int)target, CPUHP_BRINGUP_CPU); + ret = cpuhp_up_callbacks(cpu, st, cpuhp_bp_states, target); out: cpu_hotplug_done(); - return ret; } -int cpu_up(unsigned int cpu) +static int do_cpu_up(unsigned int cpu, enum cpuhp_state target) { int err = 0; @@ -558,12 +1002,16 @@ int cpu_up(unsigned int cpu) goto out; } - err = _cpu_up(cpu, 0); - + err = _cpu_up(cpu, 0, target); out: cpu_maps_update_done(); return err; } + +int cpu_up(unsigned int cpu) +{ + return do_cpu_up(cpu, CPUHP_ONLINE); +} EXPORT_SYMBOL_GPL(cpu_up); #ifdef CONFIG_PM_SLEEP_SMP @@ -586,7 +1034,7 @@ int disable_nonboot_cpus(void) if (cpu == first_cpu) continue; trace_suspend_resume(TPS("CPU_OFF"), cpu, true); - error = _cpu_down(cpu, 1); + error = _cpu_down(cpu, 1, CPUHP_OFFLINE); trace_suspend_resume(TPS("CPU_OFF"), cpu, false); if (!error) cpumask_set_cpu(cpu, frozen_cpus); @@ -636,7 +1084,7 @@ void enable_nonboot_cpus(void) for_each_cpu(cpu, frozen_cpus) { trace_suspend_resume(TPS("CPU_ON"), cpu, true); - error = _cpu_up(cpu, 1); + error = _cpu_up(cpu, 1, CPUHP_ONLINE); trace_suspend_resume(TPS("CPU_ON"), cpu, false); if (!error) { pr_info("CPU%d is up\n", cpu); @@ -709,26 +1157,479 @@ core_initcall(cpu_hotplug_pm_sync_init); #endif /* CONFIG_PM_SLEEP_SMP */ +#endif /* CONFIG_SMP */ + +/* Boot processor state steps */ +static struct cpuhp_step cpuhp_bp_states[] = { + [CPUHP_OFFLINE] = { + .name = "offline", + .startup = NULL, + .teardown = NULL, + }, +#ifdef CONFIG_SMP + [CPUHP_CREATE_THREADS]= { + .name = "threads:create", + .startup = smpboot_create_threads, + .teardown = NULL, + .cant_stop = true, + }, + /* + * Preparatory and dead notifiers. Will be replaced once the notifiers + * are converted to states. + */ + [CPUHP_NOTIFY_PREPARE] = { + .name = "notify:prepare", + .startup = notify_prepare, + .teardown = notify_dead, + .skip_onerr = true, + .cant_stop = true, + }, + /* Kicks the plugged cpu into life */ + [CPUHP_BRINGUP_CPU] = { + .name = "cpu:bringup", + .startup = bringup_cpu, + .teardown = NULL, + .cant_stop = true, + }, + /* + * Handled on controll processor until the plugged processor manages + * this itself. + */ + [CPUHP_TEARDOWN_CPU] = { + .name = "cpu:teardown", + .startup = NULL, + .teardown = takedown_cpu, + .cant_stop = true, + }, +#endif +}; + +/* Application processor state steps */ +static struct cpuhp_step cpuhp_ap_states[] = { +#ifdef CONFIG_SMP + /* Final state before CPU kills itself */ + [CPUHP_AP_IDLE_DEAD] = { + .name = "idle:dead", + }, + /* + * Last state before CPU enters the idle loop to die. Transient state + * for synchronization. + */ + [CPUHP_AP_OFFLINE] = { + .name = "ap:offline", + .cant_stop = true, + }, + /* First state is scheduler control. Interrupts are disabled */ + [CPUHP_AP_SCHED_STARTING] = { + .name = "sched:starting", + .startup = sched_cpu_starting, + .teardown = sched_cpu_dying, + }, + /* + * Low level startup/teardown notifiers. Run with interrupts + * disabled. Will be removed once the notifiers are converted to + * states. + */ + [CPUHP_AP_NOTIFY_STARTING] = { + .name = "notify:starting", + .startup = notify_starting, + .teardown = notify_dying, + .skip_onerr = true, + .cant_stop = true, + }, + /* Entry state on starting. Interrupts enabled from here on. Transient + * state for synchronsization */ + [CPUHP_AP_ONLINE] = { + .name = "ap:online", + }, + /* Handle smpboot threads park/unpark */ + [CPUHP_AP_SMPBOOT_THREADS] = { + .name = "smpboot:threads", + .startup = smpboot_unpark_threads, + .teardown = NULL, + }, + /* + * Online/down_prepare notifiers. Will be removed once the notifiers + * are converted to states. + */ + [CPUHP_AP_NOTIFY_ONLINE] = { + .name = "notify:online", + .startup = notify_online, + .teardown = notify_down_prepare, + .skip_onerr = true, + }, +#endif + /* + * The dynamically registered state space is here + */ + +#ifdef CONFIG_SMP + /* Last state is scheduler control setting the cpu active */ + [CPUHP_AP_ACTIVE] = { + .name = "sched:active", + .startup = sched_cpu_activate, + .teardown = sched_cpu_deactivate, + }, +#endif + + /* CPU is fully up and running. */ + [CPUHP_ONLINE] = { + .name = "online", + .startup = NULL, + .teardown = NULL, + }, +}; + +/* Sanity check for callbacks */ +static int cpuhp_cb_check(enum cpuhp_state state) +{ + if (state <= CPUHP_OFFLINE || state >= CPUHP_ONLINE) + return -EINVAL; + return 0; +} + +static bool cpuhp_is_ap_state(enum cpuhp_state state) +{ + /* + * The extra check for CPUHP_TEARDOWN_CPU is only for documentation + * purposes as that state is handled explicitely in cpu_down. + */ + return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; +} + +static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) +{ + struct cpuhp_step *sp; + + sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states; + return sp + state; +} + +static void cpuhp_store_callbacks(enum cpuhp_state state, + const char *name, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)) +{ + /* (Un)Install the callbacks for further cpu hotplug operations */ + struct cpuhp_step *sp; + + mutex_lock(&cpuhp_state_mutex); + sp = cpuhp_get_step(state); + sp->startup = startup; + sp->teardown = teardown; + sp->name = name; + mutex_unlock(&cpuhp_state_mutex); +} + +static void *cpuhp_get_teardown_cb(enum cpuhp_state state) +{ + return cpuhp_get_step(state)->teardown; +} + +/* + * Call the startup/teardown function for a step either on the AP or + * on the current CPU. + */ +static int cpuhp_issue_call(int cpu, enum cpuhp_state state, + int (*cb)(unsigned int), bool bringup) +{ + int ret; + + if (!cb) + return 0; + /* + * The non AP bound callbacks can fail on bringup. On teardown + * e.g. module removal we crash for now. + */ +#ifdef CONFIG_SMP + if (cpuhp_is_ap_state(state)) + ret = cpuhp_invoke_ap_callback(cpu, state, cb); + else + ret = cpuhp_invoke_callback(cpu, state, cb); +#else + ret = cpuhp_invoke_callback(cpu, state, cb); +#endif + BUG_ON(ret && !bringup); + return ret; +} + +/* + * Called from __cpuhp_setup_state on a recoverable failure. + * + * Note: The teardown callbacks for rollback are not allowed to fail! + */ +static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, + int (*teardown)(unsigned int cpu)) +{ + int cpu; + + if (!teardown) + return; + + /* Roll back the already executed steps on the other cpus */ + for_each_present_cpu(cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int cpustate = st->state; + + if (cpu >= failedcpu) + break; + + /* Did we invoke the startup call on that cpu ? */ + if (cpustate >= state) + cpuhp_issue_call(cpu, state, teardown, false); + } +} + +/* + * Returns a free for dynamic slot assignment of the Online state. The states + * are protected by the cpuhp_slot_states mutex and an empty slot is identified + * by having no name assigned. + */ +static int cpuhp_reserve_state(enum cpuhp_state state) +{ + enum cpuhp_state i; + + mutex_lock(&cpuhp_state_mutex); + for (i = CPUHP_AP_ONLINE_DYN; i <= CPUHP_AP_ONLINE_DYN_END; i++) { + if (cpuhp_ap_states[i].name) + continue; + + cpuhp_ap_states[i].name = "Reserved"; + mutex_unlock(&cpuhp_state_mutex); + return i; + } + mutex_unlock(&cpuhp_state_mutex); + WARN(1, "No more dynamic states available for CPU hotplug\n"); + return -ENOSPC; +} + /** - * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers - * @cpu: cpu that just started + * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state + * @state: The state to setup + * @invoke: If true, the startup function is invoked for cpus where + * cpu state >= @state + * @startup: startup callback function + * @teardown: teardown callback function * - * This function calls the cpu_chain notifiers with CPU_STARTING. - * It must be called by the arch code on the new cpu, before the new cpu - * enables interrupts and before the "boot" cpu returns from __cpu_up(). + * Returns 0 if successful, otherwise a proper error code */ -void notify_cpu_starting(unsigned int cpu) +int __cpuhp_setup_state(enum cpuhp_state state, + const char *name, bool invoke, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)) { - unsigned long val = CPU_STARTING; + int cpu, ret = 0; + int dyn_state = 0; -#ifdef CONFIG_PM_SLEEP_SMP - if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus)) - val = CPU_STARTING_FROZEN; -#endif /* CONFIG_PM_SLEEP_SMP */ - cpu_notify(val, (void *)(long)cpu); + if (cpuhp_cb_check(state) || !name) + return -EINVAL; + + get_online_cpus(); + + /* currently assignments for the ONLINE state are possible */ + if (state == CPUHP_AP_ONLINE_DYN) { + dyn_state = 1; + ret = cpuhp_reserve_state(state); + if (ret < 0) + goto out; + state = ret; + } + + cpuhp_store_callbacks(state, name, startup, teardown); + + if (!invoke || !startup) + goto out; + + /* + * Try to call the startup callback for each present cpu + * depending on the hotplug state of the cpu. + */ + for_each_present_cpu(cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int cpustate = st->state; + + if (cpustate < state) + continue; + + ret = cpuhp_issue_call(cpu, state, startup, true); + if (ret) { + cpuhp_rollback_install(cpu, state, teardown); + cpuhp_store_callbacks(state, NULL, NULL, NULL); + goto out; + } + } +out: + put_online_cpus(); + if (!ret && dyn_state) + return state; + return ret; } +EXPORT_SYMBOL(__cpuhp_setup_state); -#endif /* CONFIG_SMP */ +/** + * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state + * @state: The state to remove + * @invoke: If true, the teardown function is invoked for cpus where + * cpu state >= @state + * + * The teardown callback is currently not allowed to fail. Think + * about module removal! + */ +void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) +{ + int (*teardown)(unsigned int cpu) = cpuhp_get_teardown_cb(state); + int cpu; + + BUG_ON(cpuhp_cb_check(state)); + + get_online_cpus(); + + if (!invoke || !teardown) + goto remove; + + /* + * Call the teardown callback for each present cpu depending + * on the hotplug state of the cpu. This function is not + * allowed to fail currently! + */ + for_each_present_cpu(cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int cpustate = st->state; + + if (cpustate >= state) + cpuhp_issue_call(cpu, state, teardown, false); + } +remove: + cpuhp_store_callbacks(state, NULL, NULL, NULL); + put_online_cpus(); +} +EXPORT_SYMBOL(__cpuhp_remove_state); + +#if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU) +static ssize_t show_cpuhp_state(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + + return sprintf(buf, "%d\n", st->state); +} +static DEVICE_ATTR(state, 0444, show_cpuhp_state, NULL); + +static ssize_t write_cpuhp_target(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + struct cpuhp_step *sp; + int target, ret; + + ret = kstrtoint(buf, 10, &target); + if (ret) + return ret; + +#ifdef CONFIG_CPU_HOTPLUG_STATE_CONTROL + if (target < CPUHP_OFFLINE || target > CPUHP_ONLINE) + return -EINVAL; +#else + if (target != CPUHP_OFFLINE && target != CPUHP_ONLINE) + return -EINVAL; +#endif + + ret = lock_device_hotplug_sysfs(); + if (ret) + return ret; + + mutex_lock(&cpuhp_state_mutex); + sp = cpuhp_get_step(target); + ret = !sp->name || sp->cant_stop ? -EINVAL : 0; + mutex_unlock(&cpuhp_state_mutex); + if (ret) + return ret; + + if (st->state < target) + ret = do_cpu_up(dev->id, target); + else + ret = do_cpu_down(dev->id, target); + + unlock_device_hotplug(); + return ret ? ret : count; +} + +static ssize_t show_cpuhp_target(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + + return sprintf(buf, "%d\n", st->target); +} +static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target); + +static struct attribute *cpuhp_cpu_attrs[] = { + &dev_attr_state.attr, + &dev_attr_target.attr, + NULL +}; + +static struct attribute_group cpuhp_cpu_attr_group = { + .attrs = cpuhp_cpu_attrs, + .name = "hotplug", + NULL +}; + +static ssize_t show_cpuhp_states(struct device *dev, + struct device_attribute *attr, char *buf) +{ + ssize_t cur, res = 0; + int i; + + mutex_lock(&cpuhp_state_mutex); + for (i = CPUHP_OFFLINE; i <= CPUHP_ONLINE; i++) { + struct cpuhp_step *sp = cpuhp_get_step(i); + + if (sp->name) { + cur = sprintf(buf, "%3d: %s\n", i, sp->name); + buf += cur; + res += cur; + } + } + mutex_unlock(&cpuhp_state_mutex); + return res; +} +static DEVICE_ATTR(states, 0444, show_cpuhp_states, NULL); + +static struct attribute *cpuhp_cpu_root_attrs[] = { + &dev_attr_states.attr, + NULL +}; + +static struct attribute_group cpuhp_cpu_root_attr_group = { + .attrs = cpuhp_cpu_root_attrs, + .name = "hotplug", + NULL +}; + +static int __init cpuhp_sysfs_init(void) +{ + int cpu, ret; + + ret = sysfs_create_group(&cpu_subsys.dev_root->kobj, + &cpuhp_cpu_root_attr_group); + if (ret) + return ret; + + for_each_possible_cpu(cpu) { + struct device *dev = get_cpu_device(cpu); + + if (!dev) + continue; + ret = sysfs_create_group(&dev->kobj, &cpuhp_cpu_attr_group); + if (ret) + return ret; + } + return 0; +} +device_initcall(cpuhp_sysfs_init); +#endif /* * cpu_bit_bitmap[] is a special, "compressed" data structure that @@ -789,3 +1690,25 @@ void init_cpu_online(const struct cpumask *src) { cpumask_copy(&__cpu_online_mask, src); } + +/* + * Activate the first processor. + */ +void __init boot_cpu_init(void) +{ + int cpu = smp_processor_id(); + + /* Mark the boot cpu "present", "online" etc for SMP and UP case */ + set_cpu_online(cpu, true); + set_cpu_active(cpu, true); + set_cpu_present(cpu, true); + set_cpu_possible(cpu, true); +} + +/* + * Must be called _AFTER_ setting up the per_cpu areas + */ +void __init boot_cpu_state_init(void) +{ + per_cpu_ptr(&cpuhp_state, smp_processor_id())->state = CPUHP_ONLINE; +} diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 41989ab4db57..73e93e53884d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -58,11 +58,10 @@ #include <asm/uaccess.h> #include <linux/atomic.h> #include <linux/mutex.h> -#include <linux/workqueue.h> #include <linux/cgroup.h> #include <linux/wait.h> -struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE; +DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); /* See "Frequency meter" comments, below. */ @@ -1016,7 +1015,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, } } -void cpuset_post_attach_flush(void) +static void cpuset_post_attach(void) { flush_workqueue(cpuset_migrate_mm_wq); } @@ -2087,9 +2086,10 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .can_attach = cpuset_can_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, + .post_attach = cpuset_post_attach, .bind = cpuset_bind, .legacy_cftypes = files, - .early_init = 1, + .early_init = true, }; /** @@ -2528,27 +2528,27 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. */ -int __cpuset_node_allowed(int node, gfp_t gfp_mask) +bool __cpuset_node_allowed(int node, gfp_t gfp_mask) { struct cpuset *cs; /* current cpuset ancestors */ int allowed; /* is allocation in zone z allowed? */ unsigned long flags; if (in_interrupt()) - return 1; + return true; if (node_isset(node, current->mems_allowed)) - return 1; + return true; /* * Allow tasks that have access to memory reserves because they have * been OOM killed to get memory anywhere. */ if (unlikely(test_thread_flag(TIF_MEMDIE))) - return 1; + return true; if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ - return 0; + return false; if (current->flags & PF_EXITING) /* Let dying task have memory */ - return 1; + return true; /* Not hardwall and node outside mems_allowed: scan up cpusets */ spin_lock_irqsave(&callback_lock, flags); @@ -2591,13 +2591,7 @@ int __cpuset_node_allowed(int node, gfp_t gfp_mask) static int cpuset_spread_node(int *rotor) { - int node; - - node = next_node(*rotor, current->mems_allowed); - if (node == MAX_NUMNODES) - node = first_node(current->mems_allowed); - *rotor = node; - return node; + return *rotor = next_node_in(*rotor, current->mems_allowed); } int cpuset_mem_spread_node(void) @@ -2714,10 +2708,10 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, goto out; retval = -ENAMETOOLONG; - rcu_read_lock(); - css = task_css(tsk, cpuset_cgrp_id); - p = cgroup_path(css->cgroup, buf, PATH_MAX); - rcu_read_unlock(); + css = task_get_css(tsk, cpuset_cgrp_id); + p = cgroup_path_ns(css->cgroup, buf, PATH_MAX, + current->nsproxy->cgroup_ns); + css_put(css); if (!p) goto out_free; seq_puts(m, p); diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index e1dbf4a2c69e..90ff129c88a2 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -153,13 +153,11 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp) } else { kdb_printf("%s: failed to set breakpoint at 0x%lx\n", __func__, bp->bp_addr); -#ifdef CONFIG_DEBUG_RODATA if (!bp->bp_type) { kdb_printf("Software breakpoints are unavailable.\n" - " Change the kernel CONFIG_DEBUG_RODATA=n\n" + " Boot the kernel with rodata=off\n" " OR use hw breaks: help bph\n"); } -#endif return 1; } return 0; diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 9c418002b8c1..179ef4640964 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -18,18 +18,28 @@ struct callchain_cpus_entries { struct perf_callchain_entry *cpu_entries[0]; }; +int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH; +int sysctl_perf_event_max_contexts_per_stack __read_mostly = PERF_MAX_CONTEXTS_PER_STACK; + +static inline size_t perf_callchain_entry__sizeof(void) +{ + return (sizeof(struct perf_callchain_entry) + + sizeof(__u64) * (sysctl_perf_event_max_stack + + sysctl_perf_event_max_contexts_per_stack)); +} + static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); static atomic_t nr_callchain_events; static DEFINE_MUTEX(callchain_mutex); static struct callchain_cpus_entries *callchain_cpus_entries; -__weak void perf_callchain_kernel(struct perf_callchain_entry *entry, +__weak void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { } -__weak void perf_callchain_user(struct perf_callchain_entry *entry, +__weak void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { } @@ -73,7 +83,7 @@ static int alloc_callchain_buffers(void) if (!entries) return -ENOMEM; - size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; + size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS; for_each_possible_cpu(cpu) { entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, @@ -147,7 +157,8 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx) cpu = smp_processor_id(); - return &entries->cpu_entries[cpu][*rctx]; + return (((void *)entries->cpu_entries[cpu]) + + (*rctx * perf_callchain_entry__sizeof())); } static void @@ -159,15 +170,25 @@ put_callchain_entry(int rctx) struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs) { - int rctx; - struct perf_callchain_entry *entry; - - int kernel = !event->attr.exclude_callchain_kernel; - int user = !event->attr.exclude_callchain_user; + bool kernel = !event->attr.exclude_callchain_kernel; + bool user = !event->attr.exclude_callchain_user; + /* Disallow cross-task user callchains. */ + bool crosstask = event->ctx->task && event->ctx->task != current; if (!kernel && !user) return NULL; + return get_perf_callchain(regs, 0, kernel, user, sysctl_perf_event_max_stack, crosstask, true); +} + +struct perf_callchain_entry * +get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, + u32 max_stack, bool crosstask, bool add_mark) +{ + struct perf_callchain_entry *entry; + struct perf_callchain_entry_ctx ctx; + int rctx; + entry = get_callchain_entry(&rctx); if (rctx == -1) return NULL; @@ -175,11 +196,16 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) if (!entry) goto exit_put; - entry->nr = 0; + ctx.entry = entry; + ctx.max_stack = max_stack; + ctx.nr = entry->nr = init_nr; + ctx.contexts = 0; + ctx.contexts_maxed = false; if (kernel && !user_mode(regs)) { - perf_callchain_store(entry, PERF_CONTEXT_KERNEL); - perf_callchain_kernel(entry, regs); + if (add_mark) + perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL); + perf_callchain_kernel(&ctx, regs); } if (user) { @@ -191,14 +217,12 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) } if (regs) { - /* - * Disallow cross-task user callchains. - */ - if (event->ctx->task && event->ctx->task != current) + if (crosstask) goto exit_put; - perf_callchain_store(entry, PERF_CONTEXT_USER); - perf_callchain_user(entry, regs); + if (add_mark) + perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); + perf_callchain_user(&ctx, regs); } } @@ -207,3 +231,30 @@ exit_put: return entry; } + +/* + * Used for sysctl_perf_event_max_stack and + * sysctl_perf_event_max_contexts_per_stack. + */ +int perf_event_max_stack_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *value = table->data; + int new_value = *value, ret; + struct ctl_table new_table = *table; + + new_table.data = &new_value; + ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + + mutex_lock(&callchain_mutex); + if (atomic_read(&nr_callchain_events)) + ret = -EBUSY; + else + *value = new_value; + + mutex_unlock(&callchain_mutex); + + return ret; +} diff --git a/kernel/events/core.c b/kernel/events/core.c index 614614821f00..85cd41878a74 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -44,6 +44,8 @@ #include <linux/compat.h> #include <linux/bpf.h> #include <linux/filter.h> +#include <linux/namei.h> +#include <linux/parser.h> #include "internal.h" @@ -351,7 +353,7 @@ static struct srcu_struct pmus_srcu; * 1 - disallow cpu events for unpriv * 2 - disallow kernel profiling for unpriv */ -int sysctl_perf_event_paranoid __read_mostly = 1; +int sysctl_perf_event_paranoid __read_mostly = 2; /* Minimum for 512 kiB + 1 user control page */ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ @@ -376,8 +378,11 @@ static void update_perf_cpu_limits(void) u64 tmp = perf_sample_period_ns; tmp *= sysctl_perf_cpu_time_max_percent; - do_div(tmp, 100); - ACCESS_ONCE(perf_sample_allowed_ns) = tmp; + tmp = div_u64(tmp, 100); + if (!tmp) + tmp = 1; + + WRITE_ONCE(perf_sample_allowed_ns, tmp); } static int perf_rotate_context(struct perf_cpu_context *cpuctx); @@ -409,7 +414,14 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, if (ret || !write) return ret; - update_perf_cpu_limits(); + if (sysctl_perf_cpu_time_max_percent == 100 || + sysctl_perf_cpu_time_max_percent == 0) { + printk(KERN_WARNING + "perf: Dynamic interrupt throttling disabled, can hang your system!\n"); + WRITE_ONCE(perf_sample_allowed_ns, 0); + } else { + update_perf_cpu_limits(); + } return 0; } @@ -423,62 +435,68 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, #define NR_ACCUMULATED_SAMPLES 128 static DEFINE_PER_CPU(u64, running_sample_length); +static u64 __report_avg; +static u64 __report_allowed; + static void perf_duration_warn(struct irq_work *w) { - u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); - u64 avg_local_sample_len; - u64 local_samples_len; - - local_samples_len = __this_cpu_read(running_sample_length); - avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; - printk_ratelimited(KERN_WARNING - "perf interrupt took too long (%lld > %lld), lowering " - "kernel.perf_event_max_sample_rate to %d\n", - avg_local_sample_len, allowed_ns >> 1, - sysctl_perf_event_sample_rate); + "perf: interrupt took too long (%lld > %lld), lowering " + "kernel.perf_event_max_sample_rate to %d\n", + __report_avg, __report_allowed, + sysctl_perf_event_sample_rate); } static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn); void perf_sample_event_took(u64 sample_len_ns) { - u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); - u64 avg_local_sample_len; - u64 local_samples_len; + u64 max_len = READ_ONCE(perf_sample_allowed_ns); + u64 running_len; + u64 avg_len; + u32 max; - if (allowed_ns == 0) + if (max_len == 0) return; - /* decay the counter by 1 average sample */ - local_samples_len = __this_cpu_read(running_sample_length); - local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES; - local_samples_len += sample_len_ns; - __this_cpu_write(running_sample_length, local_samples_len); + /* Decay the counter by 1 average sample. */ + running_len = __this_cpu_read(running_sample_length); + running_len -= running_len/NR_ACCUMULATED_SAMPLES; + running_len += sample_len_ns; + __this_cpu_write(running_sample_length, running_len); /* - * note: this will be biased artifically low until we have - * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us + * Note: this will be biased artifically low until we have + * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us * from having to maintain a count. */ - avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; - - if (avg_local_sample_len <= allowed_ns) + avg_len = running_len/NR_ACCUMULATED_SAMPLES; + if (avg_len <= max_len) return; - if (max_samples_per_tick <= 1) - return; + __report_avg = avg_len; + __report_allowed = max_len; - max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2); - sysctl_perf_event_sample_rate = max_samples_per_tick * HZ; - perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; + /* + * Compute a throttle threshold 25% below the current duration. + */ + avg_len += avg_len / 4; + max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent; + if (avg_len < max) + max /= (u32)avg_len; + else + max = 1; - update_perf_cpu_limits(); + WRITE_ONCE(perf_sample_allowed_ns, avg_len); + WRITE_ONCE(max_samples_per_tick, max); + + sysctl_perf_event_sample_rate = max * HZ; + perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; if (!irq_work_queue(&perf_duration_work)) { - early_printk("perf interrupt took too long (%lld > %lld), lowering " + early_printk("perf: interrupt took too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", - avg_local_sample_len, allowed_ns >> 1, + __report_avg, __report_allowed, sysctl_perf_event_sample_rate); } } @@ -1090,6 +1108,7 @@ static void put_ctx(struct perf_event_context *ctx) * function. * * Lock order: + * cred_guard_mutex * task_struct::perf_event_mutex * perf_event_context::mutex * perf_event::child_mutex; @@ -1910,8 +1929,13 @@ event_sched_in(struct perf_event *event, if (event->state <= PERF_EVENT_STATE_OFF) return 0; - event->state = PERF_EVENT_STATE_ACTIVE; - event->oncpu = smp_processor_id(); + WRITE_ONCE(event->oncpu, smp_processor_id()); + /* + * Order event::oncpu write to happen before the ACTIVE state + * is visible. + */ + smp_wmb(); + WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE); /* * Unthrottle events, since we scheduled we might have missed several @@ -2343,6 +2367,112 @@ void perf_event_enable(struct perf_event *event) } EXPORT_SYMBOL_GPL(perf_event_enable); +struct stop_event_data { + struct perf_event *event; + unsigned int restart; +}; + +static int __perf_event_stop(void *info) +{ + struct stop_event_data *sd = info; + struct perf_event *event = sd->event; + + /* if it's already INACTIVE, do nothing */ + if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) + return 0; + + /* matches smp_wmb() in event_sched_in() */ + smp_rmb(); + + /* + * There is a window with interrupts enabled before we get here, + * so we need to check again lest we try to stop another CPU's event. + */ + if (READ_ONCE(event->oncpu) != smp_processor_id()) + return -EAGAIN; + + event->pmu->stop(event, PERF_EF_UPDATE); + + /* + * May race with the actual stop (through perf_pmu_output_stop()), + * but it is only used for events with AUX ring buffer, and such + * events will refuse to restart because of rb::aux_mmap_count==0, + * see comments in perf_aux_output_begin(). + * + * Since this is happening on a event-local CPU, no trace is lost + * while restarting. + */ + if (sd->restart) + event->pmu->start(event, PERF_EF_START); + + return 0; +} + +static int perf_event_restart(struct perf_event *event) +{ + struct stop_event_data sd = { + .event = event, + .restart = 1, + }; + int ret = 0; + + do { + if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) + return 0; + + /* matches smp_wmb() in event_sched_in() */ + smp_rmb(); + + /* + * We only want to restart ACTIVE events, so if the event goes + * inactive here (event->oncpu==-1), there's nothing more to do; + * fall through with ret==-ENXIO. + */ + ret = cpu_function_call(READ_ONCE(event->oncpu), + __perf_event_stop, &sd); + } while (ret == -EAGAIN); + + return ret; +} + +/* + * In order to contain the amount of racy and tricky in the address filter + * configuration management, it is a two part process: + * + * (p1) when userspace mappings change as a result of (1) or (2) or (3) below, + * we update the addresses of corresponding vmas in + * event::addr_filters_offs array and bump the event::addr_filters_gen; + * (p2) when an event is scheduled in (pmu::add), it calls + * perf_event_addr_filters_sync() which calls pmu::addr_filters_sync() + * if the generation has changed since the previous call. + * + * If (p1) happens while the event is active, we restart it to force (p2). + * + * (1) perf_addr_filters_apply(): adjusting filters' offsets based on + * pre-existing mappings, called once when new filters arrive via SET_FILTER + * ioctl; + * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly + * registered mapping, called for every new mmap(), with mm::mmap_sem down + * for reading; + * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process + * of exec. + */ +void perf_event_addr_filters_sync(struct perf_event *event) +{ + struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); + + if (!has_addr_filter(event)) + return; + + raw_spin_lock(&ifh->lock); + if (event->addr_filters_gen != event->hw.addr_filters_gen) { + event->pmu->addr_filters_sync(event); + event->hw.addr_filters_gen = event->addr_filters_gen; + } + raw_spin_unlock(&ifh->lock); +} +EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync); + static int _perf_event_refresh(struct perf_event *event, int refresh) { /* @@ -2402,14 +2532,24 @@ static void ctx_sched_out(struct perf_event_context *ctx, cpuctx->task_ctx = NULL; } - is_active ^= ctx->is_active; /* changed bits */ - + /* + * Always update time if it was set; not only when it changes. + * Otherwise we can 'forget' to update time for any but the last + * context we sched out. For example: + * + * ctx_sched_out(.event_type = EVENT_FLEXIBLE) + * ctx_sched_out(.event_type = EVENT_PINNED) + * + * would only update time for the pinned events. + */ if (is_active & EVENT_TIME) { /* update (and stop) ctx time */ update_context_time(ctx); update_cgrp_time_from_cpuctx(cpuctx); } + is_active ^= ctx->is_active; /* changed bits */ + if (!ctx->nr_active || !(is_active & EVENT_ALL)) return; @@ -3112,17 +3252,6 @@ done: return rotate; } -#ifdef CONFIG_NO_HZ_FULL -bool perf_event_can_stop_tick(void) -{ - if (atomic_read(&nr_freq_events) || - __this_cpu_read(perf_throttled_count)) - return false; - else - return true; -} -#endif - void perf_event_task_tick(void) { struct list_head *head = this_cpu_ptr(&active_ctx_list); @@ -3133,6 +3262,7 @@ void perf_event_task_tick(void) __this_cpu_inc(perf_throttled_seq); throttled = __this_cpu_xchg(perf_throttled_count, 0); + tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); list_for_each_entry_safe(ctx, tmp, head, active_ctx_list) perf_adjust_freq_unthr_context(ctx, throttled); @@ -3192,16 +3322,6 @@ out: put_ctx(clone_ctx); } -void perf_event_exec(void) -{ - int ctxn; - - rcu_read_lock(); - for_each_task_context_nr(ctxn) - perf_event_enable_on_exec(ctxn); - rcu_read_unlock(); -} - struct perf_read_data { struct perf_event *event; bool group; @@ -3405,7 +3525,6 @@ static struct task_struct * find_lively_task_by_vpid(pid_t vpid) { struct task_struct *task; - int err; rcu_read_lock(); if (!vpid) @@ -3419,16 +3538,7 @@ find_lively_task_by_vpid(pid_t vpid) if (!task) return ERR_PTR(-ESRCH); - /* Reuse ptrace permission checks for now. */ - err = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) - goto errout; - return task; -errout: - put_task_struct(task); - return ERR_PTR(err); - } /* @@ -3564,6 +3674,28 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu) atomic_dec(&per_cpu(perf_cgroup_events, cpu)); } +#ifdef CONFIG_NO_HZ_FULL +static DEFINE_SPINLOCK(nr_freq_lock); +#endif + +static void unaccount_freq_event_nohz(void) +{ +#ifdef CONFIG_NO_HZ_FULL + spin_lock(&nr_freq_lock); + if (atomic_dec_and_test(&nr_freq_events)) + tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS); + spin_unlock(&nr_freq_lock); +#endif +} + +static void unaccount_freq_event(void) +{ + if (tick_nohz_full_enabled()) + unaccount_freq_event_nohz(); + else + atomic_dec(&nr_freq_events); +} + static void unaccount_event(struct perf_event *event) { bool dec = false; @@ -3580,7 +3712,7 @@ static void unaccount_event(struct perf_event *event) if (event->attr.task) atomic_dec(&nr_task_events); if (event->attr.freq) - atomic_dec(&nr_freq_events); + unaccount_freq_event(); if (event->attr.context_switch) { dec = true; atomic_dec(&nr_switch_events); @@ -3691,6 +3823,9 @@ static bool exclusive_event_installable(struct perf_event *event, return true; } +static void perf_addr_filters_splice(struct perf_event *event, + struct list_head *head); + static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending); @@ -3718,6 +3853,8 @@ static void _free_event(struct perf_event *event) } perf_event_free_bpf_prog(event); + perf_addr_filters_splice(event, NULL); + kfree(event->addr_filters_offs); if (event->destroy) event->destroy(event); @@ -3725,10 +3862,8 @@ static void _free_event(struct perf_event *event) if (event->ctx) put_ctx(event->ctx); - if (event->pmu) { - exclusive_event_destroy(event); - module_put(event->pmu->module); - } + exclusive_event_destroy(event); + module_put(event->pmu->module); call_rcu(&event->rcu_head, free_event_rcu); } @@ -4198,6 +4333,14 @@ static void __perf_event_period(struct perf_event *event, active = (event->state == PERF_EVENT_STATE_ACTIVE); if (active) { perf_pmu_disable(ctx->pmu); + /* + * We could be throttled; unthrottle now to avoid the tick + * trying to unthrottle while we already re-started the event. + */ + if (event->hw.interrupts == MAX_INTERRUPTS) { + event->hw.interrupts = 0; + perf_log_throttle(event, 1); + } event->pmu->stop(event, PERF_EF_UPDATE); } @@ -4306,6 +4449,19 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon case PERF_EVENT_IOC_SET_BPF: return perf_event_set_bpf_prog(event, arg); + case PERF_EVENT_IOC_PAUSE_OUTPUT: { + struct ring_buffer *rb; + + rcu_read_lock(); + rb = rcu_dereference(event->rb); + if (!rb || !rb->nr_pages) { + rcu_read_unlock(); + return -EINVAL; + } + rb_toggle_paused(rb, !!arg); + rcu_read_unlock(); + return 0; + } default: return -ENOTTY; } @@ -4622,6 +4778,8 @@ static void perf_mmap_open(struct vm_area_struct *vma) event->pmu->event_mapped(event); } +static void perf_pmu_output_stop(struct perf_event *event); + /* * A buffer can be mmap()ed multiple times; either directly through the same * event, or through other events by use of perf_event_set_output(). @@ -4649,10 +4807,22 @@ static void perf_mmap_close(struct vm_area_struct *vma) */ if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) { + /* + * Stop all AUX events that are writing to this buffer, + * so that we can free its AUX pages and corresponding PMU + * data. Note that after rb::aux_mmap_count dropped to zero, + * they won't start any more (see perf_aux_output_begin()). + */ + perf_pmu_output_stop(event); + + /* now it's safe to free the pages */ atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); vma->vm_mm->pinned_vm -= rb->aux_mmap_locked; + /* this has to be the last one */ rb_free_aux(rb); + WARN_ON_ONCE(atomic_read(&rb->aux_refcount)); + mutex_unlock(&event->mmap_mutex); } @@ -5593,9 +5763,13 @@ void perf_prepare_sample(struct perf_event_header *header, } } -void perf_event_output(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) +static void __always_inline +__perf_event_output(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs, + int (*output_begin)(struct perf_output_handle *, + struct perf_event *, + unsigned int)) { struct perf_output_handle handle; struct perf_event_header header; @@ -5605,7 +5779,7 @@ void perf_event_output(struct perf_event *event, perf_prepare_sample(&header, data, event, regs); - if (perf_output_begin(&handle, event, header.size)) + if (output_begin(&handle, event, header.size)) goto exit; perf_output_sample(&handle, &header, data, event); @@ -5616,6 +5790,30 @@ exit: rcu_read_unlock(); } +void +perf_event_output_forward(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + __perf_event_output(event, data, regs, perf_output_begin_forward); +} + +void +perf_event_output_backward(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + __perf_event_output(event, data, regs, perf_output_begin_backward); +} + +void +perf_event_output(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + __perf_event_output(event, data, regs, perf_output_begin); +} + /* * read event_id */ @@ -5661,15 +5859,18 @@ typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); static void perf_event_aux_ctx(struct perf_event_context *ctx, perf_event_aux_output_cb output, - void *data) + void *data, bool all) { struct perf_event *event; list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (event->state < PERF_EVENT_STATE_INACTIVE) - continue; - if (!event_filter_match(event)) - continue; + if (!all) { + if (event->state < PERF_EVENT_STATE_INACTIVE) + continue; + if (!event_filter_match(event)) + continue; + } + output(event, data); } } @@ -5680,7 +5881,7 @@ perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data, { rcu_read_lock(); preempt_disable(); - perf_event_aux_ctx(task_ctx, output, data); + perf_event_aux_ctx(task_ctx, output, data, false); preempt_enable(); rcu_read_unlock(); } @@ -5710,13 +5911,13 @@ perf_event_aux(perf_event_aux_output_cb output, void *data, cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); if (cpuctx->unique_pmu != pmu) goto next; - perf_event_aux_ctx(&cpuctx->ctx, output, data); + perf_event_aux_ctx(&cpuctx->ctx, output, data, false); ctxn = pmu->task_ctx_nr; if (ctxn < 0) goto next; ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); if (ctx) - perf_event_aux_ctx(ctx, output, data); + perf_event_aux_ctx(ctx, output, data, false); next: put_cpu_ptr(pmu->pmu_cpu_context); } @@ -5724,6 +5925,134 @@ next: } /* + * Clear all file-based filters at exec, they'll have to be + * re-instated when/if these objects are mmapped again. + */ +static void perf_event_addr_filters_exec(struct perf_event *event, void *data) +{ + struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); + struct perf_addr_filter *filter; + unsigned int restart = 0, count = 0; + unsigned long flags; + + if (!has_addr_filter(event)) + return; + + raw_spin_lock_irqsave(&ifh->lock, flags); + list_for_each_entry(filter, &ifh->list, entry) { + if (filter->inode) { + event->addr_filters_offs[count] = 0; + restart++; + } + + count++; + } + + if (restart) + event->addr_filters_gen++; + raw_spin_unlock_irqrestore(&ifh->lock, flags); + + if (restart) + perf_event_restart(event); +} + +void perf_event_exec(void) +{ + struct perf_event_context *ctx; + int ctxn; + + rcu_read_lock(); + for_each_task_context_nr(ctxn) { + ctx = current->perf_event_ctxp[ctxn]; + if (!ctx) + continue; + + perf_event_enable_on_exec(ctxn); + + perf_event_aux_ctx(ctx, perf_event_addr_filters_exec, NULL, + true); + } + rcu_read_unlock(); +} + +struct remote_output { + struct ring_buffer *rb; + int err; +}; + +static void __perf_event_output_stop(struct perf_event *event, void *data) +{ + struct perf_event *parent = event->parent; + struct remote_output *ro = data; + struct ring_buffer *rb = ro->rb; + struct stop_event_data sd = { + .event = event, + }; + + if (!has_aux(event)) + return; + + if (!parent) + parent = event; + + /* + * In case of inheritance, it will be the parent that links to the + * ring-buffer, but it will be the child that's actually using it: + */ + if (rcu_dereference(parent->rb) == rb) + ro->err = __perf_event_stop(&sd); +} + +static int __perf_pmu_output_stop(void *info) +{ + struct perf_event *event = info; + struct pmu *pmu = event->pmu; + struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + struct remote_output ro = { + .rb = event->rb, + }; + + rcu_read_lock(); + perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false); + if (cpuctx->task_ctx) + perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop, + &ro, false); + rcu_read_unlock(); + + return ro.err; +} + +static void perf_pmu_output_stop(struct perf_event *event) +{ + struct perf_event *iter; + int err, cpu; + +restart: + rcu_read_lock(); + list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) { + /* + * For per-CPU events, we need to make sure that neither they + * nor their children are running; for cpu==-1 events it's + * sufficient to stop the event itself if it's active, since + * it can't have children. + */ + cpu = iter->cpu; + if (cpu == -1) + cpu = READ_ONCE(iter->oncpu); + + if (cpu == -1) + continue; + + err = cpu_function_call(cpu, __perf_pmu_output_stop, event); + if (err == -EAGAIN) { + rcu_read_unlock(); + goto restart; + } + } + rcu_read_unlock(); +} + +/* * task tracking -- fork/exit * * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task @@ -6132,6 +6461,87 @@ got_name: kfree(buf); } +/* + * Whether this @filter depends on a dynamic object which is not loaded + * yet or its load addresses are not known. + */ +static bool perf_addr_filter_needs_mmap(struct perf_addr_filter *filter) +{ + return filter->filter && filter->inode; +} + +/* + * Check whether inode and address range match filter criteria. + */ +static bool perf_addr_filter_match(struct perf_addr_filter *filter, + struct file *file, unsigned long offset, + unsigned long size) +{ + if (filter->inode != file->f_inode) + return false; + + if (filter->offset > offset + size) + return false; + + if (filter->offset + filter->size < offset) + return false; + + return true; +} + +static void __perf_addr_filters_adjust(struct perf_event *event, void *data) +{ + struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); + struct vm_area_struct *vma = data; + unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags; + struct file *file = vma->vm_file; + struct perf_addr_filter *filter; + unsigned int restart = 0, count = 0; + + if (!has_addr_filter(event)) + return; + + if (!file) + return; + + raw_spin_lock_irqsave(&ifh->lock, flags); + list_for_each_entry(filter, &ifh->list, entry) { + if (perf_addr_filter_match(filter, file, off, + vma->vm_end - vma->vm_start)) { + event->addr_filters_offs[count] = vma->vm_start; + restart++; + } + + count++; + } + + if (restart) + event->addr_filters_gen++; + raw_spin_unlock_irqrestore(&ifh->lock, flags); + + if (restart) + perf_event_restart(event); +} + +/* + * Adjust all task's events' filters to the new vma + */ +static void perf_addr_filters_adjust(struct vm_area_struct *vma) +{ + struct perf_event_context *ctx; + int ctxn; + + rcu_read_lock(); + for_each_task_context_nr(ctxn) { + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (!ctx) + continue; + + perf_event_aux_ctx(ctx, __perf_addr_filters_adjust, vma, true); + } + rcu_read_unlock(); +} + void perf_event_mmap(struct vm_area_struct *vma) { struct perf_mmap_event mmap_event; @@ -6163,6 +6573,7 @@ void perf_event_mmap(struct vm_area_struct *vma) /* .flags (attr_mmap2 only) */ }; + perf_addr_filters_adjust(vma); perf_event_mmap_event(&mmap_event); } @@ -6424,9 +6835,9 @@ static int __perf_event_overflow(struct perf_event *event, if (unlikely(throttle && hwc->interrupts >= max_samples_per_tick)) { __this_cpu_inc(perf_throttled_count); + tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); hwc->interrupts = MAX_INTERRUPTS; perf_log_throttle(event, 0); - tick_nohz_full_kick(); ret = 1; } } @@ -6454,10 +6865,7 @@ static int __perf_event_overflow(struct perf_event *event, irq_work_queue(&event->pending); } - if (event->overflow_handler) - event->overflow_handler(event, data, regs); - else - perf_event_output(event, data, regs); + event->overflow_handler(event, data, regs); if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; @@ -6690,7 +7098,7 @@ int perf_swevent_get_recursion_context(void) } EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); -inline void perf_swevent_put_recursion_context(int rctx) +void perf_swevent_put_recursion_context(int rctx) { struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); @@ -6785,7 +7193,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash) kfree_rcu(hlist, rcu_head); } -static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) +static void swevent_hlist_put_cpu(int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); @@ -6797,15 +7205,15 @@ static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) mutex_unlock(&swhash->hlist_mutex); } -static void swevent_hlist_put(struct perf_event *event) +static void swevent_hlist_put(void) { int cpu; for_each_possible_cpu(cpu) - swevent_hlist_put_cpu(event, cpu); + swevent_hlist_put_cpu(cpu); } -static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) +static int swevent_hlist_get_cpu(int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); int err = 0; @@ -6828,14 +7236,13 @@ exit: return err; } -static int swevent_hlist_get(struct perf_event *event) +static int swevent_hlist_get(void) { - int err; - int cpu, failed_cpu; + int err, cpu, failed_cpu; get_online_cpus(); for_each_possible_cpu(cpu) { - err = swevent_hlist_get_cpu(event, cpu); + err = swevent_hlist_get_cpu(cpu); if (err) { failed_cpu = cpu; goto fail; @@ -6848,7 +7255,7 @@ fail: for_each_possible_cpu(cpu) { if (cpu == failed_cpu) break; - swevent_hlist_put_cpu(event, cpu); + swevent_hlist_put_cpu(cpu); } put_online_cpus(); @@ -6864,7 +7271,7 @@ static void sw_perf_event_destroy(struct perf_event *event) WARN_ON(event->parent); static_key_slow_dec(&perf_swevent_enabled[event_id]); - swevent_hlist_put(event); + swevent_hlist_put(); } static int perf_swevent_init(struct perf_event *event) @@ -6895,7 +7302,7 @@ static int perf_swevent_init(struct perf_event *event) if (!event->parent) { int err; - err = swevent_hlist_get(event); + err = swevent_hlist_get(); if (err) return err; @@ -6953,7 +7360,26 @@ static int perf_tp_event_match(struct perf_event *event, return 1; } -void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, +void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, + struct trace_event_call *call, u64 count, + struct pt_regs *regs, struct hlist_head *head, + struct task_struct *task) +{ + struct bpf_prog *prog = call->prog; + + if (prog) { + *(struct pt_regs **)raw_data = regs; + if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) { + perf_swevent_put_recursion_context(rctx); + return; + } + } + perf_tp_event(call->event.type, count, raw_data, size, regs, head, + rctx, task); +} +EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit); + +void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, struct task_struct *task) { @@ -6965,9 +7391,11 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, .data = record, }; - perf_sample_data_init(&data, addr, 0); + perf_sample_data_init(&data, 0, 0); data.raw = &raw; + perf_trace_buf_update(record, event_type); + hlist_for_each_entry_rcu(event, head, hlist_entry) { if (perf_tp_event_match(event, &data, regs)) perf_swevent_event(event, count, &data, regs); @@ -7045,24 +7473,6 @@ static inline void perf_tp_register(void) perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); } -static int perf_event_set_filter(struct perf_event *event, void __user *arg) -{ - char *filter_str; - int ret; - - if (event->attr.type != PERF_TYPE_TRACEPOINT) - return -EINVAL; - - filter_str = strndup_user(arg, PAGE_SIZE); - if (IS_ERR(filter_str)) - return PTR_ERR(filter_str); - - ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); - - kfree(filter_str); - return ret; -} - static void perf_event_free_filter(struct perf_event *event) { ftrace_profile_free_filter(event); @@ -7070,6 +7480,7 @@ static void perf_event_free_filter(struct perf_event *event) static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) { + bool is_kprobe, is_tracepoint; struct bpf_prog *prog; if (event->attr.type != PERF_TYPE_TRACEPOINT) @@ -7078,20 +7489,31 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) if (event->tp_event->prog) return -EEXIST; - if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE)) - /* bpf programs can only be attached to u/kprobes */ + is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; + is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; + if (!is_kprobe && !is_tracepoint) + /* bpf programs can only be attached to u/kprobe or tracepoint */ return -EINVAL; prog = bpf_prog_get(prog_fd); if (IS_ERR(prog)) return PTR_ERR(prog); - if (prog->type != BPF_PROG_TYPE_KPROBE) { + if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) || + (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) { /* valid fd, but invalid bpf program type */ bpf_prog_put(prog); return -EINVAL; } + if (is_tracepoint) { + int off = trace_event_get_offsets(event->tp_event); + + if (prog->aux->max_ctx_offset > off) { + bpf_prog_put(prog); + return -EACCES; + } + } event->tp_event->prog = prog; return 0; @@ -7107,7 +7529,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event) prog = event->tp_event->prog; if (prog) { event->tp_event->prog = NULL; - bpf_prog_put(prog); + bpf_prog_put_rcu(prog); } } @@ -7117,11 +7539,6 @@ static inline void perf_tp_register(void) { } -static int perf_event_set_filter(struct perf_event *event, void __user *arg) -{ - return -ENOENT; -} - static void perf_event_free_filter(struct perf_event *event) { } @@ -7150,6 +7567,387 @@ void perf_bp_event(struct perf_event *bp, void *data) #endif /* + * Allocate a new address filter + */ +static struct perf_addr_filter * +perf_addr_filter_new(struct perf_event *event, struct list_head *filters) +{ + int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu); + struct perf_addr_filter *filter; + + filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node); + if (!filter) + return NULL; + + INIT_LIST_HEAD(&filter->entry); + list_add_tail(&filter->entry, filters); + + return filter; +} + +static void free_filters_list(struct list_head *filters) +{ + struct perf_addr_filter *filter, *iter; + + list_for_each_entry_safe(filter, iter, filters, entry) { + if (filter->inode) + iput(filter->inode); + list_del(&filter->entry); + kfree(filter); + } +} + +/* + * Free existing address filters and optionally install new ones + */ +static void perf_addr_filters_splice(struct perf_event *event, + struct list_head *head) +{ + unsigned long flags; + LIST_HEAD(list); + + if (!has_addr_filter(event)) + return; + + /* don't bother with children, they don't have their own filters */ + if (event->parent) + return; + + raw_spin_lock_irqsave(&event->addr_filters.lock, flags); + + list_splice_init(&event->addr_filters.list, &list); + if (head) + list_splice(head, &event->addr_filters.list); + + raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags); + + free_filters_list(&list); +} + +/* + * Scan through mm's vmas and see if one of them matches the + * @filter; if so, adjust filter's address range. + * Called with mm::mmap_sem down for reading. + */ +static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter, + struct mm_struct *mm) +{ + struct vm_area_struct *vma; + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + struct file *file = vma->vm_file; + unsigned long off = vma->vm_pgoff << PAGE_SHIFT; + unsigned long vma_size = vma->vm_end - vma->vm_start; + + if (!file) + continue; + + if (!perf_addr_filter_match(filter, file, off, vma_size)) + continue; + + return vma->vm_start; + } + + return 0; +} + +/* + * Update event's address range filters based on the + * task's existing mappings, if any. + */ +static void perf_event_addr_filters_apply(struct perf_event *event) +{ + struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); + struct task_struct *task = READ_ONCE(event->ctx->task); + struct perf_addr_filter *filter; + struct mm_struct *mm = NULL; + unsigned int count = 0; + unsigned long flags; + + /* + * We may observe TASK_TOMBSTONE, which means that the event tear-down + * will stop on the parent's child_mutex that our caller is also holding + */ + if (task == TASK_TOMBSTONE) + return; + + mm = get_task_mm(event->ctx->task); + if (!mm) + goto restart; + + down_read(&mm->mmap_sem); + + raw_spin_lock_irqsave(&ifh->lock, flags); + list_for_each_entry(filter, &ifh->list, entry) { + event->addr_filters_offs[count] = 0; + + if (perf_addr_filter_needs_mmap(filter)) + event->addr_filters_offs[count] = + perf_addr_filter_apply(filter, mm); + + count++; + } + + event->addr_filters_gen++; + raw_spin_unlock_irqrestore(&ifh->lock, flags); + + up_read(&mm->mmap_sem); + + mmput(mm); + +restart: + perf_event_restart(event); +} + +/* + * Address range filtering: limiting the data to certain + * instruction address ranges. Filters are ioctl()ed to us from + * userspace as ascii strings. + * + * Filter string format: + * + * ACTION RANGE_SPEC + * where ACTION is one of the + * * "filter": limit the trace to this region + * * "start": start tracing from this address + * * "stop": stop tracing at this address/region; + * RANGE_SPEC is + * * for kernel addresses: <start address>[/<size>] + * * for object files: <start address>[/<size>]@</path/to/object/file> + * + * if <size> is not specified, the range is treated as a single address. + */ +enum { + IF_ACT_FILTER, + IF_ACT_START, + IF_ACT_STOP, + IF_SRC_FILE, + IF_SRC_KERNEL, + IF_SRC_FILEADDR, + IF_SRC_KERNELADDR, +}; + +enum { + IF_STATE_ACTION = 0, + IF_STATE_SOURCE, + IF_STATE_END, +}; + +static const match_table_t if_tokens = { + { IF_ACT_FILTER, "filter" }, + { IF_ACT_START, "start" }, + { IF_ACT_STOP, "stop" }, + { IF_SRC_FILE, "%u/%u@%s" }, + { IF_SRC_KERNEL, "%u/%u" }, + { IF_SRC_FILEADDR, "%u@%s" }, + { IF_SRC_KERNELADDR, "%u" }, +}; + +/* + * Address filter string parser + */ +static int +perf_event_parse_addr_filter(struct perf_event *event, char *fstr, + struct list_head *filters) +{ + struct perf_addr_filter *filter = NULL; + char *start, *orig, *filename = NULL; + struct path path; + substring_t args[MAX_OPT_ARGS]; + int state = IF_STATE_ACTION, token; + unsigned int kernel = 0; + int ret = -EINVAL; + + orig = fstr = kstrdup(fstr, GFP_KERNEL); + if (!fstr) + return -ENOMEM; + + while ((start = strsep(&fstr, " ,\n")) != NULL) { + ret = -EINVAL; + + if (!*start) + continue; + + /* filter definition begins */ + if (state == IF_STATE_ACTION) { + filter = perf_addr_filter_new(event, filters); + if (!filter) + goto fail; + } + + token = match_token(start, if_tokens, args); + switch (token) { + case IF_ACT_FILTER: + case IF_ACT_START: + filter->filter = 1; + + case IF_ACT_STOP: + if (state != IF_STATE_ACTION) + goto fail; + + state = IF_STATE_SOURCE; + break; + + case IF_SRC_KERNELADDR: + case IF_SRC_KERNEL: + kernel = 1; + + case IF_SRC_FILEADDR: + case IF_SRC_FILE: + if (state != IF_STATE_SOURCE) + goto fail; + + if (token == IF_SRC_FILE || token == IF_SRC_KERNEL) + filter->range = 1; + + *args[0].to = 0; + ret = kstrtoul(args[0].from, 0, &filter->offset); + if (ret) + goto fail; + + if (filter->range) { + *args[1].to = 0; + ret = kstrtoul(args[1].from, 0, &filter->size); + if (ret) + goto fail; + } + + if (token == IF_SRC_FILE) { + filename = match_strdup(&args[2]); + if (!filename) { + ret = -ENOMEM; + goto fail; + } + } + + state = IF_STATE_END; + break; + + default: + goto fail; + } + + /* + * Filter definition is fully parsed, validate and install it. + * Make sure that it doesn't contradict itself or the event's + * attribute. + */ + if (state == IF_STATE_END) { + if (kernel && event->attr.exclude_kernel) + goto fail; + + if (!kernel) { + if (!filename) + goto fail; + + /* look up the path and grab its inode */ + ret = kern_path(filename, LOOKUP_FOLLOW, &path); + if (ret) + goto fail_free_name; + + filter->inode = igrab(d_inode(path.dentry)); + path_put(&path); + kfree(filename); + filename = NULL; + + ret = -EINVAL; + if (!filter->inode || + !S_ISREG(filter->inode->i_mode)) + /* free_filters_list() will iput() */ + goto fail; + } + + /* ready to consume more filters */ + state = IF_STATE_ACTION; + filter = NULL; + } + } + + if (state != IF_STATE_ACTION) + goto fail; + + kfree(orig); + + return 0; + +fail_free_name: + kfree(filename); +fail: + free_filters_list(filters); + kfree(orig); + + return ret; +} + +static int +perf_event_set_addr_filter(struct perf_event *event, char *filter_str) +{ + LIST_HEAD(filters); + int ret; + + /* + * Since this is called in perf_ioctl() path, we're already holding + * ctx::mutex. + */ + lockdep_assert_held(&event->ctx->mutex); + + if (WARN_ON_ONCE(event->parent)) + return -EINVAL; + + /* + * For now, we only support filtering in per-task events; doing so + * for CPU-wide events requires additional context switching trickery, + * since same object code will be mapped at different virtual + * addresses in different processes. + */ + if (!event->ctx->task) + return -EOPNOTSUPP; + + ret = perf_event_parse_addr_filter(event, filter_str, &filters); + if (ret) + return ret; + + ret = event->pmu->addr_filters_validate(&filters); + if (ret) { + free_filters_list(&filters); + return ret; + } + + /* remove existing filters, if any */ + perf_addr_filters_splice(event, &filters); + + /* install new filters */ + perf_event_for_each_child(event, perf_event_addr_filters_apply); + + return ret; +} + +static int perf_event_set_filter(struct perf_event *event, void __user *arg) +{ + char *filter_str; + int ret = -EINVAL; + + if ((event->attr.type != PERF_TYPE_TRACEPOINT || + !IS_ENABLED(CONFIG_EVENT_TRACING)) && + !has_addr_filter(event)) + return -EINVAL; + + filter_str = strndup_user(arg, PAGE_SIZE); + if (IS_ERR(filter_str)) + return PTR_ERR(filter_str); + + if (IS_ENABLED(CONFIG_EVENT_TRACING) && + event->attr.type == PERF_TYPE_TRACEPOINT) + ret = ftrace_profile_set_filter(event, event->attr.config, + filter_str); + else if (has_addr_filter(event)) + ret = perf_event_set_addr_filter(event, filter_str); + + kfree(filter_str); + return ret; +} + +/* * hrtimer based swevent callback */ @@ -7506,6 +8304,20 @@ static void free_pmu_context(struct pmu *pmu) out: mutex_unlock(&pmus_lock); } + +/* + * Let userspace know that this PMU supports address range filtering: + */ +static ssize_t nr_addr_filters_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct pmu *pmu = dev_get_drvdata(dev); + + return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); +} +DEVICE_ATTR_RO(nr_addr_filters); + static struct idr pmu_idr; static ssize_t @@ -7607,9 +8419,19 @@ static int pmu_dev_alloc(struct pmu *pmu) if (ret) goto free_dev; + /* For PMUs with address filters, throw in an extra attribute: */ + if (pmu->nr_addr_filters) + ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters); + + if (ret) + goto del_dev; + out: return ret; +del_dev: + device_del(pmu->dev); + free_dev: put_device(pmu->dev); goto out; @@ -7649,6 +8471,21 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) } skip_type: + if (pmu->task_ctx_nr == perf_hw_context) { + static int hw_context_taken = 0; + + /* + * Other than systems with heterogeneous CPUs, it never makes + * sense for two PMUs to share perf_hw_context. PMUs which are + * uncore must use perf_invalid_context. + */ + if (WARN_ON_ONCE(hw_context_taken && + !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS))) + pmu->task_ctx_nr = perf_invalid_context; + + hw_context_taken = 1; + } + pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); if (pmu->pmu_cpu_context) goto got_cpu_context; @@ -7736,6 +8573,8 @@ void perf_pmu_unregister(struct pmu *pmu) free_percpu(pmu->pmu_disable_count); if (pmu->type >= PERF_TYPE_MAX) idr_remove(&pmu_idr, pmu->type); + if (pmu->nr_addr_filters) + device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); device_del(pmu->dev); put_device(pmu->dev); free_pmu_context(pmu); @@ -7816,6 +8655,27 @@ static void account_event_cpu(struct perf_event *event, int cpu) atomic_inc(&per_cpu(perf_cgroup_events, cpu)); } +/* Freq events need the tick to stay alive (see perf_event_task_tick). */ +static void account_freq_event_nohz(void) +{ +#ifdef CONFIG_NO_HZ_FULL + /* Lock so we don't race with concurrent unaccount */ + spin_lock(&nr_freq_lock); + if (atomic_inc_return(&nr_freq_events) == 1) + tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS); + spin_unlock(&nr_freq_lock); +#endif +} + +static void account_freq_event(void) +{ + if (tick_nohz_full_enabled()) + account_freq_event_nohz(); + else + atomic_inc(&nr_freq_events); +} + + static void account_event(struct perf_event *event) { bool inc = false; @@ -7831,10 +8691,8 @@ static void account_event(struct perf_event *event) atomic_inc(&nr_comm_events); if (event->attr.task) atomic_inc(&nr_task_events); - if (event->attr.freq) { - if (atomic_inc_return(&nr_freq_events) == 1) - tick_nohz_full_kick_all(); - } + if (event->attr.freq) + account_freq_event(); if (event->attr.context_switch) { atomic_inc(&nr_switch_events); inc = true; @@ -7910,6 +8768,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, INIT_LIST_HEAD(&event->sibling_list); INIT_LIST_HEAD(&event->rb_entry); INIT_LIST_HEAD(&event->active_entry); + INIT_LIST_HEAD(&event->addr_filters.list); INIT_HLIST_NODE(&event->hlist_entry); @@ -7917,6 +8776,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, init_irq_work(&event->pending, perf_pending_event); mutex_init(&event->mmap_mutex); + raw_spin_lock_init(&event->addr_filters.lock); atomic_long_set(&event->refcount, 1); event->cpu = cpu; @@ -7951,8 +8811,16 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, context = parent_event->overflow_handler_context; } - event->overflow_handler = overflow_handler; - event->overflow_handler_context = context; + if (overflow_handler) { + event->overflow_handler = overflow_handler; + event->overflow_handler_context = context; + } else if (is_write_backward(event)){ + event->overflow_handler = perf_event_output_backward; + event->overflow_handler_context = NULL; + } else { + event->overflow_handler = perf_event_output_forward; + event->overflow_handler_context = NULL; + } perf_event__state_init(event); @@ -7993,16 +8861,33 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (err) goto err_pmu; + if (has_addr_filter(event)) { + event->addr_filters_offs = kcalloc(pmu->nr_addr_filters, + sizeof(unsigned long), + GFP_KERNEL); + if (!event->addr_filters_offs) + goto err_per_task; + + /* force hw sync on the address filters */ + event->addr_filters_gen = 1; + } + if (!event->parent) { if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { err = get_callchain_buffers(); if (err) - goto err_per_task; + goto err_addr_filters; } } + /* symmetric to unaccount_event() in _free_event() */ + account_event(event); + return event; +err_addr_filters: + kfree(event->addr_filters_offs); + err_per_task: exclusive_event_destroy(event); @@ -8182,6 +9067,13 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) goto out; /* + * Either writing ring buffer from beginning or from end. + * Mixing is not allowed. + */ + if (is_write_backward(output_event) != is_write_backward(event)) + goto out; + + /* * If both events generate aux data, they must be on the same PMU */ if (has_aux(event) && has_aux(output_event) && @@ -8347,6 +9239,24 @@ SYSCALL_DEFINE5(perf_event_open, get_online_cpus(); + if (task) { + err = mutex_lock_interruptible(&task->signal->cred_guard_mutex); + if (err) + goto err_cpus; + + /* + * Reuse ptrace permission checks for now. + * + * We must hold cred_guard_mutex across this and any potential + * perf_install_in_context() call for this new event to + * serialize against exec() altering our credentials (and the + * perf_event_exit_task() that could imply). + */ + err = -EACCES; + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) + goto err_cred; + } + if (flags & PERF_FLAG_PID_CGROUP) cgroup_fd = pid; @@ -8354,7 +9264,7 @@ SYSCALL_DEFINE5(perf_event_open, NULL, NULL, cgroup_fd); if (IS_ERR(event)) { err = PTR_ERR(event); - goto err_cpus; + goto err_cred; } if (is_sampling_event(event)) { @@ -8364,8 +9274,6 @@ SYSCALL_DEFINE5(perf_event_open, } } - account_event(event); - /* * Special case software events and allow them to be part of * any hardware group. @@ -8415,11 +9323,6 @@ SYSCALL_DEFINE5(perf_event_open, goto err_context; } - if (task) { - put_task_struct(task); - task = NULL; - } - /* * Look up the group leader (we will attach this event to it): */ @@ -8478,6 +9381,7 @@ SYSCALL_DEFINE5(perf_event_open, f_flags); if (IS_ERR(event_file)) { err = PTR_ERR(event_file); + event_file = NULL; goto err_context; } @@ -8516,6 +9420,11 @@ SYSCALL_DEFINE5(perf_event_open, WARN_ON_ONCE(ctx->parent_ctx); + /* + * This is the point on no return; we cannot fail hereafter. This is + * where we start modifying current state. + */ + if (move_group) { /* * See perf_event_ctx_lock() for comments on the details @@ -8587,6 +9496,11 @@ SYSCALL_DEFINE5(perf_event_open, mutex_unlock(&gctx->mutex); mutex_unlock(&ctx->mutex); + if (task) { + mutex_unlock(&task->signal->cred_guard_mutex); + put_task_struct(task); + } + put_online_cpus(); mutex_lock(¤t->perf_event_mutex); @@ -8619,6 +9533,9 @@ err_alloc: */ if (!event_file) free_event(event); +err_cred: + if (task) + mutex_unlock(&task->signal->cred_guard_mutex); err_cpus: put_online_cpus(); err_task: @@ -8662,8 +9579,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, /* Mark owner so we could distinguish it from user events. */ event->owner = TASK_TOMBSTONE; - account_event(event); - ctx = find_get_context(event->pmu, task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); @@ -8905,6 +9820,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) /* * When a child task exits, feed back event values to parent events. + * + * Can be called with cred_guard_mutex held when called from + * install_exec_creds(). */ void perf_event_exit_task(struct task_struct *child) { @@ -9397,10 +10315,29 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: + /* + * This must be done before the CPU comes alive, because the + * moment we can run tasks we can encounter (software) events. + * + * Specifically, someone can have inherited events on kthreadd + * or a pre-existing worker thread that gets re-bound. + */ perf_event_init_cpu(cpu); break; case CPU_DOWN_PREPARE: + /* + * This must be done before the CPU dies because after that an + * active event might want to IPI the CPU and that'll not work + * so great for dead CPUs. + * + * XXX smp_call_function_single() return -ENXIO without a warn + * so we could possibly deal with this. + * + * This is safe against new events arriving because + * sys_perf_event_open() serializes against hotplug using + * get_online_cpus(). + */ perf_event_exit_cpu(cpu); break; default: @@ -9447,6 +10384,7 @@ ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr, return 0; } +EXPORT_SYMBOL_GPL(perf_event_sysfs_show); static int __init perf_event_sysfs_init(void) { diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 2bbad9c1274c..05f9f6d626df 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -11,13 +11,13 @@ struct ring_buffer { atomic_t refcount; struct rcu_head rcu_head; - struct irq_work irq_work; #ifdef CONFIG_PERF_USE_VMALLOC struct work_struct work; int page_order; /* allocation order */ #endif int nr_pages; /* nr of data pages */ int overwrite; /* can overwrite itself */ + int paused; /* can write into ring buffer */ atomic_t poll; /* POLL_ for wakeups */ @@ -65,6 +65,14 @@ static inline void rb_free_rcu(struct rcu_head *rcu_head) rb_free(rb); } +static inline void rb_toggle_paused(struct ring_buffer *rb, bool pause) +{ + if (!pause && rb->nr_pages) + rb->paused = 0; + else + rb->paused = 1; +} + extern struct ring_buffer * rb_alloc(int nr_pages, long watermark, int cpu, int flags); extern void perf_event_wakeup(struct perf_event *event); @@ -182,8 +190,6 @@ DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) /* Callchain handling */ extern struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs); -extern int get_callchain_buffers(void); -extern void put_callchain_buffers(void); static inline int get_recursion_context(int *recursion) { diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 1faad2cfdb9e..ae9b90dc9a5a 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -102,8 +102,21 @@ out: preempt_enable(); } -int perf_output_begin(struct perf_output_handle *handle, - struct perf_event *event, unsigned int size) +static bool __always_inline +ring_buffer_has_space(unsigned long head, unsigned long tail, + unsigned long data_size, unsigned int size, + bool backward) +{ + if (!backward) + return CIRC_SPACE(head, tail, data_size) >= size; + else + return CIRC_SPACE(tail, head, data_size) >= size; +} + +static int __always_inline +__perf_output_begin(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size, + bool backward) { struct ring_buffer *rb; unsigned long tail, offset, head; @@ -125,8 +138,11 @@ int perf_output_begin(struct perf_output_handle *handle, if (unlikely(!rb)) goto out; - if (unlikely(!rb->nr_pages)) + if (unlikely(rb->paused)) { + if (rb->nr_pages) + local_inc(&rb->lost); goto out; + } handle->rb = rb; handle->event = event; @@ -143,9 +159,12 @@ int perf_output_begin(struct perf_output_handle *handle, do { tail = READ_ONCE(rb->user_page->data_tail); offset = head = local_read(&rb->head); - if (!rb->overwrite && - unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size)) - goto fail; + if (!rb->overwrite) { + if (unlikely(!ring_buffer_has_space(head, tail, + perf_data_size(rb), + size, backward))) + goto fail; + } /* * The above forms a control dependency barrier separating the @@ -159,9 +178,17 @@ int perf_output_begin(struct perf_output_handle *handle, * See perf_output_put_handle(). */ - head += size; + if (!backward) + head += size; + else + head -= size; } while (local_cmpxchg(&rb->head, offset, head) != offset); + if (backward) { + offset = head; + head = (u64)(-head); + } + /* * We rely on the implied barrier() by local_cmpxchg() to ensure * none of the data stores below can be lifted up by the compiler. @@ -203,6 +230,26 @@ out: return -ENOSPC; } +int perf_output_begin_forward(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size) +{ + return __perf_output_begin(handle, event, size, false); +} + +int perf_output_begin_backward(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size) +{ + return __perf_output_begin(handle, event, size, true); +} + +int perf_output_begin(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size) +{ + + return __perf_output_begin(handle, event, size, + unlikely(is_write_backward(event))); +} + unsigned int perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len) { @@ -221,8 +268,6 @@ void perf_output_end(struct perf_output_handle *handle) rcu_read_unlock(); } -static void rb_irq_work(struct irq_work *work); - static void ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) { @@ -243,16 +288,13 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) INIT_LIST_HEAD(&rb->event_list); spin_lock_init(&rb->event_lock); - init_irq_work(&rb->irq_work, rb_irq_work); -} -static void ring_buffer_put_async(struct ring_buffer *rb) -{ - if (!atomic_dec_and_test(&rb->refcount)) - return; - - rb->rcu_head.next = (void *)rb; - irq_work_queue(&rb->irq_work); + /* + * perf_output_begin() only checks rb->paused, therefore + * rb->paused must be true if we have no pages for output. + */ + if (!rb->nr_pages) + rb->paused = 1; } /* @@ -264,6 +306,10 @@ static void ring_buffer_put_async(struct ring_buffer *rb) * The ordering is similar to that of perf_output_{begin,end}, with * the exception of (B), which should be taken care of by the pmu * driver, since ordering rules will differ depending on hardware. + * + * Call this from pmu::start(); see the comment in perf_aux_output_end() + * about its use in pmu callbacks. Both can also be called from the PMI + * handler if needed. */ void *perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event) @@ -288,6 +334,13 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, goto err; /* + * If rb::aux_mmap_count is zero (and rb_has_aux() above went through), + * the aux buffer is in perf_mmap_close(), about to get freed. + */ + if (!atomic_read(&rb->aux_mmap_count)) + goto err_put; + + /* * Nesting is not supported for AUX area, make sure nested * writers are caught early */ @@ -328,10 +381,11 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, return handle->rb->aux_priv; err_put: + /* can't be last */ rb_free_aux(rb); err: - ring_buffer_put_async(rb); + ring_buffer_put(rb); handle->event = NULL; return NULL; @@ -342,11 +396,16 @@ err: * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the * pmu driver's responsibility to observe ordering rules of the hardware, * so that all the data is externally visible before this is called. + * + * Note: this has to be called from pmu::stop() callback, as the assumption + * of the AUX buffer management code is that after pmu::stop(), the AUX + * transaction must be stopped and therefore drop the AUX reference count. */ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, bool truncated) { struct ring_buffer *rb = handle->rb; + bool wakeup = truncated; unsigned long aux_head; u64 flags = 0; @@ -375,14 +434,22 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) { - perf_output_wakeup(handle); + wakeup = true; local_add(rb->aux_watermark, &rb->aux_wakeup); } + + if (wakeup) { + if (truncated) + handle->event->pending_disable = 1; + perf_output_wakeup(handle); + } + handle->event = NULL; local_set(&rb->aux_nest, 0); + /* can't be last */ rb_free_aux(rb); - ring_buffer_put_async(rb); + ring_buffer_put(rb); } /* @@ -463,6 +530,14 @@ static void __rb_free_aux(struct ring_buffer *rb) { int pg; + /* + * Should never happen, the last reference should be dropped from + * perf_mmap_close() path, which first stops aux transactions (which + * in turn are the atomic holders of aux_refcount) and then does the + * last rb_free_aux(). + */ + WARN_ON_ONCE(in_atomic()); + if (rb->aux_priv) { rb->free_aux(rb->aux_priv); rb->free_aux = NULL; @@ -574,18 +649,7 @@ out: void rb_free_aux(struct ring_buffer *rb) { if (atomic_dec_and_test(&rb->aux_refcount)) - irq_work_queue(&rb->irq_work); -} - -static void rb_irq_work(struct irq_work *work) -{ - struct ring_buffer *rb = container_of(work, struct ring_buffer, irq_work); - - if (!atomic_read(&rb->aux_refcount)) __rb_free_aux(rb); - - if (rb->rcu_head.next == (void *)rb) - call_rcu(&rb->rcu_head, rb_free_rcu); } #ifndef CONFIG_PERF_USE_VMALLOC @@ -746,8 +810,10 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) rb->user_page = all_buf; rb->data_pages[0] = all_buf + PAGE_SIZE; - rb->page_order = ilog2(nr_pages); - rb->nr_pages = !!nr_pages; + if (nr_pages) { + rb->nr_pages = 1; + rb->page_order = ilog2(nr_pages); + } ring_buffer_init(rb, watermark, flags); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 0167679182c0..b7a525ab2083 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -299,7 +299,7 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, retry: /* Read the page with vaddr into memory */ - ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); + ret = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); if (ret <= 0) return ret; @@ -321,7 +321,7 @@ retry: copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); ret = __replace_page(vma, vaddr, old_page, new_page); - page_cache_release(new_page); + put_page(new_page); put_old: put_page(old_page); @@ -539,14 +539,14 @@ static int __copy_insn(struct address_space *mapping, struct file *filp, * see uprobe_register(). */ if (mapping->a_ops->readpage) - page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp); + page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp); else - page = shmem_read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT); + page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT); if (IS_ERR(page)) return PTR_ERR(page); copy_from_page(page, offset, insn, nbytes); - page_cache_release(page); + put_page(page); return 0; } @@ -1130,7 +1130,9 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) struct vm_area_struct *vma; int ret; - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + if (mm->uprobes_state.xol_area) { ret = -EALREADY; goto fail; @@ -1178,6 +1180,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) goto free_area; area->xol_mapping.name = "[uprobes]"; + area->xol_mapping.fault = NULL; area->xol_mapping.pages = area->pages; area->pages[0] = alloc_page(GFP_HIGHUSER); if (!area->pages[0]) @@ -1468,7 +1471,8 @@ static void dup_xol_work(struct callback_head *work) if (current->flags & PF_EXITING) return; - if (!__create_xol_area(current->utask->dup_xol_addr)) + if (!__create_xol_area(current->utask->dup_xol_addr) && + !fatal_signal_pending(current)) uprobe_warn(current, "dup xol area"); } @@ -1693,14 +1697,19 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) int result; pagefault_disable(); - result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr, - sizeof(opcode)); + result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr); pagefault_enable(); if (likely(result == 0)) goto out; - result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL); + /* + * The NULL 'tsk' here ensures that any faults that occur here + * will not be accounted to the task. 'mm' *is* current->mm, + * but we treat this as a 'remote' access since it is + * essentially a kernel access to the memory. + */ + result = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &page, NULL); if (result < 0) return result; diff --git a/kernel/exit.c b/kernel/exit.c index 10e088237fed..9e6e1356e6bb 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -53,6 +53,7 @@ #include <linux/oom.h> #include <linux/writeback.h> #include <linux/shm.h> +#include <linux/kcov.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -434,7 +435,7 @@ static void exit_mm(struct task_struct *tsk) mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) - exit_oom_victim(); + exit_oom_victim(tsk); } static struct task_struct *find_alive_thread(struct task_struct *p) @@ -655,6 +656,7 @@ void do_exit(long code) TASKS_RCU(int tasks_rcu_i); profile_task_exit(tsk); + kcov_task_exit(tsk); WARN_ON(blk_needs_flush_plug(tsk)); @@ -744,7 +746,7 @@ void do_exit(long code) disassociate_ctty(1); exit_task_namespaces(tsk); exit_task_work(tsk); - exit_thread(); + exit_thread(tsk); /* * Flush inherited counters to the parent - before the parent @@ -916,17 +918,28 @@ static int eligible_pid(struct wait_opts *wo, struct task_struct *p) task_pid_type(p, wo->wo_type) == wo->wo_pid; } -static int eligible_child(struct wait_opts *wo, struct task_struct *p) +static int +eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p) { if (!eligible_pid(wo, p)) return 0; - /* Wait for all children (clone and not) if __WALL is set; - * otherwise, wait for clone children *only* if __WCLONE is - * set; otherwise, wait for non-clone children *only*. (Note: - * A "clone" child here is one that reports to its parent - * using a signal other than SIGCHLD.) */ - if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) - && !(wo->wo_flags & __WALL)) + + /* + * Wait for all children (clone and not) if __WALL is set or + * if it is traced by us. + */ + if (ptrace || (wo->wo_flags & __WALL)) + return 1; + + /* + * Otherwise, wait for clone children *only* if __WCLONE is set; + * otherwise, wait for non-clone children *only*. + * + * Note: a "clone" child here is one that reports to its parent + * using a signal other than SIGCHLD, or a non-leader thread which + * we can only see if it is traced by us. + */ + if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) return 0; return 1; @@ -1298,7 +1311,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, if (unlikely(exit_state == EXIT_DEAD)) return 0; - ret = eligible_child(wo, p); + ret = eligible_child(wo, ptrace, p); if (!ret) return ret; @@ -1522,7 +1535,8 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, enum pid_type type; long ret; - if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED)) + if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED| + __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) return -EINVAL; diff --git a/kernel/fork.c b/kernel/fork.c index 2e391c754ae7..4a7ec0c6c88c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -75,6 +75,7 @@ #include <linux/aio.h> #include <linux/compiler.h> #include <linux/sysctl.h> +#include <linux/kcov.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -147,49 +148,57 @@ static inline void free_task_struct(struct task_struct *tsk) } #endif -void __weak arch_release_thread_info(struct thread_info *ti) +void __weak arch_release_thread_stack(unsigned long *stack) { } -#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR +#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR /* * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a * kmemcache based allocator. */ # if THREAD_SIZE >= PAGE_SIZE -static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); + if (page) + memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, + 1 << THREAD_SIZE_ORDER); + return page ? page_address(page) : NULL; } -static inline void free_thread_info(struct thread_info *ti) +static inline void free_thread_stack(unsigned long *stack) { - free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); + struct page *page = virt_to_page(stack); + + memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, + -(1 << THREAD_SIZE_ORDER)); + __free_kmem_pages(page, THREAD_SIZE_ORDER); } # else -static struct kmem_cache *thread_info_cache; +static struct kmem_cache *thread_stack_cache; -static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { - return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node); + return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); } -static void free_thread_info(struct thread_info *ti) +static void free_thread_stack(unsigned long *stack) { - kmem_cache_free(thread_info_cache, ti); + kmem_cache_free(thread_stack_cache, stack); } -void thread_info_cache_init(void) +void thread_stack_cache_init(void) { - thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE, + thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE, THREAD_SIZE, 0, NULL); - BUG_ON(thread_info_cache == NULL); + BUG_ON(thread_stack_cache == NULL); } # endif #endif @@ -212,9 +221,9 @@ struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; -static void account_kernel_stack(struct thread_info *ti, int account) +static void account_kernel_stack(unsigned long *stack, int account) { - struct zone *zone = page_zone(virt_to_page(ti)); + struct zone *zone = page_zone(virt_to_page(stack)); mod_zone_page_state(zone, NR_KERNEL_STACK, account); } @@ -222,8 +231,8 @@ static void account_kernel_stack(struct thread_info *ti, int account) void free_task(struct task_struct *tsk) { account_kernel_stack(tsk->stack, -1); - arch_release_thread_info(tsk->stack); - free_thread_info(tsk->stack); + arch_release_thread_stack(tsk->stack); + free_thread_stack(tsk->stack); rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); put_seccomp_filter(tsk); @@ -331,26 +340,27 @@ void set_task_stack_end_magic(struct task_struct *tsk) *stackend = STACK_END_MAGIC; /* for overflow detection */ } -static struct task_struct *dup_task_struct(struct task_struct *orig) +static struct task_struct *dup_task_struct(struct task_struct *orig, int node) { struct task_struct *tsk; - struct thread_info *ti; - int node = tsk_fork_get_node(orig); + unsigned long *stack; int err; + if (node == NUMA_NO_NODE) + node = tsk_fork_get_node(orig); tsk = alloc_task_struct_node(node); if (!tsk) return NULL; - ti = alloc_thread_info_node(tsk, node); - if (!ti) + stack = alloc_thread_stack_node(tsk, node); + if (!stack) goto free_tsk; err = arch_dup_task_struct(tsk, orig); if (err) - goto free_ti; + goto free_stack; - tsk->stack = ti; + tsk->stack = stack; #ifdef CONFIG_SECCOMP /* * We must handle setting up seccomp filters once we're under @@ -382,12 +392,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) tsk->task_frag.page = NULL; tsk->wake_q.next = NULL; - account_kernel_stack(ti, 1); + account_kernel_stack(stack, 1); + + kcov_task_init(tsk); return tsk; -free_ti: - free_thread_info(ti); +free_stack: + free_thread_stack(stack); free_tsk: free_task_struct(tsk); return NULL; @@ -402,7 +414,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) unsigned long charge; uprobe_start_dup_mmap(); - down_write(&oldmm->mmap_sem); + if (down_write_killable(&oldmm->mmap_sem)) { + retval = -EINTR; + goto fail_uprobe_end; + } flush_cache_dup_mm(oldmm); uprobe_dup_mmap(oldmm, mm); /* @@ -514,6 +529,7 @@ out: up_write(&mm->mmap_sem); flush_tlb_mm(oldmm); up_write(&oldmm->mmap_sem); +fail_uprobe_end: uprobe_end_dup_mmap(); return retval; fail_nomem_anon_vma_fork: @@ -688,6 +704,26 @@ void __mmdrop(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(__mmdrop); +static inline void __mmput(struct mm_struct *mm) +{ + VM_BUG_ON(atomic_read(&mm->mm_users)); + + uprobe_clear_state(mm); + exit_aio(mm); + ksm_exit(mm); + khugepaged_exit(mm); /* must run before exit_mmap */ + exit_mmap(mm); + set_mm_exe_file(mm, NULL); + if (!list_empty(&mm->mmlist)) { + spin_lock(&mmlist_lock); + list_del(&mm->mmlist); + spin_unlock(&mmlist_lock); + } + if (mm->binfmt) + module_put(mm->binfmt->module); + mmdrop(mm); +} + /* * Decrement the use count and release all resources for an mm. */ @@ -695,24 +731,26 @@ void mmput(struct mm_struct *mm) { might_sleep(); + if (atomic_dec_and_test(&mm->mm_users)) + __mmput(mm); +} +EXPORT_SYMBOL_GPL(mmput); + +#ifdef CONFIG_MMU +static void mmput_async_fn(struct work_struct *work) +{ + struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work); + __mmput(mm); +} + +void mmput_async(struct mm_struct *mm) +{ if (atomic_dec_and_test(&mm->mm_users)) { - uprobe_clear_state(mm); - exit_aio(mm); - ksm_exit(mm); - khugepaged_exit(mm); /* must run before exit_mmap */ - exit_mmap(mm); - set_mm_exe_file(mm, NULL); - if (!list_empty(&mm->mmlist)) { - spin_lock(&mmlist_lock); - list_del(&mm->mmlist); - spin_unlock(&mmlist_lock); - } - if (mm->binfmt) - module_put(mm->binfmt->module); - mmdrop(mm); + INIT_WORK(&mm->async_put_work, mmput_async_fn); + schedule_work(&mm->async_put_work); } } -EXPORT_SYMBOL_GPL(mmput); +#endif /** * set_mm_exe_file - change a reference to the mm's executable file @@ -1245,7 +1283,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, int __user *child_tidptr, struct pid *pid, int trace, - unsigned long tls) + unsigned long tls, + int node) { int retval; struct task_struct *p; @@ -1297,7 +1336,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto fork_out; retval = -ENOMEM; - p = dup_task_struct(current); + p = dup_task_struct(current, node); if (!p) goto fork_out; @@ -1459,7 +1498,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, pid = alloc_pid(p->nsproxy->pid_ns_for_children); if (IS_ERR(pid)) { retval = PTR_ERR(pid); - goto bad_fork_cleanup_io; + goto bad_fork_cleanup_thread; } } @@ -1483,7 +1522,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, * sigaltstack should be cleared when sharing the same VM */ if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM) - p->sas_ss_sp = p->sas_ss_size = 0; + sas_ss_reset(p); /* * Syscall tracing and stepping should be turned off in the @@ -1621,6 +1660,8 @@ bad_fork_cancel_cgroup: bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); +bad_fork_cleanup_thread: + exit_thread(p); bad_fork_cleanup_io: if (p->io_context) exit_io_context(p); @@ -1673,7 +1714,8 @@ static inline void init_idle_pids(struct pid_link *links) struct task_struct *fork_idle(int cpu) { struct task_struct *task; - task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0); + task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0, + cpu_to_node(cpu)); if (!IS_ERR(task)) { init_idle_pids(task->pids); init_idle(task, cpu); @@ -1718,7 +1760,7 @@ long _do_fork(unsigned long clone_flags, } p = copy_process(clone_flags, stack_start, stack_size, - child_tidptr, NULL, trace, tls); + child_tidptr, NULL, trace, tls, NUMA_NO_NODE); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. @@ -1884,7 +1926,7 @@ static int check_unshare_flags(unsigned long unshare_flags) if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| - CLONE_NEWUSER|CLONE_NEWPID)) + CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing diff --git a/kernel/futex.c b/kernel/futex.c index 5d6ce6413ef1..33664f70e2d2 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -124,16 +124,16 @@ * futex_wait(futex, val); * * waiters++; (a) - * mb(); (A) <-- paired with -. - * | - * lock(hash_bucket(futex)); | - * | - * uval = *futex; | - * | *futex = newval; - * | sys_futex(WAKE, futex); - * | futex_wake(futex); - * | - * `-------> mb(); (B) + * smp_mb(); (A) <-- paired with -. + * | + * lock(hash_bucket(futex)); | + * | + * uval = *futex; | + * | *futex = newval; + * | sys_futex(WAKE, futex); + * | futex_wake(futex); + * | + * `--------> smp_mb(); (B) * if (uval == val) * queue(); * unlock(hash_bucket(futex)); @@ -334,7 +334,7 @@ static inline void futex_get_mm(union futex_key *key) /* * Ensure futex_get_mm() implies a full barrier such that * get_futex_key() implies a full barrier. This is relied upon - * as full barrier (B), see the ordering comment above. + * as smp_mb(); (B), see the ordering comment above. */ smp_mb__after_atomic(); } @@ -407,10 +407,10 @@ static void get_futex_key_refs(union futex_key *key) switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: - ihold(key->shared.inode); /* implies MB (B) */ + ihold(key->shared.inode); /* implies smp_mb(); (B) */ break; case FUT_OFF_MMSHARED: - futex_get_mm(key); /* implies MB (B) */ + futex_get_mm(key); /* implies smp_mb(); (B) */ break; default: /* @@ -418,7 +418,7 @@ static void get_futex_key_refs(union futex_key *key) * mm, therefore the only purpose of calling get_futex_key_refs * is because we need the barrier for the lockless waiter check. */ - smp_mb(); /* explicit MB (B) */ + smp_mb(); /* explicit smp_mb(); (B) */ } } @@ -469,7 +469,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; - struct page *page; + struct page *page, *tail; struct address_space *mapping; int err, ro = 0; @@ -497,7 +497,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) if (!fshared) { key->private.mm = mm; key->private.address = address; - get_futex_key_refs(key); /* implies MB (B) */ + get_futex_key_refs(key); /* implies smp_mb(); (B) */ return 0; } @@ -520,7 +520,28 @@ again: else err = 0; - lock_page(page); + /* + * The treatment of mapping from this point on is critical. The page + * lock protects many things but in this context the page lock + * stabilizes mapping, prevents inode freeing in the shared + * file-backed region case and guards against movement to swap cache. + * + * Strictly speaking the page lock is not needed in all cases being + * considered here and page lock forces unnecessarily serialization + * From this point on, mapping will be re-verified if necessary and + * page lock will be acquired only if it is unavoidable + * + * Mapping checks require the head page for any compound page so the + * head page and mapping is looked up now. For anonymous pages, it + * does not matter if the page splits in the future as the key is + * based on the address. For filesystem-backed pages, the tail is + * required as the index of the page determines the key. For + * base pages, there is no tail page and tail == page. + */ + tail = page; + page = compound_head(page); + mapping = READ_ONCE(page->mapping); + /* * If page->mapping is NULL, then it cannot be a PageAnon * page; but it might be the ZERO_PAGE or in the gate area or @@ -536,19 +557,31 @@ again: * shmem_writepage move it from filecache to swapcache beneath us: * an unlikely race, but we do need to retry for page->mapping. */ - mapping = compound_head(page)->mapping; - if (!mapping) { - int shmem_swizzled = PageSwapCache(page); + if (unlikely(!mapping)) { + int shmem_swizzled; + + /* + * Page lock is required to identify which special case above + * applies. If this is really a shmem page then the page lock + * will prevent unexpected transitions. + */ + lock_page(page); + shmem_swizzled = PageSwapCache(page) || page->mapping; unlock_page(page); put_page(page); + if (shmem_swizzled) goto again; + return -EFAULT; } /* * Private mappings are handled in a simple way. * + * If the futex key is stored on an anonymous page, then the associated + * object is the mm which is implicitly pinned by the calling process. + * * NOTE: When userspace waits on a MAP_SHARED mapping, even if * it's a read-only handle, it's expected that futexes attach to * the object not the particular process. @@ -566,16 +599,74 @@ again: key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ key->private.mm = mm; key->private.address = address; + + get_futex_key_refs(key); /* implies smp_mb(); (B) */ + } else { + struct inode *inode; + + /* + * The associated futex object in this case is the inode and + * the page->mapping must be traversed. Ordinarily this should + * be stabilised under page lock but it's not strictly + * necessary in this case as we just want to pin the inode, not + * update the radix tree or anything like that. + * + * The RCU read lock is taken as the inode is finally freed + * under RCU. If the mapping still matches expectations then the + * mapping->host can be safely accessed as being a valid inode. + */ + rcu_read_lock(); + + if (READ_ONCE(page->mapping) != mapping) { + rcu_read_unlock(); + put_page(page); + + goto again; + } + + inode = READ_ONCE(mapping->host); + if (!inode) { + rcu_read_unlock(); + put_page(page); + + goto again; + } + + /* + * Take a reference unless it is about to be freed. Previously + * this reference was taken by ihold under the page lock + * pinning the inode in place so i_lock was unnecessary. The + * only way for this check to fail is if the inode was + * truncated in parallel so warn for now if this happens. + * + * We are not calling into get_futex_key_refs() in file-backed + * cases, therefore a successful atomic_inc return below will + * guarantee that get_futex_key() will still imply smp_mb(); (B). + */ + if (WARN_ON_ONCE(!atomic_inc_not_zero(&inode->i_count))) { + rcu_read_unlock(); + put_page(page); + + goto again; + } + + /* Should be impossible but lets be paranoid for now */ + if (WARN_ON_ONCE(inode->i_mapping != mapping)) { + err = -EFAULT; + rcu_read_unlock(); + iput(inode); + + goto out; + } + key->both.offset |= FUT_OFF_INODE; /* inode-based key */ - key->shared.inode = mapping->host; - key->shared.pgoff = basepage_index(page); + key->shared.inode = inode; + key->shared.pgoff = basepage_index(tail); + rcu_read_unlock(); } - get_futex_key_refs(key); /* implies MB (B) */ - out: - unlock_page(page); put_page(page); return err; } @@ -646,7 +737,7 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from) int ret; pagefault_disable(); - ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); + ret = __get_user(*dest, from); pagefault_enable(); return ret ? -EFAULT : 0; @@ -1212,10 +1303,20 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, if (unlikely(should_fail_futex(true))) ret = -EFAULT; - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { ret = -EFAULT; - else if (curval != uval) - ret = -EINVAL; + } else if (curval != uval) { + /* + * If a unconditional UNLOCK_PI operation (user space did not + * try the TID->0 transition) raced with a waiter setting the + * FUTEX_WAITERS flag between get_user() and locking the hash + * bucket lock, retry the operation. + */ + if ((FUTEX_TID_MASK & curval) == uval) + ret = -EAGAIN; + else + ret = -EINVAL; + } if (ret) { raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); return ret; @@ -1442,8 +1543,8 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, if (likely(&hb1->chain != &hb2->chain)) { plist_del(&q->list, &hb1->chain); hb_waiters_dec(hb1); - plist_add(&q->list, &hb2->chain); hb_waiters_inc(hb2); + plist_add(&q->list, &hb2->chain); q->lock_ptr = &hb2->lock; } get_futex_key_refs(key2); @@ -1864,7 +1965,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) q->lock_ptr = &hb->lock; - spin_lock(&hb->lock); /* implies MB (A) */ + spin_lock(&hb->lock); /* implies smp_mb(); (A) */ return hb; } @@ -1927,8 +2028,12 @@ static int unqueue_me(struct futex_q *q) /* In the common case we don't take the spinlock, which is nice. */ retry: - lock_ptr = q->lock_ptr; - barrier(); + /* + * q->lock_ptr can change between this read and the following spin_lock. + * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and + * optimizing lock_ptr out of the logic below. + */ + lock_ptr = READ_ONCE(q->lock_ptr); if (lock_ptr != NULL) { spin_lock(lock_ptr); /* @@ -2536,6 +2641,15 @@ retry: if (ret == -EFAULT) goto pi_faulted; /* + * A unconditional UNLOCK_PI op raced against a waiter + * setting the FUTEX_WAITERS bit. Try again. + */ + if (ret == -EAGAIN) { + spin_unlock(&hb->lock); + put_futex_key(&key); + goto retry; + } + /* * wake_futex_pi has detected invalid state. Tell user * space. */ diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index c92e44855ddd..1276aabaab55 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -37,6 +37,7 @@ config ARCH_HAS_GCOV_PROFILE_ALL config GCOV_PROFILE_ALL bool "Profile entire Kernel" + depends on !COMPILE_TEST depends on GCOV_KERNEL depends on ARCH_HAS_GCOV_PROFILE_ALL default n diff --git a/kernel/hung_task.c b/kernel/hung_task.c index e0f90c2b57aa..d234022805dc 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -185,10 +185,12 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) rcu_read_unlock(); } -static unsigned long timeout_jiffies(unsigned long timeout) +static long hung_timeout_jiffies(unsigned long last_checked, + unsigned long timeout) { /* timeout of 0 will disable the watchdog */ - return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT; + return timeout ? last_checked - jiffies + timeout * HZ : + MAX_SCHEDULE_TIMEOUT; } /* @@ -224,18 +226,21 @@ EXPORT_SYMBOL_GPL(reset_hung_task_detector); */ static int watchdog(void *dummy) { + unsigned long hung_last_checked = jiffies; + set_user_nice(current, 0); for ( ; ; ) { unsigned long timeout = sysctl_hung_task_timeout_secs; + long t = hung_timeout_jiffies(hung_last_checked, timeout); - while (schedule_timeout_interruptible(timeout_jiffies(timeout))) - timeout = sysctl_hung_task_timeout_secs; - - if (atomic_xchg(&reset_hung_task, 0)) + if (t <= 0) { + if (!atomic_xchg(&reset_hung_task, 0)) + check_hung_uninterruptible_tasks(timeout); + hung_last_checked = jiffies; continue; - - check_hung_uninterruptible_tasks(timeout); + } + schedule_timeout_interruptible(t); } return 0; diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 3b48dab80164..3bbfd6a9c475 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -64,6 +64,10 @@ config IRQ_DOMAIN_HIERARCHY bool select IRQ_DOMAIN +# Generic IRQ IPI support +config GENERIC_IRQ_IPI + bool + # Generic MSI interrupt support config GENERIC_MSI_IRQ bool diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 2fc9cbdf35b6..2ee42e95a3ce 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -8,3 +8,4 @@ obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o obj-$(CONFIG_PM_SLEEP) += pm.o obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o +obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 5797909f4e5b..2f9f2b0e79f2 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -961,6 +961,7 @@ void irq_chip_mask_parent(struct irq_data *data) data = data->parent_data; data->chip->irq_mask(data); } +EXPORT_SYMBOL_GPL(irq_chip_mask_parent); /** * irq_chip_unmask_parent - Unmask the parent interrupt @@ -971,6 +972,7 @@ void irq_chip_unmask_parent(struct irq_data *data) data = data->parent_data; data->chip->irq_unmask(data); } +EXPORT_SYMBOL_GPL(irq_chip_unmask_parent); /** * irq_chip_eoi_parent - Invoke EOI on the parent interrupt @@ -981,6 +983,7 @@ void irq_chip_eoi_parent(struct irq_data *data) data = data->parent_data; data->chip->irq_eoi(data); } +EXPORT_SYMBOL_GPL(irq_chip_eoi_parent); /** * irq_chip_set_affinity_parent - Set affinity on the parent interrupt @@ -1016,6 +1019,7 @@ int irq_chip_set_type_parent(struct irq_data *data, unsigned int type) return -ENOSYS; } +EXPORT_SYMBOL_GPL(irq_chip_set_type_parent); /** * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 57bff7857e87..a15b5485b446 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -136,10 +136,9 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) { irqreturn_t retval = IRQ_NONE; unsigned int flags = 0, irq = desc->irq_data.irq; - struct irqaction *action = desc->action; + struct irqaction *action; - /* action might have become NULL since we dropped the lock */ - while (action) { + for_each_action_of_desc(desc, action) { irqreturn_t res; trace_irq_handler_entry(irq, action); @@ -173,7 +172,6 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) } retval |= res; - action = action->next; } add_interrupt_randomness(irq, flags); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index fcab63c66905..09be2c903c6d 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -131,6 +131,9 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc) #define IRQ_GET_DESC_CHECK_GLOBAL (_IRQ_DESC_CHECK) #define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU) +#define for_each_action_of_desc(desc, act) \ + for (act = desc->act; act; act = act->next) + struct irq_desc * __irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, unsigned int check); @@ -160,6 +163,8 @@ irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags) __irq_put_desc_unlock(desc, flags, false); } +#define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) + /* * Manipulation functions for irq_data.state */ @@ -188,6 +193,8 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) return __irqd_to_state(d) & mask; } +#undef __irqd_to_state + static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) { __this_cpu_inc(*desc->kstat_irqs); diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c new file mode 100644 index 000000000000..89b49f6773f0 --- /dev/null +++ b/kernel/irq/ipi.c @@ -0,0 +1,340 @@ +/* + * linux/kernel/irq/ipi.c + * + * Copyright (C) 2015 Imagination Technologies Ltd + * Author: Qais Yousef <qais.yousef@imgtec.com> + * + * This file contains driver APIs to the IPI subsystem. + */ + +#define pr_fmt(fmt) "genirq/ipi: " fmt + +#include <linux/irqdomain.h> +#include <linux/irq.h> + +/** + * irq_reserve_ipi() - Setup an IPI to destination cpumask + * @domain: IPI domain + * @dest: cpumask of cpus which can receive the IPI + * + * Allocate a virq that can be used to send IPI to any CPU in dest mask. + * + * On success it'll return linux irq number and error code on failure + */ +int irq_reserve_ipi(struct irq_domain *domain, + const struct cpumask *dest) +{ + unsigned int nr_irqs, offset; + struct irq_data *data; + int virq, i; + + if (!domain ||!irq_domain_is_ipi(domain)) { + pr_warn("Reservation on a non IPI domain\n"); + return -EINVAL; + } + + if (!cpumask_subset(dest, cpu_possible_mask)) { + pr_warn("Reservation is not in possible_cpu_mask\n"); + return -EINVAL; + } + + nr_irqs = cpumask_weight(dest); + if (!nr_irqs) { + pr_warn("Reservation for empty destination mask\n"); + return -EINVAL; + } + + if (irq_domain_is_ipi_single(domain)) { + /* + * If the underlying implementation uses a single HW irq on + * all cpus then we only need a single Linux irq number for + * it. We have no restrictions vs. the destination mask. The + * underlying implementation can deal with holes nicely. + */ + nr_irqs = 1; + offset = 0; + } else { + unsigned int next; + + /* + * The IPI requires a seperate HW irq on each CPU. We require + * that the destination mask is consecutive. If an + * implementation needs to support holes, it can reserve + * several IPI ranges. + */ + offset = cpumask_first(dest); + /* + * Find a hole and if found look for another set bit after the + * hole. For now we don't support this scenario. + */ + next = cpumask_next_zero(offset, dest); + if (next < nr_cpu_ids) + next = cpumask_next(next, dest); + if (next < nr_cpu_ids) { + pr_warn("Destination mask has holes\n"); + return -EINVAL; + } + } + + virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE); + if (virq <= 0) { + pr_warn("Can't reserve IPI, failed to alloc descs\n"); + return -ENOMEM; + } + + virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE, + (void *) dest, true); + + if (virq <= 0) { + pr_warn("Can't reserve IPI, failed to alloc hw irqs\n"); + goto free_descs; + } + + for (i = 0; i < nr_irqs; i++) { + data = irq_get_irq_data(virq + i); + cpumask_copy(data->common->affinity, dest); + data->common->ipi_offset = offset; + irq_set_status_flags(virq + i, IRQ_NO_BALANCING); + } + return virq; + +free_descs: + irq_free_descs(virq, nr_irqs); + return -EBUSY; +} + +/** + * irq_destroy_ipi() - unreserve an IPI that was previously allocated + * @irq: linux irq number to be destroyed + * @dest: cpumask of cpus which should have the IPI removed + * + * The IPIs allocated with irq_reserve_ipi() are retuerned to the system + * destroying all virqs associated with them. + * + * Return 0 on success or error code on failure. + */ +int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest) +{ + struct irq_data *data = irq_get_irq_data(irq); + struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL; + struct irq_domain *domain; + unsigned int nr_irqs; + + if (!irq || !data || !ipimask) + return -EINVAL; + + domain = data->domain; + if (WARN_ON(domain == NULL)) + return -EINVAL; + + if (!irq_domain_is_ipi(domain)) { + pr_warn("Trying to destroy a non IPI domain!\n"); + return -EINVAL; + } + + if (WARN_ON(!cpumask_subset(dest, ipimask))) + /* + * Must be destroying a subset of CPUs to which this IPI + * was set up to target + */ + return -EINVAL; + + if (irq_domain_is_ipi_per_cpu(domain)) { + irq = irq + cpumask_first(dest) - data->common->ipi_offset; + nr_irqs = cpumask_weight(dest); + } else { + nr_irqs = 1; + } + + irq_domain_free_irqs(irq, nr_irqs); + return 0; +} + +/** + * ipi_get_hwirq - Get the hwirq associated with an IPI to a cpu + * @irq: linux irq number + * @cpu: the target cpu + * + * When dealing with coprocessors IPI, we need to inform the coprocessor of + * the hwirq it needs to use to receive and send IPIs. + * + * Returns hwirq value on success and INVALID_HWIRQ on failure. + */ +irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu) +{ + struct irq_data *data = irq_get_irq_data(irq); + struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL; + + if (!data || !ipimask || cpu > nr_cpu_ids) + return INVALID_HWIRQ; + + if (!cpumask_test_cpu(cpu, ipimask)) + return INVALID_HWIRQ; + + /* + * Get the real hardware irq number if the underlying implementation + * uses a seperate irq per cpu. If the underlying implementation uses + * a single hardware irq for all cpus then the IPI send mechanism + * needs to take care of the cpu destinations. + */ + if (irq_domain_is_ipi_per_cpu(data->domain)) + data = irq_get_irq_data(irq + cpu - data->common->ipi_offset); + + return data ? irqd_to_hwirq(data) : INVALID_HWIRQ; +} +EXPORT_SYMBOL_GPL(ipi_get_hwirq); + +static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data, + const struct cpumask *dest, unsigned int cpu) +{ + struct cpumask *ipimask = irq_data_get_affinity_mask(data); + + if (!chip || !ipimask) + return -EINVAL; + + if (!chip->ipi_send_single && !chip->ipi_send_mask) + return -EINVAL; + + if (cpu > nr_cpu_ids) + return -EINVAL; + + if (dest) { + if (!cpumask_subset(dest, ipimask)) + return -EINVAL; + } else { + if (!cpumask_test_cpu(cpu, ipimask)) + return -EINVAL; + } + return 0; +} + +/** + * __ipi_send_single - send an IPI to a target Linux SMP CPU + * @desc: pointer to irq_desc of the IRQ + * @cpu: destination CPU, must in the destination mask passed to + * irq_reserve_ipi() + * + * This function is for architecture or core code to speed up IPI sending. Not + * usable from driver code. + * + * Returns zero on success and negative error number on failure. + */ +int __ipi_send_single(struct irq_desc *desc, unsigned int cpu) +{ + struct irq_data *data = irq_desc_get_irq_data(desc); + struct irq_chip *chip = irq_data_get_irq_chip(data); + +#ifdef DEBUG + /* + * Minimise the overhead by omitting the checks for Linux SMP IPIs. + * Since the callers should be arch or core code which is generally + * trusted, only check for errors when debugging. + */ + if (WARN_ON_ONCE(ipi_send_verify(chip, data, NULL, cpu))) + return -EINVAL; +#endif + if (!chip->ipi_send_single) { + chip->ipi_send_mask(data, cpumask_of(cpu)); + return 0; + } + + /* FIXME: Store this information in irqdata flags */ + if (irq_domain_is_ipi_per_cpu(data->domain) && + cpu != data->common->ipi_offset) { + /* use the correct data for that cpu */ + unsigned irq = data->irq + cpu - data->common->ipi_offset; + + data = irq_get_irq_data(irq); + } + chip->ipi_send_single(data, cpu); + return 0; +} + +/** + * ipi_send_mask - send an IPI to target Linux SMP CPU(s) + * @desc: pointer to irq_desc of the IRQ + * @dest: dest CPU(s), must be a subset of the mask passed to + * irq_reserve_ipi() + * + * This function is for architecture or core code to speed up IPI sending. Not + * usable from driver code. + * + * Returns zero on success and negative error number on failure. + */ +int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest) +{ + struct irq_data *data = irq_desc_get_irq_data(desc); + struct irq_chip *chip = irq_data_get_irq_chip(data); + unsigned int cpu; + +#ifdef DEBUG + /* + * Minimise the overhead by omitting the checks for Linux SMP IPIs. + * Since the callers should be arch or core code which is generally + * trusted, only check for errors when debugging. + */ + if (WARN_ON_ONCE(ipi_send_verify(chip, data, dest, 0))) + return -EINVAL; +#endif + if (chip->ipi_send_mask) { + chip->ipi_send_mask(data, dest); + return 0; + } + + if (irq_domain_is_ipi_per_cpu(data->domain)) { + unsigned int base = data->irq; + + for_each_cpu(cpu, dest) { + unsigned irq = base + cpu - data->common->ipi_offset; + + data = irq_get_irq_data(irq); + chip->ipi_send_single(data, cpu); + } + } else { + for_each_cpu(cpu, dest) + chip->ipi_send_single(data, cpu); + } + return 0; +} + +/** + * ipi_send_single - Send an IPI to a single CPU + * @virq: linux irq number from irq_reserve_ipi() + * @cpu: destination CPU, must in the destination mask passed to + * irq_reserve_ipi() + * + * Returns zero on success and negative error number on failure. + */ +int ipi_send_single(unsigned int virq, unsigned int cpu) +{ + struct irq_desc *desc = irq_to_desc(virq); + struct irq_data *data = desc ? irq_desc_get_irq_data(desc) : NULL; + struct irq_chip *chip = data ? irq_data_get_irq_chip(data) : NULL; + + if (WARN_ON_ONCE(ipi_send_verify(chip, data, NULL, cpu))) + return -EINVAL; + + return __ipi_send_single(desc, cpu); +} +EXPORT_SYMBOL_GPL(ipi_send_single); + +/** + * ipi_send_mask - Send an IPI to target CPU(s) + * @virq: linux irq number from irq_reserve_ipi() + * @dest: dest CPU(s), must be a subset of the mask passed to + * irq_reserve_ipi() + * + * Returns zero on success and negative error number on failure. + */ +int ipi_send_mask(unsigned int virq, const struct cpumask *dest) +{ + struct irq_desc *desc = irq_to_desc(virq); + struct irq_data *data = desc ? irq_desc_get_irq_data(desc) : NULL; + struct irq_chip *chip = data ? irq_data_get_irq_chip(data) : NULL; + + if (WARN_ON_ONCE(ipi_send_verify(chip, data, dest, 0))) + return -EINVAL; + + return __ipi_send_mask(desc, dest); +} +EXPORT_SYMBOL_GPL(ipi_send_mask); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 0409da0bcc33..8731e1c5d1e7 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -24,10 +24,27 @@ static struct lock_class_key irq_desc_lock_class; #if defined(CONFIG_SMP) +static int __init irq_affinity_setup(char *str) +{ + zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); + cpulist_parse(str, irq_default_affinity); + /* + * Set at least the boot cpu. We don't want to end up with + * bugreports caused by random comandline masks + */ + cpumask_set_cpu(smp_processor_id(), irq_default_affinity); + return 1; +} +__setup("irqaffinity=", irq_affinity_setup); + static void __init init_irq_default_affinity(void) { - alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); - cpumask_setall(irq_default_affinity); +#ifdef CONFIG_CPUMASK_OFFSTACK + if (!irq_default_affinity) + zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); +#endif + if (cpumask_empty(irq_default_affinity)) + cpumask_setall(irq_default_affinity); } #else static void __init init_irq_default_affinity(void) @@ -578,7 +595,8 @@ void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus) chip_bus_sync_unlock(desc); } -int irq_set_percpu_devid(unsigned int irq) +int irq_set_percpu_devid_partition(unsigned int irq, + const struct cpumask *affinity) { struct irq_desc *desc = irq_to_desc(irq); @@ -593,10 +611,33 @@ int irq_set_percpu_devid(unsigned int irq) if (!desc->percpu_enabled) return -ENOMEM; + if (affinity) + desc->percpu_affinity = affinity; + else + desc->percpu_affinity = cpu_possible_mask; + irq_set_percpu_devid_flags(irq); return 0; } +int irq_set_percpu_devid(unsigned int irq) +{ + return irq_set_percpu_devid_partition(irq, NULL); +} + +int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (!desc || !desc->percpu_enabled) + return -EINVAL; + + if (affinity) + cpumask_copy(affinity, desc->percpu_affinity); + + return 0; +} + void kstat_incr_irq_this_cpu(unsigned int irq) { kstat_incr_irqs_this_cpu(irq_to_desc(irq)); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 3e56d2f03e24..8798b6c9e945 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -23,8 +23,6 @@ static DEFINE_MUTEX(irq_domain_mutex); static DEFINE_MUTEX(revmap_trees_mutex); static struct irq_domain *irq_default_domain; -static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, - irq_hw_number_t hwirq, int node); static void irq_domain_check_hierarchy(struct irq_domain *domain); struct irqchip_fwid { @@ -141,12 +139,7 @@ void irq_domain_remove(struct irq_domain *domain) { mutex_lock(&irq_domain_mutex); - /* - * radix_tree_delete() takes care of destroying the root - * node when all entries are removed. Shout if there are - * any mappings left. - */ - WARN_ON(domain->revmap_tree.height); + WARN_ON(!radix_tree_empty(&domain->revmap_tree)); list_del(&domain->link); @@ -245,14 +238,15 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, EXPORT_SYMBOL_GPL(irq_domain_add_legacy); /** - * irq_find_matching_fwnode() - Locates a domain for a given fwnode - * @fwnode: FW descriptor of the interrupt controller + * irq_find_matching_fwspec() - Locates a domain for a given fwspec + * @fwspec: FW specifier for an interrupt * @bus_token: domain-specific data */ -struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode, +struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec, enum irq_domain_bus_token bus_token) { struct irq_domain *h, *found = NULL; + struct fwnode_handle *fwnode = fwspec->fwnode; int rc; /* We might want to match the legacy controller last since @@ -266,7 +260,9 @@ struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode, */ mutex_lock(&irq_domain_mutex); list_for_each_entry(h, &irq_domain_list, link) { - if (h->ops->match) + if (h->ops->select && fwspec->param_count) + rc = h->ops->select(h, fwspec, bus_token); + else if (h->ops->match) rc = h->ops->match(h, to_of_node(fwnode), bus_token); else rc = ((fwnode != NULL) && (h->fwnode == fwnode) && @@ -281,7 +277,7 @@ struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode, mutex_unlock(&irq_domain_mutex); return found; } -EXPORT_SYMBOL_GPL(irq_find_matching_fwnode); +EXPORT_SYMBOL_GPL(irq_find_matching_fwspec); /** * irq_set_default_host() - Set a "default" irq domain @@ -576,11 +572,9 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) int virq; if (fwspec->fwnode) { - domain = irq_find_matching_fwnode(fwspec->fwnode, - DOMAIN_BUS_WIRED); + domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_WIRED); if (!domain) - domain = irq_find_matching_fwnode(fwspec->fwnode, - DOMAIN_BUS_ANY); + domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_ANY); } else { domain = irq_default_domain; } @@ -840,8 +834,8 @@ const struct irq_domain_ops irq_domain_simple_ops = { }; EXPORT_SYMBOL_GPL(irq_domain_simple_ops); -static int irq_domain_alloc_descs(int virq, unsigned int cnt, - irq_hw_number_t hwirq, int node) +int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, + int node) { unsigned int hint; @@ -895,6 +889,7 @@ struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent, return domain; } +EXPORT_SYMBOL_GPL(irq_domain_create_hierarchy); static void irq_domain_insert_irq(int virq) { @@ -1045,6 +1040,7 @@ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq, return 0; } +EXPORT_SYMBOL_GPL(irq_domain_set_hwirq_and_chip); /** * irq_domain_set_info - Set the complete data for a @virq in @domain @@ -1078,6 +1074,7 @@ void irq_domain_reset_irq_data(struct irq_data *irq_data) irq_data->chip = &no_irq_chip; irq_data->chip_data = NULL; } +EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data); /** * irq_domain_free_irqs_common - Clear irq_data and free the parent @@ -1098,6 +1095,7 @@ void irq_domain_free_irqs_common(struct irq_domain *domain, unsigned int virq, } irq_domain_free_irqs_parent(domain, virq, nr_irqs); } +EXPORT_SYMBOL_GPL(irq_domain_free_irqs_common); /** * irq_domain_free_irqs_top - Clear handler and handler data, clear irqdata and free parent @@ -1275,6 +1273,7 @@ int irq_domain_alloc_irqs_parent(struct irq_domain *domain, nr_irqs, arg); return -ENOSYS; } +EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent); /** * irq_domain_free_irqs_parent - Free interrupts from parent domain @@ -1292,6 +1291,7 @@ void irq_domain_free_irqs_parent(struct irq_domain *domain, irq_domain_free_irqs_recursive(domain->parent, irq_base, nr_irqs); } +EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent); /** * irq_domain_activate_irq - Call domain_ops->activate recursively to activate diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 841187239adc..ef0bc02c3a70 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -144,13 +144,11 @@ int irq_can_set_affinity(unsigned int irq) */ void irq_set_thread_affinity(struct irq_desc *desc) { - struct irqaction *action = desc->action; + struct irqaction *action; - while (action) { + for_each_action_of_desc(desc, action) if (action->thread) set_bit(IRQTF_AFFINITY, &action->thread_flags); - action = action->next; - } } #ifdef CONFIG_GENERIC_PENDING_IRQ @@ -994,7 +992,7 @@ void irq_wake_thread(unsigned int irq, void *dev_id) return; raw_spin_lock_irqsave(&desc->lock, flags); - for (action = desc->action; action; action = action->next) { + for_each_action_of_desc(desc, action) { if (action->dev_id == dev_id) { if (action->thread) __irq_wake_thread(desc, action); @@ -1324,8 +1322,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (nmsk != omsk) /* hope the handler works with current trigger mode */ - pr_warning("irq %d uses trigger mode %u; requested %u\n", - irq, nmsk, omsk); + pr_warn("irq %d uses trigger mode %u; requested %u\n", + irq, nmsk, omsk); } *old_ptr = new; @@ -1409,7 +1407,7 @@ int setup_irq(unsigned int irq, struct irqaction *act) int retval; struct irq_desc *desc = irq_to_desc(irq); - if (WARN_ON(irq_settings_is_per_cpu_devid(desc))) + if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) return -EINVAL; chip_bus_lock(desc); retval = __setup_irq(irq, desc, act); @@ -1609,6 +1607,9 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, struct irq_desc *desc; int retval; + if (irq == IRQ_NOTCONNECTED) + return -ENOTCONN; + /* * Sanity-check: shared interrupts must pass in a real dev-ID, * otherwise we'll have trouble later trying to figure out @@ -1699,9 +1700,13 @@ EXPORT_SYMBOL(request_threaded_irq); int request_any_context_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev_id) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc; int ret; + if (irq == IRQ_NOTCONNECTED) + return -ENOTCONN; + + desc = irq_to_desc(irq); if (!desc) return -EINVAL; diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index a2c02fd5d6d0..4e1b94726818 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -291,7 +291,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action) int ret = 1; raw_spin_lock_irqsave(&desc->lock, flags); - for (action = desc->action ; action; action = action->next) { + for_each_action_of_desc(desc, action) { if ((action != new_action) && action->name && !strcmp(new_action->name, action->name)) { ret = 0; diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 32144175458d..5707f97a3e6a 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -211,14 +211,12 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) * desc->lock here. See synchronize_irq(). */ raw_spin_lock_irqsave(&desc->lock, flags); - action = desc->action; - while (action) { + for_each_action_of_desc(desc, action) { printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler); if (action->thread_fn) printk(KERN_CONT " threaded [<%p>] %pf", action->thread_fn, action->thread_fn); printk(KERN_CONT "\n"); - action = action->next; } raw_spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 05254eeb4b4e..4b353e0be121 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -58,13 +58,36 @@ static void jump_label_update(struct static_key *key); void static_key_slow_inc(struct static_key *key) { + int v, v1; + STATIC_KEY_CHECK_USE(); - if (atomic_inc_not_zero(&key->enabled)) - return; + + /* + * Careful if we get concurrent static_key_slow_inc() calls; + * later calls must wait for the first one to _finish_ the + * jump_label_update() process. At the same time, however, + * the jump_label_update() call below wants to see + * static_key_enabled(&key) for jumps to be updated properly. + * + * So give a special meaning to negative key->enabled: it sends + * static_key_slow_inc() down the slow path, and it is non-zero + * so it counts as "enabled" in jump_label_update(). Note that + * atomic_inc_unless_negative() checks >= 0, so roll our own. + */ + for (v = atomic_read(&key->enabled); v > 0; v = v1) { + v1 = atomic_cmpxchg(&key->enabled, v, v + 1); + if (likely(v1 == v)) + return; + } jump_label_lock(); - if (atomic_inc_return(&key->enabled) == 1) + if (atomic_read(&key->enabled) == 0) { + atomic_set(&key->enabled, -1); jump_label_update(key); + atomic_set(&key->enabled, 1); + } else { + atomic_inc(&key->enabled); + } jump_label_unlock(); } EXPORT_SYMBOL_GPL(static_key_slow_inc); @@ -72,6 +95,13 @@ EXPORT_SYMBOL_GPL(static_key_slow_inc); static void __static_key_slow_dec(struct static_key *key, unsigned long rate_limit, struct delayed_work *work) { + /* + * The negative count check is valid even when a negative + * key->enabled is in use by static_key_slow_inc(); a + * __static_key_slow_dec() before the first static_key_slow_inc() + * returns is unbalanced, because all other static_key_slow_inc() + * instances block while the update is in progress. + */ if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { WARN(atomic_read(&key->enabled) < 0, "jump label: negative count!\n"); diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 5c5987f10819..fafd1a3ef0da 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -38,6 +38,7 @@ * during the second link stage. */ extern const unsigned long kallsyms_addresses[] __weak; +extern const int kallsyms_offsets[] __weak; extern const u8 kallsyms_names[] __weak; /* @@ -47,6 +48,9 @@ extern const u8 kallsyms_names[] __weak; extern const unsigned long kallsyms_num_syms __attribute__((weak, section(".rodata"))); +extern const unsigned long kallsyms_relative_base +__attribute__((weak, section(".rodata"))); + extern const u8 kallsyms_token_table[] __weak; extern const u16 kallsyms_token_index[] __weak; @@ -176,6 +180,23 @@ static unsigned int get_symbol_offset(unsigned long pos) return name - kallsyms_names; } +static unsigned long kallsyms_sym_address(int idx) +{ + if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE)) + return kallsyms_addresses[idx]; + + /* values are unsigned offsets if --absolute-percpu is not in effect */ + if (!IS_ENABLED(CONFIG_KALLSYMS_ABSOLUTE_PERCPU)) + return kallsyms_relative_base + (u32)kallsyms_offsets[idx]; + + /* ...otherwise, positive offsets are absolute values */ + if (kallsyms_offsets[idx] >= 0) + return kallsyms_offsets[idx]; + + /* ...and negative offsets are relative to kallsyms_relative_base - 1 */ + return kallsyms_relative_base - 1 - kallsyms_offsets[idx]; +} + /* Lookup the address for this symbol. Returns 0 if not found. */ unsigned long kallsyms_lookup_name(const char *name) { @@ -187,7 +208,7 @@ unsigned long kallsyms_lookup_name(const char *name) off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); if (strcmp(namebuf, name) == 0) - return kallsyms_addresses[i]; + return kallsyms_sym_address(i); } return module_kallsyms_lookup_name(name); } @@ -204,7 +225,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, for (i = 0, off = 0; i < kallsyms_num_syms; i++) { off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); - ret = fn(data, namebuf, NULL, kallsyms_addresses[i]); + ret = fn(data, namebuf, NULL, kallsyms_sym_address(i)); if (ret != 0) return ret; } @@ -220,7 +241,10 @@ static unsigned long get_symbol_pos(unsigned long addr, unsigned long i, low, high, mid; /* This kernel should never had been booted. */ - BUG_ON(!kallsyms_addresses); + if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE)) + BUG_ON(!kallsyms_addresses); + else + BUG_ON(!kallsyms_offsets); /* Do a binary search on the sorted kallsyms_addresses array. */ low = 0; @@ -228,7 +252,7 @@ static unsigned long get_symbol_pos(unsigned long addr, while (high - low > 1) { mid = low + (high - low) / 2; - if (kallsyms_addresses[mid] <= addr) + if (kallsyms_sym_address(mid) <= addr) low = mid; else high = mid; @@ -238,15 +262,15 @@ static unsigned long get_symbol_pos(unsigned long addr, * Search for the first aliased symbol. Aliased * symbols are symbols with the same address. */ - while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low]) + while (low && kallsyms_sym_address(low-1) == kallsyms_sym_address(low)) --low; - symbol_start = kallsyms_addresses[low]; + symbol_start = kallsyms_sym_address(low); /* Search for next non-aliased symbol. */ for (i = low + 1; i < kallsyms_num_syms; i++) { - if (kallsyms_addresses[i] > symbol_start) { - symbol_end = kallsyms_addresses[i]; + if (kallsyms_sym_address(i) > symbol_start) { + symbol_end = kallsyms_sym_address(i); break; } } @@ -470,7 +494,7 @@ static unsigned long get_ksymbol_core(struct kallsym_iter *iter) unsigned off = iter->nameoff; iter->module_name[0] = '\0'; - iter->value = kallsyms_addresses[iter->pos]; + iter->value = kallsyms_sym_address(iter->pos); iter->type = kallsyms_get_symbol_type(off); diff --git a/kernel/kcov.c b/kernel/kcov.c new file mode 100644 index 000000000000..8d44b3fea9d0 --- /dev/null +++ b/kernel/kcov.c @@ -0,0 +1,279 @@ +#define pr_fmt(fmt) "kcov: " fmt + +#define DISABLE_BRANCH_PROFILING +#include <linux/compiler.h> +#include <linux/types.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/printk.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/vmalloc.h> +#include <linux/debugfs.h> +#include <linux/uaccess.h> +#include <linux/kcov.h> + +/* + * kcov descriptor (one per opened debugfs file). + * State transitions of the descriptor: + * - initial state after open() + * - then there must be a single ioctl(KCOV_INIT_TRACE) call + * - then, mmap() call (several calls are allowed but not useful) + * - then, repeated enable/disable for a task (only one task a time allowed) + */ +struct kcov { + /* + * Reference counter. We keep one for: + * - opened file descriptor + * - task with enabled coverage (we can't unwire it from another task) + */ + atomic_t refcount; + /* The lock protects mode, size, area and t. */ + spinlock_t lock; + enum kcov_mode mode; + /* Size of arena (in long's for KCOV_MODE_TRACE). */ + unsigned size; + /* Coverage buffer shared with user space. */ + void *area; + /* Task for which we collect coverage, or NULL. */ + struct task_struct *t; +}; + +/* + * Entry point from instrumented code. + * This is called once per basic-block/edge. + */ +void notrace __sanitizer_cov_trace_pc(void) +{ + struct task_struct *t; + enum kcov_mode mode; + + t = current; + /* + * We are interested in code coverage as a function of a syscall inputs, + * so we ignore code executed in interrupts. + */ + if (!t || in_interrupt()) + return; + mode = READ_ONCE(t->kcov_mode); + if (mode == KCOV_MODE_TRACE) { + unsigned long *area; + unsigned long pos; + + /* + * There is some code that runs in interrupts but for which + * in_interrupt() returns false (e.g. preempt_schedule_irq()). + * READ_ONCE()/barrier() effectively provides load-acquire wrt + * interrupts, there are paired barrier()/WRITE_ONCE() in + * kcov_ioctl_locked(). + */ + barrier(); + area = t->kcov_area; + /* The first word is number of subsequent PCs. */ + pos = READ_ONCE(area[0]) + 1; + if (likely(pos < t->kcov_size)) { + area[pos] = _RET_IP_; + WRITE_ONCE(area[0], pos); + } + } +} +EXPORT_SYMBOL(__sanitizer_cov_trace_pc); + +static void kcov_get(struct kcov *kcov) +{ + atomic_inc(&kcov->refcount); +} + +static void kcov_put(struct kcov *kcov) +{ + if (atomic_dec_and_test(&kcov->refcount)) { + vfree(kcov->area); + kfree(kcov); + } +} + +void kcov_task_init(struct task_struct *t) +{ + t->kcov_mode = KCOV_MODE_DISABLED; + t->kcov_size = 0; + t->kcov_area = NULL; + t->kcov = NULL; +} + +void kcov_task_exit(struct task_struct *t) +{ + struct kcov *kcov; + + kcov = t->kcov; + if (kcov == NULL) + return; + spin_lock(&kcov->lock); + if (WARN_ON(kcov->t != t)) { + spin_unlock(&kcov->lock); + return; + } + /* Just to not leave dangling references behind. */ + kcov_task_init(t); + kcov->t = NULL; + spin_unlock(&kcov->lock); + kcov_put(kcov); +} + +static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) +{ + int res = 0; + void *area; + struct kcov *kcov = vma->vm_file->private_data; + unsigned long size, off; + struct page *page; + + area = vmalloc_user(vma->vm_end - vma->vm_start); + if (!area) + return -ENOMEM; + + spin_lock(&kcov->lock); + size = kcov->size * sizeof(unsigned long); + if (kcov->mode == KCOV_MODE_DISABLED || vma->vm_pgoff != 0 || + vma->vm_end - vma->vm_start != size) { + res = -EINVAL; + goto exit; + } + if (!kcov->area) { + kcov->area = area; + vma->vm_flags |= VM_DONTEXPAND; + spin_unlock(&kcov->lock); + for (off = 0; off < size; off += PAGE_SIZE) { + page = vmalloc_to_page(kcov->area + off); + if (vm_insert_page(vma, vma->vm_start + off, page)) + WARN_ONCE(1, "vm_insert_page() failed"); + } + return 0; + } +exit: + spin_unlock(&kcov->lock); + vfree(area); + return res; +} + +static int kcov_open(struct inode *inode, struct file *filep) +{ + struct kcov *kcov; + + kcov = kzalloc(sizeof(*kcov), GFP_KERNEL); + if (!kcov) + return -ENOMEM; + atomic_set(&kcov->refcount, 1); + spin_lock_init(&kcov->lock); + filep->private_data = kcov; + return nonseekable_open(inode, filep); +} + +static int kcov_close(struct inode *inode, struct file *filep) +{ + kcov_put(filep->private_data); + return 0; +} + +static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, + unsigned long arg) +{ + struct task_struct *t; + unsigned long size, unused; + + switch (cmd) { + case KCOV_INIT_TRACE: + /* + * Enable kcov in trace mode and setup buffer size. + * Must happen before anything else. + */ + if (kcov->mode != KCOV_MODE_DISABLED) + return -EBUSY; + /* + * Size must be at least 2 to hold current position and one PC. + * Later we allocate size * sizeof(unsigned long) memory, + * that must not overflow. + */ + size = arg; + if (size < 2 || size > INT_MAX / sizeof(unsigned long)) + return -EINVAL; + kcov->size = size; + kcov->mode = KCOV_MODE_TRACE; + return 0; + case KCOV_ENABLE: + /* + * Enable coverage for the current task. + * At this point user must have been enabled trace mode, + * and mmapped the file. Coverage collection is disabled only + * at task exit or voluntary by KCOV_DISABLE. After that it can + * be enabled for another task. + */ + unused = arg; + if (unused != 0 || kcov->mode == KCOV_MODE_DISABLED || + kcov->area == NULL) + return -EINVAL; + if (kcov->t != NULL) + return -EBUSY; + t = current; + /* Cache in task struct for performance. */ + t->kcov_size = kcov->size; + t->kcov_area = kcov->area; + /* See comment in __sanitizer_cov_trace_pc(). */ + barrier(); + WRITE_ONCE(t->kcov_mode, kcov->mode); + t->kcov = kcov; + kcov->t = t; + /* This is put either in kcov_task_exit() or in KCOV_DISABLE. */ + kcov_get(kcov); + return 0; + case KCOV_DISABLE: + /* Disable coverage for the current task. */ + unused = arg; + if (unused != 0 || current->kcov != kcov) + return -EINVAL; + t = current; + if (WARN_ON(kcov->t != t)) + return -EINVAL; + kcov_task_init(t); + kcov->t = NULL; + kcov_put(kcov); + return 0; + default: + return -ENOTTY; + } +} + +static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) +{ + struct kcov *kcov; + int res; + + kcov = filep->private_data; + spin_lock(&kcov->lock); + res = kcov_ioctl_locked(kcov, cmd, arg); + spin_unlock(&kcov->lock); + return res; +} + +static const struct file_operations kcov_fops = { + .open = kcov_open, + .unlocked_ioctl = kcov_ioctl, + .mmap = kcov_mmap, + .release = kcov_close, +}; + +static int __init kcov_init(void) +{ + /* + * The kcov debugfs file won't ever get removed and thus, + * there is no need to protect it against removal races. The + * use of debugfs_create_file_unsafe() is actually safe here. + */ + if (!debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops)) { + pr_err("failed to create kcov in debugfs\n"); + return -ENOMEM; + } + return 0; +} + +device_initcall(kcov_init); diff --git a/kernel/kexec.c b/kernel/kexec.c index ee70aef5cd81..4384672d3245 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -103,6 +103,65 @@ out_free_image: return ret; } +static int do_kexec_load(unsigned long entry, unsigned long nr_segments, + struct kexec_segment __user *segments, unsigned long flags) +{ + struct kimage **dest_image, *image; + unsigned long i; + int ret; + + if (flags & KEXEC_ON_CRASH) { + dest_image = &kexec_crash_image; + if (kexec_crash_image) + arch_kexec_unprotect_crashkres(); + } else { + dest_image = &kexec_image; + } + + if (nr_segments == 0) { + /* Uninstall image */ + kimage_free(xchg(dest_image, NULL)); + return 0; + } + if (flags & KEXEC_ON_CRASH) { + /* + * Loading another kernel to switch to if this one + * crashes. Free any current crash dump kernel before + * we corrupt it. + */ + kimage_free(xchg(&kexec_crash_image, NULL)); + } + + ret = kimage_alloc_init(&image, entry, nr_segments, segments, flags); + if (ret) + return ret; + + if (flags & KEXEC_PRESERVE_CONTEXT) + image->preserve_context = 1; + + ret = machine_kexec_prepare(image); + if (ret) + goto out; + + for (i = 0; i < nr_segments; i++) { + ret = kimage_load_segment(image, &image->segment[i]); + if (ret) + goto out; + } + + kimage_terminate(image); + + /* Install the new kernel and uninstall the old */ + image = xchg(dest_image, image); + +out: + if ((flags & KEXEC_ON_CRASH) && kexec_crash_image) + arch_kexec_protect_crashkres(); + + kimage_free(image); + return ret; +} + /* * Exec Kernel system call: for obvious reasons only root may call it. * @@ -127,7 +186,6 @@ out_free_image: SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, struct kexec_segment __user *, segments, unsigned long, flags) { - struct kimage **dest_image, *image; int result; /* We only trust the superuser with rebooting the system. */ @@ -152,9 +210,6 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, if (nr_segments > KEXEC_SEGMENT_MAX) return -EINVAL; - image = NULL; - result = 0; - /* Because we write directly to the reserved memory * region when loading crash kernels we need a mutex here to * prevent multiple crash kernels from attempting to load @@ -166,53 +221,9 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, if (!mutex_trylock(&kexec_mutex)) return -EBUSY; - dest_image = &kexec_image; - if (flags & KEXEC_ON_CRASH) - dest_image = &kexec_crash_image; - if (nr_segments > 0) { - unsigned long i; - - if (flags & KEXEC_ON_CRASH) { - /* - * Loading another kernel to switch to if this one - * crashes. Free any current crash dump kernel before - * we corrupt it. - */ - - kimage_free(xchg(&kexec_crash_image, NULL)); - result = kimage_alloc_init(&image, entry, nr_segments, - segments, flags); - crash_map_reserved_pages(); - } else { - /* Loading another kernel to reboot into. */ - - result = kimage_alloc_init(&image, entry, nr_segments, - segments, flags); - } - if (result) - goto out; - - if (flags & KEXEC_PRESERVE_CONTEXT) - image->preserve_context = 1; - result = machine_kexec_prepare(image); - if (result) - goto out; - - for (i = 0; i < nr_segments; i++) { - result = kimage_load_segment(image, &image->segment[i]); - if (result) - goto out; - } - kimage_terminate(image); - if (flags & KEXEC_ON_CRASH) - crash_unmap_reserved_pages(); - } - /* Install the new kernel, and Uninstall the old */ - image = xchg(dest_image, image); + result = do_kexec_load(entry, nr_segments, segments, flags); -out: mutex_unlock(&kexec_mutex); - kimage_free(image); return result; } diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 8dc659144869..56b3ed0927b0 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -66,13 +66,15 @@ struct resource crashk_res = { .name = "Crash kernel", .start = 0, .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, + .desc = IORES_DESC_CRASH_KERNEL }; struct resource crashk_low_res = { .name = "Crash kernel", .start = 0, .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, + .desc = IORES_DESC_CRASH_KERNEL }; int kexec_should_crash(struct task_struct *p) @@ -891,6 +893,7 @@ void crash_kexec(struct pt_regs *regs) old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); if (old_cpu == PANIC_CPU_INVALID) { /* This is the 1st CPU which comes here, so go ahead. */ + printk_nmi_flush_on_panic(); __crash_kexec(regs); /* @@ -951,7 +954,6 @@ int crash_shrink_memory(unsigned long new_size) start = roundup(start, KEXEC_CRASH_MEM_ALIGN); end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN); - crash_map_reserved_pages(); crash_free_reserved_phys_range(end, crashk_res.end); if ((start == end) && (crashk_res.parent != NULL)) @@ -959,13 +961,12 @@ int crash_shrink_memory(unsigned long new_size) ram_res->start = end; ram_res->end = crashk_res.end; - ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; + ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; ram_res->name = "System RAM"; crashk_res.end = end - 1; insert_resource(&iomem_resource, ram_res); - crash_unmap_reserved_pages(); unlock: mutex_unlock(&kexec_mutex); @@ -1408,11 +1409,14 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_STRUCT_SIZE(list_head); VMCOREINFO_SIZE(nodemask_t); VMCOREINFO_OFFSET(page, flags); - VMCOREINFO_OFFSET(page, _count); + VMCOREINFO_OFFSET(page, _refcount); VMCOREINFO_OFFSET(page, mapping); VMCOREINFO_OFFSET(page, lru); VMCOREINFO_OFFSET(page, _mapcount); VMCOREINFO_OFFSET(page, private); + VMCOREINFO_OFFSET(page, compound_dtor); + VMCOREINFO_OFFSET(page, compound_order); + VMCOREINFO_OFFSET(page, compound_head); VMCOREINFO_OFFSET(pglist_data, node_zones); VMCOREINFO_OFFSET(pglist_data, nr_zones); #ifdef CONFIG_FLAT_NODE_MEM_MAP @@ -1445,8 +1449,8 @@ static int __init crash_save_vmcoreinfo_init(void) #ifdef CONFIG_X86 VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE); #endif -#ifdef CONFIG_HUGETLBFS - VMCOREINFO_SYMBOL(free_huge_page); +#ifdef CONFIG_HUGETLB_PAGE + VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); #endif arch_crash_save_vmcoreinfo(); @@ -1547,13 +1551,14 @@ int kernel_kexec(void) } /* - * Add and remove page tables for crashkernel memory + * Protection mechanism for crashkernel reserved memory after + * the kdump kernel is loaded. * * Provide an empty default implementation here -- architecture * code may override this */ -void __weak crash_map_reserved_pages(void) +void __weak arch_kexec_protect_crashkres(void) {} -void __weak crash_unmap_reserved_pages(void) +void __weak arch_kexec_unprotect_crashkres(void) {} diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 007b791f676d..503bc2d348e5 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -18,6 +18,7 @@ #include <linux/kexec.h> #include <linux/mutex.h> #include <linux/list.h> +#include <linux/fs.h> #include <crypto/hash.h> #include <crypto/sha.h> #include <linux/syscalls.h> @@ -33,65 +34,6 @@ size_t __weak kexec_purgatory_size = 0; static int kexec_calculate_store_digests(struct kimage *image); -static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len) -{ - struct fd f = fdget(fd); - int ret; - struct kstat stat; - loff_t pos; - ssize_t bytes = 0; - - if (!f.file) - return -EBADF; - - ret = vfs_getattr(&f.file->f_path, &stat); - if (ret) - goto out; - - if (stat.size > INT_MAX) { - ret = -EFBIG; - goto out; - } - - /* Don't hand 0 to vmalloc, it whines. */ - if (stat.size == 0) { - ret = -EINVAL; - goto out; - } - - *buf = vmalloc(stat.size); - if (!*buf) { - ret = -ENOMEM; - goto out; - } - - pos = 0; - while (pos < stat.size) { - bytes = kernel_read(f.file, pos, (char *)(*buf) + pos, - stat.size - pos); - if (bytes < 0) { - vfree(*buf); - ret = bytes; - goto out; - } - - if (bytes == 0) - break; - pos += bytes; - } - - if (pos != stat.size) { - ret = -EBADF; - vfree(*buf); - goto out; - } - - *buf_len = pos; -out: - fdput(f); - return ret; -} - /* Architectures can provide this probe function */ int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, unsigned long buf_len) @@ -182,16 +124,17 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, { int ret = 0; void *ldata; + loff_t size; - ret = copy_file_from_fd(kernel_fd, &image->kernel_buf, - &image->kernel_buf_len); + ret = kernel_read_file_from_fd(kernel_fd, &image->kernel_buf, + &size, INT_MAX, READING_KEXEC_IMAGE); if (ret) return ret; + image->kernel_buf_len = size; /* Call arch image probe handlers */ ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, image->kernel_buf_len); - if (ret) goto out; @@ -206,10 +149,12 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, #endif /* It is possible that there no initramfs is being loaded */ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { - ret = copy_file_from_fd(initrd_fd, &image->initrd_buf, - &image->initrd_buf_len); + ret = kernel_read_file_from_fd(initrd_fd, &image->initrd_buf, + &size, INT_MAX, + READING_KEXEC_INITRAMFS); if (ret) goto out; + image->initrd_buf_len = size; } if (cmdline_len) { @@ -329,8 +274,11 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, return -EBUSY; dest_image = &kexec_image; - if (flags & KEXEC_FILE_ON_CRASH) + if (flags & KEXEC_FILE_ON_CRASH) { dest_image = &kexec_crash_image; + if (kexec_crash_image) + arch_kexec_unprotect_crashkres(); + } if (flags & KEXEC_FILE_UNLOAD) goto exchange; @@ -379,6 +327,9 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, exchange: image = xchg(dest_image, image); out: + if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image) + arch_kexec_protect_crashkres(); + mutex_unlock(&kexec_mutex); kimage_free(image); return ret; @@ -524,10 +475,10 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, /* Walk the RAM ranges and allocate a suitable range for the buffer */ if (image->type == KEXEC_TYPE_CRASH) - ret = walk_iomem_res("Crash kernel", - IORESOURCE_MEM | IORESOURCE_BUSY, - crashk_res.start, crashk_res.end, kbuf, - locate_mem_hole_callback); + ret = walk_iomem_res_desc(crashk_res.desc, + IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY, + crashk_res.start, crashk_res.end, kbuf, + locate_mem_hole_callback); else ret = walk_system_ram_res(0, -1, kbuf, locate_mem_hole_callback); diff --git a/kernel/latencytop.c b/kernel/latencytop.c index a02812743a7e..b5c30d9f46c5 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -47,12 +47,12 @@ * of times) */ -#include <linux/latencytop.h> #include <linux/kallsyms.h> #include <linux/seq_file.h> #include <linux/notifier.h> #include <linux/spinlock.h> #include <linux/proc_fs.h> +#include <linux/latencytop.h> #include <linux/export.h> #include <linux/sched.h> #include <linux/list.h> @@ -289,4 +289,16 @@ static int __init init_lstats_procfs(void) proc_create("latency_stats", 0644, NULL, &lstats_fops); return 0; } + +int sysctl_latencytop(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int err; + + err = proc_dointvec(table, write, buffer, lenp, ppos); + if (latencytop_enabled) + force_schedstat_enabled(); + + return err; +} device_initcall(init_lstats_procfs); diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index bc2c85c064c1..5c2bc1052691 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -28,6 +28,8 @@ #include <linux/list.h> #include <linux/kallsyms.h> #include <linux/livepatch.h> +#include <linux/elf.h> +#include <linux/moduleloader.h> #include <asm/cacheflush.h> /** @@ -99,12 +101,12 @@ static void klp_find_object_module(struct klp_object *obj) /* * We do not want to block removal of patched modules and therefore * we do not take a reference here. The patches are removed by - * a going module handler instead. + * klp_module_going() instead. */ mod = find_module(obj->name); /* - * Do not mess work of the module coming and going notifiers. - * Note that the patch might still be needed before the going handler + * Do not mess work of klp_module_coming() and klp_module_going(). + * Note that the patch might still be needed before klp_module_going() * is called. Module functions can be called even in the GOING state * until mod->exit() finishes. This is especially important for * patches that modify semantic of the functions. @@ -190,8 +192,8 @@ static int klp_find_object_symbol(const char *objname, const char *name, if (args.addr == 0) pr_err("symbol '%s' not found in symbol table\n", name); else if (args.count > 1 && sympos == 0) { - pr_err("unresolvable ambiguity (%lu matches) on symbol '%s' in object '%s'\n", - args.count, name, objname); + pr_err("unresolvable ambiguity for symbol '%s' in object '%s'\n", + name, objname); } else if (sympos != args.count && sympos > 0) { pr_err("symbol position %lu for symbol '%s' in object '%s' not found\n", sympos, name, objname ? objname : "vmlinux"); @@ -204,75 +206,109 @@ static int klp_find_object_symbol(const char *objname, const char *name, return -EINVAL; } -/* - * external symbols are located outside the parent object (where the parent - * object is either vmlinux or the kmod being patched). - */ -static int klp_find_external_symbol(struct module *pmod, const char *name, - unsigned long *addr) +static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod) { - const struct kernel_symbol *sym; - - /* first, check if it's an exported symbol */ - preempt_disable(); - sym = find_symbol(name, NULL, NULL, true, true); - if (sym) { - *addr = sym->value; - preempt_enable(); - return 0; - } - preempt_enable(); + int i, cnt, vmlinux, ret; + char objname[MODULE_NAME_LEN]; + char symname[KSYM_NAME_LEN]; + char *strtab = pmod->core_kallsyms.strtab; + Elf_Rela *relas; + Elf_Sym *sym; + unsigned long sympos, addr; /* - * Check if it's in another .o within the patch module. This also - * checks that the external symbol is unique. + * Since the field widths for objname and symname in the sscanf() + * call are hard-coded and correspond to MODULE_NAME_LEN and + * KSYM_NAME_LEN respectively, we must make sure that MODULE_NAME_LEN + * and KSYM_NAME_LEN have the values we expect them to have. + * + * Because the value of MODULE_NAME_LEN can differ among architectures, + * we use the smallest/strictest upper bound possible (56, based on + * the current definition of MODULE_NAME_LEN) to prevent overflows. */ - return klp_find_object_symbol(pmod->name, name, 0, addr); + BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 128); + + relas = (Elf_Rela *) relasec->sh_addr; + /* For each rela in this klp relocation section */ + for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) { + sym = pmod->core_kallsyms.symtab + ELF_R_SYM(relas[i].r_info); + if (sym->st_shndx != SHN_LIVEPATCH) { + pr_err("symbol %s is not marked as a livepatch symbol", + strtab + sym->st_name); + return -EINVAL; + } + + /* Format: .klp.sym.objname.symname,sympos */ + cnt = sscanf(strtab + sym->st_name, + ".klp.sym.%55[^.].%127[^,],%lu", + objname, symname, &sympos); + if (cnt != 3) { + pr_err("symbol %s has an incorrectly formatted name", + strtab + sym->st_name); + return -EINVAL; + } + + /* klp_find_object_symbol() treats a NULL objname as vmlinux */ + vmlinux = !strcmp(objname, "vmlinux"); + ret = klp_find_object_symbol(vmlinux ? NULL : objname, + symname, sympos, &addr); + if (ret) + return ret; + + sym->st_value = addr; + } + + return 0; } static int klp_write_object_relocations(struct module *pmod, struct klp_object *obj) { - int ret = 0; - unsigned long val; - struct klp_reloc *reloc; + int i, cnt, ret = 0; + const char *objname, *secname; + char sec_objname[MODULE_NAME_LEN]; + Elf_Shdr *sec; if (WARN_ON(!klp_is_object_loaded(obj))) return -EINVAL; - if (WARN_ON(!obj->relocs)) - return -EINVAL; + objname = klp_is_module(obj) ? obj->name : "vmlinux"; module_disable_ro(pmod); + /* For each klp relocation section */ + for (i = 1; i < pmod->klp_info->hdr.e_shnum; i++) { + sec = pmod->klp_info->sechdrs + i; + secname = pmod->klp_info->secstrings + sec->sh_name; + if (!(sec->sh_flags & SHF_RELA_LIVEPATCH)) + continue; - for (reloc = obj->relocs; reloc->name; reloc++) { - /* discover the address of the referenced symbol */ - if (reloc->external) { - if (reloc->sympos > 0) { - pr_err("non-zero sympos for external reloc symbol '%s' is not supported\n", - reloc->name); - ret = -EINVAL; - goto out; - } - ret = klp_find_external_symbol(pmod, reloc->name, &val); - } else - ret = klp_find_object_symbol(obj->name, - reloc->name, - reloc->sympos, - &val); + /* + * Format: .klp.rela.sec_objname.section_name + * See comment in klp_resolve_symbols() for an explanation + * of the selected field width value. + */ + cnt = sscanf(secname, ".klp.rela.%55[^.]", sec_objname); + if (cnt != 1) { + pr_err("section %s has an incorrectly formatted name", + secname); + ret = -EINVAL; + break; + } + + if (strcmp(objname, sec_objname)) + continue; + + ret = klp_resolve_symbols(sec, pmod); if (ret) - goto out; + break; - ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc, - val + reloc->addend); - if (ret) { - pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n", - reloc->name, val, ret); - goto out; - } + ret = apply_relocate_add(pmod->klp_info->sechdrs, + pmod->core_kallsyms.strtab, + pmod->klp_info->symndx, i, pmod); + if (ret) + break; } -out: module_enable_ro(pmod); return ret; } @@ -298,6 +334,19 @@ unlock: rcu_read_unlock(); } +/* + * Convert a function address into the appropriate ftrace location. + * + * Usually this is just the address of the function, but on some architectures + * it's more complicated so allow them to provide a custom behaviour. + */ +#ifndef klp_get_ftrace_location +static unsigned long klp_get_ftrace_location(unsigned long faddr) +{ + return faddr; +} +#endif + static void klp_disable_func(struct klp_func *func) { struct klp_ops *ops; @@ -312,8 +361,14 @@ static void klp_disable_func(struct klp_func *func) return; if (list_is_singular(&ops->func_stack)) { + unsigned long ftrace_loc; + + ftrace_loc = klp_get_ftrace_location(func->old_addr); + if (WARN_ON(!ftrace_loc)) + return; + WARN_ON(unregister_ftrace_function(&ops->fops)); - WARN_ON(ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0)); + WARN_ON(ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0)); list_del_rcu(&func->stack_node); list_del(&ops->node); @@ -338,6 +393,15 @@ static int klp_enable_func(struct klp_func *func) ops = klp_find_ops(func->old_addr); if (!ops) { + unsigned long ftrace_loc; + + ftrace_loc = klp_get_ftrace_location(func->old_addr); + if (!ftrace_loc) { + pr_err("failed to find location for function '%s'\n", + func->old_name); + return -EINVAL; + } + ops = kzalloc(sizeof(*ops), GFP_KERNEL); if (!ops) return -ENOMEM; @@ -352,7 +416,7 @@ static int klp_enable_func(struct klp_func *func) INIT_LIST_HEAD(&ops->func_stack); list_add_rcu(&func->stack_node, &ops->func_stack); - ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 0, 0); + ret = ftrace_set_filter_ip(&ops->fops, ftrace_loc, 0, 0); if (ret) { pr_err("failed to set ftrace filter for function '%s' (%d)\n", func->old_name, ret); @@ -363,7 +427,7 @@ static int klp_enable_func(struct klp_func *func) if (ret) { pr_err("failed to register ftrace handler for function '%s' (%d)\n", func->old_name, ret); - ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0); + ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0); goto err; } @@ -683,6 +747,9 @@ static void klp_free_patch(struct klp_patch *patch) static int klp_init_func(struct klp_object *obj, struct klp_func *func) { + if (!func->old_name || !func->new_func) + return -EINVAL; + INIT_LIST_HEAD(&func->stack_node); func->state = KLP_DISABLED; @@ -703,11 +770,9 @@ static int klp_init_object_loaded(struct klp_patch *patch, struct klp_func *func; int ret; - if (obj->relocs) { - ret = klp_write_object_relocations(patch->mod, obj); - if (ret) - return ret; - } + ret = klp_write_object_relocations(patch->mod, obj); + if (ret) + return ret; klp_for_each_func(obj, func) { ret = klp_find_object_symbol(obj->name, func->old_name, @@ -842,12 +907,18 @@ int klp_register_patch(struct klp_patch *patch) { int ret; - if (!klp_initialized()) - return -ENODEV; - if (!patch || !patch->mod) return -EINVAL; + if (!is_livepatch_module(patch->mod)) { + pr_err("module %s is not marked as a livepatch module", + patch->mod->name); + return -EINVAL; + } + + if (!klp_initialized()) + return -ENODEV; + /* * A reference is taken on the patch module to prevent it from being * unloaded. Right now, we don't allow patch modules to unload since @@ -866,103 +937,108 @@ int klp_register_patch(struct klp_patch *patch) } EXPORT_SYMBOL_GPL(klp_register_patch); -static int klp_module_notify_coming(struct klp_patch *patch, - struct klp_object *obj) +int klp_module_coming(struct module *mod) { - struct module *pmod = patch->mod; - struct module *mod = obj->mod; int ret; + struct klp_patch *patch; + struct klp_object *obj; - ret = klp_init_object_loaded(patch, obj); - if (ret) { - pr_warn("failed to initialize patch '%s' for module '%s' (%d)\n", - pmod->name, mod->name, ret); - return ret; - } + if (WARN_ON(mod->state != MODULE_STATE_COMING)) + return -EINVAL; - if (patch->state == KLP_DISABLED) - return 0; + mutex_lock(&klp_mutex); + /* + * Each module has to know that klp_module_coming() + * has been called. We never know what module will + * get patched by a new patch. + */ + mod->klp_alive = true; - pr_notice("applying patch '%s' to loading module '%s'\n", - pmod->name, mod->name); + list_for_each_entry(patch, &klp_patches, list) { + klp_for_each_object(patch, obj) { + if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) + continue; - ret = klp_enable_object(obj); - if (ret) - pr_warn("failed to apply patch '%s' to module '%s' (%d)\n", - pmod->name, mod->name, ret); - return ret; -} + obj->mod = mod; -static void klp_module_notify_going(struct klp_patch *patch, - struct klp_object *obj) -{ - struct module *pmod = patch->mod; - struct module *mod = obj->mod; + ret = klp_init_object_loaded(patch, obj); + if (ret) { + pr_warn("failed to initialize patch '%s' for module '%s' (%d)\n", + patch->mod->name, obj->mod->name, ret); + goto err; + } + + if (patch->state == KLP_DISABLED) + break; + + pr_notice("applying patch '%s' to loading module '%s'\n", + patch->mod->name, obj->mod->name); + + ret = klp_enable_object(obj); + if (ret) { + pr_warn("failed to apply patch '%s' to module '%s' (%d)\n", + patch->mod->name, obj->mod->name, ret); + goto err; + } - if (patch->state == KLP_DISABLED) - goto disabled; + break; + } + } - pr_notice("reverting patch '%s' on unloading module '%s'\n", - pmod->name, mod->name); + mutex_unlock(&klp_mutex); - klp_disable_object(obj); + return 0; -disabled: +err: + /* + * If a patch is unsuccessfully applied, return + * error to the module loader. + */ + pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n", + patch->mod->name, obj->mod->name, obj->mod->name); + mod->klp_alive = false; klp_free_object_loaded(obj); + mutex_unlock(&klp_mutex); + + return ret; } -static int klp_module_notify(struct notifier_block *nb, unsigned long action, - void *data) +void klp_module_going(struct module *mod) { - int ret; - struct module *mod = data; struct klp_patch *patch; struct klp_object *obj; - if (action != MODULE_STATE_COMING && action != MODULE_STATE_GOING) - return 0; + if (WARN_ON(mod->state != MODULE_STATE_GOING && + mod->state != MODULE_STATE_COMING)) + return; mutex_lock(&klp_mutex); - /* - * Each module has to know that the notifier has been called. - * We never know what module will get patched by a new patch. + * Each module has to know that klp_module_going() + * has been called. We never know what module will + * get patched by a new patch. */ - if (action == MODULE_STATE_COMING) - mod->klp_alive = true; - else /* MODULE_STATE_GOING */ - mod->klp_alive = false; + mod->klp_alive = false; list_for_each_entry(patch, &klp_patches, list) { klp_for_each_object(patch, obj) { if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) continue; - if (action == MODULE_STATE_COMING) { - obj->mod = mod; - ret = klp_module_notify_coming(patch, obj); - if (ret) { - obj->mod = NULL; - pr_warn("patch '%s' is in an inconsistent state!\n", - patch->mod->name); - } - } else /* MODULE_STATE_GOING */ - klp_module_notify_going(patch, obj); + if (patch->state != KLP_DISABLED) { + pr_notice("reverting patch '%s' on unloading module '%s'\n", + patch->mod->name, obj->mod->name); + klp_disable_object(obj); + } + klp_free_object_loaded(obj); break; } } mutex_unlock(&klp_mutex); - - return 0; } -static struct notifier_block klp_module_nb = { - .notifier_call = klp_module_notify, - .priority = INT_MIN+1, /* called late but before ftrace notifier */ -}; - static int __init klp_init(void) { int ret; @@ -973,21 +1049,11 @@ static int __init klp_init(void) return -EINVAL; } - ret = register_module_notifier(&klp_module_nb); - if (ret) - return ret; - klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj); - if (!klp_root_kobj) { - ret = -ENOMEM; - goto unregister; - } + if (!klp_root_kobj) + return -ENOMEM; return 0; - -unregister: - unregister_module_notifier(&klp_module_nb); - return ret; } module_init(klp_init); diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 8e96f6cc2a4a..31322a4275cd 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,3 +1,6 @@ +# Any varying coverage in these files is non-deterministic +# and is generally not a function of system call inputs. +KCOV_INSTRUMENT := n obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 716547fdb873..81f1a7107c0e 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -45,6 +45,7 @@ #include <linux/bitops.h> #include <linux/gfp.h> #include <linux/kmemcheck.h> +#include <linux/random.h> #include <asm/sections.h> @@ -123,8 +124,6 @@ static inline int debug_locks_off_graph_unlock(void) return ret; } -static int lockdep_initialized; - unsigned long nr_list_entries; static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; @@ -150,8 +149,7 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock) } #ifdef CONFIG_LOCK_STAT -static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], - cpu_lock_stats); +static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], cpu_lock_stats); static inline u64 lockstat_clock(void) { @@ -434,19 +432,6 @@ unsigned int max_lockdep_depth; #ifdef CONFIG_DEBUG_LOCKDEP /* - * We cannot printk in early bootup code. Not even early_printk() - * might work. So we mark any initialization errors and printk - * about it later on, in lockdep_info(). - */ -static int lockdep_init_error; -static const char *lock_init_error; -static unsigned long lockdep_init_trace_data[20]; -static struct stack_trace lockdep_init_trace = { - .max_entries = ARRAY_SIZE(lockdep_init_trace_data), - .entries = lockdep_init_trace_data, -}; - -/* * Various lockdep statistics: */ DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats); @@ -669,20 +654,6 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) struct hlist_head *hash_head; struct lock_class *class; -#ifdef CONFIG_DEBUG_LOCKDEP - /* - * If the architecture calls into lockdep before initializing - * the hashes then we'll warn about it later. (we cannot printk - * right now) - */ - if (unlikely(!lockdep_initialized)) { - lockdep_init(); - lockdep_init_error = 1; - lock_init_error = lock->name; - save_stack_trace(&lockdep_init_trace); - } -#endif - if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { debug_locks_off(); printk(KERN_ERR @@ -738,7 +709,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) * yet. Otherwise we look it up. We cache the result in the lock object * itself, so actual lookup of the hash should be once per lock object. */ -static inline struct lock_class * +static struct lock_class * register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) { struct lockdep_subclass_key *key; @@ -2011,6 +1982,130 @@ struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i) } /* + * Returns the index of the first held_lock of the current chain + */ +static inline int get_first_held_lock(struct task_struct *curr, + struct held_lock *hlock) +{ + int i; + struct held_lock *hlock_curr; + + for (i = curr->lockdep_depth - 1; i >= 0; i--) { + hlock_curr = curr->held_locks + i; + if (hlock_curr->irq_context != hlock->irq_context) + break; + + } + + return ++i; +} + +#ifdef CONFIG_DEBUG_LOCKDEP +/* + * Returns the next chain_key iteration + */ +static u64 print_chain_key_iteration(int class_idx, u64 chain_key) +{ + u64 new_chain_key = iterate_chain_key(chain_key, class_idx); + + printk(" class_idx:%d -> chain_key:%016Lx", + class_idx, + (unsigned long long)new_chain_key); + return new_chain_key; +} + +static void +print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_next) +{ + struct held_lock *hlock; + u64 chain_key = 0; + int depth = curr->lockdep_depth; + int i; + + printk("depth: %u\n", depth + 1); + for (i = get_first_held_lock(curr, hlock_next); i < depth; i++) { + hlock = curr->held_locks + i; + chain_key = print_chain_key_iteration(hlock->class_idx, chain_key); + + print_lock(hlock); + } + + print_chain_key_iteration(hlock_next->class_idx, chain_key); + print_lock(hlock_next); +} + +static void print_chain_keys_chain(struct lock_chain *chain) +{ + int i; + u64 chain_key = 0; + int class_id; + + printk("depth: %u\n", chain->depth); + for (i = 0; i < chain->depth; i++) { + class_id = chain_hlocks[chain->base + i]; + chain_key = print_chain_key_iteration(class_id + 1, chain_key); + + print_lock_name(lock_classes + class_id); + printk("\n"); + } +} + +static void print_collision(struct task_struct *curr, + struct held_lock *hlock_next, + struct lock_chain *chain) +{ + printk("\n"); + printk("======================\n"); + printk("[chain_key collision ]\n"); + print_kernel_ident(); + printk("----------------------\n"); + printk("%s/%d: ", current->comm, task_pid_nr(current)); + printk("Hash chain already cached but the contents don't match!\n"); + + printk("Held locks:"); + print_chain_keys_held_locks(curr, hlock_next); + + printk("Locks in cached chain:"); + print_chain_keys_chain(chain); + + printk("\nstack backtrace:\n"); + dump_stack(); +} +#endif + +/* + * Checks whether the chain and the current held locks are consistent + * in depth and also in content. If they are not it most likely means + * that there was a collision during the calculation of the chain_key. + * Returns: 0 not passed, 1 passed + */ +static int check_no_collision(struct task_struct *curr, + struct held_lock *hlock, + struct lock_chain *chain) +{ +#ifdef CONFIG_DEBUG_LOCKDEP + int i, j, id; + + i = get_first_held_lock(curr, hlock); + + if (DEBUG_LOCKS_WARN_ON(chain->depth != curr->lockdep_depth - (i - 1))) { + print_collision(curr, hlock, chain); + return 0; + } + + for (j = 0; j < chain->depth - 1; j++, i++) { + id = curr->held_locks[i].class_idx - 1; + + if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) { + print_collision(curr, hlock, chain); + return 0; + } + } +#endif + return 1; +} + +/* * Look up a dependency chain. If the key is not present yet then * add it and return 1 - in this case the new dependency chain is * validated. If the key is already hashed, return 0. @@ -2023,7 +2118,6 @@ static inline int lookup_chain_cache(struct task_struct *curr, struct lock_class *class = hlock_class(hlock); struct hlist_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; - struct held_lock *hlock_curr; int i, j; /* @@ -2041,6 +2135,9 @@ static inline int lookup_chain_cache(struct task_struct *curr, if (chain->chain_key == chain_key) { cache_hit: debug_atomic_inc(chain_lookup_hits); + if (!check_no_collision(curr, hlock, chain)) + return 0; + if (very_verbose(class)) printk("\nhash chain already cached, key: " "%016Lx tail class: [%p] %s\n", @@ -2078,23 +2175,39 @@ cache_hit: chain = lock_chains + nr_lock_chains++; chain->chain_key = chain_key; chain->irq_context = hlock->irq_context; - /* Find the first held_lock of current chain */ - for (i = curr->lockdep_depth - 1; i >= 0; i--) { - hlock_curr = curr->held_locks + i; - if (hlock_curr->irq_context != hlock->irq_context) - break; - } - i++; + i = get_first_held_lock(curr, hlock); chain->depth = curr->lockdep_depth + 1 - i; + + BUILD_BUG_ON((1UL << 24) <= ARRAY_SIZE(chain_hlocks)); + BUILD_BUG_ON((1UL << 6) <= ARRAY_SIZE(curr->held_locks)); + BUILD_BUG_ON((1UL << 8*sizeof(chain_hlocks[0])) <= ARRAY_SIZE(lock_classes)); + if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { chain->base = nr_chain_hlocks; - nr_chain_hlocks += chain->depth; for (j = 0; j < chain->depth - 1; j++, i++) { int lock_id = curr->held_locks[i].class_idx - 1; chain_hlocks[chain->base + j] = lock_id; } chain_hlocks[chain->base + j] = class - lock_classes; } + + if (nr_chain_hlocks < MAX_LOCKDEP_CHAIN_HLOCKS) + nr_chain_hlocks += chain->depth; + +#ifdef CONFIG_DEBUG_LOCKDEP + /* + * Important for check_no_collision(). + */ + if (unlikely(nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)) { + if (debug_locks_off_graph_unlock()) + return 0; + + print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!"); + dump_stack(); + return 0; + } +#endif + hlist_add_head_rcu(&chain->entry, hash_head); debug_atomic_inc(chain_lookup_misses); inc_chains(); @@ -2172,7 +2285,7 @@ static void check_chain_key(struct task_struct *curr) { #ifdef CONFIG_DEBUG_LOCKDEP struct held_lock *hlock, *prev_hlock = NULL; - unsigned int i, id; + unsigned int i; u64 chain_key = 0; for (i = 0; i < curr->lockdep_depth; i++) { @@ -2189,17 +2302,16 @@ static void check_chain_key(struct task_struct *curr) (unsigned long long)hlock->prev_chain_key); return; } - id = hlock->class_idx - 1; /* * Whoops ran out of static storage again? */ - if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) + if (DEBUG_LOCKS_WARN_ON(hlock->class_idx > MAX_LOCKDEP_KEYS)) return; if (prev_hlock && (prev_hlock->irq_context != hlock->irq_context)) chain_key = 0; - chain_key = iterate_chain_key(chain_key, id); + chain_key = iterate_chain_key(chain_key, hlock->class_idx); prev_hlock = hlock; } if (chain_key != curr->curr_chain_key) { @@ -2843,6 +2955,11 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) return 1; } +static inline unsigned int task_irq_context(struct task_struct *task) +{ + return 2 * !!task->hardirq_context + !!task->softirq_context; +} + static int separate_irq_context(struct task_struct *curr, struct held_lock *hlock) { @@ -2851,8 +2968,6 @@ static int separate_irq_context(struct task_struct *curr, /* * Keep track of points where we cross into an interrupt context: */ - hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) + - curr->softirq_context; if (depth) { struct held_lock *prev_hlock; @@ -2884,6 +2999,11 @@ static inline int mark_irqflags(struct task_struct *curr, return 1; } +static inline unsigned int task_irq_context(struct task_struct *task) +{ + return 0; +} + static inline int separate_irq_context(struct task_struct *curr, struct held_lock *hlock) { @@ -3077,7 +3197,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, struct task_struct *curr = current; struct lock_class *class = NULL; struct held_lock *hlock; - unsigned int depth, id; + unsigned int depth; int chain_head = 0; int class_idx; u64 chain_key; @@ -3152,6 +3272,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->acquire_ip = ip; hlock->instance = lock; hlock->nest_lock = nest_lock; + hlock->irq_context = task_irq_context(curr); hlock->trylock = trylock; hlock->read = read; hlock->check = check; @@ -3180,11 +3301,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, * The 'key ID' is what is the most compact key value to drive * the hash, not class->key. */ - id = class - lock_classes; /* * Whoops, we did it again.. ran straight out of our static allocation. */ - if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) + if (DEBUG_LOCKS_WARN_ON(class_idx > MAX_LOCKDEP_KEYS)) return 0; chain_key = curr->curr_chain_key; @@ -3202,7 +3322,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, chain_key = 0; chain_head = 1; } - chain_key = iterate_chain_key(chain_key, id); + chain_key = iterate_chain_key(chain_key, class_idx); if (nest_lock && !__lock_is_held(nest_lock)) return print_lock_nested_lock_not_held(curr, hlock, ip); @@ -3466,7 +3586,35 @@ static int __lock_is_held(struct lockdep_map *lock) return 0; } -static void __lock_pin_lock(struct lockdep_map *lock) +static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock) +{ + struct pin_cookie cookie = NIL_COOKIE; + struct task_struct *curr = current; + int i; + + if (unlikely(!debug_locks)) + return cookie; + + for (i = 0; i < curr->lockdep_depth; i++) { + struct held_lock *hlock = curr->held_locks + i; + + if (match_held_lock(hlock, lock)) { + /* + * Grab 16bits of randomness; this is sufficient to not + * be guessable and still allows some pin nesting in + * our u32 pin_count. + */ + cookie.val = 1 + (prandom_u32() >> 16); + hlock->pin_count += cookie.val; + return cookie; + } + } + + WARN(1, "pinning an unheld lock\n"); + return cookie; +} + +static void __lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie) { struct task_struct *curr = current; int i; @@ -3478,7 +3626,7 @@ static void __lock_pin_lock(struct lockdep_map *lock) struct held_lock *hlock = curr->held_locks + i; if (match_held_lock(hlock, lock)) { - hlock->pin_count++; + hlock->pin_count += cookie.val; return; } } @@ -3486,7 +3634,7 @@ static void __lock_pin_lock(struct lockdep_map *lock) WARN(1, "pinning an unheld lock\n"); } -static void __lock_unpin_lock(struct lockdep_map *lock) +static void __lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie) { struct task_struct *curr = current; int i; @@ -3501,7 +3649,11 @@ static void __lock_unpin_lock(struct lockdep_map *lock) if (WARN(!hlock->pin_count, "unpinning an unpinned lock\n")) return; - hlock->pin_count--; + hlock->pin_count -= cookie.val; + + if (WARN((int)hlock->pin_count < 0, "pin count corrupted\n")) + hlock->pin_count = 0; + return; } } @@ -3632,24 +3784,44 @@ int lock_is_held(struct lockdep_map *lock) } EXPORT_SYMBOL_GPL(lock_is_held); -void lock_pin_lock(struct lockdep_map *lock) +struct pin_cookie lock_pin_lock(struct lockdep_map *lock) { + struct pin_cookie cookie = NIL_COOKIE; unsigned long flags; if (unlikely(current->lockdep_recursion)) - return; + return cookie; raw_local_irq_save(flags); check_flags(flags); current->lockdep_recursion = 1; - __lock_pin_lock(lock); + cookie = __lock_pin_lock(lock); current->lockdep_recursion = 0; raw_local_irq_restore(flags); + + return cookie; } EXPORT_SYMBOL_GPL(lock_pin_lock); -void lock_unpin_lock(struct lockdep_map *lock) +void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie) +{ + unsigned long flags; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + + current->lockdep_recursion = 1; + __lock_repin_lock(lock, cookie); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_repin_lock); + +void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie) { unsigned long flags; @@ -3660,7 +3832,7 @@ void lock_unpin_lock(struct lockdep_map *lock) check_flags(flags); current->lockdep_recursion = 1; - __lock_unpin_lock(lock); + __lock_unpin_lock(lock, cookie); current->lockdep_recursion = 0; raw_local_irq_restore(flags); } @@ -4013,28 +4185,6 @@ out_restore: raw_local_irq_restore(flags); } -void lockdep_init(void) -{ - int i; - - /* - * Some architectures have their own start_kernel() - * code which calls lockdep_init(), while we also - * call lockdep_init() from the start_kernel() itself, - * and we want to initialize the hashes only once: - */ - if (lockdep_initialized) - return; - - for (i = 0; i < CLASSHASH_SIZE; i++) - INIT_HLIST_HEAD(classhash_table + i); - - for (i = 0; i < CHAINHASH_SIZE; i++) - INIT_HLIST_HEAD(chainhash_table + i); - - lockdep_initialized = 1; -} - void __init lockdep_info(void) { printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); @@ -4061,14 +4211,6 @@ void __init lockdep_info(void) printk(" per task-struct memory footprint: %lu bytes\n", sizeof(struct held_lock) * MAX_LOCK_DEPTH); - -#ifdef CONFIG_DEBUG_LOCKDEP - if (lockdep_init_error) { - printk("WARNING: lockdep init error: lock '%s' was acquired before lockdep_init().\n", lock_init_error); - printk("Call stack leading to lockdep invocation was:\n"); - print_stack_trace(&lockdep_init_trace, 0); - } -#endif } static void diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index dbb61a302548..a0f61effad25 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -141,6 +141,8 @@ static int lc_show(struct seq_file *m, void *v) int i; if (v == SEQ_START_TOKEN) { + if (nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS) + seq_printf(m, "(buggered) "); seq_printf(m, "all lock chains:\n"); return 0; } diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 8ef1919d63b2..f8c5af52a131 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -75,12 +75,7 @@ struct lock_stress_stats { long n_lock_acquired; }; -#if defined(MODULE) -#define LOCKTORTURE_RUNNABLE_INIT 1 -#else -#define LOCKTORTURE_RUNNABLE_INIT 0 -#endif -int torture_runnable = LOCKTORTURE_RUNNABLE_INIT; +int torture_runnable = IS_ENABLED(MODULE); module_param(torture_runnable, int, 0444); MODULE_PARM_DESC(torture_runnable, "Start locktorture at module init"); @@ -394,12 +389,12 @@ static void torture_rtmutex_boost(struct torture_random_state *trsp) if (!rt_task(current)) { /* - * (1) Boost priority once every ~50k operations. When the + * Boost priority once every ~50k operations. When the * task tries to take the lock, the rtmutex it will account * for the new priority, and do any corresponding pi-dance. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * factor))) { + if (trsp && !(torture_random(trsp) % + (cxt.nrealwriters_stress * factor))) { policy = SCHED_FIFO; param.sched_priority = MAX_RT_PRIO - 1; } else /* common case, do nothing */ @@ -748,6 +743,15 @@ static void lock_torture_cleanup(void) if (torture_cleanup_begin()) return; + /* + * Indicates early cleanup, meaning that the test has not run, + * such as when passing bogus args when loading the module. As + * such, only perform the underlying torture-specific cleanups, + * and avoid anything related to locktorture. + */ + if (!cxt.lwsa) + goto end; + if (writer_tasks) { for (i = 0; i < cxt.nrealwriters_stress; i++) torture_stop_kthread(lock_torture_writer, @@ -776,6 +780,7 @@ static void lock_torture_cleanup(void) else lock_torture_print_module_parms(cxt.cur_ops, "End of test: SUCCESS"); +end: torture_cleanup_end(); } @@ -870,6 +875,7 @@ static int __init lock_torture_init(void) VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory"); firsterr = -ENOMEM; kfree(cxt.lwsa); + cxt.lwsa = NULL; goto unwind; } @@ -878,6 +884,7 @@ static int __init lock_torture_init(void) cxt.lrsa[i].n_lock_acquired = 0; } } + lock_torture_print_module_parms(cxt.cur_ops, "Start of test"); /* Prepare torture context. */ diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 5b9102a47ea5..c835270f0c2f 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -67,7 +67,13 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) node->locked = 0; node->next = NULL; - prev = xchg_acquire(lock, node); + /* + * We rely on the full barrier with global transitivity implied by the + * below xchg() to order the initialization stores above against any + * observation of @node. And to provide the ACQUIRE ordering associated + * with a LOCK primitive. + */ + prev = xchg(lock, node); if (likely(prev == NULL)) { /* * Lock acquired, don't need to set node->locked to 1. Threads diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 3ef3736002d8..9c951fade415 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -49,21 +49,21 @@ void debug_mutex_free_waiter(struct mutex_waiter *waiter) } void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti) + struct task_struct *task) { SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); /* Mark the current thread as blocked on the lock: */ - ti->task->blocked_on = waiter; + task->blocked_on = waiter; } void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti) + struct task_struct *task) { DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); - DEBUG_LOCKS_WARN_ON(waiter->task != ti->task); - DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter); - ti->task->blocked_on = NULL; + DEBUG_LOCKS_WARN_ON(waiter->task != task); + DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter); + task->blocked_on = NULL; list_del_init(&waiter->list); waiter->task = NULL; diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h index 0799fd3e4cfa..d06ae3bb46c5 100644 --- a/kernel/locking/mutex-debug.h +++ b/kernel/locking/mutex-debug.h @@ -20,9 +20,9 @@ extern void debug_mutex_wake_waiter(struct mutex *lock, extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); extern void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti); + struct task_struct *task); extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti); + struct task_struct *task); extern void debug_mutex_unlock(struct mutex *lock); extern void debug_mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 0551c219c40e..a70b90db3909 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -486,9 +486,6 @@ __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) if (!hold_ctx) return 0; - if (unlikely(ctx == hold_ctx)) - return -EALREADY; - if (ctx->stamp - hold_ctx->stamp <= LONG_MAX && (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) { #ifdef CONFIG_DEBUG_MUTEXES @@ -514,6 +511,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, unsigned long flags; int ret; + if (use_ww_ctx) { + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) + return -EALREADY; + } + preempt_disable(); mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); @@ -534,7 +537,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, goto skip_wait; debug_mutex_lock_common(lock, &waiter); - debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); + debug_mutex_add_waiter(lock, &waiter, task); /* add waiting tasks to the end of the waitqueue (FIFO): */ list_add_tail(&waiter.list, &lock->wait_list); @@ -581,7 +584,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } __set_task_state(task, TASK_RUNNING); - mutex_remove_waiter(lock, &waiter, current_thread_info()); + mutex_remove_waiter(lock, &waiter, task); /* set it to 0 if there are no waiters left: */ if (likely(list_empty(&lock->wait_list))) atomic_set(&lock->count, 0); @@ -602,7 +605,7 @@ skip_wait: return 0; err: - mutex_remove_waiter(lock, &waiter, task_thread_info(task)); + mutex_remove_waiter(lock, &waiter, task); spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); mutex_release(&lock->dep_map, 1, ip); @@ -716,6 +719,7 @@ static inline void __mutex_unlock_common_slowpath(struct mutex *lock, int nested) { unsigned long flags; + WAKE_Q(wake_q); /* * As a performance measurement, release the lock before doing other @@ -743,11 +747,11 @@ __mutex_unlock_common_slowpath(struct mutex *lock, int nested) struct mutex_waiter, list); debug_mutex_wake_waiter(lock, waiter); - - wake_up_process(waiter->task); + wake_q_add(&wake_q, waiter->task); } spin_unlock_mutex(&lock->wait_lock, flags); + wake_up_q(&wake_q); } /* diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 5cda397607f2..a68bae5e852a 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -13,7 +13,7 @@ do { spin_lock(lock); (void)(flags); } while (0) #define spin_unlock_mutex(lock, flags) \ do { spin_unlock(lock); (void)(flags); } while (0) -#define mutex_remove_waiter(lock, waiter, ti) \ +#define mutex_remove_waiter(lock, waiter, task) \ __list_del((waiter)->list.prev, (waiter)->list.next) #ifdef CONFIG_MUTEX_SPIN_ON_OWNER diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index f231e0bb311c..bec0b647f9cc 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -37,6 +37,7 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *brw) free_percpu(brw->fast_read_ctr); brw->fast_read_ctr = NULL; /* catch use after free bugs */ } +EXPORT_SYMBOL_GPL(percpu_free_rwsem); /* * This is the fast-path for down_read/up_read. If it succeeds we rely diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 393d1874b9e0..5fc8c311b8fe 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -267,6 +267,66 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, #define queued_spin_lock_slowpath native_queued_spin_lock_slowpath #endif +/* + * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before + * issuing an _unordered_ store to set _Q_LOCKED_VAL. + * + * This means that the store can be delayed, but no later than the + * store-release from the unlock. This means that simply observing + * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired. + * + * There are two paths that can issue the unordered store: + * + * (1) clear_pending_set_locked(): *,1,0 -> *,0,1 + * + * (2) set_locked(): t,0,0 -> t,0,1 ; t != 0 + * atomic_cmpxchg_relaxed(): t,0,0 -> 0,0,1 + * + * However, in both cases we have other !0 state we've set before to queue + * ourseves: + * + * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our + * load is constrained by that ACQUIRE to not pass before that, and thus must + * observe the store. + * + * For (2) we have a more intersting scenario. We enqueue ourselves using + * xchg_tail(), which ends up being a RELEASE. This in itself is not + * sufficient, however that is followed by an smp_cond_acquire() on the same + * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and + * guarantees we must observe that store. + * + * Therefore both cases have other !0 state that is observable before the + * unordered locked byte store comes through. This means we can use that to + * wait for the lock store, and then wait for an unlock. + */ +#ifndef queued_spin_unlock_wait +void queued_spin_unlock_wait(struct qspinlock *lock) +{ + u32 val; + + for (;;) { + val = atomic_read(&lock->val); + + if (!val) /* not locked, we're done */ + goto done; + + if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */ + break; + + /* not locked, but pending, wait until we observe the lock */ + cpu_relax(); + } + + /* any unlock is good */ + while (atomic_read(&lock->val) & _Q_LOCKED_MASK) + cpu_relax(); + +done: + smp_rmb(); /* CTRL + RMB -> ACQUIRE */ +} +EXPORT_SYMBOL(queued_spin_unlock_wait); +#endif + #endif /* _GEN_PV_LOCK_SLOWPATH */ /** @@ -358,8 +418,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) * sequentiality; this is because not all clear_pending_set_locked() * implementations imply full barriers. */ - while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK) - cpu_relax(); + smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK)); /* * take ownership and clear the pending bit. @@ -435,7 +494,7 @@ queue: * * The PV pv_wait_head_or_lock function, if active, will acquire * the lock and return a non-zero value. So we have to skip the - * smp_load_acquire() call. As the next PV queue head hasn't been + * smp_cond_acquire() call. As the next PV queue head hasn't been * designated yet, there is no way for the locked value to become * _Q_SLOW_VAL. So both the set_locked() and the * atomic_cmpxchg_relaxed() calls will be safe. @@ -466,7 +525,7 @@ locked: break; } /* - * The smp_load_acquire() call above has provided the necessary + * The smp_cond_acquire() call above has provided the necessary * acquire semantics required for locking. At most two * iterations of this loop may be ran. */ diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 87bb235c3448..21ede57f68b3 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -55,6 +55,11 @@ struct pv_node { }; /* + * Include queued spinlock statistics code + */ +#include "qspinlock_stat.h" + +/* * By replacing the regular queued_spin_trylock() with the function below, * it will be called once when a lock waiter enter the PV slowpath before * being queued. By allowing one lock stealing attempt here when the pending @@ -65,9 +70,11 @@ struct pv_node { static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock) { struct __qspinlock *l = (void *)lock; + int ret = !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && + (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0); - return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && - (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0); + qstat_inc(qstat_pv_lock_stealing, ret); + return ret; } /* @@ -138,11 +145,6 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock) #endif /* _Q_PENDING_BITS == 8 */ /* - * Include queued spinlock statistics code - */ -#include "qspinlock_stat.h" - -/* * Lock and MCS node addresses hash table for fast lookup * * Hashing is done on a per-cacheline basis to minimize the need to access @@ -398,6 +400,11 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) if (READ_ONCE(pn->state) == vcpu_hashed) lp = (struct qspinlock **)1; + /* + * Tracking # of slowpath locking operations + */ + qstat_inc(qstat_pv_lock_slowpath, true); + for (;; waitcnt++) { /* * Set correct vCPU state to be used by queue node wait-early diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index 640dcecdd1df..22e025309845 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -22,6 +22,7 @@ * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake * pv_latency_kick - average latency (ns) of vCPU kick operation * pv_latency_wake - average latency (ns) from vCPU kick to wakeup + * pv_lock_slowpath - # of locking operations via the slowpath * pv_lock_stealing - # of lock stealing operations * pv_spurious_wakeup - # of spurious wakeups * pv_wait_again - # of vCPU wait's that happened after a vCPU kick @@ -45,6 +46,7 @@ enum qlock_stats { qstat_pv_kick_wake, qstat_pv_latency_kick, qstat_pv_latency_wake, + qstat_pv_lock_slowpath, qstat_pv_lock_stealing, qstat_pv_spurious_wakeup, qstat_pv_wait_again, @@ -70,6 +72,7 @@ static const char * const qstat_names[qstat_num + 1] = { [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup", [qstat_pv_latency_kick] = "pv_latency_kick", [qstat_pv_latency_wake] = "pv_latency_wake", + [qstat_pv_lock_slowpath] = "pv_lock_slowpath", [qstat_pv_lock_stealing] = "pv_lock_stealing", [qstat_pv_wait_again] = "pv_wait_again", [qstat_pv_wait_early] = "pv_wait_early", @@ -133,10 +136,12 @@ static ssize_t qstat_read(struct file *file, char __user *user_buf, } if (counter == qstat_pv_hash_hops) { - u64 frac; + u64 frac = 0; - frac = 100ULL * do_div(stat, kicks); - frac = DIV_ROUND_CLOSEST_ULL(frac, kicks); + if (kicks) { + frac = 100ULL * do_div(stat, kicks); + frac = DIV_ROUND_CLOSEST_ULL(frac, kicks); + } /* * Return a X.XX decimal number @@ -186,8 +191,6 @@ static ssize_t qstat_write(struct file *file, const char __user *user_buf, for (i = 0 ; i < qstat_num; i++) WRITE_ONCE(ptr[i], 0); - for (i = 0 ; i < qstat_num; i++) - WRITE_ONCE(ptr[i], 0); } return count; } @@ -209,10 +212,8 @@ static int __init init_qspinlock_stat(void) struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL); int i; - if (!d_qstat) { - pr_warn("Could not create 'qlockstat' debugfs directory\n"); - return 0; - } + if (!d_qstat) + goto out; /* * Create the debugfs files @@ -222,12 +223,20 @@ static int __init init_qspinlock_stat(void) * performance. */ for (i = 0; i < qstat_num; i++) - debugfs_create_file(qstat_names[i], 0400, d_qstat, - (void *)(long)i, &fops_qstat); + if (!debugfs_create_file(qstat_names[i], 0400, d_qstat, + (void *)(long)i, &fops_qstat)) + goto fail_undo; + + if (!debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat, + (void *)(long)qstat_reset_cnts, &fops_qstat)) + goto fail_undo; - debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat, - (void *)(long)qstat_reset_cnts, &fops_qstat); return 0; +fail_undo: + debugfs_remove_recursive(d_qstat); +out: + pr_warn("Could not create 'qlockstat' debugfs entries\n"); + return -ENOMEM; } fs_initcall(init_qspinlock_stat); @@ -279,19 +288,6 @@ static inline void __pv_wait(u8 *ptr, u8 val) #define pv_kick(c) __pv_kick(c) #define pv_wait(p, v) __pv_wait(p, v) -/* - * PV unfair trylock count tracking function - */ -static inline int qstat_spin_steal_lock(struct qspinlock *lock) -{ - int ret = pv_queued_spin_steal_lock(lock); - - qstat_inc(qstat_pv_lock_stealing, ret); - return ret; -} -#undef queued_spin_trylock -#define queued_spin_trylock(l) qstat_spin_steal_lock(l) - #else /* CONFIG_QUEUED_LOCK_STAT */ static inline void qstat_inc(enum qlock_stats stat, bool cond) { } diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 3a5048572065..1591f6b3539f 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -191,11 +191,12 @@ int __down_read_trylock(struct rw_semaphore *sem) /* * get a write lock on the semaphore */ -void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) +int __sched __down_write_common(struct rw_semaphore *sem, int state) { struct rwsem_waiter waiter; struct task_struct *tsk; unsigned long flags; + int ret = 0; raw_spin_lock_irqsave(&sem->wait_lock, flags); @@ -215,21 +216,33 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) */ if (sem->count == 0) break; - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (signal_pending_state(state, current)) { + ret = -EINTR; + goto out; + } + set_task_state(tsk, state); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); schedule(); raw_spin_lock_irqsave(&sem->wait_lock, flags); } /* got the lock */ sem->count = -1; +out: list_del(&waiter.list); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + + return ret; } void __sched __down_write(struct rw_semaphore *sem) { - __down_write_nested(sem, 0); + __down_write_common(sem, TASK_UNINTERRUPTIBLE); +} + +int __sched __down_write_killable(struct rw_semaphore *sem) +{ + return __down_write_common(sem, TASK_KILLABLE); } /* diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index a4d4de05b2d1..09e30c6225e5 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -433,12 +433,13 @@ static inline bool rwsem_has_spinner(struct rw_semaphore *sem) /* * Wait until we successfully acquire the write lock */ -__visible -struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) +static inline struct rw_semaphore * +__rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) { long count; bool waiting = true; /* any queued threads before us */ struct rwsem_waiter waiter; + struct rw_semaphore *ret = sem; /* undo write bias from down_write operation, stop active locking */ count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem); @@ -478,7 +479,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); /* wait until we successfully acquire the lock */ - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(state); while (true) { if (rwsem_try_write_lock(count, sem)) break; @@ -486,21 +487,48 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) /* Block until there are no active lockers. */ do { + if (signal_pending_state(state, current)) + goto out_nolock; + schedule(); - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(state); } while ((count = sem->count) & RWSEM_ACTIVE_MASK); raw_spin_lock_irq(&sem->wait_lock); } __set_current_state(TASK_RUNNING); + list_del(&waiter.list); + raw_spin_unlock_irq(&sem->wait_lock); + return ret; + +out_nolock: + __set_current_state(TASK_RUNNING); + raw_spin_lock_irq(&sem->wait_lock); list_del(&waiter.list); + if (list_empty(&sem->wait_list)) + rwsem_atomic_update(-RWSEM_WAITING_BIAS, sem); + else + __rwsem_do_wake(sem, RWSEM_WAKE_ANY); raw_spin_unlock_irq(&sem->wait_lock); - return sem; + return ERR_PTR(-EINTR); +} + +__visible struct rw_semaphore * __sched +rwsem_down_write_failed(struct rw_semaphore *sem) +{ + return __rwsem_down_write_failed_common(sem, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(rwsem_down_write_failed); +__visible struct rw_semaphore * __sched +rwsem_down_write_failed_killable(struct rw_semaphore *sem) +{ + return __rwsem_down_write_failed_common(sem, TASK_KILLABLE); +} +EXPORT_SYMBOL(rwsem_down_write_failed_killable); + /* * handle waking up a waiter on the semaphore * - up_read/up_write has decremented the active part of count if we come here diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 205be0ce34de..2e853ad93a3a 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -55,6 +55,25 @@ void __sched down_write(struct rw_semaphore *sem) EXPORT_SYMBOL(down_write); /* + * lock for writing + */ +int __sched down_write_killable(struct rw_semaphore *sem) +{ + might_sleep(); + rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); + + if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) { + rwsem_release(&sem->dep_map, 1, _RET_IP_); + return -EINTR; + } + + rwsem_set_owner(sem); + return 0; +} + +EXPORT_SYMBOL(down_write_killable); + +/* * trylock for writing -- returns 1 if successful, 0 if contention */ int down_write_trylock(struct rw_semaphore *sem) @@ -154,6 +173,22 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_write_nested); +int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) +{ + might_sleep(); + rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); + + if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) { + rwsem_release(&sem->dep_map, 1, _RET_IP_); + return -EINTR; + } + + rwsem_set_owner(sem); + return 0; +} + +EXPORT_SYMBOL(down_write_killable_nested); + void up_read_non_owner(struct rw_semaphore *sem) { __up_read(sem); diff --git a/kernel/memremap.c b/kernel/memremap.c index 6cf54615a9c4..017532193fb1 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -27,6 +27,13 @@ __weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size) } #endif +#ifndef arch_memremap_wb +static void *arch_memremap_wb(resource_size_t offset, unsigned long size) +{ + return (__force void *)ioremap_cache(offset, size); +} +#endif + static void *try_ram_remap(resource_size_t offset, size_t size) { unsigned long pfn = PHYS_PFN(offset); @@ -34,20 +41,22 @@ static void *try_ram_remap(resource_size_t offset, size_t size) /* In the simple case just return the existing linear address */ if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn))) return __va(offset); - return NULL; /* fallback to ioremap_cache */ + return NULL; /* fallback to arch_memremap_wb */ } /** * memremap() - remap an iomem_resource as cacheable memory * @offset: iomem resource start address * @size: size of remap - * @flags: either MEMREMAP_WB or MEMREMAP_WT + * @flags: any of MEMREMAP_WB, MEMREMAP_WT and MEMREMAP_WC * * memremap() is "ioremap" for cases where it is known that the resource * being mapped does not have i/o side effects and the __iomem - * annotation is not applicable. + * annotation is not applicable. In the case of multiple flags, the different + * mapping types will be attempted in the order listed below until one of + * them succeeds. * - * MEMREMAP_WB - matches the default mapping for "System RAM" on + * MEMREMAP_WB - matches the default mapping for System RAM on * the architecture. This is usually a read-allocate write-back cache. * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM * memremap() will bypass establishing a new mapping and instead return @@ -56,13 +65,21 @@ static void *try_ram_remap(resource_size_t offset, size_t size) * MEMREMAP_WT - establish a mapping whereby writes either bypass the * cache or are written through to memory and never exist in a * cache-dirty state with respect to program visibility. Attempts to - * map "System RAM" with this mapping type will fail. + * map System RAM with this mapping type will fail. + * + * MEMREMAP_WC - establish a writecombine mapping, whereby writes may + * be coalesced together (e.g. in the CPU's write buffers), but is otherwise + * uncached. Attempts to map System RAM with this mapping type will fail. */ void *memremap(resource_size_t offset, size_t size, unsigned long flags) { - int is_ram = region_intersects(offset, size, "System RAM"); + int is_ram = region_intersects(offset, size, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); void *addr = NULL; + if (!flags) + return NULL; + if (is_ram == REGION_MIXED) { WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n", &offset, (unsigned long) size); @@ -71,35 +88,35 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) /* Try all mapping types requested until one returns non-NULL */ if (flags & MEMREMAP_WB) { - flags &= ~MEMREMAP_WB; /* * MEMREMAP_WB is special in that it can be satisifed * from the direct map. Some archs depend on the * capability of memremap() to autodetect cases where - * the requested range is potentially in "System RAM" + * the requested range is potentially in System RAM. */ if (is_ram == REGION_INTERSECTS) addr = try_ram_remap(offset, size); if (!addr) - addr = ioremap_cache(offset, size); + addr = arch_memremap_wb(offset, size); } /* - * If we don't have a mapping yet and more request flags are - * pending then we will be attempting to establish a new virtual + * If we don't have a mapping yet and other request flags are + * present then we will be attempting to establish a new virtual * address mapping. Enforce that this mapping is not aliasing - * "System RAM" + * System RAM. */ - if (!addr && is_ram == REGION_INTERSECTS && flags) { + if (!addr && is_ram == REGION_INTERSECTS && flags != MEMREMAP_WB) { WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n", &offset, (unsigned long) size); return NULL; } - if (!addr && (flags & MEMREMAP_WT)) { - flags &= ~MEMREMAP_WT; + if (!addr && (flags & MEMREMAP_WT)) addr = ioremap_wt(offset, size); - } + + if (!addr && (flags & MEMREMAP_WC)) + addr = ioremap_wc(offset, size); return addr; } @@ -279,7 +296,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) - align_start; - is_ram = region_intersects(align_start, align_size, "System RAM"); + is_ram = region_intersects(align_start, align_size, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); if (is_ram == REGION_MIXED) { WARN_ONCE(1, "%s attempted on mixed region %pr\n", @@ -389,7 +407,7 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) /* * 'memmap_start' is the virtual address for the first "struct * page" in this range of the vmemmap array. In the case of - * CONFIG_SPARSE_VMEMMAP a page_to_pfn conversion is simple + * CONFIG_SPARSEMEM_VMEMMAP a page_to_pfn conversion is simple * pointer arithmetic, so we can perform this to_vmem_altmap() * conversion without concern for the initialization state of * the struct page fields. @@ -398,7 +416,7 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) struct dev_pagemap *pgmap; /* - * Uncoditionally retrieve a dev_pagemap associated with the + * Unconditionally retrieve a dev_pagemap associated with the * given physical address, this is only for use in the * arch_{add|remove}_memory() for setting up and tearing down * the memmap. diff --git a/kernel/module.c b/kernel/module.c index 794ebe8e878d..5f71aa63ed2a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -53,6 +53,7 @@ #include <asm/sections.h> #include <linux/tracepoint.h> #include <linux/ftrace.h> +#include <linux/livepatch.h> #include <linux/async.h> #include <linux/percpu.h> #include <linux/kmemleak.h> @@ -984,6 +985,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, mod->exit(); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); + klp_module_going(mod); ftrace_release_mod(mod); async_synchronize_full(); @@ -1971,6 +1973,83 @@ static void module_enable_nx(const struct module *mod) { } static void module_disable_nx(const struct module *mod) { } #endif +#ifdef CONFIG_LIVEPATCH +/* + * Persist Elf information about a module. Copy the Elf header, + * section header table, section string table, and symtab section + * index from info to mod->klp_info. + */ +static int copy_module_elf(struct module *mod, struct load_info *info) +{ + unsigned int size, symndx; + int ret; + + size = sizeof(*mod->klp_info); + mod->klp_info = kmalloc(size, GFP_KERNEL); + if (mod->klp_info == NULL) + return -ENOMEM; + + /* Elf header */ + size = sizeof(mod->klp_info->hdr); + memcpy(&mod->klp_info->hdr, info->hdr, size); + + /* Elf section header table */ + size = sizeof(*info->sechdrs) * info->hdr->e_shnum; + mod->klp_info->sechdrs = kmalloc(size, GFP_KERNEL); + if (mod->klp_info->sechdrs == NULL) { + ret = -ENOMEM; + goto free_info; + } + memcpy(mod->klp_info->sechdrs, info->sechdrs, size); + + /* Elf section name string table */ + size = info->sechdrs[info->hdr->e_shstrndx].sh_size; + mod->klp_info->secstrings = kmalloc(size, GFP_KERNEL); + if (mod->klp_info->secstrings == NULL) { + ret = -ENOMEM; + goto free_sechdrs; + } + memcpy(mod->klp_info->secstrings, info->secstrings, size); + + /* Elf symbol section index */ + symndx = info->index.sym; + mod->klp_info->symndx = symndx; + + /* + * For livepatch modules, core_kallsyms.symtab is a complete + * copy of the original symbol table. Adjust sh_addr to point + * to core_kallsyms.symtab since the copy of the symtab in module + * init memory is freed at the end of do_init_module(). + */ + mod->klp_info->sechdrs[symndx].sh_addr = \ + (unsigned long) mod->core_kallsyms.symtab; + + return 0; + +free_sechdrs: + kfree(mod->klp_info->sechdrs); +free_info: + kfree(mod->klp_info); + return ret; +} + +static void free_module_elf(struct module *mod) +{ + kfree(mod->klp_info->sechdrs); + kfree(mod->klp_info->secstrings); + kfree(mod->klp_info); +} +#else /* !CONFIG_LIVEPATCH */ +static int copy_module_elf(struct module *mod, struct load_info *info) +{ + return 0; +} + +static void free_module_elf(struct module *mod) +{ +} +#endif /* CONFIG_LIVEPATCH */ + void __weak module_memfree(void *module_region) { vfree(module_region); @@ -2009,6 +2088,9 @@ static void free_module(struct module *mod) /* Free any allocated parameters. */ destroy_params(mod->kp, mod->num_kp); + if (is_livepatch_module(mod)) + free_module_elf(mod); + /* Now we can delete it from the lists */ mutex_lock(&module_mutex); /* Unlink carefully: kallsyms could be walking list. */ @@ -2124,6 +2206,10 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) (long)sym[i].st_value); break; + case SHN_LIVEPATCH: + /* Livepatch symbols are resolved by livepatch */ + break; + case SHN_UNDEF: ksym = resolve_symbol_wait(mod, info, name); /* Ok if resolved. */ @@ -2172,6 +2258,10 @@ static int apply_relocations(struct module *mod, const struct load_info *info) if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC)) continue; + /* Livepatch relocation sections are applied by livepatch */ + if (info->sechdrs[i].sh_flags & SHF_RELA_LIVEPATCH) + continue; + if (info->sechdrs[i].sh_type == SHT_REL) err = apply_relocate(info->sechdrs, info->strtab, info->index.sym, i, mod); @@ -2467,7 +2557,7 @@ static void layout_symtab(struct module *mod, struct load_info *info) /* Compute total space required for the core symbols' strtab. */ for (ndst = i = 0; i < nsrc; i++) { - if (i == 0 || + if (i == 0 || is_livepatch_module(mod) || is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, info->index.pcpu)) { strtab_size += strlen(&info->strtab[src[i].st_name])+1; @@ -2526,7 +2616,7 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs; src = mod->kallsyms->symtab; for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) { - if (i == 0 || + if (i == 0 || is_livepatch_module(mod) || is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, info->index.pcpu)) { dst[ndst] = src[i]; @@ -2665,6 +2755,26 @@ static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned l return 0; } +#ifdef CONFIG_LIVEPATCH +static int find_livepatch_modinfo(struct module *mod, struct load_info *info) +{ + mod->klp = get_modinfo(info, "livepatch") ? true : false; + + return 0; +} +#else /* !CONFIG_LIVEPATCH */ +static int find_livepatch_modinfo(struct module *mod, struct load_info *info) +{ + if (get_modinfo(info, "livepatch")) { + pr_err("%s: module is marked as livepatch module, but livepatch support is disabled", + mod->name); + return -ENOEXEC; + } + + return 0; +} +#endif /* CONFIG_LIVEPATCH */ + /* Sets info->hdr and info->len. */ static int copy_module_from_user(const void __user *umod, unsigned long len, struct load_info *info) @@ -2675,7 +2785,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, if (info->len < sizeof(*(info->hdr))) return -ENOEXEC; - err = security_kernel_module_from_file(NULL); + err = security_kernel_read_file(NULL, READING_MODULE); if (err) return err; @@ -2693,63 +2803,6 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, return 0; } -/* Sets info->hdr and info->len. */ -static int copy_module_from_fd(int fd, struct load_info *info) -{ - struct fd f = fdget(fd); - int err; - struct kstat stat; - loff_t pos; - ssize_t bytes = 0; - - if (!f.file) - return -ENOEXEC; - - err = security_kernel_module_from_file(f.file); - if (err) - goto out; - - err = vfs_getattr(&f.file->f_path, &stat); - if (err) - goto out; - - if (stat.size > INT_MAX) { - err = -EFBIG; - goto out; - } - - /* Don't hand 0 to vmalloc, it whines. */ - if (stat.size == 0) { - err = -EINVAL; - goto out; - } - - info->hdr = vmalloc(stat.size); - if (!info->hdr) { - err = -ENOMEM; - goto out; - } - - pos = 0; - while (pos < stat.size) { - bytes = kernel_read(f.file, pos, (char *)(info->hdr) + pos, - stat.size - pos); - if (bytes < 0) { - vfree(info->hdr); - err = bytes; - goto out; - } - if (bytes == 0) - break; - pos += bytes; - } - info->len = pos; - -out: - fdput(f); - return err; -} - static void free_copy(struct load_info *info) { vfree(info->hdr); @@ -2876,6 +2929,10 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) "is unknown, you have been warned.\n", mod->name); } + err = find_livepatch_modinfo(mod, info); + if (err) + return err; + /* Set up license info based on the info section */ set_license(mod, get_modinfo(info, "license")); @@ -3315,6 +3372,7 @@ fail: module_put(mod); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); + klp_module_going(mod); ftrace_release_mod(mod); free_module(mod); wake_up_all(&module_wq); @@ -3392,9 +3450,6 @@ static int complete_formation(struct module *mod, struct load_info *info) mod->state = MODULE_STATE_COMING; mutex_unlock(&module_mutex); - ftrace_module_enable(mod); - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_COMING, mod); return 0; out: @@ -3402,6 +3457,20 @@ out: return err; } +static int prepare_coming_module(struct module *mod) +{ + int err; + + ftrace_module_enable(mod); + err = klp_module_coming(mod); + if (err) + return err; + + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_COMING, mod); + return 0; +} + static int unknown_module_param_cb(char *param, char *val, const char *modname, void *arg) { @@ -3516,13 +3585,17 @@ static int load_module(struct load_info *info, const char __user *uargs, if (err) goto ddebug_cleanup; + err = prepare_coming_module(mod); + if (err) + goto bug_cleanup; + /* Module is ready to execute: parsing args may do that. */ after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, -32768, 32767, mod, unknown_module_param_cb); if (IS_ERR(after_dashes)) { err = PTR_ERR(after_dashes); - goto bug_cleanup; + goto coming_cleanup; } else if (after_dashes) { pr_warn("%s: parameters '%s' after `--' ignored\n", mod->name, after_dashes); @@ -3531,7 +3604,13 @@ static int load_module(struct load_info *info, const char __user *uargs, /* Link in to syfs. */ err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); if (err < 0) - goto bug_cleanup; + goto coming_cleanup; + + if (is_livepatch_module(mod)) { + err = copy_module_elf(mod, info); + if (err < 0) + goto sysfs_cleanup; + } /* Get rid of temporary copy. */ free_copy(info); @@ -3541,15 +3620,18 @@ static int load_module(struct load_info *info, const char __user *uargs, return do_init_module(mod); + sysfs_cleanup: + mod_sysfs_teardown(mod); + coming_cleanup: + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_GOING, mod); + klp_module_going(mod); bug_cleanup: /* module_bug_cleanup needs module_mutex protection */ mutex_lock(&module_mutex); module_bug_cleanup(mod); mutex_unlock(&module_mutex); - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_GOING, mod); - /* we can't deallocate the module until we clear memory protection */ module_disable_ro(mod); module_disable_nx(mod); @@ -3611,8 +3693,10 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) { - int err; struct load_info info = { }; + loff_t size; + void *hdr; + int err; err = may_init_module(); if (err) @@ -3624,9 +3708,12 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) |MODULE_INIT_IGNORE_VERMAGIC)) return -EINVAL; - err = copy_module_from_fd(fd, &info); + err = kernel_read_file_from_fd(fd, &hdr, &size, INT_MAX, + READING_MODULE); if (err) return err; + info.hdr = hdr; + info.len = size; return load_module(&info, uargs, flags); } diff --git a/kernel/module_signing.c b/kernel/module_signing.c index 6528a79d998d..937c844bee4a 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -11,10 +11,17 @@ #include <linux/kernel.h> #include <linux/errno.h> -#include <keys/system_keyring.h> +#include <linux/string.h> +#include <linux/verification.h> #include <crypto/public_key.h> #include "module-internal.h" +enum pkey_id_type { + PKEY_ID_PGP, /* OpenPGP generated key ID */ + PKEY_ID_X509, /* X.509 arbitrary subjectKeyIdentifier */ + PKEY_ID_PKCS7, /* Signature in PKCS#7 message */ +}; + /* * Module signature information block. * @@ -73,6 +80,7 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen) return -EBADMSG; } - return system_verify_data(mod, modlen, mod + modlen, sig_len, - VERIFYING_MODULE_SIGNATURE); + return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len, + NULL, VERIFYING_MODULE_SIGNATURE, + NULL, NULL); } diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 49746c81ad8d..782102e59eed 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -25,6 +25,7 @@ #include <linux/proc_ns.h> #include <linux/file.h> #include <linux/syscalls.h> +#include <linux/cgroup.h> static struct kmem_cache *nsproxy_cachep; @@ -39,6 +40,9 @@ struct nsproxy init_nsproxy = { #ifdef CONFIG_NET .net_ns = &init_net, #endif +#ifdef CONFIG_CGROUPS + .cgroup_ns = &init_cgroup_ns, +#endif }; static inline struct nsproxy *create_nsproxy(void) @@ -92,6 +96,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_pid; } + new_nsp->cgroup_ns = copy_cgroup_ns(flags, user_ns, + tsk->nsproxy->cgroup_ns); + if (IS_ERR(new_nsp->cgroup_ns)) { + err = PTR_ERR(new_nsp->cgroup_ns); + goto out_cgroup; + } + new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns); if (IS_ERR(new_nsp->net_ns)) { err = PTR_ERR(new_nsp->net_ns); @@ -101,6 +112,8 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, return new_nsp; out_net: + put_cgroup_ns(new_nsp->cgroup_ns); +out_cgroup: if (new_nsp->pid_ns_for_children) put_pid_ns(new_nsp->pid_ns_for_children); out_pid: @@ -128,7 +141,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) struct nsproxy *new_ns; if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWPID | CLONE_NEWNET)))) { + CLONE_NEWPID | CLONE_NEWNET | + CLONE_NEWCGROUP)))) { get_nsproxy(old_ns); return 0; } @@ -165,6 +179,7 @@ void free_nsproxy(struct nsproxy *ns) put_ipc_ns(ns->ipc_ns); if (ns->pid_ns_for_children) put_pid_ns(ns->pid_ns_for_children); + put_cgroup_ns(ns->cgroup_ns); put_net(ns->net_ns); kmem_cache_free(nsproxy_cachep, ns); } @@ -180,7 +195,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, int err = 0; if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWNET | CLONE_NEWPID))) + CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP))) return 0; user_ns = new_cred ? new_cred->user_ns : current_user_ns(); diff --git a/kernel/padata.c b/kernel/padata.c index b38bea9c466a..993278895ccc 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -607,33 +607,6 @@ out_replace: } /** - * padata_set_cpumasks - Set both parallel and serial cpumasks. The first - * one is used by parallel workers and the second one - * by the wokers doing serialization. - * - * @pinst: padata instance - * @pcpumask: the cpumask to use for parallel workers - * @cbcpumask: the cpumsak to use for serial workers - */ -int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask, - cpumask_var_t cbcpumask) -{ - int err; - - mutex_lock(&pinst->lock); - get_online_cpus(); - - err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask); - - put_online_cpus(); - mutex_unlock(&pinst->lock); - - return err; - -} -EXPORT_SYMBOL(padata_set_cpumasks); - -/** * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value * equivalent to @cpumask. * @@ -674,6 +647,43 @@ out: } EXPORT_SYMBOL(padata_set_cpumask); +/** + * padata_start - start the parallel processing + * + * @pinst: padata instance to start + */ +int padata_start(struct padata_instance *pinst) +{ + int err = 0; + + mutex_lock(&pinst->lock); + + if (pinst->flags & PADATA_INVALID) + err = -EINVAL; + + __padata_start(pinst); + + mutex_unlock(&pinst->lock); + + return err; +} +EXPORT_SYMBOL(padata_start); + +/** + * padata_stop - stop the parallel processing + * + * @pinst: padata instance to stop + */ +void padata_stop(struct padata_instance *pinst) +{ + mutex_lock(&pinst->lock); + __padata_stop(pinst); + mutex_unlock(&pinst->lock); +} +EXPORT_SYMBOL(padata_stop); + +#ifdef CONFIG_HOTPLUG_CPU + static int __padata_add_cpu(struct padata_instance *pinst, int cpu) { struct parallel_data *pd; @@ -694,42 +704,6 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu) return 0; } - /** - * padata_add_cpu - add a cpu to one or both(parallel and serial) - * padata cpumasks. - * - * @pinst: padata instance - * @cpu: cpu to add - * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added. - * The @mask may be any combination of the following flags: - * PADATA_CPU_SERIAL - serial cpumask - * PADATA_CPU_PARALLEL - parallel cpumask - */ - -int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask) -{ - int err; - - if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL))) - return -EINVAL; - - mutex_lock(&pinst->lock); - - get_online_cpus(); - if (mask & PADATA_CPU_SERIAL) - cpumask_set_cpu(cpu, pinst->cpumask.cbcpu); - if (mask & PADATA_CPU_PARALLEL) - cpumask_set_cpu(cpu, pinst->cpumask.pcpu); - - err = __padata_add_cpu(pinst, cpu); - put_online_cpus(); - - mutex_unlock(&pinst->lock); - - return err; -} -EXPORT_SYMBOL(padata_add_cpu); - static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) { struct parallel_data *pd = NULL; @@ -789,43 +763,6 @@ int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask) } EXPORT_SYMBOL(padata_remove_cpu); -/** - * padata_start - start the parallel processing - * - * @pinst: padata instance to start - */ -int padata_start(struct padata_instance *pinst) -{ - int err = 0; - - mutex_lock(&pinst->lock); - - if (pinst->flags & PADATA_INVALID) - err =-EINVAL; - - __padata_start(pinst); - - mutex_unlock(&pinst->lock); - - return err; -} -EXPORT_SYMBOL(padata_start); - -/** - * padata_stop - stop the parallel processing - * - * @pinst: padata instance to stop - */ -void padata_stop(struct padata_instance *pinst) -{ - mutex_lock(&pinst->lock); - __padata_stop(pinst); - mutex_unlock(&pinst->lock); -} -EXPORT_SYMBOL(padata_stop); - -#ifdef CONFIG_HOTPLUG_CPU - static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu) { return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) || @@ -1091,7 +1028,6 @@ err_free_inst: err: return NULL; } -EXPORT_SYMBOL(padata_alloc); /** * padata_free - free a padata instance diff --git a/kernel/panic.c b/kernel/panic.c index d96469de72dc..8aa74497cc5a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -24,6 +24,7 @@ #include <linux/init.h> #include <linux/nmi.h> #include <linux/console.h> +#include <linux/bug.h> #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 @@ -72,6 +73,26 @@ void __weak nmi_panic_self_stop(struct pt_regs *regs) atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); +/* + * A variant of panic() called from NMI context. We return if we've already + * panicked on this CPU. If another CPU already panicked, loop in + * nmi_panic_self_stop() which can provide architecture dependent code such + * as saving register state for crash dump. + */ +void nmi_panic(struct pt_regs *regs, const char *msg) +{ + int old_cpu, cpu; + + cpu = raw_smp_processor_id(); + old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu); + + if (old_cpu == PANIC_CPU_INVALID) + panic("%s", msg); + else if (old_cpu != cpu) + nmi_panic_self_stop(regs); +} +EXPORT_SYMBOL(nmi_panic); + /** * panic - halt the system * @fmt: The text string to print @@ -139,8 +160,10 @@ void panic(const char *fmt, ...) * * Bypass the panic_cpu check and call __crash_kexec directly. */ - if (!crash_kexec_post_notifiers) + if (!crash_kexec_post_notifiers) { + printk_nmi_flush_on_panic(); __crash_kexec(NULL); + } /* * Note smp_send_stop is the usual smp shutdown function, which @@ -155,6 +178,8 @@ void panic(const char *fmt, ...) */ atomic_notifier_call_chain(&panic_notifier_list, 0, buf); + /* Call flush even twice. It tries harder with a single online CPU */ + printk_nmi_flush_on_panic(); kmsg_dump(KMSG_DUMP_PANIC); /* @@ -449,20 +474,25 @@ void oops_exit(void) kmsg_dump(KMSG_DUMP_OOPS); } -#ifdef WANT_WARN_ON_SLOWPATH -struct slowpath_args { +struct warn_args { const char *fmt; va_list args; }; -static void warn_slowpath_common(const char *file, int line, void *caller, - unsigned taint, struct slowpath_args *args) +void __warn(const char *file, int line, void *caller, unsigned taint, + struct pt_regs *regs, struct warn_args *args) { disable_trace_on_warning(); pr_warn("------------[ cut here ]------------\n"); - pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS()\n", - raw_smp_processor_id(), current->pid, file, line, caller); + + if (file) + pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n", + raw_smp_processor_id(), current->pid, file, line, + caller); + else + pr_warn("WARNING: CPU: %d PID: %d at %pS\n", + raw_smp_processor_id(), current->pid, caller); if (args) vprintk(args->fmt, args->args); @@ -479,20 +509,27 @@ static void warn_slowpath_common(const char *file, int line, void *caller, } print_modules(); - dump_stack(); + + if (regs) + show_regs(regs); + else + dump_stack(); + print_oops_end_marker(); + /* Just a warning, don't kill lockdep. */ add_taint(taint, LOCKDEP_STILL_OK); } +#ifdef WANT_WARN_ON_SLOWPATH void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) { - struct slowpath_args args; + struct warn_args args; args.fmt = fmt; va_start(args.args, fmt); - warn_slowpath_common(file, line, __builtin_return_address(0), - TAINT_WARN, &args); + __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, + &args); va_end(args.args); } EXPORT_SYMBOL(warn_slowpath_fmt); @@ -500,20 +537,18 @@ EXPORT_SYMBOL(warn_slowpath_fmt); void warn_slowpath_fmt_taint(const char *file, int line, unsigned taint, const char *fmt, ...) { - struct slowpath_args args; + struct warn_args args; args.fmt = fmt; va_start(args.args, fmt); - warn_slowpath_common(file, line, __builtin_return_address(0), - taint, &args); + __warn(file, line, __builtin_return_address(0), taint, NULL, &args); va_end(args.args); } EXPORT_SYMBOL(warn_slowpath_fmt_taint); void warn_slowpath_null(const char *file, int line) { - warn_slowpath_common(file, line, __builtin_return_address(0), - TAINT_WARN, NULL); + __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL); } EXPORT_SYMBOL(warn_slowpath_null); #endif diff --git a/kernel/pid.c b/kernel/pid.c index 4d73a834c7e6..f66162f2359b 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -311,7 +311,7 @@ struct pid *alloc_pid(struct pid_namespace *ns) pid->level = ns->level; for (i = ns->level; i >= 0; i--) { nr = alloc_pidmap(tmp); - if (IS_ERR_VALUE(nr)) { + if (nr < 0) { retval = nr; goto out_free; } diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index b7342a24f559..fca9254280ee 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -339,6 +339,7 @@ int hibernation_snapshot(int platform_mode) pm_message_t msg; int error; + pm_suspend_clear_flags(); error = platform_begin(platform_mode); if (error) goto Close; @@ -1158,6 +1159,22 @@ static int __init kaslr_nohibernate_setup(char *str) return nohibernate_setup(str); } +static int __init page_poison_nohibernate_setup(char *str) +{ +#ifdef CONFIG_PAGE_POISONING_ZERO + /* + * The zeroing option for page poison skips the checks on alloc. + * since hibernation doesn't save free pages there's no way to + * guarantee the pages will still be zeroed. + */ + if (!strcmp(str, "on")) { + pr_info("Disabling hibernation due to page poisoning\n"); + return nohibernate_setup(str); + } +#endif + return 1; +} + __setup("noresume", noresume_setup); __setup("resume_offset=", resume_offset_setup); __setup("resume=", resume_setup); @@ -1166,3 +1183,4 @@ __setup("resumewait", resumewait_setup); __setup("resumedelay=", resumedelay_setup); __setup("nohibernate", nohibernate_setup); __setup("kaslr", kaslr_nohibernate_setup); +__setup("page_poison=", page_poison_nohibernate_setup); diff --git a/kernel/power/process.c b/kernel/power/process.c index 564f786df470..0c2ee9761d57 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -30,13 +30,12 @@ static int try_to_freeze_tasks(bool user_only) unsigned long end_time; unsigned int todo; bool wq_busy = false; - struct timeval start, end; - u64 elapsed_msecs64; + ktime_t start, end, elapsed; unsigned int elapsed_msecs; bool wakeup = false; int sleep_usecs = USEC_PER_MSEC; - do_gettimeofday(&start); + start = ktime_get_boottime(); end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs); @@ -78,10 +77,9 @@ static int try_to_freeze_tasks(bool user_only) sleep_usecs *= 2; } - do_gettimeofday(&end); - elapsed_msecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); - do_div(elapsed_msecs64, NSEC_PER_MSEC); - elapsed_msecs = elapsed_msecs64; + end = ktime_get_boottime(); + elapsed = ktime_sub(end, start); + elapsed_msecs = ktime_to_ms(elapsed); if (todo) { pr_cont("\n"); @@ -148,6 +146,18 @@ int freeze_processes(void) if (!error && !oom_killer_disable()) error = -EBUSY; + /* + * There is a hard to fix race between oom_reaper kernel thread + * and oom_killer_disable. oom_reaper calls exit_oom_victim + * before the victim reaches exit_mm so try to freeze all the tasks + * again and catch such a left over task. + */ + if (!error) { + pr_info("Double checking all user space processes after OOM killer disable... "); + error = try_to_freeze_tasks(true); + pr_cont("\n"); + } + if (error) thaw_processes(); return error; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index f9fe133c13e2..5b70d64b871e 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -248,7 +248,7 @@ static int suspend_test(int level) { #ifdef CONFIG_PM_DEBUG if (pm_test_level == level) { - printk(KERN_INFO "suspend debug: Waiting for %d second(s).\n", + pr_info("suspend debug: Waiting for %d second(s).\n", pm_test_delay); mdelay(pm_test_delay * 1000); return 1; @@ -320,7 +320,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) error = dpm_suspend_late(PMSG_SUSPEND); if (error) { - printk(KERN_ERR "PM: late suspend of devices failed\n"); + pr_err("PM: late suspend of devices failed\n"); goto Platform_finish; } error = platform_suspend_prepare_late(state); @@ -329,7 +329,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) error = dpm_suspend_noirq(PMSG_SUSPEND); if (error) { - printk(KERN_ERR "PM: noirq suspend of devices failed\n"); + pr_err("PM: noirq suspend of devices failed\n"); goto Platform_early_resume; } error = platform_suspend_prepare_noirq(state); @@ -473,8 +473,7 @@ static int enter_state(suspend_state_t state) if (state == PM_SUSPEND_FREEZE) { #ifdef CONFIG_PM_DEBUG if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) { - pr_warning("PM: Unsupported test mode for suspend to idle," - "please choose none/freezer/devices/platform.\n"); + pr_warn("PM: Unsupported test mode for suspend to idle, please choose none/freezer/devices/platform.\n"); return -EAGAIN; } #endif diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 12cd989dadf6..160e1006640d 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -37,6 +37,14 @@ #define HIBERNATE_SIG "S1SUSPEND" /* + * When reading an {un,}compressed image, we may restore pages in place, + * in which case some architectures need these pages cleaning before they + * can be executed. We don't know which pages these may be, so clean the lot. + */ +static bool clean_pages_on_read; +static bool clean_pages_on_decompress; + +/* * The swap map is a data structure used for keeping track of each page * written to a swap partition. It consists of many swap_map_page * structures that contain each an array of MAP_PAGE_ENTRIES swap entries. @@ -241,6 +249,9 @@ static void hib_end_io(struct bio *bio) if (bio_data_dir(bio) == WRITE) put_page(page); + else if (clean_pages_on_read) + flush_icache_range((unsigned long)page_address(page), + (unsigned long)page_address(page) + PAGE_SIZE); if (bio->bi_error && !hb->error) hb->error = bio->bi_error; @@ -1049,6 +1060,7 @@ static int load_image(struct swap_map_handle *handle, hib_init_batch(&hb); + clean_pages_on_read = true; printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n", nr_to_read); m = nr_to_read / 10; @@ -1124,6 +1136,10 @@ static int lzo_decompress_threadfn(void *data) d->unc_len = LZO_UNC_SIZE; d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len, d->unc, &d->unc_len); + if (clean_pages_on_decompress) + flush_icache_range((unsigned long)d->unc, + (unsigned long)d->unc + d->unc_len); + atomic_set(&d->stop, 1); wake_up(&d->done); } @@ -1189,6 +1205,8 @@ static int load_image_lzo(struct swap_map_handle *handle, } memset(crc, 0, offsetof(struct crc_data, go)); + clean_pages_on_decompress = true; + /* * Start the decompression threads. */ diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index 85405bdcf2b3..abb0042a427b 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -1,2 +1,3 @@ obj-y = printk.o +obj-$(CONFIG_PRINTK_NMI) += nmi.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h new file mode 100644 index 000000000000..7fd2838fa417 --- /dev/null +++ b/kernel/printk/internal.h @@ -0,0 +1,57 @@ +/* + * internal.h - printk internal definitions + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ +#include <linux/percpu.h> + +typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args); + +int __printf(1, 0) vprintk_default(const char *fmt, va_list args); + +#ifdef CONFIG_PRINTK_NMI + +extern raw_spinlock_t logbuf_lock; + +/* + * printk() could not take logbuf_lock in NMI context. Instead, + * it temporary stores the strings into a per-CPU buffer. + * The alternative implementation is chosen transparently + * via per-CPU variable. + */ +DECLARE_PER_CPU(printk_func_t, printk_func); +static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) +{ + return this_cpu_read(printk_func)(fmt, args); +} + +extern atomic_t nmi_message_lost; +static inline int get_nmi_message_lost(void) +{ + return atomic_xchg(&nmi_message_lost, 0); +} + +#else /* CONFIG_PRINTK_NMI */ + +static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) +{ + return vprintk_default(fmt, args); +} + +static inline int get_nmi_message_lost(void) +{ + return 0; +} + +#endif /* CONFIG_PRINTK_NMI */ diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c new file mode 100644 index 000000000000..b69eb8a2876f --- /dev/null +++ b/kernel/printk/nmi.c @@ -0,0 +1,260 @@ +/* + * nmi.c - Safe printk in NMI context + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/preempt.h> +#include <linux/spinlock.h> +#include <linux/debug_locks.h> +#include <linux/smp.h> +#include <linux/cpumask.h> +#include <linux/irq_work.h> +#include <linux/printk.h> + +#include "internal.h" + +/* + * printk() could not take logbuf_lock in NMI context. Instead, + * it uses an alternative implementation that temporary stores + * the strings into a per-CPU buffer. The content of the buffer + * is later flushed into the main ring buffer via IRQ work. + * + * The alternative implementation is chosen transparently + * via @printk_func per-CPU variable. + * + * The implementation allows to flush the strings also from another CPU. + * There are situations when we want to make sure that all buffers + * were handled or when IRQs are blocked. + */ +DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; +static int printk_nmi_irq_ready; +atomic_t nmi_message_lost; + +#define NMI_LOG_BUF_LEN ((1 << CONFIG_NMI_LOG_BUF_SHIFT) - \ + sizeof(atomic_t) - sizeof(struct irq_work)) + +struct nmi_seq_buf { + atomic_t len; /* length of written data */ + struct irq_work work; /* IRQ work that flushes the buffer */ + unsigned char buffer[NMI_LOG_BUF_LEN]; +}; +static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); + +/* + * Safe printk() for NMI context. It uses a per-CPU buffer to + * store the message. NMIs are not nested, so there is always only + * one writer running. But the buffer might get flushed from another + * CPU, so we need to be careful. + */ +static int vprintk_nmi(const char *fmt, va_list args) +{ + struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); + int add = 0; + size_t len; + +again: + len = atomic_read(&s->len); + + if (len >= sizeof(s->buffer)) { + atomic_inc(&nmi_message_lost); + return 0; + } + + /* + * Make sure that all old data have been read before the buffer was + * reseted. This is not needed when we just append data. + */ + if (!len) + smp_rmb(); + + add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args); + + /* + * Do it once again if the buffer has been flushed in the meantime. + * Note that atomic_cmpxchg() is an implicit memory barrier that + * makes sure that the data were written before updating s->len. + */ + if (atomic_cmpxchg(&s->len, len, len + add) != len) + goto again; + + /* Get flushed in a more safe context. */ + if (add && printk_nmi_irq_ready) { + /* Make sure that IRQ work is really initialized. */ + smp_rmb(); + irq_work_queue(&s->work); + } + + return add; +} + +/* + * printk one line from the temporary buffer from @start index until + * and including the @end index. + */ +static void print_nmi_seq_line(struct nmi_seq_buf *s, int start, int end) +{ + const char *buf = s->buffer + start; + + /* + * The buffers are flushed in NMI only on panic. The messages must + * go only into the ring buffer at this stage. Consoles will get + * explicitly called later when a crashdump is not generated. + */ + if (in_nmi()) + printk_deferred("%.*s", (end - start) + 1, buf); + else + printk("%.*s", (end - start) + 1, buf); + +} + +/* + * Flush data from the associated per_CPU buffer. The function + * can be called either via IRQ work or independently. + */ +static void __printk_nmi_flush(struct irq_work *work) +{ + static raw_spinlock_t read_lock = + __RAW_SPIN_LOCK_INITIALIZER(read_lock); + struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work); + unsigned long flags; + size_t len, size; + int i, last_i; + + /* + * The lock has two functions. First, one reader has to flush all + * available message to make the lockless synchronization with + * writers easier. Second, we do not want to mix messages from + * different CPUs. This is especially important when printing + * a backtrace. + */ + raw_spin_lock_irqsave(&read_lock, flags); + + i = 0; +more: + len = atomic_read(&s->len); + + /* + * This is just a paranoid check that nobody has manipulated + * the buffer an unexpected way. If we printed something then + * @len must only increase. + */ + if (i && i >= len) + pr_err("printk_nmi_flush: internal error: i=%d >= len=%zu\n", + i, len); + + if (!len) + goto out; /* Someone else has already flushed the buffer. */ + + /* Make sure that data has been written up to the @len */ + smp_rmb(); + + size = min(len, sizeof(s->buffer)); + last_i = i; + + /* Print line by line. */ + for (; i < size; i++) { + if (s->buffer[i] == '\n') { + print_nmi_seq_line(s, last_i, i); + last_i = i + 1; + } + } + /* Check if there was a partial line. */ + if (last_i < size) { + print_nmi_seq_line(s, last_i, size - 1); + pr_cont("\n"); + } + + /* + * Check that nothing has got added in the meantime and truncate + * the buffer. Note that atomic_cmpxchg() is an implicit memory + * barrier that makes sure that the data were copied before + * updating s->len. + */ + if (atomic_cmpxchg(&s->len, len, 0) != len) + goto more; + +out: + raw_spin_unlock_irqrestore(&read_lock, flags); +} + +/** + * printk_nmi_flush - flush all per-cpu nmi buffers. + * + * The buffers are flushed automatically via IRQ work. This function + * is useful only when someone wants to be sure that all buffers have + * been flushed at some point. + */ +void printk_nmi_flush(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + __printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work); +} + +/** + * printk_nmi_flush_on_panic - flush all per-cpu nmi buffers when the system + * goes down. + * + * Similar to printk_nmi_flush() but it can be called even in NMI context when + * the system goes down. It does the best effort to get NMI messages into + * the main ring buffer. + * + * Note that it could try harder when there is only one CPU online. + */ +void printk_nmi_flush_on_panic(void) +{ + /* + * Make sure that we could access the main ring buffer. + * Do not risk a double release when more CPUs are up. + */ + if (in_nmi() && raw_spin_is_locked(&logbuf_lock)) { + if (num_online_cpus() > 1) + return; + + debug_locks_off(); + raw_spin_lock_init(&logbuf_lock); + } + + printk_nmi_flush(); +} + +void __init printk_nmi_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu); + + init_irq_work(&s->work, __printk_nmi_flush); + } + + /* Make sure that IRQ works are initialized before enabling. */ + smp_wmb(); + printk_nmi_irq_ready = 1; + + /* Flush pending messages that did not have scheduled IRQ works. */ + printk_nmi_flush(); +} + +void printk_nmi_enter(void) +{ + this_cpu_write(printk_func, vprintk_nmi); +} + +void printk_nmi_exit(void) +{ + this_cpu_write(printk_func, vprintk_default); +} diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index c963ba534a78..60cdf6386763 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -55,6 +55,7 @@ #include "console_cmdline.h" #include "braille.h" +#include "internal.h" int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ @@ -244,7 +245,7 @@ __packed __aligned(4) * within the scheduler's rq lock. It must be released before calling * console_unlock() or anything else that might wake up a process. */ -static DEFINE_RAW_SPINLOCK(logbuf_lock); +DEFINE_RAW_SPINLOCK(logbuf_lock); #ifdef CONFIG_PRINTK DECLARE_WAIT_QUEUE_HEAD(log_wait); @@ -367,16 +368,20 @@ static int logbuf_has_space(u32 msg_size, bool empty) static int log_make_free_space(u32 msg_size) { - while (log_first_seq < log_next_seq) { - if (logbuf_has_space(msg_size, false)) - return 0; + while (log_first_seq < log_next_seq && + !logbuf_has_space(msg_size, false)) { /* drop old messages until we have enough contiguous space */ log_first_idx = log_next(log_first_idx); log_first_seq++; } + if (clear_seq < log_first_seq) { + clear_seq = log_first_seq; + clear_idx = log_first_idx; + } + /* sequence numbers are equal, so the log buffer is empty */ - if (logbuf_has_space(msg_size, true)) + if (logbuf_has_space(msg_size, log_first_seq == log_next_seq)) return 0; return -ENOMEM; @@ -854,6 +859,7 @@ void log_buf_kexec_setup(void) VMCOREINFO_SYMBOL(log_buf); VMCOREINFO_SYMBOL(log_buf_len); VMCOREINFO_SYMBOL(log_first_idx); + VMCOREINFO_SYMBOL(clear_idx); VMCOREINFO_SYMBOL(log_next_idx); /* * Export struct printk_log size and field offsets. User space tools can @@ -1216,12 +1222,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear) u32 idx; enum log_flags prev; - if (clear_seq < log_first_seq) { - /* messages are gone, move to first available one */ - clear_seq = log_first_seq; - clear_idx = log_first_idx; - } - /* * Find first record that fits, including all following records, * into the user-provided buffer for this dump. @@ -1483,58 +1483,6 @@ static void zap_locks(void) sema_init(&console_sem, 1); } -/* - * Check if we have any console that is capable of printing while cpu is - * booting or shutting down. Requires console_sem. - */ -static int have_callable_console(void) -{ - struct console *con; - - for_each_console(con) - if (con->flags & CON_ANYTIME) - return 1; - - return 0; -} - -/* - * Can we actually use the console at this time on this cpu? - * - * Console drivers may assume that per-cpu resources have been allocated. So - * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't - * call them until this CPU is officially up. - */ -static inline int can_use_console(unsigned int cpu) -{ - return cpu_online(cpu) || have_callable_console(); -} - -/* - * Try to get console ownership to actually show the kernel - * messages from a 'printk'. Return true (and with the - * console_lock held, and 'console_locked' set) if it - * is successful, false otherwise. - */ -static int console_trylock_for_printk(void) -{ - unsigned int cpu = smp_processor_id(); - - if (!console_trylock()) - return 0; - /* - * If we can't use the console, we need to release the console - * semaphore by hand to avoid flushing the buffer. We need to hold the - * console semaphore in order to do this test safely. - */ - if (!can_use_console(cpu)) { - console_locked = 0; - up_console_sem(); - return 0; - } - return 1; -} - int printk_delay_msec __read_mostly; static inline void printk_delay(void) @@ -1669,6 +1617,7 @@ asmlinkage int vprintk_emit(int facility, int level, unsigned long flags; int this_cpu; int printed_len = 0; + int nmi_message_lost; bool in_sched = false; /* cpu currently holding logbuf_lock in this function */ static unsigned int logbuf_cpu = UINT_MAX; @@ -1681,7 +1630,6 @@ asmlinkage int vprintk_emit(int facility, int level, boot_delay_msec(level); printk_delay(); - /* This stops the holder of console_sem just where we want him */ local_irq_save(flags); this_cpu = smp_processor_id(); @@ -1705,6 +1653,7 @@ asmlinkage int vprintk_emit(int facility, int level, } lockdep_off(); + /* This stops the holder of console_sem just where we want him */ raw_spin_lock(&logbuf_lock); logbuf_cpu = this_cpu; @@ -1719,6 +1668,15 @@ asmlinkage int vprintk_emit(int facility, int level, strlen(recursion_msg)); } + nmi_message_lost = get_nmi_message_lost(); + if (unlikely(nmi_message_lost)) { + text_len = scnprintf(textbuf, sizeof(textbuf), + "BAD LUCK: lost %d message(s) from NMI context!", + nmi_message_lost); + printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, + NULL, 0, textbuf, text_len); + } + /* * The printf needs to come first; we need the syslog * prefix which might be passed-in as a parameter. @@ -1810,20 +1768,12 @@ asmlinkage int vprintk_emit(int facility, int level, if (!in_sched) { lockdep_off(); /* - * Disable preemption to avoid being preempted while holding - * console_sem which would prevent anyone from printing to - * console - */ - preempt_disable(); - - /* * Try to acquire and then immediately release the console * semaphore. The release will print out buffers and wake up * /dev/kmsg and syslog() users. */ - if (console_trylock_for_printk()) + if (console_trylock()) console_unlock(); - preempt_enable(); lockdep_on(); } @@ -1868,14 +1818,6 @@ int vprintk_default(const char *fmt, va_list args) } EXPORT_SYMBOL_GPL(vprintk_default); -/* - * This allows printk to be diverted to another function per cpu. - * This is useful for calling printk functions from within NMI - * without worrying about race conditions that can lock up the - * box. - */ -DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; - /** * printk - print a kernel message * @fmt: format string @@ -1899,21 +1841,11 @@ DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; */ asmlinkage __visible int printk(const char *fmt, ...) { - printk_func_t vprintk_func; va_list args; int r; va_start(args, fmt); - - /* - * If a caller overrides the per_cpu printk_func, then it needs - * to disable preemption when calling printk(). Otherwise - * the printk_func should be set to the default. No need to - * disable preemption here. - */ - vprintk_func = this_cpu_read(printk_func); r = vprintk_func(fmt, args); - va_end(args); return r; @@ -2174,7 +2106,20 @@ int console_trylock(void) return 0; } console_locked = 1; - console_may_schedule = 0; + /* + * When PREEMPT_COUNT disabled we can't reliably detect if it's + * safe to schedule (e.g. calling printk while holding a spin_lock), + * because preempt_disable()/preempt_enable() are just barriers there + * and preempt_count() is always 0. + * + * RCU read sections have a separate preemption counter when + * PREEMPT_RCU enabled thus we must take extra care and check + * rcu_preempt_depth(), otherwise RCU read sections modify + * preempt_count(). + */ + console_may_schedule = !oops_in_progress && + preemptible() && + !rcu_preempt_depth(); return 1; } EXPORT_SYMBOL(console_trylock); @@ -2184,6 +2129,34 @@ int is_console_locked(void) return console_locked; } +/* + * Check if we have any console that is capable of printing while cpu is + * booting or shutting down. Requires console_sem. + */ +static int have_callable_console(void) +{ + struct console *con; + + for_each_console(con) + if ((con->flags & CON_ENABLED) && + (con->flags & CON_ANYTIME)) + return 1; + + return 0; +} + +/* + * Can we actually use the console at this time on this cpu? + * + * Console drivers may assume that per-cpu resources have been allocated. So + * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't + * call them until this CPU is officially up. + */ +static inline int can_use_console(void) +{ + return cpu_online(raw_smp_processor_id()) || have_callable_console(); +} + static void console_cont_flush(char *text, size_t size) { unsigned long flags; @@ -2254,9 +2227,21 @@ void console_unlock(void) do_cond_resched = console_may_schedule; console_may_schedule = 0; +again: + /* + * We released the console_sem lock, so we need to recheck if + * cpu is online and (if not) is there at least one CON_ANYTIME + * console. + */ + if (!can_use_console()) { + console_locked = 0; + up_console_sem(); + return; + } + /* flush buffered message fragment immediately to console */ console_cont_flush(text, sizeof(text)); -again: + for (;;) { struct printk_log *msg; size_t ext_len = 0; diff --git a/kernel/profile.c b/kernel/profile.c index 99513e1160e5..c2199e9901c9 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -44,7 +44,7 @@ int prof_on __read_mostly; EXPORT_SYMBOL_GPL(prof_on); static cpumask_var_t prof_cpu_mask; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); static DEFINE_PER_CPU(int, cpu_profile_flip); static DEFINE_MUTEX(profile_flip_mutex); @@ -59,6 +59,7 @@ int profile_setup(char *str) if (!strncmp(str, sleepstr, strlen(sleepstr))) { #ifdef CONFIG_SCHEDSTATS + force_schedstat_enabled(); prof_on = SLEEP_PROFILING; if (str[strlen(sleepstr)] == ',') str += strlen(sleepstr) + 1; @@ -201,7 +202,7 @@ int profile_event_unregister(enum profile_type type, struct notifier_block *n) } EXPORT_SYMBOL_GPL(profile_event_unregister); -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) /* * Each cpu has a pair of open-addressed hashtables for pending * profile hits. read_profile() IPI's all cpus to request them diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 2341efe7fe02..d49bfa1e53e6 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -73,12 +73,11 @@ void __ptrace_unlink(struct task_struct *child) { BUG_ON(!child->ptrace); - child->ptrace = 0; child->parent = child->real_parent; list_del_init(&child->ptrace_entry); spin_lock(&child->sighand->siglock); - + child->ptrace = 0; /* * Clear all pending traps and TRAPPING. TRAPPING should be * cleared regardless of JOBCTL_STOP_PENDING. Do it explicitly. @@ -681,7 +680,7 @@ static int ptrace_peek_siginfo(struct task_struct *child, break; #ifdef CONFIG_COMPAT - if (unlikely(is_compat_task())) { + if (unlikely(in_compat_syscall())) { compat_siginfo_t __user *uinfo = compat_ptr(data); if (copy_siginfo_to_user32(uinfo, &info) || diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 61a16569ffbf..18dfc485225c 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -1,6 +1,11 @@ +# Any varying coverage in these files is non-deterministic +# and is generally not a function of system call inputs. +KCOV_INSTRUMENT := n + obj-y += update.o sync.o obj-$(CONFIG_SRCU) += srcu.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o +obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_PREEMPT_RCU) += tree.o obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c new file mode 100644 index 000000000000..3cee0d8393ed --- /dev/null +++ b/kernel/rcu/rcuperf.c @@ -0,0 +1,655 @@ +/* + * Read-Copy Update module-based performance-test facility + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2015 + * + * Authors: Paul E. McKenney <paulmck@us.ibm.com> + */ +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/err.h> +#include <linux/spinlock.h> +#include <linux/smp.h> +#include <linux/rcupdate.h> +#include <linux/interrupt.h> +#include <linux/sched.h> +#include <linux/atomic.h> +#include <linux/bitops.h> +#include <linux/completion.h> +#include <linux/moduleparam.h> +#include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/reboot.h> +#include <linux/freezer.h> +#include <linux/cpu.h> +#include <linux/delay.h> +#include <linux/stat.h> +#include <linux/srcu.h> +#include <linux/slab.h> +#include <asm/byteorder.h> +#include <linux/torture.h> +#include <linux/vmalloc.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); + +#define PERF_FLAG "-perf:" +#define PERFOUT_STRING(s) \ + pr_alert("%s" PERF_FLAG s "\n", perf_type) +#define VERBOSE_PERFOUT_STRING(s) \ + do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0) +#define VERBOSE_PERFOUT_ERRSTRING(s) \ + do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) + +torture_param(bool, gp_exp, true, "Use expedited GP wait primitives"); +torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); +torture_param(int, nreaders, -1, "Number of RCU reader threads"); +torture_param(int, nwriters, -1, "Number of RCU updater threads"); +torture_param(bool, shutdown, false, "Shutdown at end of performance tests."); +torture_param(bool, verbose, true, "Enable verbose debugging printk()s"); + +static char *perf_type = "rcu"; +module_param(perf_type, charp, 0444); +MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, rcu_bh, ...)"); + +static int nrealreaders; +static int nrealwriters; +static struct task_struct **writer_tasks; +static struct task_struct **reader_tasks; +static struct task_struct *shutdown_task; + +static u64 **writer_durations; +static int *writer_n_durations; +static atomic_t n_rcu_perf_reader_started; +static atomic_t n_rcu_perf_writer_started; +static atomic_t n_rcu_perf_writer_finished; +static wait_queue_head_t shutdown_wq; +static u64 t_rcu_perf_writer_started; +static u64 t_rcu_perf_writer_finished; +static unsigned long b_rcu_perf_writer_started; +static unsigned long b_rcu_perf_writer_finished; + +static int rcu_perf_writer_state; +#define RTWS_INIT 0 +#define RTWS_EXP_SYNC 1 +#define RTWS_SYNC 2 +#define RTWS_IDLE 2 +#define RTWS_STOPPING 3 + +#define MAX_MEAS 10000 +#define MIN_MEAS 100 + +#if defined(MODULE) || defined(CONFIG_RCU_PERF_TEST_RUNNABLE) +#define RCUPERF_RUNNABLE_INIT 1 +#else +#define RCUPERF_RUNNABLE_INIT 0 +#endif +static int perf_runnable = RCUPERF_RUNNABLE_INIT; +module_param(perf_runnable, int, 0444); +MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot"); + +/* + * Operations vector for selecting different types of tests. + */ + +struct rcu_perf_ops { + int ptype; + void (*init)(void); + void (*cleanup)(void); + int (*readlock)(void); + void (*readunlock)(int idx); + unsigned long (*started)(void); + unsigned long (*completed)(void); + unsigned long (*exp_completed)(void); + void (*sync)(void); + void (*exp_sync)(void); + const char *name; +}; + +static struct rcu_perf_ops *cur_ops; + +/* + * Definitions for rcu perf testing. + */ + +static int rcu_perf_read_lock(void) __acquires(RCU) +{ + rcu_read_lock(); + return 0; +} + +static void rcu_perf_read_unlock(int idx) __releases(RCU) +{ + rcu_read_unlock(); +} + +static unsigned long __maybe_unused rcu_no_completed(void) +{ + return 0; +} + +static void rcu_sync_perf_init(void) +{ +} + +static struct rcu_perf_ops rcu_ops = { + .ptype = RCU_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = rcu_perf_read_lock, + .readunlock = rcu_perf_read_unlock, + .started = rcu_batches_started, + .completed = rcu_batches_completed, + .exp_completed = rcu_exp_batches_completed, + .sync = synchronize_rcu, + .exp_sync = synchronize_rcu_expedited, + .name = "rcu" +}; + +/* + * Definitions for rcu_bh perf testing. + */ + +static int rcu_bh_perf_read_lock(void) __acquires(RCU_BH) +{ + rcu_read_lock_bh(); + return 0; +} + +static void rcu_bh_perf_read_unlock(int idx) __releases(RCU_BH) +{ + rcu_read_unlock_bh(); +} + +static struct rcu_perf_ops rcu_bh_ops = { + .ptype = RCU_BH_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = rcu_bh_perf_read_lock, + .readunlock = rcu_bh_perf_read_unlock, + .started = rcu_batches_started_bh, + .completed = rcu_batches_completed_bh, + .exp_completed = rcu_exp_batches_completed_sched, + .sync = synchronize_rcu_bh, + .exp_sync = synchronize_rcu_bh_expedited, + .name = "rcu_bh" +}; + +/* + * Definitions for srcu perf testing. + */ + +DEFINE_STATIC_SRCU(srcu_ctl_perf); +static struct srcu_struct *srcu_ctlp = &srcu_ctl_perf; + +static int srcu_perf_read_lock(void) __acquires(srcu_ctlp) +{ + return srcu_read_lock(srcu_ctlp); +} + +static void srcu_perf_read_unlock(int idx) __releases(srcu_ctlp) +{ + srcu_read_unlock(srcu_ctlp, idx); +} + +static unsigned long srcu_perf_completed(void) +{ + return srcu_batches_completed(srcu_ctlp); +} + +static void srcu_perf_synchronize(void) +{ + synchronize_srcu(srcu_ctlp); +} + +static void srcu_perf_synchronize_expedited(void) +{ + synchronize_srcu_expedited(srcu_ctlp); +} + +static struct rcu_perf_ops srcu_ops = { + .ptype = SRCU_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = srcu_perf_read_lock, + .readunlock = srcu_perf_read_unlock, + .started = NULL, + .completed = srcu_perf_completed, + .exp_completed = srcu_perf_completed, + .sync = srcu_perf_synchronize, + .exp_sync = srcu_perf_synchronize_expedited, + .name = "srcu" +}; + +/* + * Definitions for sched perf testing. + */ + +static int sched_perf_read_lock(void) +{ + preempt_disable(); + return 0; +} + +static void sched_perf_read_unlock(int idx) +{ + preempt_enable(); +} + +static struct rcu_perf_ops sched_ops = { + .ptype = RCU_SCHED_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = sched_perf_read_lock, + .readunlock = sched_perf_read_unlock, + .started = rcu_batches_started_sched, + .completed = rcu_batches_completed_sched, + .exp_completed = rcu_exp_batches_completed_sched, + .sync = synchronize_sched, + .exp_sync = synchronize_sched_expedited, + .name = "sched" +}; + +#ifdef CONFIG_TASKS_RCU + +/* + * Definitions for RCU-tasks perf testing. + */ + +static int tasks_perf_read_lock(void) +{ + return 0; +} + +static void tasks_perf_read_unlock(int idx) +{ +} + +static struct rcu_perf_ops tasks_ops = { + .ptype = RCU_TASKS_FLAVOR, + .init = rcu_sync_perf_init, + .readlock = tasks_perf_read_lock, + .readunlock = tasks_perf_read_unlock, + .started = rcu_no_completed, + .completed = rcu_no_completed, + .sync = synchronize_rcu_tasks, + .exp_sync = synchronize_rcu_tasks, + .name = "tasks" +}; + +#define RCUPERF_TASKS_OPS &tasks_ops, + +static bool __maybe_unused torturing_tasks(void) +{ + return cur_ops == &tasks_ops; +} + +#else /* #ifdef CONFIG_TASKS_RCU */ + +#define RCUPERF_TASKS_OPS + +static bool __maybe_unused torturing_tasks(void) +{ + return false; +} + +#endif /* #else #ifdef CONFIG_TASKS_RCU */ + +/* + * If performance tests complete, wait for shutdown to commence. + */ +static void rcu_perf_wait_shutdown(void) +{ + cond_resched_rcu_qs(); + if (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters) + return; + while (!torture_must_stop()) + schedule_timeout_uninterruptible(1); +} + +/* + * RCU perf reader kthread. Repeatedly does empty RCU read-side + * critical section, minimizing update-side interference. + */ +static int +rcu_perf_reader(void *arg) +{ + unsigned long flags; + int idx; + long me = (long)arg; + + VERBOSE_PERFOUT_STRING("rcu_perf_reader task started"); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + atomic_inc(&n_rcu_perf_reader_started); + + do { + local_irq_save(flags); + idx = cur_ops->readlock(); + cur_ops->readunlock(idx); + local_irq_restore(flags); + rcu_perf_wait_shutdown(); + } while (!torture_must_stop()); + torture_kthread_stopping("rcu_perf_reader"); + return 0; +} + +/* + * RCU perf writer kthread. Repeatedly does a grace period. + */ +static int +rcu_perf_writer(void *arg) +{ + int i = 0; + int i_max; + long me = (long)arg; + struct sched_param sp; + bool started = false, done = false, alldone = false; + u64 t; + u64 *wdp; + u64 *wdpp = writer_durations[me]; + + VERBOSE_PERFOUT_STRING("rcu_perf_writer task started"); + WARN_ON(rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp); + WARN_ON(rcu_gp_is_normal() && gp_exp); + WARN_ON(!wdpp); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + sp.sched_priority = 1; + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + + if (holdoff) + schedule_timeout_uninterruptible(holdoff * HZ); + + t = ktime_get_mono_fast_ns(); + if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) { + t_rcu_perf_writer_started = t; + if (gp_exp) { + b_rcu_perf_writer_started = + cur_ops->exp_completed() / 2; + } else { + b_rcu_perf_writer_started = + cur_ops->completed(); + } + } + + do { + wdp = &wdpp[i]; + *wdp = ktime_get_mono_fast_ns(); + if (gp_exp) { + rcu_perf_writer_state = RTWS_EXP_SYNC; + cur_ops->exp_sync(); + } else { + rcu_perf_writer_state = RTWS_SYNC; + cur_ops->sync(); + } + rcu_perf_writer_state = RTWS_IDLE; + t = ktime_get_mono_fast_ns(); + *wdp = t - *wdp; + i_max = i; + if (!started && + atomic_read(&n_rcu_perf_writer_started) >= nrealwriters) + started = true; + if (!done && i >= MIN_MEAS) { + done = true; + sp.sched_priority = 0; + sched_setscheduler_nocheck(current, + SCHED_NORMAL, &sp); + pr_alert("%s" PERF_FLAG + "rcu_perf_writer %ld has %d measurements\n", + perf_type, me, MIN_MEAS); + if (atomic_inc_return(&n_rcu_perf_writer_finished) >= + nrealwriters) { + schedule_timeout_interruptible(10); + rcu_ftrace_dump(DUMP_ALL); + PERFOUT_STRING("Test complete"); + t_rcu_perf_writer_finished = t; + if (gp_exp) { + b_rcu_perf_writer_finished = + cur_ops->exp_completed() / 2; + } else { + b_rcu_perf_writer_finished = + cur_ops->completed(); + } + if (shutdown) { + smp_mb(); /* Assign before wake. */ + wake_up(&shutdown_wq); + } + } + } + if (done && !alldone && + atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters) + alldone = true; + if (started && !alldone && i < MAX_MEAS - 1) + i++; + rcu_perf_wait_shutdown(); + } while (!torture_must_stop()); + rcu_perf_writer_state = RTWS_STOPPING; + writer_n_durations[me] = i_max; + torture_kthread_stopping("rcu_perf_writer"); + return 0; +} + +static inline void +rcu_perf_print_module_parms(struct rcu_perf_ops *cur_ops, const char *tag) +{ + pr_alert("%s" PERF_FLAG + "--- %s: nreaders=%d nwriters=%d verbose=%d shutdown=%d\n", + perf_type, tag, nrealreaders, nrealwriters, verbose, shutdown); +} + +static void +rcu_perf_cleanup(void) +{ + int i; + int j; + int ngps = 0; + u64 *wdp; + u64 *wdpp; + + if (torture_cleanup_begin()) + return; + + if (reader_tasks) { + for (i = 0; i < nrealreaders; i++) + torture_stop_kthread(rcu_perf_reader, + reader_tasks[i]); + kfree(reader_tasks); + } + + if (writer_tasks) { + for (i = 0; i < nrealwriters; i++) { + torture_stop_kthread(rcu_perf_writer, + writer_tasks[i]); + if (!writer_n_durations) + continue; + j = writer_n_durations[i]; + pr_alert("%s%s writer %d gps: %d\n", + perf_type, PERF_FLAG, i, j); + ngps += j; + } + pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n", + perf_type, PERF_FLAG, + t_rcu_perf_writer_started, t_rcu_perf_writer_finished, + t_rcu_perf_writer_finished - + t_rcu_perf_writer_started, + ngps, + b_rcu_perf_writer_finished - + b_rcu_perf_writer_started); + for (i = 0; i < nrealwriters; i++) { + if (!writer_durations) + break; + if (!writer_n_durations) + continue; + wdpp = writer_durations[i]; + if (!wdpp) + continue; + for (j = 0; j <= writer_n_durations[i]; j++) { + wdp = &wdpp[j]; + pr_alert("%s%s %4d writer-duration: %5d %llu\n", + perf_type, PERF_FLAG, + i, j, *wdp); + if (j % 100 == 0) + schedule_timeout_uninterruptible(1); + } + kfree(writer_durations[i]); + } + kfree(writer_tasks); + kfree(writer_durations); + kfree(writer_n_durations); + } + + /* Do flavor-specific cleanup operations. */ + if (cur_ops->cleanup != NULL) + cur_ops->cleanup(); + + torture_cleanup_end(); +} + +/* + * Return the number if non-negative. If -1, the number of CPUs. + * If less than -1, that much less than the number of CPUs, but + * at least one. + */ +static int compute_real(int n) +{ + int nr; + + if (n >= 0) { + nr = n; + } else { + nr = num_online_cpus() + 1 + n; + if (nr <= 0) + nr = 1; + } + return nr; +} + +/* + * RCU perf shutdown kthread. Just waits to be awakened, then shuts + * down system. + */ +static int +rcu_perf_shutdown(void *arg) +{ + do { + wait_event(shutdown_wq, + atomic_read(&n_rcu_perf_writer_finished) >= + nrealwriters); + } while (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters); + smp_mb(); /* Wake before output. */ + rcu_perf_cleanup(); + kernel_power_off(); + return -EINVAL; +} + +static int __init +rcu_perf_init(void) +{ + long i; + int firsterr = 0; + static struct rcu_perf_ops *perf_ops[] = { + &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, + RCUPERF_TASKS_OPS + }; + + if (!torture_init_begin(perf_type, verbose, &perf_runnable)) + return -EBUSY; + + /* Process args and tell the world that the perf'er is on the job. */ + for (i = 0; i < ARRAY_SIZE(perf_ops); i++) { + cur_ops = perf_ops[i]; + if (strcmp(perf_type, cur_ops->name) == 0) + break; + } + if (i == ARRAY_SIZE(perf_ops)) { + pr_alert("rcu-perf: invalid perf type: \"%s\"\n", + perf_type); + pr_alert("rcu-perf types:"); + for (i = 0; i < ARRAY_SIZE(perf_ops); i++) + pr_alert(" %s", perf_ops[i]->name); + pr_alert("\n"); + firsterr = -EINVAL; + goto unwind; + } + if (cur_ops->init) + cur_ops->init(); + + nrealwriters = compute_real(nwriters); + nrealreaders = compute_real(nreaders); + atomic_set(&n_rcu_perf_reader_started, 0); + atomic_set(&n_rcu_perf_writer_started, 0); + atomic_set(&n_rcu_perf_writer_finished, 0); + rcu_perf_print_module_parms(cur_ops, "Start of test"); + + /* Start up the kthreads. */ + + if (shutdown) { + init_waitqueue_head(&shutdown_wq); + firsterr = torture_create_kthread(rcu_perf_shutdown, NULL, + shutdown_task); + if (firsterr) + goto unwind; + schedule_timeout_uninterruptible(1); + } + reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]), + GFP_KERNEL); + if (reader_tasks == NULL) { + VERBOSE_PERFOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealreaders; i++) { + firsterr = torture_create_kthread(rcu_perf_reader, (void *)i, + reader_tasks[i]); + if (firsterr) + goto unwind; + } + while (atomic_read(&n_rcu_perf_reader_started) < nrealreaders) + schedule_timeout_uninterruptible(1); + writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]), + GFP_KERNEL); + writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), + GFP_KERNEL); + writer_n_durations = + kcalloc(nrealwriters, sizeof(*writer_n_durations), + GFP_KERNEL); + if (!writer_tasks || !writer_durations || !writer_n_durations) { + VERBOSE_PERFOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealwriters; i++) { + writer_durations[i] = + kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), + GFP_KERNEL); + if (!writer_durations[i]) + goto unwind; + firsterr = torture_create_kthread(rcu_perf_writer, (void *)i, + writer_tasks[i]); + if (firsterr) + goto unwind; + } + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + rcu_perf_cleanup(); + return firsterr; +} + +module_init(rcu_perf_init); +module_exit(rcu_perf_cleanup); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d2988d047d66..084a28a732eb 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -130,10 +130,8 @@ static struct rcu_torture __rcu *rcu_torture_current; static unsigned long rcu_torture_current_version; static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; static DEFINE_SPINLOCK(rcu_torture_lock); -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], - rcu_torture_count) = { 0 }; -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], - rcu_torture_batch) = { 0 }; +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count); +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch); static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; static atomic_t n_rcu_torture_alloc; static atomic_t n_rcu_torture_alloc_fail; @@ -918,7 +916,7 @@ rcu_torture_fqs(void *arg) static int rcu_torture_writer(void *arg) { - bool can_expedite = !rcu_gp_is_expedited(); + bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal(); int expediting = 0; unsigned long gp_snap; bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; @@ -932,12 +930,14 @@ rcu_torture_writer(void *arg) int nsynctypes = 0; VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); - pr_alert("%s" TORTURE_FLAG - " Grace periods expedited from boot/sysfs for %s,\n", - torture_type, cur_ops->name); - pr_alert("%s" TORTURE_FLAG - " Testing of dynamic grace-period expediting diabled.\n", - torture_type); + if (!can_expedite) { + pr_alert("%s" TORTURE_FLAG + " GP expediting controlled from boot/sysfs for %s,\n", + torture_type, cur_ops->name); + pr_alert("%s" TORTURE_FLAG + " Disabled dynamic grace-period expediting.\n", + torture_type); + } /* Initialize synctype[] array. If none set, take default. */ if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1) @@ -1082,17 +1082,6 @@ rcu_torture_fakewriter(void *arg) return 0; } -static void rcutorture_trace_dump(void) -{ - static atomic_t beenhere = ATOMIC_INIT(0); - - if (atomic_read(&beenhere)) - return; - if (atomic_xchg(&beenhere, 1) != 0) - return; - ftrace_dump(DUMP_ALL); -} - /* * RCU torture reader from timer handler. Dereferences rcu_torture_current, * incrementing the corresponding element of the pipeline array. The @@ -1142,7 +1131,7 @@ static void rcu_torture_timer(unsigned long unused) if (pipe_count > 1) { do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, started, completed); - rcutorture_trace_dump(); + rcu_ftrace_dump(DUMP_ALL); } __this_cpu_inc(rcu_torture_count[pipe_count]); completed = completed - started; @@ -1215,7 +1204,7 @@ rcu_torture_reader(void *arg) if (pipe_count > 1) { do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, started, completed); - rcutorture_trace_dump(); + rcu_ftrace_dump(DUMP_ALL); } __this_cpu_inc(rcu_torture_count[pipe_count]); completed = completed - started; @@ -1333,7 +1322,7 @@ rcu_torture_stats_print(void) rcu_torture_writer_state, gpnum, completed, flags); show_rcu_gp_kthreads(); - rcutorture_trace_dump(); + rcu_ftrace_dump(DUMP_ALL); } rtcv_snap = rcu_torture_current_version; } @@ -1489,7 +1478,9 @@ static int rcu_torture_barrier_cbs(void *arg) * The above smp_load_acquire() ensures barrier_phase load * is ordered before the folloiwng ->call(). */ + local_irq_disable(); /* Just to test no-irq call_rcu(). */ cur_ops->call(&rcu, rcu_torture_barrier_cbf); + local_irq_enable(); if (atomic_dec_and_test(&barrier_cbs_count)) wake_up(&barrier_wq); } while (!torture_must_stop()); @@ -1596,7 +1587,7 @@ static int rcutorture_cpu_notify(struct notifier_block *self, { long cpu = (long)hcpu; - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_ONLINE: case CPU_DOWN_FAILED: (void)rcutorture_booster_init(cpu); diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index e492a5253e0f..196f0302e2f4 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -23,7 +23,7 @@ */ #include <linux/kthread.h> -#include <linux/module.h> +#include <linux/init.h> #include <linux/debugfs.h> #include <linux/seq_file.h> @@ -122,18 +122,7 @@ free_out: debugfs_remove_recursive(rcudir); return 1; } - -static void __exit rcutiny_trace_cleanup(void) -{ - debugfs_remove_recursive(rcudir); -} - -module_init(rcutiny_trace_init); -module_exit(rcutiny_trace_cleanup); - -MODULE_AUTHOR("Paul E. McKenney"); -MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); -MODULE_LICENSE("GPL"); +device_initcall(rcutiny_trace_init); static void check_cpu_stall(struct rcu_ctrlblk *rcp) { diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e41dd4131f7a..c7f1bc4f817c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -102,13 +102,14 @@ struct rcu_state sname##_state = { \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ + .exp_mutex = __MUTEX_INITIALIZER(sname##_state.exp_mutex), \ + .exp_wake_mutex = __MUTEX_INITIALIZER(sname##_state.exp_wake_mutex), \ } RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); static struct rcu_state *const rcu_state_p; -static struct rcu_data __percpu *const rcu_data_p; LIST_HEAD(rcu_struct_flavors); /* Dump rcu_node combining tree at boot to verify correct setup. */ @@ -371,6 +372,21 @@ void rcu_all_qs(void) rcu_momentary_dyntick_idle(); local_irq_restore(flags); } + if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) { + /* + * Yes, we just checked a per-CPU variable with preemption + * enabled, so we might be migrated to some other CPU at + * this point. That is OK because in that case, the + * migration will supply the needed quiescent state. + * We might end up needlessly disabling preemption and + * invoking rcu_sched_qs() on the destination CPU, but + * the probability and cost are both quite low, so this + * should not be a problem in practice. + */ + preempt_disable(); + rcu_sched_qs(); + preempt_enable(); + } this_cpu_inc(rcu_qs_ctr); barrier(); /* Avoid RCU read-side critical sections leaking up. */ } @@ -386,9 +402,11 @@ module_param(qlowmark, long, 0444); static ulong jiffies_till_first_fqs = ULONG_MAX; static ulong jiffies_till_next_fqs = ULONG_MAX; +static bool rcu_kick_kthreads; module_param(jiffies_till_first_fqs, ulong, 0644); module_param(jiffies_till_next_fqs, ulong, 0644); +module_param(rcu_kick_kthreads, bool, 0644); /* * How long the grace period must be before we start recruiting @@ -461,6 +479,28 @@ unsigned long rcu_batches_completed_bh(void) EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); /* + * Return the number of RCU expedited batches completed thus far for + * debug & stats. Odd numbers mean that a batch is in progress, even + * numbers mean idle. The value returned will thus be roughly double + * the cumulative batches since boot. + */ +unsigned long rcu_exp_batches_completed(void) +{ + return rcu_state_p->expedited_sequence; +} +EXPORT_SYMBOL_GPL(rcu_exp_batches_completed); + +/* + * Return the number of RCU-sched expedited batches completed thus far + * for debug & stats. Similar to rcu_exp_batches_completed(). + */ +unsigned long rcu_exp_batches_completed_sched(void) +{ + return rcu_sched_state.expedited_sequence; +} +EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched); + +/* * Force a quiescent state. */ void rcu_force_quiescent_state(void) @@ -638,7 +678,7 @@ static void rcu_eqs_enter_common(long long oldval, bool user) idle_task(smp_processor_id()); trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); - ftrace_dump(DUMP_ORIG); + rcu_ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ @@ -800,7 +840,7 @@ static void rcu_eqs_exit_common(long long oldval, int user) trace_rcu_dyntick(TPS("Error on exit: not idle task"), oldval, rdtp->dynticks_nesting); - ftrace_dump(DUMP_ORIG); + rcu_ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ @@ -1083,13 +1123,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, rcu_sysidle_check_cpu(rdp, isidle, maxj); if ((rdp->dynticks_snap & 0x1) == 0) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); - return 1; - } else { if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rdp->mynode->gpnum)) WRITE_ONCE(rdp->gpwrap, true); - return 0; + return 1; } + return 0; } /* @@ -1173,15 +1212,16 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, smp_mb(); /* ->cond_resched_completed before *rcrmp. */ WRITE_ONCE(*rcrmp, READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask); - resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ - rdp->rsp->jiffies_resched += 5; /* Enable beating. */ - } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { - /* Time to beat on that CPU again! */ - resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ - rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ } + rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ } + /* And if it has been a really long time, kick the CPU as well. */ + if (ULONG_CMP_GE(jiffies, + rdp->rsp->gp_start + 2 * jiffies_till_sched_qs) || + ULONG_CMP_GE(jiffies, rdp->rsp->gp_start + jiffies_till_sched_qs)) + resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ + return 0; } @@ -1225,8 +1265,10 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) rsp->gp_flags, gp_state_getname(rsp->gp_state), rsp->gp_state, rsp->gp_kthread ? rsp->gp_kthread->state : ~0); - if (rsp->gp_kthread) + if (rsp->gp_kthread) { sched_show_task(rsp->gp_kthread); + wake_up_process(rsp->gp_kthread); + } } } @@ -1246,7 +1288,26 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) if (rnp->qsmask & (1UL << cpu)) dump_cpu_task(rnp->grplo + cpu); } - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } +} + +/* + * If too much time has passed in the current grace period, and if + * so configured, go kick the relevant kthreads. + */ +static void rcu_stall_kick_kthreads(struct rcu_state *rsp) +{ + unsigned long j; + + if (!rcu_kick_kthreads) + return; + j = READ_ONCE(rsp->jiffies_kick_kthreads); + if (time_after(jiffies, j) && rsp->gp_kthread) { + WARN_ONCE(1, "Kicking %s grace-period kthread\n", rsp->name); + rcu_ftrace_dump(DUMP_ALL); + wake_up_process(rsp->gp_kthread); + WRITE_ONCE(rsp->jiffies_kick_kthreads, j + HZ); } } @@ -1261,17 +1322,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) struct rcu_node *rnp = rcu_get_root(rsp); long totqlen = 0; + /* Kick and suppress, if so configured. */ + rcu_stall_kick_kthreads(rsp); + if (rcu_cpu_stall_suppress) + return; + /* Only let one CPU complain about others per time interval. */ raw_spin_lock_irqsave_rcu_node(rnp, flags); delta = jiffies - READ_ONCE(rsp->jiffies_stall); if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } WRITE_ONCE(rsp->jiffies_stall, jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* * OK, time to rat on our buddy... @@ -1292,7 +1358,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) ndetected++; } } - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } print_cpu_stall_info_end(); @@ -1334,6 +1400,11 @@ static void print_cpu_stall(struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); long totqlen = 0; + /* Kick and suppress, if so configured. */ + rcu_stall_kick_kthreads(rsp); + if (rcu_cpu_stall_suppress) + return; + /* * OK, time to rat on ourselves... * See Documentation/RCU/stallwarn.txt for info on how to debug @@ -1357,7 +1428,7 @@ static void print_cpu_stall(struct rcu_state *rsp) if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) WRITE_ONCE(rsp->jiffies_stall, jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* * Attempt to revive the RCU machinery by forcing a context switch. @@ -1378,8 +1449,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) unsigned long js; struct rcu_node *rnp; - if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp)) + if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) || + !rcu_gp_in_progress(rsp)) return; + rcu_stall_kick_kthreads(rsp); j = jiffies; /* @@ -1595,7 +1668,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, } unlock_out: if (rnp != rnp_root) - raw_spin_unlock(&rnp_root->lock); + raw_spin_unlock_rcu_node(rnp_root); out: if (c_out != NULL) *c_out = c; @@ -1614,7 +1687,6 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) int needmore; struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - rcu_nocb_gp_cleanup(rsp, rnp); rnp->need_future_gp[c & 0x1] = 0; needmore = rnp->need_future_gp[(c + 1) & 0x1]; trace_rcu_future_gp(rnp, rdp, c, @@ -1635,7 +1707,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) !READ_ONCE(rsp->gp_flags) || !rsp->gp_kthread) return; - wake_up(&rsp->gp_wq); + swake_up(&rsp->gp_wq); } /* @@ -1815,7 +1887,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) return; } needwake = __note_gp_changes(rsp, rnp, rdp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) rcu_gp_kthread_wake(rsp); } @@ -1840,7 +1912,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) raw_spin_lock_irq_rcu_node(rnp); if (!READ_ONCE(rsp->gp_flags)) { /* Spurious wakeup, tell caller to go back to sleep. */ - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); return false; } WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */ @@ -1850,7 +1922,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) * Grace period already in progress, don't start another. * Not supposed to be able to happen. */ - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); return false; } @@ -1859,7 +1931,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) /* Record GP times before starting GP, hence smp_store_release(). */ smp_store_release(&rsp->gpnum, rsp->gpnum + 1); trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); /* * Apply per-leaf buffered online and offline operations to the @@ -1873,7 +1945,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) if (rnp->qsmaskinit == rnp->qsmaskinitnext && !rnp->wait_blkd_tasks) { /* Nothing to do on this leaf rcu_node structure. */ - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); continue; } @@ -1907,7 +1979,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) rcu_cleanup_dead_rnp(rnp); } - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); } /* @@ -1938,7 +2010,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) trace_rcu_grace_period_init(rsp->name, rnp->gpnum, rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); cond_resched_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); } @@ -1996,7 +2068,7 @@ static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) raw_spin_lock_irq_rcu_node(rnp); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS); - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); } } @@ -2010,6 +2082,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) int nocb = 0; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); + struct swait_queue_head *sq; WRITE_ONCE(rsp->gp_activity, jiffies); raw_spin_lock_irq_rcu_node(rnp); @@ -2025,7 +2098,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) * safe for us to drop the lock in order to mark the grace * period as completed in all of the rcu_node structures. */ - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); /* * Propagate new ->completed value to rcu_node structures so @@ -2046,7 +2119,9 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; /* smp_mb() provided by prior unlock-lock pair. */ nocb += rcu_future_gp_cleanup(rsp, rnp); - raw_spin_unlock_irq(&rnp->lock); + sq = rcu_nocb_gp_get(rnp); + raw_spin_unlock_irq_rcu_node(rnp); + rcu_nocb_gp_cleanup(sq); cond_resched_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); rcu_gp_slow(rsp, gp_cleanup_delay); @@ -2068,7 +2143,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) READ_ONCE(rsp->gpnum), TPS("newreq")); } - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); } /* @@ -2092,7 +2167,7 @@ static int __noreturn rcu_gp_kthread(void *arg) READ_ONCE(rsp->gpnum), TPS("reqwait")); rsp->gp_state = RCU_GP_WAIT_GPS; - wait_event_interruptible(rsp->gp_wq, + swait_event_interruptible(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_INIT); rsp->gp_state = RCU_GP_DONE_GPS; @@ -2116,13 +2191,16 @@ static int __noreturn rcu_gp_kthread(void *arg) } ret = 0; for (;;) { - if (!ret) + if (!ret) { rsp->jiffies_force_qs = jiffies + j; + WRITE_ONCE(rsp->jiffies_kick_kthreads, + jiffies + 3 * j); + } trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("fqswait")); rsp->gp_state = RCU_GP_WAIT_FQS; - ret = wait_event_interruptible_timeout(rsp->gp_wq, + ret = swait_event_interruptible_timeout(rsp->gp_wq, rcu_gp_fqs_check_wake(rsp, &gf), j); rsp->gp_state = RCU_GP_DOING_FQS; /* Locking provides needed memory barriers. */ @@ -2143,6 +2221,15 @@ static int __noreturn rcu_gp_kthread(void *arg) TPS("fqsend")); cond_resched_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); + ret = 0; /* Force full wait till next FQS. */ + j = jiffies_till_next_fqs; + if (j > HZ) { + j = HZ; + jiffies_till_next_fqs = HZ; + } else if (j < 1) { + j = 1; + jiffies_till_next_fqs = 1; + } } else { /* Deal with stray signal. */ cond_resched_rcu_qs(); @@ -2151,14 +2238,12 @@ static int __noreturn rcu_gp_kthread(void *arg) trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("fqswaitsig")); - } - j = jiffies_till_next_fqs; - if (j > HZ) { - j = HZ; - jiffies_till_next_fqs = HZ; - } else if (j < 1) { - j = 1; - jiffies_till_next_fqs = 1; + ret = 1; /* Keep old FQS timing. */ + j = jiffies; + if (time_after(jiffies, rsp->jiffies_force_qs)) + j = 1; + else + j = rsp->jiffies_force_qs - j; } } @@ -2234,19 +2319,21 @@ static bool rcu_start_gp(struct rcu_state *rsp) } /* - * Report a full set of quiescent states to the specified rcu_state - * data structure. This involves cleaning up after the prior grace - * period and letting rcu_start_gp() start up the next grace period - * if one is needed. Note that the caller must hold rnp->lock, which - * is released before return. + * Report a full set of quiescent states to the specified rcu_state data + * structure. Invoke rcu_gp_kthread_wake() to awaken the grace-period + * kthread if another grace period is required. Whether we wake + * the grace-period kthread or it awakens itself for the next round + * of quiescent-state forcing, that kthread will clean up after the + * just-completed grace period. Note that the caller must hold rnp->lock, + * which is released before return. */ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); - raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); - rcu_gp_kthread_wake(rsp); + raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); + swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ } /* @@ -2275,7 +2362,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, * Our bit has already been cleared, or the * relevant grace period is already over, so done. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ @@ -2287,7 +2374,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { /* Other bits still set at this level, so done. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } mask = rnp->grpmask; @@ -2297,7 +2384,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, break; } - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rnp_c = rnp; rnp = rnp->parent; raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -2329,7 +2416,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; /* Still need more quiescent states! */ } @@ -2346,19 +2433,14 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, /* Report up the rest of the hierarchy, tracking current ->gpnum. */ gps = rnp->gpnum; mask = rnp->grpmask; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */ rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags); } /* * Record a quiescent state for the specified CPU to that CPU's rcu_data - * structure. This must be either called from the specified CPU, or - * called when the specified CPU is known to be offline (and when it is - * also known that no other CPU is concurrently trying to help the offline - * CPU). The lastcomp argument is used to make sure we are still in the - * grace period of interest. We don't want to end the current grace period - * based on quiescent states detected in an earlier grace period! + * structure. This must be called from the specified CPU. */ static void rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) @@ -2383,14 +2465,14 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) */ rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } mask = rdp->grpmask; if ((rnp->qsmask & mask) == 0) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } else { - rdp->core_needs_qs = 0; + rdp->core_needs_qs = false; /* * This GP can't end until cpu checks in, so all of our @@ -2599,36 +2681,15 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) rnp->qsmaskinit &= ~mask; rnp->qsmask &= ~mask; if (rnp->qsmaskinit) { - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp); + /* irqs remain disabled. */ return; } - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ } } /* - * The CPU is exiting the idle loop into the arch_cpu_idle_dead() - * function. We now remove it from the rcu_node tree's ->qsmaskinit - * bit masks. - */ -static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) -{ - unsigned long flags; - unsigned long mask; - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ - - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return; - - /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ - mask = rdp->grpmask; - raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ - rnp->qsmaskinitnext &= ~mask; - raw_spin_unlock_irqrestore(&rnp->lock, flags); -} - -/* * The CPU has been completely removed, and some other CPU is reporting * this fact from process context. Do the remainder of the cleanup, * including orphaning the outgoing CPU's RCU callbacks, and also @@ -2859,7 +2920,7 @@ static void force_qs_rnp(struct rcu_state *rsp, rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags); } else { /* Nothing to do here, so just drop the lock. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } } @@ -2895,12 +2956,12 @@ static void force_quiescent_state(struct rcu_state *rsp) raw_spin_unlock(&rnp_old->fqslock); if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { rsp->n_force_qs_lh++; - raw_spin_unlock_irqrestore(&rnp_old->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); return; /* Someone beat us to it. */ } WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); - raw_spin_unlock_irqrestore(&rnp_old->lock, flags); - rcu_gp_kthread_wake(rsp); + raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); + swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ } /* @@ -2925,7 +2986,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) if (cpu_needs_another_gp(rsp, rdp)) { raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */ needwake = rcu_start_gp(rsp); - raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); if (needwake) rcu_gp_kthread_wake(rsp); } else { @@ -3016,7 +3077,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, raw_spin_lock_rcu_node(rnp_root); needwake = rcu_start_gp(rsp); - raw_spin_unlock(&rnp_root->lock); + raw_spin_unlock_rcu_node(rnp_root); if (needwake) rcu_gp_kthread_wake(rsp); } else { @@ -3399,8 +3460,12 @@ static void rcu_exp_gp_seq_end(struct rcu_state *rsp) } static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) { + unsigned long s; + smp_mb(); /* Caller's modifications seen first by other CPUs. */ - return rcu_seq_snap(&rsp->expedited_sequence); + s = rcu_seq_snap(&rsp->expedited_sequence); + trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); + return s; } static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) { @@ -3436,14 +3501,14 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmaskinit == rnp->expmaskinitnext) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); continue; /* No new CPUs, nothing to do. */ } /* Update this node's mask, track old value for propagation. */ oldmask = rnp->expmaskinit; rnp->expmaskinit = rnp->expmaskinitnext; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* If was already nonzero, nothing to propagate. */ if (oldmask) @@ -3458,7 +3523,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) if (rnp_up->expmaskinit) done = true; rnp_up->expmaskinit |= mask; - raw_spin_unlock_irqrestore(&rnp_up->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags); if (done) break; mask = rnp_up->grpmask; @@ -3481,7 +3546,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) raw_spin_lock_irqsave_rcu_node(rnp, flags); WARN_ON_ONCE(rnp->expmask); rnp->expmask = rnp->expmaskinit; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -3492,7 +3557,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) * for the current expedited grace period. Works only for preemptible * RCU -- other RCU implementation use other means. * - * Caller must hold the root rcu_node's exp_funnel_mutex. + * Caller must hold the rcu_state's exp_mutex. */ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) { @@ -3508,8 +3573,8 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) * recursively up the tree. (Calm down, calm down, we do the recursion * iteratively!) * - * Caller must hold the root rcu_node's exp_funnel_mutex and the - * specified rcu_node structure's ->lock. + * Caller must hold the rcu_state's exp_mutex and the specified rcu_node + * structure's ->lock. */ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake, unsigned long flags) @@ -3522,19 +3587,19 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, if (!rnp->expmask) rcu_initiate_boost(rnp, flags); else - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); break; } if (rnp->parent == NULL) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (wake) { smp_mb(); /* EGP done before wake_up(). */ - wake_up(&rsp->expedited_wq); + swake_up(&rsp->expedited_wq); } break; } mask = rnp->grpmask; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */ rnp = rnp->parent; raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ WARN_ON_ONCE(!(rnp->expmask & mask)); @@ -3546,7 +3611,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, * Report expedited quiescent state for specified node. This is a * lock-acquisition wrapper function for __rcu_report_exp_rnp(). * - * Caller must hold the root rcu_node's exp_funnel_mutex. + * Caller must hold the rcu_state's exp_mutex. */ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake) @@ -3559,8 +3624,8 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, /* * Report expedited quiescent state for multiple CPUs, all covered by the - * specified leaf rcu_node structure. Caller must hold the root - * rcu_node's exp_funnel_mutex. + * specified leaf rcu_node structure. Caller must hold the rcu_state's + * exp_mutex. */ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, unsigned long mask, bool wake) @@ -3569,7 +3634,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, raw_spin_lock_irqsave_rcu_node(rnp, flags); if (!(rnp->expmask & mask)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } rnp->expmask &= ~mask; @@ -3578,7 +3643,6 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, /* * Report expedited quiescent state for specified rcu_data (CPU). - * Caller must hold the root rcu_node's exp_funnel_mutex. */ static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, bool wake) @@ -3587,15 +3651,11 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, } /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ -static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, - struct rcu_data *rdp, - atomic_long_t *stat, unsigned long s) +static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat, + unsigned long s) { if (rcu_exp_gp_seq_done(rsp, s)) { - if (rnp) - mutex_unlock(&rnp->exp_funnel_mutex); - else if (rdp) - mutex_unlock(&rdp->exp_funnel_mutex); + trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); /* Ensure test happens before caller kfree(). */ smp_mb__before_atomic(); /* ^^^ */ atomic_long_inc(stat); @@ -3605,59 +3665,65 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, } /* - * Funnel-lock acquisition for expedited grace periods. Returns a - * pointer to the root rcu_node structure, or NULL if some other - * task did the expedited grace period for us. + * Funnel-lock acquisition for expedited grace periods. Returns true + * if some other task completed an expedited grace period that this task + * can piggy-back on, and with no mutex held. Otherwise, returns false + * with the mutex held, indicating that the caller must actually do the + * expedited grace period. */ -static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) +static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) { struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); - struct rcu_node *rnp0; - struct rcu_node *rnp1 = NULL; + struct rcu_node *rnp = rdp->mynode; + struct rcu_node *rnp_root = rcu_get_root(rsp); + + /* Low-contention fastpath. */ + if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) && + (rnp == rnp_root || + ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) && + !mutex_is_locked(&rsp->exp_mutex) && + mutex_trylock(&rsp->exp_mutex)) + goto fastpath; /* - * First try directly acquiring the root lock in order to reduce - * latency in the common case where expedited grace periods are - * rare. We check mutex_is_locked() to avoid pathological levels of - * memory contention on ->exp_funnel_mutex in the heavy-load case. + * Each pass through the following loop works its way up + * the rcu_node tree, returning if others have done the work or + * otherwise falls through to acquire rsp->exp_mutex. The mapping + * from CPU to rcu_node structure can be inexact, as it is just + * promoting locality and is not strictly needed for correctness. */ - rnp0 = rcu_get_root(rsp); - if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) { - if (mutex_trylock(&rnp0->exp_funnel_mutex)) { - if (sync_exp_work_done(rsp, rnp0, NULL, - &rdp->expedited_workdone0, s)) - return NULL; - return rnp0; + for (; rnp != NULL; rnp = rnp->parent) { + if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s)) + return true; + + /* Work not done, either wait here or go up. */ + spin_lock(&rnp->exp_lock); + if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) { + + /* Someone else doing GP, so wait for them. */ + spin_unlock(&rnp->exp_lock); + trace_rcu_exp_funnel_lock(rsp->name, rnp->level, + rnp->grplo, rnp->grphi, + TPS("wait")); + wait_event(rnp->exp_wq[(s >> 1) & 0x3], + sync_exp_work_done(rsp, + &rdp->exp_workdone2, s)); + return true; } + rnp->exp_seq_rq = s; /* Followers can wait on us. */ + spin_unlock(&rnp->exp_lock); + trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, + rnp->grphi, TPS("nxtlvl")); } - - /* - * Each pass through the following loop works its way - * up the rcu_node tree, returning if others have done the - * work or otherwise falls through holding the root rnp's - * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure - * can be inexact, as it is just promoting locality and is not - * strictly needed for correctness. - */ - if (sync_exp_work_done(rsp, NULL, NULL, &rdp->expedited_workdone1, s)) - return NULL; - mutex_lock(&rdp->exp_funnel_mutex); - rnp0 = rdp->mynode; - for (; rnp0 != NULL; rnp0 = rnp0->parent) { - if (sync_exp_work_done(rsp, rnp1, rdp, - &rdp->expedited_workdone2, s)) - return NULL; - mutex_lock(&rnp0->exp_funnel_mutex); - if (rnp1) - mutex_unlock(&rnp1->exp_funnel_mutex); - else - mutex_unlock(&rdp->exp_funnel_mutex); - rnp1 = rnp0; + mutex_lock(&rsp->exp_mutex); +fastpath: + if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) { + mutex_unlock(&rsp->exp_mutex); + return true; } - if (sync_exp_work_done(rsp, rnp1, rdp, - &rdp->expedited_workdone3, s)) - return NULL; - return rnp1; + rcu_exp_gp_seq_start(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); + return false; } /* Invoked on each online non-idle CPU for expedited quiescent state. */ @@ -3672,6 +3738,11 @@ static void sync_sched_exp_handler(void *data) if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) return; + if (rcu_is_cpu_rrupt_from_idle()) { + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(&rcu_sched_data), true); + return; + } __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); resched_cpu(smp_processor_id()); } @@ -3730,7 +3801,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, */ if (rcu_preempt_has_tasks(rnp)) rnp->exp_tasks = rnp->blkd_tasks.next; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ mask = 1; @@ -3747,7 +3818,7 @@ retry_ipi: raw_spin_lock_irqsave_rcu_node(rnp, flags); if (cpu_online(cpu) && (rnp->expmask & mask)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); schedule_timeout_uninterruptible(1); if (cpu_online(cpu) && (rnp->expmask & mask)) @@ -3756,7 +3827,7 @@ retry_ipi: } if (!(rnp->expmask & mask)) mask_ofl_ipi &= ~mask; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* Report quiescent states for those that went offline. */ mask_ofl_test |= mask_ofl_ipi; @@ -3780,7 +3851,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) jiffies_start = jiffies; for (;;) { - ret = wait_event_interruptible_timeout( + ret = swait_event_timeout( rsp->expedited_wq, sync_rcu_preempt_exp_done(rnp_root), jiffies_stall); @@ -3788,7 +3859,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) return; if (ret < 0) { /* Hit a signal, disable CPU stall warnings. */ - wait_event(rsp->expedited_wq, + swait_event(rsp->expedited_wq, sync_rcu_preempt_exp_done(rnp_root)); return; } @@ -3796,7 +3867,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) rsp->name); ndetected = 0; rcu_for_each_leaf_node(rsp, rnp) { - ndetected = rcu_print_task_exp_stall(rnp); + ndetected += rcu_print_task_exp_stall(rnp); mask = 1; for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { struct rcu_data *rdp; @@ -3806,7 +3877,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) ndetected++; rdp = per_cpu_ptr(rsp->rda, cpu); pr_cont(" %d-%c%c%c", cpu, - "O."[cpu_online(cpu)], + "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rnp->expmaskinit)], "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); } @@ -3815,7 +3886,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", jiffies - jiffies_start, rsp->expedited_sequence, rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); - if (!ndetected) { + if (ndetected) { pr_err("blocking rcu_node structures:"); rcu_for_each_node_breadth_first(rsp, rnp) { if (rnp == rnp_root) @@ -3841,6 +3912,41 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) } } +/* + * Wait for the current expedited grace period to complete, and then + * wake up everyone who piggybacked on the just-completed expedited + * grace period. Also update all the ->exp_seq_rq counters as needed + * in order to avoid counter-wrap problems. + */ +static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) +{ + struct rcu_node *rnp; + + synchronize_sched_expedited_wait(rsp); + rcu_exp_gp_seq_end(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); + + /* + * Switch over to wakeup mode, allowing the next GP, but -only- the + * next GP, to proceed. + */ + mutex_lock(&rsp->exp_wake_mutex); + mutex_unlock(&rsp->exp_mutex); + + rcu_for_each_node_breadth_first(rsp, rnp) { + if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { + spin_lock(&rnp->exp_lock); + /* Recheck, avoid hang in case someone just arrived. */ + if (ULONG_CMP_LT(rnp->exp_seq_rq, s)) + rnp->exp_seq_rq = s; + spin_unlock(&rnp->exp_lock); + } + wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); + } + trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); + mutex_unlock(&rsp->exp_wake_mutex); +} + /** * synchronize_sched_expedited - Brute-force RCU-sched grace period * @@ -3860,7 +3966,6 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) void synchronize_sched_expedited(void) { unsigned long s; - struct rcu_node *rnp; struct rcu_state *rsp = &rcu_sched_state; /* If only one CPU, this is automatically a grace period. */ @@ -3875,17 +3980,14 @@ void synchronize_sched_expedited(void) /* Take a snapshot of the sequence number. */ s = rcu_exp_gp_seq_snap(rsp); - - rnp = exp_funnel_lock(rsp, s); - if (rnp == NULL) + if (exp_funnel_lock(rsp, s)) return; /* Someone else did our work for us. */ - rcu_exp_gp_seq_start(rsp); + /* Initialize the rcu_node tree in preparation for the wait. */ sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); - synchronize_sched_expedited_wait(rsp); - rcu_exp_gp_seq_end(rsp); - mutex_unlock(&rnp->exp_funnel_mutex); + /* Wait and clean up, including waking everyone. */ + rcu_exp_wait_wake(rsp, s); } EXPORT_SYMBOL_GPL(synchronize_sched_expedited); @@ -4163,7 +4265,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) return; raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */ rnp->qsmaskinit |= mask; - raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* Interrupts remain disabled. */ } } @@ -4185,9 +4287,8 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); rdp->cpu = cpu; rdp->rsp = rsp; - mutex_init(&rdp->exp_funnel_mutex); rcu_boot_init_nocb_percpu_data(rdp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* @@ -4215,7 +4316,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rcu_sysidle_init_percpu_data(rdp->dynticks); atomic_set(&rdp->dynticks->dynticks, (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ /* * Add CPU to leaf rcu_node pending-online bitmask. Any needed @@ -4236,7 +4337,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); rdp->core_needs_qs = false; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } static void rcu_prepare_cpu(int cpu) @@ -4247,6 +4348,46 @@ static void rcu_prepare_cpu(int cpu) rcu_init_percpu_data(cpu, rsp); } +#ifdef CONFIG_HOTPLUG_CPU +/* + * The CPU is exiting the idle loop into the arch_cpu_idle_dead() + * function. We now remove it from the rcu_node tree's ->qsmaskinit + * bit masks. + * The CPU is exiting the idle loop into the arch_cpu_idle_dead() + * function. We now remove it from the rcu_node tree's ->qsmaskinit + * bit masks. + */ +static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) +{ + unsigned long flags; + unsigned long mask; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) + return; + + /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ + mask = rdp->grpmask; + raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ + rnp->qsmaskinitnext &= ~mask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +} + +void rcu_report_dead(unsigned int cpu) +{ + struct rcu_state *rsp; + + /* QS for any half-done expedited RCU-sched GP. */ + preempt_disable(); + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(rcu_sched_state.rda), true); + preempt_enable(); + for_each_rcu_flavor(rsp) + rcu_cleanup_dying_idle_cpu(cpu, rsp); +} +#endif + /* * Handle CPU online/offline notification events. */ @@ -4278,17 +4419,6 @@ int rcu_cpu_notify(struct notifier_block *self, for_each_rcu_flavor(rsp) rcu_cleanup_dying_cpu(rsp); break; - case CPU_DYING_IDLE: - /* QS for any half-done expedited RCU-sched GP. */ - preempt_disable(); - rcu_report_exp_rdp(&rcu_sched_state, - this_cpu_ptr(rcu_sched_state.rda), true); - preempt_enable(); - - for_each_rcu_flavor(rsp) { - rcu_cleanup_dying_idle_cpu(cpu, rsp); - } - break; case CPU_DEAD: case CPU_DEAD_FROZEN: case CPU_UP_CANCELED: @@ -4358,7 +4488,7 @@ static int __init rcu_spawn_gp_kthread(void) sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); } - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); wake_up_process(t); } rcu_spawn_nocb_kthreads(); @@ -4414,10 +4544,8 @@ static void __init rcu_init_one(struct rcu_state *rsp) { static const char * const buf[] = RCU_NODE_NAME_INIT; static const char * const fqs[] = RCU_FQS_NAME_INIT; - static const char * const exp[] = RCU_EXP_NAME_INIT; static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; - static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; static u8 fl_mask = 0x1; int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ @@ -4449,8 +4577,8 @@ static void __init rcu_init_one(struct rcu_state *rsp) cpustride *= levelspread[i]; rnp = rsp->level[i]; for (j = 0; j < levelcnt[i]; j++, rnp++) { - raw_spin_lock_init(&rnp->lock); - lockdep_set_class_and_name(&rnp->lock, + raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock)); + lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock), &rcu_node_class[i], buf[i]); raw_spin_lock_init(&rnp->fqslock); lockdep_set_class_and_name(&rnp->fqslock, @@ -4476,14 +4604,16 @@ static void __init rcu_init_one(struct rcu_state *rsp) rnp->level = i; INIT_LIST_HEAD(&rnp->blkd_tasks); rcu_init_one_nocb(rnp); - mutex_init(&rnp->exp_funnel_mutex); - lockdep_set_class_and_name(&rnp->exp_funnel_mutex, - &rcu_exp_class[i], exp[i]); + init_waitqueue_head(&rnp->exp_wq[0]); + init_waitqueue_head(&rnp->exp_wq[1]); + init_waitqueue_head(&rnp->exp_wq[2]); + init_waitqueue_head(&rnp->exp_wq[3]); + spin_lock_init(&rnp->exp_lock); } } - init_waitqueue_head(&rsp->gp_wq); - init_waitqueue_head(&rsp->expedited_wq); + init_swait_queue_head(&rsp->gp_wq); + init_swait_queue_head(&rsp->expedited_wq); rnp = rsp->level[rcu_num_lvls - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 83360b4f4352..e3959f5e6ddf 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -27,6 +27,7 @@ #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/seqlock.h> +#include <linux/swait.h> #include <linux/stop_machine.h> /* @@ -69,7 +70,6 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } # define RCU_NODE_NAME_INIT { "rcu_node_0" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } -# define RCU_EXP_NAME_INIT { "rcu_node_exp_0" } #elif NR_CPUS <= RCU_FANOUT_2 # define RCU_NUM_LVLS 2 # define NUM_RCU_LVL_0 1 @@ -78,7 +78,6 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } -# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" } #elif NR_CPUS <= RCU_FANOUT_3 # define RCU_NUM_LVLS 3 # define NUM_RCU_LVL_0 1 @@ -88,7 +87,6 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } -# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" } #elif NR_CPUS <= RCU_FANOUT_4 # define RCU_NUM_LVLS 4 # define NUM_RCU_LVL_0 1 @@ -99,7 +97,6 @@ # define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } -# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" } #else # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ @@ -149,8 +146,9 @@ struct rcu_dynticks { * Definition for node within the RCU grace-period-detection hierarchy. */ struct rcu_node { - raw_spinlock_t lock; /* Root rcu_node's lock protects some */ - /* rcu_state fields as well as following. */ + raw_spinlock_t __private lock; /* Root rcu_node's lock protects */ + /* some rcu_state fields as well as */ + /* following. */ unsigned long gpnum; /* Current grace period for this node. */ /* This will either be equal to or one */ /* behind the root rcu_node's gpnum. */ @@ -243,14 +241,16 @@ struct rcu_node { /* Refused to boost: not sure why, though. */ /* This can happen due to race conditions. */ #ifdef CONFIG_RCU_NOCB_CPU - wait_queue_head_t nocb_gp_wq[2]; + struct swait_queue_head nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ int need_future_gp[2]; /* Counts of upcoming no-CB GP requests. */ raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; - struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp; + spinlock_t exp_lock ____cacheline_internodealigned_in_smp; + unsigned long exp_seq_rq; + wait_queue_head_t exp_wq[4]; } ____cacheline_internodealigned_in_smp; /* @@ -385,11 +385,9 @@ struct rcu_data { #ifdef CONFIG_RCU_FAST_NO_HZ struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ - struct mutex exp_funnel_mutex; - atomic_long_t expedited_workdone0; /* # done by others #0. */ - atomic_long_t expedited_workdone1; /* # done by others #1. */ - atomic_long_t expedited_workdone2; /* # done by others #2. */ - atomic_long_t expedited_workdone3; /* # done by others #3. */ + atomic_long_t exp_workdone1; /* # done by others #1. */ + atomic_long_t exp_workdone2; /* # done by others #2. */ + atomic_long_t exp_workdone3; /* # done by others #3. */ /* 7) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU @@ -399,7 +397,7 @@ struct rcu_data { atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */ struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ struct rcu_head **nocb_follower_tail; - wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ + struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */ struct task_struct *nocb_kthread; int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ @@ -478,7 +476,7 @@ struct rcu_state { unsigned long gpnum; /* Current gp number. */ unsigned long completed; /* # of last completed gp. */ struct task_struct *gp_kthread; /* Task for grace periods. */ - wait_queue_head_t gp_wq; /* Where GP task waits. */ + struct swait_queue_head gp_wq; /* Where GP task waits. */ short gp_flags; /* Commands for GP task. */ short gp_state; /* GP kthread sleep state. */ @@ -503,14 +501,18 @@ struct rcu_state { /* _rcu_barrier(). */ /* End of fields guarded by barrier_mutex. */ + struct mutex exp_mutex; /* Serialize expedited GP. */ + struct mutex exp_wake_mutex; /* Serialize wakeup. */ unsigned long expedited_sequence; /* Take a ticket. */ atomic_long_t expedited_normal; /* # fallbacks to normal. */ atomic_t expedited_need_qs; /* # CPUs left to check in. */ - wait_queue_head_t expedited_wq; /* Wait for check-ins. */ + struct swait_queue_head expedited_wq; /* Wait for check-ins. */ int ncpus_snap; /* # CPUs seen last time. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ + unsigned long jiffies_kick_kthreads; /* Time at which to kick */ + /* kthreads, if configured. */ unsigned long n_force_qs; /* Number of calls to */ /* force_quiescent_state(). */ unsigned long n_force_qs_lh; /* ~Number of calls leaving */ @@ -621,7 +623,8 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); static void rcu_init_one_nocb(struct rcu_node *rnp); static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, bool lazy, unsigned long flags); @@ -680,7 +683,7 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) #endif /* #else #ifdef CONFIG_PPC */ /* - * Wrappers for the rcu_node::lock acquire. + * Wrappers for the rcu_node::lock acquire and release. * * Because the rcu_nodes form a tree, the tree traversal locking will observe * different lock values, this in turn means that an UNLOCK of one level @@ -689,29 +692,48 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) * * In order to restore full ordering between tree levels, augment the regular * lock acquire functions with smp_mb__after_unlock_lock(). + * + * As ->lock of struct rcu_node is a __private field, therefore one should use + * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock. */ static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp) { - raw_spin_lock(&rnp->lock); + raw_spin_lock(&ACCESS_PRIVATE(rnp, lock)); smp_mb__after_unlock_lock(); } +static inline void raw_spin_unlock_rcu_node(struct rcu_node *rnp) +{ + raw_spin_unlock(&ACCESS_PRIVATE(rnp, lock)); +} + static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp) { - raw_spin_lock_irq(&rnp->lock); + raw_spin_lock_irq(&ACCESS_PRIVATE(rnp, lock)); smp_mb__after_unlock_lock(); } -#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ -do { \ - typecheck(unsigned long, flags); \ - raw_spin_lock_irqsave(&(rnp)->lock, flags); \ - smp_mb__after_unlock_lock(); \ +static inline void raw_spin_unlock_irq_rcu_node(struct rcu_node *rnp) +{ + raw_spin_unlock_irq(&ACCESS_PRIVATE(rnp, lock)); +} + +#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ +do { \ + typecheck(unsigned long, flags); \ + raw_spin_lock_irqsave(&ACCESS_PRIVATE(rnp, lock), flags); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define raw_spin_unlock_irqrestore_rcu_node(rnp, flags) \ +do { \ + typecheck(unsigned long, flags); \ + raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(rnp, lock), flags); \ } while (0) static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp) { - bool locked = raw_spin_trylock(&rnp->lock); + bool locked = raw_spin_trylock(&ACCESS_PRIVATE(rnp, lock)); if (locked) smp_mb__after_unlock_lock(); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 9467a8b7e756..ff1cd4e1188d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -235,7 +235,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) rnp->gp_tasks = &t->rcu_node_entry; if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) rnp->exp_tasks = &t->rcu_node_entry; - raw_spin_unlock(&rnp->lock); /* rrupts remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */ /* * Report the quiescent state for the expedited GP. This expedited @@ -489,7 +489,7 @@ void rcu_read_unlock_special(struct task_struct *t) !!rnp->gp_tasks); rcu_report_unblock_qs_rnp(rcu_state_p, rnp, flags); } else { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* Unboost if we were boosted. */ @@ -518,14 +518,14 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) raw_spin_lock_irqsave_rcu_node(rnp, flags); if (!rcu_preempt_blocked_readers_cgp(rnp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } t = list_entry(rnp->gp_tasks->prev, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) sched_show_task(t); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* @@ -722,18 +722,22 @@ static void sync_rcu_exp_handler(void *info) * synchronize_rcu_expedited - Brute-force RCU grace period * * Wait for an RCU-preempt grace period, but expedite it. The basic - * idea is to invoke synchronize_sched_expedited() to push all the tasks to - * the ->blkd_tasks lists and wait for this list to drain. This consumes - * significant time on all CPUs and is unfriendly to real-time workloads, - * so is thus not recommended for any sort of common-case code. - * In fact, if you are using synchronize_rcu_expedited() in a loop, - * please restructure your code to batch your updates, and then Use a - * single synchronize_rcu() instead. + * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler + * checks whether the CPU is in an RCU-preempt critical section, and + * if so, it sets a flag that causes the outermost rcu_read_unlock() + * to report the quiescent state. On the other hand, if the CPU is + * not in an RCU read-side critical section, the IPI handler reports + * the quiescent state immediately. + * + * Although this is a greate improvement over previous expedited + * implementations, it is still unfriendly to real-time workloads, so is + * thus not recommended for any sort of common-case code. In fact, if + * you are using synchronize_rcu_expedited() in a loop, please restructure + * your code to batch your updates, and then Use a single synchronize_rcu() + * instead. */ void synchronize_rcu_expedited(void) { - struct rcu_node *rnp; - struct rcu_node *rnp_unlock; struct rcu_state *rsp = rcu_state_p; unsigned long s; @@ -744,23 +748,14 @@ void synchronize_rcu_expedited(void) } s = rcu_exp_gp_seq_snap(rsp); - - rnp_unlock = exp_funnel_lock(rsp, s); - if (rnp_unlock == NULL) + if (exp_funnel_lock(rsp, s)) return; /* Someone else did our work for us. */ - rcu_exp_gp_seq_start(rsp); - /* Initialize the rcu_node tree in preparation for the wait. */ sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); - /* Wait for snapshotted ->blkd_tasks lists to drain. */ - rnp = rcu_get_root(rsp); - synchronize_sched_expedited_wait(rsp); - - /* Clean up and exit. */ - rcu_exp_gp_seq_end(rsp); - mutex_unlock(&rnp_unlock->exp_funnel_mutex); + /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */ + rcu_exp_wait_wake(rsp, s); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); @@ -807,7 +802,6 @@ void exit_rcu(void) #else /* #ifdef CONFIG_PREEMPT_RCU */ static struct rcu_state *const rcu_state_p = &rcu_sched_state; -static struct rcu_data __percpu *const rcu_data_p = &rcu_sched_data; /* * Tell them what RCU they are running. @@ -991,7 +985,7 @@ static int rcu_boost(struct rcu_node *rnp) * might exit their RCU read-side critical sections on their own. */ if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return 0; } @@ -1028,7 +1022,7 @@ static int rcu_boost(struct rcu_node *rnp) */ t = container_of(tb, struct task_struct, rcu_node_entry); rt_mutex_init_proxy_locked(&rnp->boost_mtx, t); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* Lock only for side effect: boosts task t's priority. */ rt_mutex_lock(&rnp->boost_mtx); rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ @@ -1088,7 +1082,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { rnp->n_balk_exp_gp_tasks++; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } if (rnp->exp_tasks != NULL || @@ -1098,13 +1092,13 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) ULONG_CMP_GE(jiffies, rnp->boost_time))) { if (rnp->exp_tasks == NULL) rnp->boost_tasks = rnp->gp_tasks; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); t = rnp->boost_kthread_task; if (t) rcu_wake_cond(t, rnp->boost_kthread_status); } else { rcu_initiate_boost_trace(rnp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -1172,7 +1166,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, return PTR_ERR(t); raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->boost_kthread_task = t; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ @@ -1308,7 +1302,7 @@ static void rcu_prepare_kthreads(int cpu) static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } static void invoke_rcu_callbacks_kthread(void) @@ -1559,7 +1553,7 @@ static void rcu_prepare_for_idle(void) rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ needwake = rcu_accelerate_cbs(rsp, rnp, rdp); - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ if (needwake) rcu_gp_kthread_wake(rsp); } @@ -1811,9 +1805,9 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll); * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended * grace period. */ -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) { - wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]); + swake_up_all(sq); } /* @@ -1829,10 +1823,15 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; } +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) +{ + return &rnp->nocb_gp_wq[rnp->completed & 0x1]; +} + static void rcu_init_one_nocb(struct rcu_node *rnp) { - init_waitqueue_head(&rnp->nocb_gp_wq[0]); - init_waitqueue_head(&rnp->nocb_gp_wq[1]); + init_swait_queue_head(&rnp->nocb_gp_wq[0]); + init_swait_queue_head(&rnp->nocb_gp_wq[1]); } #ifndef CONFIG_RCU_NOCB_CPU_ALL @@ -1857,7 +1856,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { /* Prior smp_mb__after_atomic() orders against prior enqueue. */ WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); - wake_up(&rdp_leader->nocb_wq); + swake_up(&rdp_leader->nocb_wq); } } @@ -2059,7 +2058,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) raw_spin_lock_irqsave_rcu_node(rnp, flags); needwake = rcu_start_future_gp(rnp, rdp, &c); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) rcu_gp_kthread_wake(rdp->rsp); @@ -2069,7 +2068,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) */ trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); for (;;) { - wait_event_interruptible( + swait_event_interruptible( rnp->nocb_gp_wq[c & 0x1], (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c))); if (likely(d)) @@ -2097,7 +2096,7 @@ wait_again: /* Wait for callbacks to appear. */ if (!rcu_nocb_poll) { trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); - wait_event_interruptible(my_rdp->nocb_wq, + swait_event_interruptible(my_rdp->nocb_wq, !READ_ONCE(my_rdp->nocb_leader_sleep)); /* Memory barrier handled by smp_mb() calls below and repoll. */ } else if (firsttime) { @@ -2172,7 +2171,7 @@ wait_again: * List was empty, wake up the follower. * Memory barriers supplied by atomic_long_add(). */ - wake_up(&rdp->nocb_wq); + swake_up(&rdp->nocb_wq); } } @@ -2193,7 +2192,7 @@ static void nocb_follower_wait(struct rcu_data *rdp) if (!rcu_nocb_poll) { trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "FollowerSleep"); - wait_event_interruptible(rdp->nocb_wq, + swait_event_interruptible(rdp->nocb_wq, READ_ONCE(rdp->nocb_follower_head)); } else if (firsttime) { /* Don't drown trace log with "Poll"! */ @@ -2352,7 +2351,7 @@ void __init rcu_init_nohz(void) static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { rdp->nocb_tail = &rdp->nocb_head; - init_waitqueue_head(&rdp->nocb_wq); + init_swait_queue_head(&rdp->nocb_wq); rdp->nocb_follower_tail = &rdp->nocb_follower_head; } @@ -2502,7 +2501,7 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) return false; } -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) { } @@ -2510,6 +2509,11 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) { } +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) +{ + return NULL; +} + static void rcu_init_one_nocb(struct rcu_node *rnp) { } diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 1088e64f01ad..86782f9a4604 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -185,17 +185,16 @@ static int show_rcuexp(struct seq_file *m, void *v) int cpu; struct rcu_state *rsp = (struct rcu_state *)m->private; struct rcu_data *rdp; - unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0; + unsigned long s1 = 0, s2 = 0, s3 = 0; for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(rsp->rda, cpu); - s0 += atomic_long_read(&rdp->expedited_workdone0); - s1 += atomic_long_read(&rdp->expedited_workdone1); - s2 += atomic_long_read(&rdp->expedited_workdone2); - s3 += atomic_long_read(&rdp->expedited_workdone3); + s1 += atomic_long_read(&rdp->exp_workdone1); + s2 += atomic_long_read(&rdp->exp_workdone2); + s3 += atomic_long_read(&rdp->exp_workdone3); } - seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", - rsp->expedited_sequence, s0, s1, s2, s3, + seq_printf(m, "s=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", + rsp->expedited_sequence, s1, s2, s3, atomic_long_read(&rsp->expedited_normal), atomic_read(&rsp->expedited_need_qs), rsp->expedited_sequence / 2); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 76b94e19430b..3e888cd5a594 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -67,7 +67,7 @@ static int rcu_normal_after_boot; module_param(rcu_normal_after_boot, int, 0); #endif /* #ifndef CONFIG_TINY_RCU */ -#if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT) +#ifdef CONFIG_DEBUG_LOCK_ALLOC /** * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section? * @@ -111,7 +111,7 @@ int rcu_read_lock_sched_held(void) return 0; if (debug_locks) lockdep_opinion = lock_is_held(&rcu_sched_lock_map); - return lockdep_opinion || preempt_count() != 0 || irqs_disabled(); + return lockdep_opinion || !preemptible(); } EXPORT_SYMBOL(rcu_read_lock_sched_held); #endif @@ -128,6 +128,7 @@ bool rcu_gp_is_normal(void) { return READ_ONCE(rcu_normal); } +EXPORT_SYMBOL_GPL(rcu_gp_is_normal); static atomic_t rcu_expedited_nesting = ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0); @@ -379,29 +380,9 @@ void destroy_rcu_head(struct rcu_head *head) debug_object_free(head, &rcuhead_debug_descr); } -/* - * fixup_activate is called when: - * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) - * Activation is performed internally by call_rcu(). - */ -static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) +static bool rcuhead_is_static_object(void *addr) { - struct rcu_head *head = addr; - - switch (state) { - - case ODEBUG_STATE_NOTAVAILABLE: - /* - * This is not really a fixup. We just make sure that it is - * tracked in the object tracker. - */ - debug_object_init(head, &rcuhead_debug_descr); - debug_object_activate(head, &rcuhead_debug_descr); - return 0; - default: - return 1; - } + return true; } /** @@ -439,7 +420,7 @@ EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); struct debug_obj_descr rcuhead_debug_descr = { .name = "rcu_head", - .fixup_activate = rcuhead_fixup_activate, + .is_static_object = rcuhead_is_static_object, }; EXPORT_SYMBOL_GPL(rcuhead_debug_descr); #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ diff --git a/kernel/relay.c b/kernel/relay.c index 074994bcfa9b..04d7cf3ef8cf 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -614,6 +614,7 @@ free_bufs: kref_put(&chan->kref, relay_destroy_channel); mutex_unlock(&relay_channels_mutex); + kfree(chan); return NULL; } EXPORT_SYMBOL_GPL(relay_open); diff --git a/kernel/resource.c b/kernel/resource.c index 3669d1bfc425..9b5f04404152 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -105,16 +105,25 @@ static int r_show(struct seq_file *m, void *v) { struct resource *root = m->private; struct resource *r = v, *p; + unsigned long long start, end; int width = root->end < 0x10000 ? 4 : 8; int depth; for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent) if (p->parent == root) break; + + if (file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) { + start = r->start; + end = r->end; + } else { + start = end = 0; + } + seq_printf(m, "%*s%0*llx-%0*llx : %s\n", depth * 2, "", - width, (unsigned long long) r->start, - width, (unsigned long long) r->end, + width, start, + width, end, r->name ? r->name : "<BAD>"); return 0; } @@ -233,9 +242,9 @@ static struct resource * __request_resource(struct resource *root, struct resour } } -static int __release_resource(struct resource *old) +static int __release_resource(struct resource *old, bool release_child) { - struct resource *tmp, **p; + struct resource *tmp, **p, *chd; p = &old->parent->child; for (;;) { @@ -243,7 +252,17 @@ static int __release_resource(struct resource *old) if (!tmp) break; if (tmp == old) { - *p = tmp->sibling; + if (release_child || !(tmp->child)) { + *p = tmp->sibling; + } else { + for (chd = tmp->child;; chd = chd->sibling) { + chd->parent = tmp->parent; + if (!(chd->sibling)) + break; + } + *p = tmp->child; + chd->sibling = tmp->sibling; + } old->parent = NULL; return 0; } @@ -325,7 +344,7 @@ int release_resource(struct resource *old) int retval; write_lock(&resource_lock); - retval = __release_resource(old); + retval = __release_resource(old, true); write_unlock(&resource_lock); return retval; } @@ -333,13 +352,13 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); /* - * Finds the lowest iomem reosurce exists with-in [res->start.res->end) - * the caller must specify res->start, res->end, res->flags and "name". - * If found, returns 0, res is overwritten, if not found, returns -1. - * This walks through whole tree and not just first level children - * until and unless first_level_children_only is true. + * Finds the lowest iomem resource existing within [res->start.res->end). + * The caller must specify res->start, res->end, res->flags, and optionally + * desc. If found, returns 0, res is overwritten, if not found, returns -1. + * This function walks the whole tree and not just first level children until + * and unless first_level_children_only is true. */ -static int find_next_iomem_res(struct resource *res, char *name, +static int find_next_iomem_res(struct resource *res, unsigned long desc, bool first_level_children_only) { resource_size_t start, end; @@ -358,9 +377,9 @@ static int find_next_iomem_res(struct resource *res, char *name, read_lock(&resource_lock); for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) { - if (p->flags != res->flags) + if ((p->flags & res->flags) != res->flags) continue; - if (name && strcmp(p->name, name)) + if ((desc != IORES_DESC_NONE) && (desc != p->desc)) continue; if (p->start > end) { p = NULL; @@ -385,15 +404,18 @@ static int find_next_iomem_res(struct resource *res, char *name, * Walks through iomem resources and calls func() with matching resource * ranges. This walks through whole tree and not just first level children. * All the memory ranges which overlap start,end and also match flags and - * name are valid candidates. + * desc are valid candidates. * - * @name: name of resource - * @flags: resource flags + * @desc: I/O resource descriptor. Use IORES_DESC_NONE to skip @desc check. + * @flags: I/O resource flags * @start: start addr * @end: end addr + * + * NOTE: For a new descriptor search, define a new IORES_DESC in + * <linux/ioport.h> and set it in 'desc' of a target resource entry. */ -int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, - void *arg, int (*func)(u64, u64, void *)) +int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, + u64 end, void *arg, int (*func)(u64, u64, void *)) { struct resource res; u64 orig_end; @@ -403,23 +425,27 @@ int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, res.end = end; res.flags = flags; orig_end = res.end; + while ((res.start < res.end) && - (!find_next_iomem_res(&res, name, false))) { + (!find_next_iomem_res(&res, desc, false))) { + ret = (*func)(res.start, res.end, arg); if (ret) break; + res.start = res.end + 1; res.end = orig_end; } + return ret; } /* - * This function calls callback against all memory range of "System RAM" - * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. - * Now, this function is only for "System RAM". This function deals with - * full ranges and not pfn. If resources are not pfn aligned, dealing - * with pfn can truncate ranges. + * This function calls the @func callback against all memory ranges of type + * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY. + * Now, this function is only for System RAM, it deals with full ranges and + * not PFNs. If resources are not PFN-aligned, dealing with PFNs can truncate + * ranges. */ int walk_system_ram_res(u64 start, u64 end, void *arg, int (*func)(u64, u64, void *)) @@ -430,10 +456,10 @@ int walk_system_ram_res(u64 start, u64 end, void *arg, res.start = start; res.end = end; - res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; orig_end = res.end; while ((res.start < res.end) && - (!find_next_iomem_res(&res, "System RAM", true))) { + (!find_next_iomem_res(&res, IORES_DESC_NONE, true))) { ret = (*func)(res.start, res.end, arg); if (ret) break; @@ -446,9 +472,9 @@ int walk_system_ram_res(u64 start, u64 end, void *arg, #if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) /* - * This function calls callback against all memory range of "System RAM" - * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. - * Now, this function is only for "System RAM". + * This function calls the @func callback against all memory ranges of type + * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY. + * It is to be used only for System RAM. */ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, void *arg, int (*func)(unsigned long, unsigned long, void *)) @@ -460,10 +486,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, res.start = (u64) start_pfn << PAGE_SHIFT; res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; - res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; orig_end = res.end; while ((res.start < res.end) && - (find_next_iomem_res(&res, "System RAM", true) >= 0)) { + (find_next_iomem_res(&res, IORES_DESC_NONE, true) >= 0)) { pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; end_pfn = (res.end + 1) >> PAGE_SHIFT; if (end_pfn > pfn) @@ -484,7 +510,7 @@ static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg) } /* * This generic page_is_ram() returns true if specified address is - * registered as "System RAM" in iomem_resource list. + * registered as System RAM in iomem_resource list. */ int __weak page_is_ram(unsigned long pfn) { @@ -496,30 +522,34 @@ EXPORT_SYMBOL_GPL(page_is_ram); * region_intersects() - determine intersection of region with known resources * @start: region start address * @size: size of region - * @name: name of resource (in iomem_resource) + * @flags: flags of resource (in iomem_resource) + * @desc: descriptor of resource (in iomem_resource) or IORES_DESC_NONE * * Check if the specified region partially overlaps or fully eclipses a - * resource identified by @name. Return REGION_DISJOINT if the region - * does not overlap @name, return REGION_MIXED if the region overlaps - * @type and another resource, and return REGION_INTERSECTS if the - * region overlaps @type and no other defined resource. Note, that - * REGION_INTERSECTS is also returned in the case when the specified - * region overlaps RAM and undefined memory holes. + * resource identified by @flags and @desc (optional with IORES_DESC_NONE). + * Return REGION_DISJOINT if the region does not overlap @flags/@desc, + * return REGION_MIXED if the region overlaps @flags/@desc and another + * resource, and return REGION_INTERSECTS if the region overlaps @flags/@desc + * and no other defined resource. Note that REGION_INTERSECTS is also + * returned in the case when the specified region overlaps RAM and undefined + * memory holes. * * region_intersect() is used by memory remapping functions to ensure * the user is not remapping RAM and is a vast speed up over walking * through the resource table page by page. */ -int region_intersects(resource_size_t start, size_t size, const char *name) +int region_intersects(resource_size_t start, size_t size, unsigned long flags, + unsigned long desc) { - unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY; resource_size_t end = start + size - 1; int type = 0; int other = 0; struct resource *p; read_lock(&resource_lock); for (p = iomem_resource.child; p ; p = p->sibling) { - bool is_type = strcmp(p->name, name) == 0 && p->flags == flags; + bool is_type = (((p->flags & flags) == flags) && + ((desc == IORES_DESC_NONE) || + (desc == p->desc))); if (start >= p->start && start <= p->end) is_type ? type++ : other++; @@ -538,6 +568,7 @@ int region_intersects(resource_size_t start, size_t size, const char *name) return REGION_DISJOINT; } +EXPORT_SYMBOL_GPL(region_intersects); void __weak arch_remove_reservations(struct resource *avail) { @@ -667,7 +698,7 @@ static int reallocate_resource(struct resource *root, struct resource *old, old->start = new.start; old->end = new.end; } else { - __release_resource(old); + __release_resource(old, true); *old = new; conflict = __request_resource(root, old); BUG_ON(conflict); @@ -813,6 +844,9 @@ static struct resource * __insert_resource(struct resource *parent, struct resou * entirely fit within the range of the new resource, then the new * resource is inserted and the conflicting resources become children of * the new resource. + * + * This function is intended for producers of resources, such as FW modules + * and bus drivers. */ struct resource *insert_resource_conflict(struct resource *parent, struct resource *new) { @@ -830,6 +864,9 @@ struct resource *insert_resource_conflict(struct resource *parent, struct resour * @new: new resource to insert * * Returns 0 on success, -EBUSY if the resource can't be inserted. + * + * This function is intended for producers of resources, such as FW modules + * and bus drivers. */ int insert_resource(struct resource *parent, struct resource *new) { @@ -838,6 +875,7 @@ int insert_resource(struct resource *parent, struct resource *new) conflict = insert_resource_conflict(parent, new); return conflict ? -EBUSY : 0; } +EXPORT_SYMBOL_GPL(insert_resource); /** * insert_resource_expand_to_fit - Insert a resource into the resource tree @@ -873,6 +911,32 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new) write_unlock(&resource_lock); } +/** + * remove_resource - Remove a resource in the resource tree + * @old: resource to remove + * + * Returns 0 on success, -EINVAL if the resource is not valid. + * + * This function removes a resource previously inserted by insert_resource() + * or insert_resource_conflict(), and moves the children (if any) up to + * where they were before. insert_resource() and insert_resource_conflict() + * insert a new resource, and move any conflicting resources down to the + * children of the new resource. + * + * insert_resource(), insert_resource_conflict() and remove_resource() are + * intended for producers of resources, such as FW modules and bus drivers. + */ +int remove_resource(struct resource *old) +{ + int retval; + + write_lock(&resource_lock); + retval = __release_resource(old, false); + write_unlock(&resource_lock); + return retval; +} +EXPORT_SYMBOL_GPL(remove_resource); + static int __adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) { @@ -948,6 +1012,7 @@ static void __init __reserve_region_with_split(struct resource *root, res->start = start; res->end = end; res->flags = IORESOURCE_BUSY; + res->desc = IORES_DESC_NONE; while (1) { @@ -982,6 +1047,7 @@ static void __init __reserve_region_with_split(struct resource *root, next_res->start = conflict->end + 1; next_res->end = end; next_res->flags = IORESOURCE_BUSY; + next_res->desc = IORES_DESC_NONE; } } else { res->start = conflict->end + 1; @@ -1071,14 +1137,16 @@ struct resource * __request_region(struct resource *parent, res->name = name; res->start = start; res->end = start + n - 1; - res->flags = resource_type(parent); - res->flags |= IORESOURCE_BUSY | flags; write_lock(&resource_lock); for (;;) { struct resource *conflict; + res->flags = resource_type(parent) | resource_ext_type(parent); + res->flags |= IORESOURCE_BUSY | flags; + res->desc = parent->desc; + conflict = __request_resource(parent, res); if (!conflict) break; @@ -1238,6 +1306,7 @@ int release_mem_region_adjustable(struct resource *parent, new_res->start = end + 1; new_res->end = res->end; new_res->flags = res->flags; + new_res->desc = res->desc; new_res->parent = res->parent; new_res->sibling = res->sibling; new_res->child = NULL; @@ -1413,6 +1482,7 @@ static int __init reserve_setup(char *str) res->start = io_start; res->end = io_start + io_num - 1; res->flags = IORESOURCE_BUSY; + res->desc = IORES_DESC_NONE; res->child = NULL; if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0) reserved = x+1; diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 67687973ce80..5e59b832ae2b 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -2,6 +2,10 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE) endif +# These files are disabled because they produce non-interesting flaky coverage +# that is not a function of syscall inputs. E.g. involuntary context switches. +KCOV_INSTRUMENT := n + ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is # needed for x86 only. Why this used to be enabled for all architectures is beyond @@ -13,9 +17,11 @@ endif obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o -obj-y += wait.o completion.o idle.o +obj-y += wait.o swait.o completion.o idle.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ) += cpufreq.o +obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index bc54e84675da..e85a725e5c34 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -61,6 +61,7 @@ #include <linux/static_key.h> #include <linux/workqueue.h> #include <linux/compiler.h> +#include <linux/tick.h> /* * Scheduler clock - returns current time in nanosec units. @@ -89,6 +90,8 @@ static void __set_sched_clock_stable(void) { if (!sched_clock_stable()) static_key_slow_inc(&__sched_clock_stable); + + tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE); } void set_sched_clock_stable(void) @@ -108,6 +111,8 @@ static void __clear_sched_clock_stable(struct work_struct *work) /* XXX worry about clock continuity */ if (sched_clock_stable()) static_key_slow_dec(&__sched_clock_stable); + + tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE); } static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable); @@ -313,6 +318,7 @@ u64 sched_clock_cpu(int cpu) return clock; } +EXPORT_SYMBOL_GPL(sched_clock_cpu); void sched_clock_tick(void) { @@ -358,39 +364,6 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); -/* - * As outlined at the top, provides a fast, high resolution, nanosecond - * time source that is monotonic per cpu argument and has bounded drift - * between cpus. - * - * ######################### BIG FAT WARNING ########################## - * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # - * # go backwards !! # - * #################################################################### - */ -u64 cpu_clock(int cpu) -{ - if (!sched_clock_stable()) - return sched_clock_cpu(cpu); - - return sched_clock(); -} - -/* - * Similar to cpu_clock() for the current cpu. Time will only be observed - * to be monotonic if care is taken to only compare timestampt taken on the - * same CPU. - * - * See cpu_clock(). - */ -u64 local_clock(void) -{ - if (!sched_clock_stable()) - return sched_clock_cpu(raw_smp_processor_id()); - - return sched_clock(); -} - #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ void sched_clock_init(void) @@ -405,22 +378,8 @@ u64 sched_clock_cpu(int cpu) return sched_clock(); } - -u64 cpu_clock(int cpu) -{ - return sched_clock(); -} - -u64 local_clock(void) -{ - return sched_clock(); -} - #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ -EXPORT_SYMBOL_GPL(cpu_clock); -EXPORT_SYMBOL_GPL(local_clock); - /* * Running clock - returns the time that has elapsed while a guest has been * running. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 41f6b2215aa8..51d7105f529a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -33,7 +33,7 @@ #include <linux/init.h> #include <linux/uaccess.h> #include <linux/highmem.h> -#include <asm/mmu_context.h> +#include <linux/mmu_context.h> #include <linux/interrupt.h> #include <linux/capability.h> #include <linux/completion.h> @@ -67,14 +67,13 @@ #include <linux/pagemap.h> #include <linux/hrtimer.h> #include <linux/tick.h> -#include <linux/debugfs.h> #include <linux/ctype.h> #include <linux/ftrace.h> #include <linux/slab.h> #include <linux/init_task.h> -#include <linux/binfmts.h> #include <linux/context_tracking.h> #include <linux/compiler.h> +#include <linux/frame.h> #include <asm/switch_to.h> #include <asm/tlb.h> @@ -125,138 +124,6 @@ const_debug unsigned int sysctl_sched_features = #undef SCHED_FEAT -#ifdef CONFIG_SCHED_DEBUG -#define SCHED_FEAT(name, enabled) \ - #name , - -static const char * const sched_feat_names[] = { -#include "features.h" -}; - -#undef SCHED_FEAT - -static int sched_feat_show(struct seq_file *m, void *v) -{ - int i; - - for (i = 0; i < __SCHED_FEAT_NR; i++) { - if (!(sysctl_sched_features & (1UL << i))) - seq_puts(m, "NO_"); - seq_printf(m, "%s ", sched_feat_names[i]); - } - seq_puts(m, "\n"); - - return 0; -} - -#ifdef HAVE_JUMP_LABEL - -#define jump_label_key__true STATIC_KEY_INIT_TRUE -#define jump_label_key__false STATIC_KEY_INIT_FALSE - -#define SCHED_FEAT(name, enabled) \ - jump_label_key__##enabled , - -struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { -#include "features.h" -}; - -#undef SCHED_FEAT - -static void sched_feat_disable(int i) -{ - static_key_disable(&sched_feat_keys[i]); -} - -static void sched_feat_enable(int i) -{ - static_key_enable(&sched_feat_keys[i]); -} -#else -static void sched_feat_disable(int i) { }; -static void sched_feat_enable(int i) { }; -#endif /* HAVE_JUMP_LABEL */ - -static int sched_feat_set(char *cmp) -{ - int i; - int neg = 0; - - if (strncmp(cmp, "NO_", 3) == 0) { - neg = 1; - cmp += 3; - } - - for (i = 0; i < __SCHED_FEAT_NR; i++) { - if (strcmp(cmp, sched_feat_names[i]) == 0) { - if (neg) { - sysctl_sched_features &= ~(1UL << i); - sched_feat_disable(i); - } else { - sysctl_sched_features |= (1UL << i); - sched_feat_enable(i); - } - break; - } - } - - return i; -} - -static ssize_t -sched_feat_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - char *cmp; - int i; - struct inode *inode; - - if (cnt > 63) - cnt = 63; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - cmp = strstrip(buf); - - /* Ensure the static_key remains in a consistent state */ - inode = file_inode(filp); - inode_lock(inode); - i = sched_feat_set(cmp); - inode_unlock(inode); - if (i == __SCHED_FEAT_NR) - return -EINVAL; - - *ppos += cnt; - - return cnt; -} - -static int sched_feat_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, sched_feat_show, NULL); -} - -static const struct file_operations sched_feat_fops = { - .open = sched_feat_open, - .write = sched_feat_write, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static __init int sched_init_debug(void) -{ - debugfs_create_file("sched_features", 0644, NULL, NULL, - &sched_feat_fops); - - return 0; -} -late_initcall(sched_init_debug); -#endif /* CONFIG_SCHED_DEBUG */ - /* * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. @@ -303,6 +170,71 @@ static struct rq *this_rq_lock(void) return rq; } +/* + * __task_rq_lock - lock the rq @p resides on. + */ +struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(rq->lock) +{ + struct rq *rq; + + lockdep_assert_held(&p->pi_lock); + + for (;;) { + rq = task_rq(p); + raw_spin_lock(&rq->lock); + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { + rf->cookie = lockdep_pin_lock(&rq->lock); + return rq; + } + raw_spin_unlock(&rq->lock); + + while (unlikely(task_on_rq_migrating(p))) + cpu_relax(); + } +} + +/* + * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. + */ +struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(p->pi_lock) + __acquires(rq->lock) +{ + struct rq *rq; + + for (;;) { + raw_spin_lock_irqsave(&p->pi_lock, rf->flags); + rq = task_rq(p); + raw_spin_lock(&rq->lock); + /* + * move_queued_task() task_rq_lock() + * + * ACQUIRE (rq->lock) + * [S] ->on_rq = MIGRATING [L] rq = task_rq() + * WMB (__set_task_cpu()) ACQUIRE (rq->lock); + * [S] ->cpu = new_cpu [L] task_rq() + * [L] ->on_rq + * RELEASE (rq->lock) + * + * If we observe the old cpu in task_rq_lock, the acquire of + * the old rq->lock will fully serialize against the stores. + * + * If we observe the new cpu in task_rq_lock, the acquire will + * pair with the WMB to ensure we must then also see migrating. + */ + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { + rf->cookie = lockdep_pin_lock(&rq->lock); + return rq; + } + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); + + while (unlikely(task_on_rq_migrating(p))) + cpu_relax(); + } +} + #ifdef CONFIG_SCHED_HRTICK /* * Use HR-timers to deliver accurate preemption points. @@ -382,29 +314,6 @@ void hrtick_start(struct rq *rq, u64 delay) } } -static int -hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - int cpu = (int)(long)hcpu; - - switch (action) { - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - hrtick_clear(cpu_rq(cpu)); - return NOTIFY_OK; - } - - return NOTIFY_DONE; -} - -static __init void init_hrtick(void) -{ - hotcpu_notifier(hotplug_hrtick, 0); -} #else /* * Called to set the hrtick timer state. @@ -421,10 +330,6 @@ void hrtick_start(struct rq *rq, u64 delay) hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL_PINNED); } - -static inline void init_hrtick(void) -{ -} #endif /* CONFIG_SMP */ static void init_rq_hrtick(struct rq *rq) @@ -448,24 +353,24 @@ static inline void hrtick_clear(struct rq *rq) static inline void init_rq_hrtick(struct rq *rq) { } - -static inline void init_hrtick(void) -{ -} #endif /* CONFIG_SCHED_HRTICK */ /* * cmpxchg based fetch_or, macro so it works for different integer types */ -#define fetch_or(ptr, val) \ -({ typeof(*(ptr)) __old, __val = *(ptr); \ - for (;;) { \ - __old = cmpxchg((ptr), __val, __val | (val)); \ - if (__old == __val) \ - break; \ - __val = __old; \ - } \ - __old; \ +#define fetch_or(ptr, mask) \ + ({ \ + typeof(ptr) _ptr = (ptr); \ + typeof(mask) _mask = (mask); \ + typeof(*_ptr) _old, _val = *_ptr; \ + \ + for (;;) { \ + _old = cmpxchg(_ptr, _val, _val | _mask); \ + if (_old == _val) \ + break; \ + _val = _old; \ + } \ + _old; \ }) #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) @@ -529,7 +434,7 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) * wakeup due to that. * * This cmpxchg() implies a full barrier, which pairs with the write - * barrier implied by the wakeup in wake_up_list(). + * barrier implied by the wakeup in wake_up_q(). */ if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) return; @@ -628,7 +533,10 @@ int get_nohz_timer_target(void) rcu_read_lock(); for_each_domain(cpu, sd) { for_each_cpu(i, sched_domain_span(sd)) { - if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) { + if (cpu == i) + continue; + + if (!idle_cpu(i) && is_housekeeping_cpu(i)) { cpu = i; goto unlock; } @@ -716,31 +624,39 @@ static inline bool got_nohz_idle_kick(void) #endif /* CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_FULL -bool sched_can_stop_tick(void) +bool sched_can_stop_tick(struct rq *rq) { + int fifo_nr_running; + + /* Deadline tasks, even if single, need the tick */ + if (rq->dl.dl_nr_running) + return false; + /* - * FIFO realtime policy runs the highest priority task. Other runnable - * tasks are of a lower priority. The scheduler tick does nothing. + * If there are more than one RR tasks, we need the tick to effect the + * actual RR behaviour. */ - if (current->policy == SCHED_FIFO) - return true; + if (rq->rt.rr_nr_running) { + if (rq->rt.rr_nr_running == 1) + return true; + else + return false; + } /* - * Round-robin realtime tasks time slice with other tasks at the same - * realtime priority. Is this task the only one at this priority? + * If there's no RR tasks, but FIFO tasks, we can skip the tick, no + * forced preemption between FIFO tasks. */ - if (current->policy == SCHED_RR) { - struct sched_rt_entity *rt_se = ¤t->rt; - - return list_is_singular(&rt_se->run_list); - } + fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running; + if (fifo_nr_running) + return true; /* - * More than one running task need preemption. - * nr_running update is assumed to be visible - * after IPI is sent from wakers. + * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left; + * if there's more than one we need the tick for involuntary + * preemption. */ - if (this_rq()->nr_running > 1) + if (rq->nr_running > 1) return false; return true; @@ -1206,12 +1122,20 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) static int __set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask, bool check) { - unsigned long flags; - struct rq *rq; + const struct cpumask *cpu_valid_mask = cpu_active_mask; unsigned int dest_cpu; + struct rq_flags rf; + struct rq *rq; int ret = 0; - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); + + if (p->flags & PF_KTHREAD) { + /* + * Kernel threads are allowed on online && !active CPUs + */ + cpu_valid_mask = cpu_online_mask; + } /* * Must re-check here, to close a race against __kthread_bind(), @@ -1225,22 +1149,32 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_equal(&p->cpus_allowed, new_mask)) goto out; - if (!cpumask_intersects(new_mask, cpu_active_mask)) { + if (!cpumask_intersects(new_mask, cpu_valid_mask)) { ret = -EINVAL; goto out; } do_set_cpus_allowed(p, new_mask); + if (p->flags & PF_KTHREAD) { + /* + * For kernel threads that do indeed end up on online && + * !active we want to ensure they are strict per-cpu threads. + */ + WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && + !cpumask_intersects(new_mask, cpu_active_mask) && + p->nr_cpus_allowed != 1); + } + /* Can the task run on the task's current CPU? If so, we're done */ if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); + dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); if (task_running(rq, p) || p->state == TASK_WAKING) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); tlb_migrate_finish(p->mm); return 0; @@ -1249,12 +1183,12 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, * OK, since we're going to drop the lock immediately * afterwards anyway. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, rf.cookie); rq = move_queued_task(rq, p, dest_cpu); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, rf.cookie); } out: - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return ret; } @@ -1438,8 +1372,8 @@ out: */ unsigned long wait_task_inactive(struct task_struct *p, long match_state) { - unsigned long flags; int running, queued; + struct rq_flags rf; unsigned long ncsw; struct rq *rq; @@ -1474,14 +1408,14 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * lock now, to be *sure*. If we're wrong, we'll * just go back and repeat. */ - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); trace_sched_wait_task(p); running = task_running(rq, p); queued = task_on_rq_queued(p); ncsw = 0; if (!match_state || p->state == match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); /* * If it changed from the expected state, bail out now. @@ -1555,6 +1489,25 @@ EXPORT_SYMBOL_GPL(kick_process); /* * ->cpus_allowed is protected by both rq->lock and p->pi_lock + * + * A few notes on cpu_active vs cpu_online: + * + * - cpu_active must be a subset of cpu_online + * + * - on cpu-up we allow per-cpu kthreads on the online && !active cpu, + * see __set_cpus_allowed_ptr(). At this point the newly online + * cpu isn't yet part of the sched domains, and balancing will not + * see it. + * + * - on cpu-down we clear cpu_active() to mask the sched domains and + * avoid the load balancer to place new tasks on the to be removed + * cpu. Existing tasks will remain running there and will be taken + * off. + * + * This means that fallback selection must not select !active CPUs. + * And can assume that any active CPU must be online. Conversely + * select_task_rq() below may allow selection of !active CPUs in order + * to satisfy the above rules. */ static int select_fallback_rq(int cpu, struct task_struct *p) { @@ -1573,8 +1526,6 @@ static int select_fallback_rq(int cpu, struct task_struct *p) /* Look for allowed, online CPU in same node. */ for_each_cpu(dest_cpu, nodemask) { - if (!cpu_online(dest_cpu)) - continue; if (!cpu_active(dest_cpu)) continue; if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) @@ -1585,9 +1536,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) for (;;) { /* Any allowed, online CPU? */ for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { - if (!cpu_online(dest_cpu)) + if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu)) continue; - if (!cpu_active(dest_cpu)) + if (!cpu_online(dest_cpu)) continue; goto out; } @@ -1636,8 +1587,10 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { lockdep_assert_held(&p->pi_lock); - if (p->nr_cpus_allowed > 1) + if (tsk_nr_cpus_allowed(p) > 1) cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); + else + cpu = cpumask_any(tsk_cpus_allowed(p)); /* * In order not to call set_task_cpu() on a blocking task we need @@ -1725,8 +1678,8 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl /* * Mark the task runnable and perform wakeup-preemption. */ -static void -ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) +static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, + struct pin_cookie cookie) { check_preempt_curr(rq, p, wake_flags); p->state = TASK_RUNNING; @@ -1738,9 +1691,9 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) * Our task @p is fully woken up and running; so its safe to * drop the rq->lock, hereafter rq is only used for statistics. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); p->sched_class->task_woken(rq, p); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, cookie); } if (rq->idle_stamp) { @@ -1758,17 +1711,23 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) } static void -ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) +ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, + struct pin_cookie cookie) { + int en_flags = ENQUEUE_WAKEUP; + lockdep_assert_held(&rq->lock); #ifdef CONFIG_SMP if (p->sched_contributes_to_load) rq->nr_uninterruptible--; + + if (wake_flags & WF_MIGRATED) + en_flags |= ENQUEUE_MIGRATED; #endif - ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); - ttwu_do_wakeup(rq, p, wake_flags); + ttwu_activate(rq, p, en_flags); + ttwu_do_wakeup(rq, p, wake_flags, cookie); } /* @@ -1779,17 +1738,18 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) */ static int ttwu_remote(struct task_struct *p, int wake_flags) { + struct rq_flags rf; struct rq *rq; int ret = 0; - rq = __task_rq_lock(p); + rq = __task_rq_lock(p, &rf); if (task_on_rq_queued(p)) { /* check_preempt_curr() may use rq clock */ update_rq_clock(rq); - ttwu_do_wakeup(rq, p, wake_flags); + ttwu_do_wakeup(rq, p, wake_flags, rf.cookie); ret = 1; } - __task_rq_unlock(rq); + __task_rq_unlock(rq, &rf); return ret; } @@ -1799,6 +1759,7 @@ void sched_ttwu_pending(void) { struct rq *rq = this_rq(); struct llist_node *llist = llist_del_all(&rq->wake_list); + struct pin_cookie cookie; struct task_struct *p; unsigned long flags; @@ -1806,15 +1767,21 @@ void sched_ttwu_pending(void) return; raw_spin_lock_irqsave(&rq->lock, flags); - lockdep_pin_lock(&rq->lock); + cookie = lockdep_pin_lock(&rq->lock); while (llist) { + int wake_flags = 0; + p = llist_entry(llist, struct task_struct, wake_entry); llist = llist_next(llist); - ttwu_do_activate(rq, p, 0); + + if (p->sched_remote_wakeup) + wake_flags = WF_MIGRATED; + + ttwu_do_activate(rq, p, wake_flags, cookie); } - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -1856,10 +1823,12 @@ void scheduler_ipi(void) irq_exit(); } -static void ttwu_queue_remote(struct task_struct *p, int cpu) +static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); + p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); + if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { if (!set_nr_if_polling(rq->idle)) smp_send_reschedule(cpu); @@ -1898,22 +1867,23 @@ bool cpus_share_cache(int this_cpu, int that_cpu) } #endif /* CONFIG_SMP */ -static void ttwu_queue(struct task_struct *p, int cpu) +static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); + struct pin_cookie cookie; #if defined(CONFIG_SMP) if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { sched_clock_cpu(cpu); /* sync clocks x-cpu */ - ttwu_queue_remote(p, cpu); + ttwu_queue_remote(p, cpu, wake_flags); return; } #endif raw_spin_lock(&rq->lock); - lockdep_pin_lock(&rq->lock); - ttwu_do_activate(rq, p, 0); - lockdep_unpin_lock(&rq->lock); + cookie = lockdep_pin_lock(&rq->lock); + ttwu_do_activate(rq, p, wake_flags, cookie); + lockdep_unpin_lock(&rq->lock, cookie); raw_spin_unlock(&rq->lock); } @@ -2082,9 +2052,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; - if (p->sched_class->task_waking) - p->sched_class->task_waking(p); - cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; @@ -2092,9 +2059,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) } #endif /* CONFIG_SMP */ - ttwu_queue(p, cpu); + ttwu_queue(p, cpu, wake_flags); stat: - ttwu_stat(p, cpu, wake_flags); + if (schedstat_enabled()) + ttwu_stat(p, cpu, wake_flags); out: raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -2109,7 +2077,7 @@ out: * ensure that this_rq() is locked, @p is bound to this_rq() and not * the current task. */ -static void try_to_wake_up_local(struct task_struct *p) +static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie) { struct rq *rq = task_rq(p); @@ -2126,11 +2094,11 @@ static void try_to_wake_up_local(struct task_struct *p) * disabled avoiding further scheduler activity on it and we've * not yet picked a replacement task. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); raw_spin_unlock(&rq->lock); raw_spin_lock(&p->pi_lock); raw_spin_lock(&rq->lock); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, cookie); } if (!(p->state & TASK_NORMAL)) @@ -2141,8 +2109,9 @@ static void try_to_wake_up_local(struct task_struct *p) if (!task_on_rq_queued(p)) ttwu_activate(rq, p, ENQUEUE_WAKEUP); - ttwu_do_wakeup(rq, p, 0); - ttwu_stat(p, smp_processor_id(), 0); + ttwu_do_wakeup(rq, p, 0, cookie); + if (schedstat_enabled()) + ttwu_stat(p, smp_processor_id(), 0); out: raw_spin_unlock(&p->pi_lock); } @@ -2184,7 +2153,6 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_bw = 0; dl_se->dl_throttled = 0; - dl_se->dl_new = 1; dl_se->dl_yielded = 0; } @@ -2211,6 +2179,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #endif #ifdef CONFIG_SCHEDSTATS + /* Even if schedstat is disabled, there should not be garbage */ memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif @@ -2219,6 +2188,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) __dl_clear_params(p); INIT_LIST_HEAD(&p->rt.run_list); + p->rt.timeout = 0; + p->rt.time_slice = sched_rr_timeslice; + p->rt.on_rq = 0; + p->rt.on_list = 0; #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); @@ -2282,6 +2255,83 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, #endif #endif +#ifdef CONFIG_SCHEDSTATS + +DEFINE_STATIC_KEY_FALSE(sched_schedstats); +static bool __initdata __sched_schedstats = false; + +static void set_schedstats(bool enabled) +{ + if (enabled) + static_branch_enable(&sched_schedstats); + else + static_branch_disable(&sched_schedstats); +} + +void force_schedstat_enabled(void) +{ + if (!schedstat_enabled()) { + pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); + static_branch_enable(&sched_schedstats); + } +} + +static int __init setup_schedstats(char *str) +{ + int ret = 0; + if (!str) + goto out; + + /* + * This code is called before jump labels have been set up, so we can't + * change the static branch directly just yet. Instead set a temporary + * variable so init_schedstats() can do it later. + */ + if (!strcmp(str, "enable")) { + __sched_schedstats = true; + ret = 1; + } else if (!strcmp(str, "disable")) { + __sched_schedstats = false; + ret = 1; + } +out: + if (!ret) + pr_warn("Unable to parse schedstats=\n"); + + return ret; +} +__setup("schedstats=", setup_schedstats); + +static void __init init_schedstats(void) +{ + set_schedstats(__sched_schedstats); +} + +#ifdef CONFIG_PROC_SYSCTL +int sysctl_schedstats(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + int err; + int state = static_branch_likely(&sched_schedstats); + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + t = *table; + t.data = &state; + err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + if (write) + set_schedstats(state); + return err; +} +#endif /* CONFIG_PROC_SYSCTL */ +#else /* !CONFIG_SCHEDSTATS */ +static inline void init_schedstats(void) {} +#endif /* CONFIG_SCHEDSTATS */ + /* * fork()/clone()-time setup: */ @@ -2433,7 +2483,8 @@ static int dl_overflow(struct task_struct *p, int policy, u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; int cpus, err = -1; - if (new_bw == p->dl.dl_bw) + /* !deadline task may carry old deadline bandwidth */ + if (new_bw == p->dl.dl_bw && task_has_dl_policy(p)) return 0; /* @@ -2472,12 +2523,12 @@ extern void init_dl_bw(struct dl_bw *dl_b); */ void wake_up_new_task(struct task_struct *p) { - unsigned long flags; + struct rq_flags rf; struct rq *rq; - raw_spin_lock_irqsave(&p->pi_lock, flags); /* Initialize new task's runnable average */ init_entity_runnable_average(&p->se); + raw_spin_lock_irqsave(&p->pi_lock, rf.flags); #ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: @@ -2486,8 +2537,9 @@ void wake_up_new_task(struct task_struct *p) */ set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif + rq = __task_rq_lock(p, &rf); + post_init_entity_util_avg(&p->se); - rq = __task_rq_lock(p); activate_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); @@ -2498,12 +2550,12 @@ void wake_up_new_task(struct task_struct *p) * Nothing relies on rq->lock after this, so its fine to * drop it. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, rf.cookie); p->sched_class->task_woken(rq, p); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, rf.cookie); } #endif - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); } #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -2763,9 +2815,9 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) /* * context_switch - switch to the new MM and the new thread's register state. */ -static inline struct rq * +static __always_inline struct rq * context_switch(struct rq *rq, struct task_struct *prev, - struct task_struct *next) + struct task_struct *next, struct pin_cookie cookie) { struct mm_struct *mm, *oldmm; @@ -2785,7 +2837,7 @@ context_switch(struct rq *rq, struct task_struct *prev, atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else - switch_mm(oldmm, mm, next); + switch_mm_irqs_off(oldmm, mm, next); if (!prev->mm) { prev->active_mm = NULL; @@ -2797,7 +2849,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * of the scheduler it's an obvious special-case), so we * do an early lockdep release here: */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); spin_release(&rq->lock.dep_map, 1, _THIS_IP_); /* Here we just switch the register state and the stack. */ @@ -2919,7 +2971,7 @@ EXPORT_PER_CPU_SYMBOL(kernel_cpustat); */ unsigned long long task_sched_runtime(struct task_struct *p) { - unsigned long flags; + struct rq_flags rf; struct rq *rq; u64 ns; @@ -2939,7 +2991,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) return p->se.sum_exec_runtime; #endif - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); /* * Must be ->curr _and_ ->on_rq. If dequeued, we would * project cycles that may never be accounted to this @@ -2950,7 +3002,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) p->sched_class->update_curr(rq); } ns = p->se.sum_exec_runtime; - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return ns; } @@ -2970,7 +3022,7 @@ void scheduler_tick(void) raw_spin_lock(&rq->lock); update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); - update_cpu_load_active(rq); + cpu_load_update_active(rq); calc_global_load_tick(rq); raw_spin_unlock(&rq->lock); @@ -3011,19 +3063,23 @@ u64 scheduler_tick_max_deferment(void) } #endif -notrace unsigned long get_parent_ip(unsigned long addr) +#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ + defined(CONFIG_PREEMPT_TRACER)) +/* + * If the value passed in is equal to the current preempt count + * then we just disabled preemption. Start timing the latency. + */ +static inline void preempt_latency_start(int val) { - if (in_lock_functions(addr)) { - addr = CALLER_ADDR2; - if (in_lock_functions(addr)) - addr = CALLER_ADDR3; + if (preempt_count() == val) { + unsigned long ip = get_lock_parent_ip(); +#ifdef CONFIG_DEBUG_PREEMPT + current->preempt_disable_ip = ip; +#endif + trace_preempt_off(CALLER_ADDR0, ip); } - return addr; } -#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ - defined(CONFIG_PREEMPT_TRACER)) - void preempt_count_add(int val) { #ifdef CONFIG_DEBUG_PREEMPT @@ -3041,17 +3097,21 @@ void preempt_count_add(int val) DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK - 10); #endif - if (preempt_count() == val) { - unsigned long ip = get_parent_ip(CALLER_ADDR1); -#ifdef CONFIG_DEBUG_PREEMPT - current->preempt_disable_ip = ip; -#endif - trace_preempt_off(CALLER_ADDR0, ip); - } + preempt_latency_start(val); } EXPORT_SYMBOL(preempt_count_add); NOKPROBE_SYMBOL(preempt_count_add); +/* + * If the value passed in equals to the current preempt count + * then we just enabled preemption. Stop timing the latency. + */ +static inline void preempt_latency_stop(int val) +{ + if (preempt_count() == val) + trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); +} + void preempt_count_sub(int val) { #ifdef CONFIG_DEBUG_PREEMPT @@ -3068,13 +3128,15 @@ void preempt_count_sub(int val) return; #endif - if (preempt_count() == val) - trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + preempt_latency_stop(val); __preempt_count_sub(val); } EXPORT_SYMBOL(preempt_count_sub); NOKPROBE_SYMBOL(preempt_count_sub); +#else +static inline void preempt_latency_start(int val) { } +static inline void preempt_latency_stop(int val) { } #endif /* @@ -3109,7 +3171,8 @@ static noinline void __schedule_bug(struct task_struct *prev) static inline void schedule_debug(struct task_struct *prev) { #ifdef CONFIG_SCHED_STACK_END_CHECK - BUG_ON(task_stack_end_corrupted(prev)); + if (task_stack_end_corrupted(prev)) + panic("corrupted stack end detected inside scheduler\n"); #endif if (unlikely(in_atomic_preempt_off())) { @@ -3127,7 +3190,7 @@ static inline void schedule_debug(struct task_struct *prev) * Pick up the highest-prio task: */ static inline struct task_struct * -pick_next_task(struct rq *rq, struct task_struct *prev) +pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) { const struct sched_class *class = &fair_sched_class; struct task_struct *p; @@ -3138,20 +3201,20 @@ pick_next_task(struct rq *rq, struct task_struct *prev) */ if (likely(prev->sched_class == class && rq->nr_running == rq->cfs.h_nr_running)) { - p = fair_sched_class.pick_next_task(rq, prev); + p = fair_sched_class.pick_next_task(rq, prev, cookie); if (unlikely(p == RETRY_TASK)) goto again; /* assumes fair_sched_class->next == idle_sched_class */ if (unlikely(!p)) - p = idle_sched_class.pick_next_task(rq, prev); + p = idle_sched_class.pick_next_task(rq, prev, cookie); return p; } again: for_each_class(class) { - p = class->pick_next_task(rq, prev); + p = class->pick_next_task(rq, prev, cookie); if (p) { if (unlikely(p == RETRY_TASK)) goto again; @@ -3205,6 +3268,7 @@ static void __sched notrace __schedule(bool preempt) { struct task_struct *prev, *next; unsigned long *switch_count; + struct pin_cookie cookie; struct rq *rq; int cpu; @@ -3238,7 +3302,7 @@ static void __sched notrace __schedule(bool preempt) */ smp_mb__before_spinlock(); raw_spin_lock(&rq->lock); - lockdep_pin_lock(&rq->lock); + cookie = lockdep_pin_lock(&rq->lock); rq->clock_skip_update <<= 1; /* promote REQ to ACT */ @@ -3258,9 +3322,9 @@ static void __sched notrace __schedule(bool preempt) if (prev->flags & PF_WQ_WORKER) { struct task_struct *to_wakeup; - to_wakeup = wq_worker_sleeping(prev, cpu); + to_wakeup = wq_worker_sleeping(prev); if (to_wakeup) - try_to_wake_up_local(to_wakeup); + try_to_wake_up_local(to_wakeup, cookie); } } switch_count = &prev->nvcsw; @@ -3269,7 +3333,7 @@ static void __sched notrace __schedule(bool preempt) if (task_on_rq_queued(prev)) update_rq_clock(rq); - next = pick_next_task(rq, prev); + next = pick_next_task(rq, prev, cookie); clear_tsk_need_resched(prev); clear_preempt_need_resched(); rq->clock_skip_update = 0; @@ -3280,15 +3344,15 @@ static void __sched notrace __schedule(bool preempt) ++*switch_count; trace_sched_switch(preempt, prev, next); - rq = context_switch(rq, prev, next); /* unlocks the rq */ - cpu = cpu_of(rq); + rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */ } else { - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); raw_spin_unlock_irq(&rq->lock); } balance_callback(rq); } +STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */ static inline void sched_submit_work(struct task_struct *tsk) { @@ -3349,8 +3413,23 @@ void __sched schedule_preempt_disabled(void) static void __sched notrace preempt_schedule_common(void) { do { + /* + * Because the function tracer can trace preempt_count_sub() + * and it also uses preempt_enable/disable_notrace(), if + * NEED_RESCHED is set, the preempt_enable_notrace() called + * by the function tracer will call this function again and + * cause infinite recursion. + * + * Preemption must be disabled here before the function + * tracer can trace. Break up preempt_disable() into two + * calls. One to disable preemption without fear of being + * traced. The other to still record the preemption latency, + * which can also be traced by the function tracer. + */ preempt_disable_notrace(); + preempt_latency_start(1); __schedule(true); + preempt_latency_stop(1); preempt_enable_no_resched_notrace(); /* @@ -3402,7 +3481,21 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) return; do { + /* + * Because the function tracer can trace preempt_count_sub() + * and it also uses preempt_enable/disable_notrace(), if + * NEED_RESCHED is set, the preempt_enable_notrace() called + * by the function tracer will call this function again and + * cause infinite recursion. + * + * Preemption must be disabled here before the function + * tracer can trace. Break up preempt_disable() into two + * calls. One to disable preemption without fear of being + * traced. The other to still record the preemption latency, + * which can also be traced by the function tracer. + */ preempt_disable_notrace(); + preempt_latency_start(1); /* * Needs preempt disabled in case user_exit() is traced * and the tracer calls preempt_enable_notrace() causing @@ -3412,6 +3505,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) __schedule(true); exception_exit(prev_ctx); + preempt_latency_stop(1); preempt_enable_no_resched_notrace(); } while (need_resched()); } @@ -3467,13 +3561,14 @@ EXPORT_SYMBOL(default_wake_function); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE; - struct rq *rq; + int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE; const struct sched_class *prev_class; + struct rq_flags rf; + struct rq *rq; BUG_ON(prio > MAX_PRIO); - rq = __task_rq_lock(p); + rq = __task_rq_lock(p, &rf); /* * Idle task boosting is a nono in general. There is one @@ -3495,11 +3590,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio) trace_sched_pi_setprio(p, prio); oldprio = p->prio; + + if (oldprio == prio) + queue_flag &= ~DEQUEUE_MOVE; + prev_class = p->sched_class; queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE); + dequeue_task(rq, p, queue_flag); if (running) put_prev_task(rq, p); @@ -3517,7 +3616,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (!dl_prio(p->normal_prio) || (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; - enqueue_flag |= ENQUEUE_REPLENISH; + queue_flag |= ENQUEUE_REPLENISH; } else p->dl.dl_boosted = 0; p->sched_class = &dl_sched_class; @@ -3525,7 +3624,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (dl_prio(oldprio)) p->dl.dl_boosted = 0; if (oldprio < prio) - enqueue_flag |= ENQUEUE_HEAD; + queue_flag |= ENQUEUE_HEAD; p->sched_class = &rt_sched_class; } else { if (dl_prio(oldprio)) @@ -3540,12 +3639,12 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (running) p->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, p, enqueue_flag); + enqueue_task(rq, p, queue_flag); check_class_changed(rq, p, prev_class, oldprio); out_unlock: preempt_disable(); /* avoid rq from going away on us */ - __task_rq_unlock(rq); + __task_rq_unlock(rq, &rf); balance_callback(rq); preempt_enable(); @@ -3555,7 +3654,7 @@ out_unlock: void set_user_nice(struct task_struct *p, long nice) { int old_prio, delta, queued; - unsigned long flags; + struct rq_flags rf; struct rq *rq; if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) @@ -3564,7 +3663,7 @@ void set_user_nice(struct task_struct *p, long nice) * We have to be careful, if called from sys_setpriority(), * the task might be in the middle of scheduling on another CPU. */ - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); /* * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected @@ -3595,7 +3694,7 @@ void set_user_nice(struct task_struct *p, long nice) resched_curr(rq); } out_unlock: - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); } EXPORT_SYMBOL(set_user_nice); @@ -3892,10 +3991,11 @@ static int __sched_setscheduler(struct task_struct *p, MAX_RT_PRIO - 1 - attr->sched_priority; int retval, oldprio, oldpolicy = -1, queued, running; int new_effective_prio, policy = attr->sched_policy; - unsigned long flags; const struct sched_class *prev_class; - struct rq *rq; + struct rq_flags rf; int reset_on_fork; + int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; + struct rq *rq; /* may grab non-irq protected spin_locks */ BUG_ON(in_interrupt()); @@ -3990,13 +4090,13 @@ recheck: * To be able to change p->policy safely, the appropriate * runqueue lock must be held. */ - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); /* * Changing the policy of the stop threads its a very bad idea */ if (p == rq->stop) { - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return -EINVAL; } @@ -4013,7 +4113,7 @@ recheck: goto change; p->sched_reset_on_fork = reset_on_fork; - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return 0; } change: @@ -4027,7 +4127,7 @@ change: if (rt_bandwidth_enabled() && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0 && !task_group_is_autogroup(task_group(p))) { - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return -EPERM; } #endif @@ -4042,7 +4142,7 @@ change: */ if (!cpumask_subset(span, &p->cpus_allowed) || rq->rd->dl_bw.bw == 0) { - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return -EPERM; } } @@ -4052,7 +4152,7 @@ change: /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); goto recheck; } @@ -4062,7 +4162,7 @@ change: * is available. */ if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); return -EBUSY; } @@ -4078,17 +4178,14 @@ change: * itself. */ new_effective_prio = rt_mutex_get_effective_prio(p, newprio); - if (new_effective_prio == oldprio) { - __setscheduler_params(p, attr); - task_rq_unlock(rq, p, &flags); - return 0; - } + if (new_effective_prio == oldprio) + queue_flags &= ~DEQUEUE_MOVE; } queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE); + dequeue_task(rq, p, queue_flags); if (running) put_prev_task(rq, p); @@ -4098,20 +4195,19 @@ change: if (running) p->sched_class->set_curr_task(rq); if (queued) { - int enqueue_flags = ENQUEUE_RESTORE; /* * We enqueue to tail when the priority of a task is * increased (user space view). */ - if (oldprio <= p->prio) - enqueue_flags |= ENQUEUE_HEAD; + if (oldprio < p->prio) + queue_flags |= ENQUEUE_HEAD; - enqueue_task(rq, p, enqueue_flags); + enqueue_task(rq, p, queue_flags); } check_class_changed(rq, p, prev_class, oldprio); preempt_disable(); /* avoid rq from going away on us */ - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); if (pi) rt_mutex_adjust_pi(p); @@ -4964,10 +5060,10 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, { struct task_struct *p; unsigned int time_slice; - unsigned long flags; + struct rq_flags rf; + struct timespec t; struct rq *rq; int retval; - struct timespec t; if (pid < 0) return -EINVAL; @@ -4982,11 +5078,11 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, if (retval) goto out_unlock; - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); time_slice = 0; if (p->sched_class->get_rr_interval) time_slice = p->sched_class->get_rr_interval(rq, p); - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); rcu_read_unlock(); jiffies_to_timespec(time_slice, &t); @@ -5053,16 +5149,19 @@ void show_state_filter(unsigned long state_filter) /* * reset the NMI-timeout, listing all files on a slow * console might take a lot of time: + * Also, reset softlockup watchdogs on all CPUs, because + * another CPU might be blocked waiting for us to process + * an IPI. */ touch_nmi_watchdog(); + touch_all_softlockup_watchdogs(); if (!state_filter || (p->state & state_filter)) sched_show_task(p); } - touch_all_softlockup_watchdogs(); - #ifdef CONFIG_SCHED_DEBUG - sysrq_sched_debug_show(); + if (!state_filter) + sysrq_sched_debug_show(); #endif rcu_read_unlock(); /* @@ -5224,6 +5323,8 @@ out: #ifdef CONFIG_SMP +static bool sched_smp_initialized __read_mostly; + #ifdef CONFIG_NUMA_BALANCING /* Migrate current task p to target_cpu */ int migrate_task_to(struct task_struct *p, int target_cpu) @@ -5249,11 +5350,11 @@ int migrate_task_to(struct task_struct *p, int target_cpu) */ void sched_setnuma(struct task_struct *p, int nid) { - struct rq *rq; - unsigned long flags; bool queued, running; + struct rq_flags rf; + struct rq *rq; - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); queued = task_on_rq_queued(p); running = task_current(rq, p); @@ -5268,7 +5369,7 @@ void sched_setnuma(struct task_struct *p, int nid) p->sched_class->set_curr_task(rq); if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE); - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); } #endif /* CONFIG_NUMA_BALANCING */ @@ -5284,7 +5385,7 @@ void idle_task_exit(void) BUG_ON(cpu_online(smp_processor_id())); if (mm != &init_mm) { - switch_mm(mm, &init_mm, current); + switch_mm_irqs_off(mm, &init_mm, current); finish_arch_post_lock_switch(); } mmdrop(mm); @@ -5332,6 +5433,7 @@ static void migrate_tasks(struct rq *dead_rq) { struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; + struct pin_cookie cookie; int dest_cpu; /* @@ -5363,8 +5465,8 @@ static void migrate_tasks(struct rq *dead_rq) /* * pick_next_task assumes pinned rq->lock. */ - lockdep_pin_lock(&rq->lock); - next = pick_next_task(rq, &fake_task); + cookie = lockdep_pin_lock(&rq->lock); + next = pick_next_task(rq, &fake_task, cookie); BUG_ON(!next); next->sched_class->put_prev_task(rq, next); @@ -5377,7 +5479,7 @@ static void migrate_tasks(struct rq *dead_rq) * because !cpu_active at this point, which means load-balance * will not interfere. Also, stop-machine. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); raw_spin_unlock(&rq->lock); raw_spin_lock(&next->pi_lock); raw_spin_lock(&rq->lock); @@ -5408,183 +5510,6 @@ static void migrate_tasks(struct rq *dead_rq) } #endif /* CONFIG_HOTPLUG_CPU */ -#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) - -static struct ctl_table sd_ctl_dir[] = { - { - .procname = "sched_domain", - .mode = 0555, - }, - {} -}; - -static struct ctl_table sd_ctl_root[] = { - { - .procname = "kernel", - .mode = 0555, - .child = sd_ctl_dir, - }, - {} -}; - -static struct ctl_table *sd_alloc_ctl_entry(int n) -{ - struct ctl_table *entry = - kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); - - return entry; -} - -static void sd_free_ctl_entry(struct ctl_table **tablep) -{ - struct ctl_table *entry; - - /* - * In the intermediate directories, both the child directory and - * procname are dynamically allocated and could fail but the mode - * will always be set. In the lowest directory the names are - * static strings and all have proc handlers. - */ - for (entry = *tablep; entry->mode; entry++) { - if (entry->child) - sd_free_ctl_entry(&entry->child); - if (entry->proc_handler == NULL) - kfree(entry->procname); - } - - kfree(*tablep); - *tablep = NULL; -} - -static int min_load_idx = 0; -static int max_load_idx = CPU_LOAD_IDX_MAX-1; - -static void -set_table_entry(struct ctl_table *entry, - const char *procname, void *data, int maxlen, - umode_t mode, proc_handler *proc_handler, - bool load_idx) -{ - entry->procname = procname; - entry->data = data; - entry->maxlen = maxlen; - entry->mode = mode; - entry->proc_handler = proc_handler; - - if (load_idx) { - entry->extra1 = &min_load_idx; - entry->extra2 = &max_load_idx; - } -} - -static struct ctl_table * -sd_alloc_ctl_domain_table(struct sched_domain *sd) -{ - struct ctl_table *table = sd_alloc_ctl_entry(14); - - if (table == NULL) - return NULL; - - set_table_entry(&table[0], "min_interval", &sd->min_interval, - sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[1], "max_interval", &sd->max_interval, - sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[2], "busy_idx", &sd->busy_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[3], "idle_idx", &sd->idle_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[5], "wake_idx", &sd->wake_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[7], "busy_factor", &sd->busy_factor, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[9], "cache_nice_tries", - &sd->cache_nice_tries, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[10], "flags", &sd->flags, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[11], "max_newidle_lb_cost", - &sd->max_newidle_lb_cost, - sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[12], "name", sd->name, - CORENAME_MAX_SIZE, 0444, proc_dostring, false); - /* &table[13] is terminator */ - - return table; -} - -static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) -{ - struct ctl_table *entry, *table; - struct sched_domain *sd; - int domain_num = 0, i; - char buf[32]; - - for_each_domain(cpu, sd) - domain_num++; - entry = table = sd_alloc_ctl_entry(domain_num + 1); - if (table == NULL) - return NULL; - - i = 0; - for_each_domain(cpu, sd) { - snprintf(buf, 32, "domain%d", i); - entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0555; - entry->child = sd_alloc_ctl_domain_table(sd); - entry++; - i++; - } - return table; -} - -static struct ctl_table_header *sd_sysctl_header; -static void register_sched_domain_sysctl(void) -{ - int i, cpu_num = num_possible_cpus(); - struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); - char buf[32]; - - WARN_ON(sd_ctl_dir[0].child); - sd_ctl_dir[0].child = entry; - - if (entry == NULL) - return; - - for_each_possible_cpu(i) { - snprintf(buf, 32, "cpu%d", i); - entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0555; - entry->child = sd_alloc_ctl_cpu_table(i); - entry++; - } - - WARN_ON(sd_sysctl_header); - sd_sysctl_header = register_sysctl_table(sd_ctl_root); -} - -/* may be called multiple times per register */ -static void unregister_sched_domain_sysctl(void) -{ - unregister_sysctl_table(sd_sysctl_header); - sd_sysctl_header = NULL; - if (sd_ctl_dir[0].child) - sd_free_ctl_entry(&sd_ctl_dir[0].child); -} -#else -static void register_sched_domain_sysctl(void) -{ -} -static void unregister_sched_domain_sysctl(void) -{ -} -#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */ - static void set_rq_online(struct rq *rq) { if (!rq->online) { @@ -5615,136 +5540,13 @@ static void set_rq_offline(struct rq *rq) } } -/* - * migration_call - callback that gets triggered when a CPU is added. - * Here we can start up the necessary migration thread for the new CPU. - */ -static int -migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) +static void set_cpu_rq_start_time(unsigned int cpu) { - int cpu = (long)hcpu; - unsigned long flags; struct rq *rq = cpu_rq(cpu); - switch (action & ~CPU_TASKS_FROZEN) { - - case CPU_UP_PREPARE: - rq->calc_load_update = calc_load_update; - break; - - case CPU_ONLINE: - /* Update our root-domain */ - raw_spin_lock_irqsave(&rq->lock, flags); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - - set_rq_online(rq); - } - raw_spin_unlock_irqrestore(&rq->lock, flags); - break; - -#ifdef CONFIG_HOTPLUG_CPU - case CPU_DYING: - sched_ttwu_pending(); - /* Update our root-domain */ - raw_spin_lock_irqsave(&rq->lock, flags); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - set_rq_offline(rq); - } - migrate_tasks(rq); - BUG_ON(rq->nr_running != 1); /* the migration thread */ - raw_spin_unlock_irqrestore(&rq->lock, flags); - break; - - case CPU_DEAD: - calc_load_migrate(rq); - break; -#endif - } - - update_max_interval(); - - return NOTIFY_OK; -} - -/* - * Register at high priority so that task migration (migrate_all_tasks) - * happens before everything else. This has to be lower priority than - * the notifier in the perf_event subsystem, though. - */ -static struct notifier_block migration_notifier = { - .notifier_call = migration_call, - .priority = CPU_PRI_MIGRATION, -}; - -static void set_cpu_rq_start_time(void) -{ - int cpu = smp_processor_id(); - struct rq *rq = cpu_rq(cpu); rq->age_stamp = sched_clock_cpu(cpu); } -static int sched_cpu_active(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - int cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_STARTING: - set_cpu_rq_start_time(); - return NOTIFY_OK; - - case CPU_ONLINE: - /* - * At this point a starting CPU has marked itself as online via - * set_cpu_online(). But it might not yet have marked itself - * as active, which is essential from here on. - */ - set_cpu_active(cpu, true); - stop_machine_unpark(cpu); - return NOTIFY_OK; - - case CPU_DOWN_FAILED: - set_cpu_active(cpu, true); - return NOTIFY_OK; - - default: - return NOTIFY_DONE; - } -} - -static int sched_cpu_inactive(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DOWN_PREPARE: - set_cpu_active((long)hcpu, false); - return NOTIFY_OK; - default: - return NOTIFY_DONE; - } -} - -static int __init migration_init(void) -{ - void *cpu = (void *)(long)smp_processor_id(); - int err; - - /* Initialize migration for the boot CPU */ - err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); - BUG_ON(err == NOTIFY_BAD); - migration_call(&migration_notifier, CPU_ONLINE, cpu); - register_cpu_notifier(&migration_notifier); - - /* Register cpu active notifiers */ - cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); - cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); - - return 0; -} -early_initcall(migration_init); - static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ #ifdef CONFIG_SCHED_DEBUG @@ -6176,11 +5978,16 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { + int ret; + alloc_bootmem_cpumask_var(&cpu_isolated_map); - cpulist_parse(str, cpu_isolated_map); + ret = cpulist_parse(str, cpu_isolated_map); + if (ret) { + pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids); + return 0; + } return 1; } - __setup("isolcpus=", isolated_cpu_setup); struct s_data { @@ -6887,10 +6694,10 @@ static void sched_init_numa(void) init_numa_topology_type(); } -static void sched_domains_numa_masks_set(int cpu) +static void sched_domains_numa_masks_set(unsigned int cpu) { - int i, j; int node = cpu_to_node(cpu); + int i, j; for (i = 0; i < sched_domains_numa_levels; i++) { for (j = 0; j < nr_node_ids; j++) { @@ -6900,51 +6707,20 @@ static void sched_domains_numa_masks_set(int cpu) } } -static void sched_domains_numa_masks_clear(int cpu) +static void sched_domains_numa_masks_clear(unsigned int cpu) { int i, j; + for (i = 0; i < sched_domains_numa_levels; i++) { for (j = 0; j < nr_node_ids; j++) cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); } } -/* - * Update sched_domains_numa_masks[level][node] array when new cpus - * are onlined. - */ -static int sched_domains_numa_masks_update(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - int cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - sched_domains_numa_masks_set(cpu); - break; - - case CPU_DEAD: - sched_domains_numa_masks_clear(cpu); - break; - - default: - return NOTIFY_DONE; - } - - return NOTIFY_OK; -} #else -static inline void sched_init_numa(void) -{ -} - -static int sched_domains_numa_masks_update(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - return 0; -} +static inline void sched_init_numa(void) { } +static void sched_domains_numa_masks_set(unsigned int cpu) { } +static void sched_domains_numa_masks_clear(unsigned int cpu) { } #endif /* CONFIG_NUMA */ static int __sdt_alloc(const struct cpumask *cpu_map) @@ -7334,13 +7110,9 @@ static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ * If we come here as part of a suspend/resume, don't touch cpusets because we * want to restore it back to its original state upon resume anyway. */ -static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, - void *hcpu) +static void cpuset_cpu_active(void) { - switch (action) { - case CPU_ONLINE_FROZEN: - case CPU_DOWN_FAILED_FROZEN: - + if (cpuhp_tasks_frozen) { /* * num_cpus_frozen tracks how many CPUs are involved in suspend * resume sequence. As long as this is not the last online @@ -7350,35 +7122,25 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, num_cpus_frozen--; if (likely(num_cpus_frozen)) { partition_sched_domains(1, NULL, NULL); - break; + return; } - /* * This is the last CPU online operation. So fall through and * restore the original sched domains by considering the * cpuset configurations. */ - - case CPU_ONLINE: - cpuset_update_active_cpus(true); - break; - default: - return NOTIFY_DONE; } - return NOTIFY_OK; + cpuset_update_active_cpus(true); } -static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, - void *hcpu) +static int cpuset_cpu_inactive(unsigned int cpu) { unsigned long flags; - long cpu = (long)hcpu; struct dl_bw *dl_b; bool overflow; int cpus; - switch (action) { - case CPU_DOWN_PREPARE: + if (!cpuhp_tasks_frozen) { rcu_read_lock_sched(); dl_b = dl_bw_of(cpu); @@ -7390,19 +7152,120 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, rcu_read_unlock_sched(); if (overflow) - return notifier_from_errno(-EBUSY); + return -EBUSY; cpuset_update_active_cpus(false); - break; - case CPU_DOWN_PREPARE_FROZEN: + } else { num_cpus_frozen++; partition_sched_domains(1, NULL, NULL); - break; - default: - return NOTIFY_DONE; } - return NOTIFY_OK; + return 0; +} + +int sched_cpu_activate(unsigned int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + set_cpu_active(cpu, true); + + if (sched_smp_initialized) { + sched_domains_numa_masks_set(cpu); + cpuset_cpu_active(); + } + + /* + * Put the rq online, if not already. This happens: + * + * 1) In the early boot process, because we build the real domains + * after all cpus have been brought up. + * + * 2) At runtime, if cpuset_cpu_active() fails to rebuild the + * domains. + */ + raw_spin_lock_irqsave(&rq->lock, flags); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_online(rq); + } + raw_spin_unlock_irqrestore(&rq->lock, flags); + + update_max_interval(); + + return 0; } +int sched_cpu_deactivate(unsigned int cpu) +{ + int ret; + + set_cpu_active(cpu, false); + /* + * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU + * users of this state to go away such that all new such users will + * observe it. + * + * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might + * not imply sync_sched(), so wait for both. + * + * Do sync before park smpboot threads to take care the rcu boost case. + */ + if (IS_ENABLED(CONFIG_PREEMPT)) + synchronize_rcu_mult(call_rcu, call_rcu_sched); + else + synchronize_rcu(); + + if (!sched_smp_initialized) + return 0; + + ret = cpuset_cpu_inactive(cpu); + if (ret) { + set_cpu_active(cpu, true); + return ret; + } + sched_domains_numa_masks_clear(cpu); + return 0; +} + +static void sched_rq_cpu_starting(unsigned int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + rq->calc_load_update = calc_load_update; + account_reset_rq(rq); + update_max_interval(); +} + +int sched_cpu_starting(unsigned int cpu) +{ + set_cpu_rq_start_time(cpu); + sched_rq_cpu_starting(cpu); + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU +int sched_cpu_dying(unsigned int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + /* Handle pending wakeups and then migrate everything off */ + sched_ttwu_pending(); + raw_spin_lock_irqsave(&rq->lock, flags); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_offline(rq); + } + migrate_tasks(rq); + BUG_ON(rq->nr_running != 1); + raw_spin_unlock_irqrestore(&rq->lock, flags); + calc_load_migrate(rq); + update_max_interval(); + nohz_balance_exit_idle(cpu); + hrtick_clear(rq); + return 0; +} +#endif + void __init sched_init_smp(void) { cpumask_var_t non_isolated_cpus; @@ -7424,12 +7287,6 @@ void __init sched_init_smp(void) cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); mutex_unlock(&sched_domains_mutex); - hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); - hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); - hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); - - init_hrtick(); - /* Move init over to a non-isolated CPU */ if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) BUG(); @@ -7438,7 +7295,16 @@ void __init sched_init_smp(void) init_sched_rt_class(); init_sched_dl_class(); + sched_smp_initialized = true; +} + +static int __init migration_init(void) +{ + sched_rq_cpu_starting(smp_processor_id()); + return 0; } +early_initcall(migration_init); + #else void __init sched_init_smp(void) { @@ -7573,8 +7439,6 @@ void __init sched_init(void) for (j = 0; j < CPU_LOAD_IDX_MAX; j++) rq->cpu_load[j] = 0; - rq->last_load_update_tick = jiffies; - #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; @@ -7593,12 +7457,13 @@ void __init sched_init(void) rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ_COMMON + rq->last_load_update_tick = jiffies; rq->nohz_flags = 0; #endif #ifdef CONFIG_NO_HZ_FULL rq->last_sched_tick = 0; #endif -#endif +#endif /* CONFIG_SMP */ init_rq_hrtick(rq); atomic_set(&rq->nr_iowait, 0); } @@ -7636,10 +7501,12 @@ void __init sched_init(void) if (cpu_isolated_map == NULL) zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); idle_thread_set_boot_cpu(); - set_cpu_rq_start_time(); + set_cpu_rq_start_time(smp_processor_id()); #endif init_sched_fair_class(); + init_schedstats(); + scheduler_running = 1; } @@ -7801,7 +7668,7 @@ void set_curr_task(int cpu, struct task_struct *p) /* task_group_lock serializes the addition/removal of task groups */ static DEFINE_SPINLOCK(task_group_lock); -static void free_sched_group(struct task_group *tg) +static void sched_free_group(struct task_group *tg) { free_fair_sched_group(tg); free_rt_sched_group(tg); @@ -7827,7 +7694,7 @@ struct task_group *sched_create_group(struct task_group *parent) return tg; err: - free_sched_group(tg); + sched_free_group(tg); return ERR_PTR(-ENOMEM); } @@ -7847,27 +7714,24 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) } /* rcu callback to free various structures associated with a task group */ -static void free_sched_group_rcu(struct rcu_head *rhp) +static void sched_free_group_rcu(struct rcu_head *rhp) { /* now it should be safe to free those cfs_rqs */ - free_sched_group(container_of(rhp, struct task_group, rcu)); + sched_free_group(container_of(rhp, struct task_group, rcu)); } -/* Destroy runqueue etc associated with a task group */ void sched_destroy_group(struct task_group *tg) { /* wait for possible concurrent references to cfs_rqs complete */ - call_rcu(&tg->rcu, free_sched_group_rcu); + call_rcu(&tg->rcu, sched_free_group_rcu); } void sched_offline_group(struct task_group *tg) { unsigned long flags; - int i; /* end participation in shares distribution */ - for_each_possible_cpu(i) - unregister_fair_sched_group(tg, i); + unregister_fair_sched_group(tg); spin_lock_irqsave(&task_group_lock, flags); list_del_rcu(&tg->list); @@ -7884,16 +7748,16 @@ void sched_move_task(struct task_struct *tsk) { struct task_group *tg; int queued, running; - unsigned long flags; + struct rq_flags rf; struct rq *rq; - rq = task_rq_lock(tsk, &flags); + rq = task_rq_lock(tsk, &rf); running = task_current(rq, tsk); queued = task_on_rq_queued(tsk); if (queued) - dequeue_task(rq, tsk, DEQUEUE_SAVE); + dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); if (unlikely(running)) put_prev_task(rq, tsk); @@ -7917,9 +7781,9 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, tsk, ENQUEUE_RESTORE); + enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); - task_rq_unlock(rq, tsk, &flags); + task_rq_unlock(rq, tsk, &rf); } #endif /* CONFIG_CGROUP_SCHED */ @@ -8139,7 +8003,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) static int sched_rt_global_constraints(void) { unsigned long flags; - int i, ret = 0; + int i; raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { @@ -8151,7 +8015,7 @@ static int sched_rt_global_constraints(void) } raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); - return ret; + return 0; } #endif /* CONFIG_RT_GROUP_SCHED */ @@ -8318,31 +8182,26 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); + sched_online_group(tg, parent); + return &tg->css; } -static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) +static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); - struct task_group *parent = css_tg(css->parent); - if (parent) - sched_online_group(tg, parent); - return 0; + sched_offline_group(tg); } static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); - sched_destroy_group(tg); -} - -static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) -{ - struct task_group *tg = css_tg(css); - - sched_offline_group(tg); + /* + * Relies on the RCU grace period between css_released() and this. + */ + sched_free_group(tg); } static void cpu_cgroup_fork(struct task_struct *task) @@ -8702,14 +8561,13 @@ static struct cftype cpu_files[] = { struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, + .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, - .css_online = cpu_cgroup_css_online, - .css_offline = cpu_cgroup_css_offline, .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .legacy_cftypes = cpu_files, - .early_init = 1, + .early_init = true, }; #endif /* CONFIG_CGROUP_SCHED */ diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index dd7cbb55bbf2..41f85c4d0938 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -25,11 +25,22 @@ enum cpuacct_stat_index { CPUACCT_STAT_NSTATS, }; +enum cpuacct_usage_index { + CPUACCT_USAGE_USER, /* ... user mode */ + CPUACCT_USAGE_SYSTEM, /* ... kernel mode */ + + CPUACCT_USAGE_NRUSAGE, +}; + +struct cpuacct_usage { + u64 usages[CPUACCT_USAGE_NRUSAGE]; +}; + /* track cpu usage of a group of tasks and its child groups */ struct cpuacct { struct cgroup_subsys_state css; /* cpuusage holds pointer to a u64-type object on every cpu */ - u64 __percpu *cpuusage; + struct cpuacct_usage __percpu *cpuusage; struct kernel_cpustat __percpu *cpustat; }; @@ -49,7 +60,7 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca) return css_ca(ca->css.parent); } -static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); +static DEFINE_PER_CPU(struct cpuacct_usage, root_cpuacct_cpuusage); static struct cpuacct root_cpuacct = { .cpustat = &kernel_cpustat, .cpuusage = &root_cpuacct_cpuusage, @@ -68,7 +79,7 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) if (!ca) goto out; - ca->cpuusage = alloc_percpu(u64); + ca->cpuusage = alloc_percpu(struct cpuacct_usage); if (!ca->cpuusage) goto out_free_ca; @@ -96,20 +107,37 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css) kfree(ca); } -static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) +static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, + enum cpuacct_usage_index index) { - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); u64 data; + /* + * We allow index == CPUACCT_USAGE_NRUSAGE here to read + * the sum of suages. + */ + BUG_ON(index > CPUACCT_USAGE_NRUSAGE); + #ifndef CONFIG_64BIT /* * Take rq->lock to make 64-bit read safe on 32-bit platforms. */ raw_spin_lock_irq(&cpu_rq(cpu)->lock); - data = *cpuusage; +#endif + + if (index == CPUACCT_USAGE_NRUSAGE) { + int i = 0; + + data = 0; + for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) + data += cpuusage->usages[i]; + } else { + data = cpuusage->usages[index]; + } + +#ifndef CONFIG_64BIT raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - data = *cpuusage; #endif return data; @@ -117,66 +145,103 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) { - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + int i; #ifndef CONFIG_64BIT /* * Take rq->lock to make 64-bit write safe on 32-bit platforms. */ raw_spin_lock_irq(&cpu_rq(cpu)->lock); - *cpuusage = val; +#endif + + for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) + cpuusage->usages[i] = val; + +#ifndef CONFIG_64BIT raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - *cpuusage = val; #endif } /* return total cpu usage (in nanoseconds) of a group */ -static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) +static u64 __cpuusage_read(struct cgroup_subsys_state *css, + enum cpuacct_usage_index index) { struct cpuacct *ca = css_ca(css); u64 totalcpuusage = 0; int i; - for_each_present_cpu(i) - totalcpuusage += cpuacct_cpuusage_read(ca, i); + for_each_possible_cpu(i) + totalcpuusage += cpuacct_cpuusage_read(ca, i, index); return totalcpuusage; } +static u64 cpuusage_user_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return __cpuusage_read(css, CPUACCT_USAGE_USER); +} + +static u64 cpuusage_sys_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return __cpuusage_read(css, CPUACCT_USAGE_SYSTEM); +} + +static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + return __cpuusage_read(css, CPUACCT_USAGE_NRUSAGE); +} + static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, - u64 reset) + u64 val) { struct cpuacct *ca = css_ca(css); - int err = 0; - int i; + int cpu; - if (reset) { - err = -EINVAL; - goto out; - } + /* + * Only allow '0' here to do a reset. + */ + if (val) + return -EINVAL; - for_each_present_cpu(i) - cpuacct_cpuusage_write(ca, i, 0); + for_each_possible_cpu(cpu) + cpuacct_cpuusage_write(ca, cpu, 0); -out: - return err; + return 0; } -static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) +static int __cpuacct_percpu_seq_show(struct seq_file *m, + enum cpuacct_usage_index index) { struct cpuacct *ca = css_ca(seq_css(m)); u64 percpu; int i; - for_each_present_cpu(i) { - percpu = cpuacct_cpuusage_read(ca, i); + for_each_possible_cpu(i) { + percpu = cpuacct_cpuusage_read(ca, i, index); seq_printf(m, "%llu ", (unsigned long long) percpu); } seq_printf(m, "\n"); return 0; } +static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V) +{ + return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_USER); +} + +static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V) +{ + return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_SYSTEM); +} + +static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) +{ + return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_NRUSAGE); +} + static const char * const cpuacct_stat_desc[] = { [CPUACCT_STAT_USER] = "user", [CPUACCT_STAT_SYSTEM] = "system", @@ -188,7 +253,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v) int cpu; s64 val = 0; - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); val += kcpustat->cpustat[CPUTIME_USER]; val += kcpustat->cpustat[CPUTIME_NICE]; @@ -197,7 +262,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v) seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val); val = 0; - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); val += kcpustat->cpustat[CPUTIME_SYSTEM]; val += kcpustat->cpustat[CPUTIME_IRQ]; @@ -217,10 +282,26 @@ static struct cftype files[] = { .write_u64 = cpuusage_write, }, { + .name = "usage_user", + .read_u64 = cpuusage_user_read, + }, + { + .name = "usage_sys", + .read_u64 = cpuusage_sys_read, + }, + { .name = "usage_percpu", .seq_show = cpuacct_percpu_seq_show, }, { + .name = "usage_percpu_user", + .seq_show = cpuacct_percpu_user_seq_show, + }, + { + .name = "usage_percpu_sys", + .seq_show = cpuacct_percpu_sys_seq_show, + }, + { .name = "stat", .seq_show = cpuacct_stats_show, }, @@ -235,22 +316,16 @@ static struct cftype files[] = { void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; - int cpu; + int index = CPUACCT_USAGE_SYSTEM; + struct pt_regs *regs = task_pt_regs(tsk); - cpu = task_cpu(tsk); + if (regs && user_mode(regs)) + index = CPUACCT_USAGE_USER; rcu_read_lock(); - ca = task_ca(tsk); - - while (true) { - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - *cpuusage += cputime; - - ca = parent_ca(ca); - if (!ca) - break; - } + for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) + this_cpu_ptr(ca->cpuusage)->usages[index] += cputime; rcu_read_unlock(); } @@ -260,18 +335,13 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) * * Note: it's the caller that updates the account of the root cgroup. */ -void cpuacct_account_field(struct task_struct *p, int index, u64 val) +void cpuacct_account_field(struct task_struct *tsk, int index, u64 val) { - struct kernel_cpustat *kcpustat; struct cpuacct *ca; rcu_read_lock(); - ca = task_ca(p); - while (ca != &root_cpuacct) { - kcpustat = this_cpu_ptr(ca->cpustat); - kcpustat->cpustat[index] += val; - ca = parent_ca(ca); - } + for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca)) + this_cpu_ptr(ca->cpustat)->cpustat[index] += val; rcu_read_unlock(); } @@ -279,5 +349,5 @@ struct cgroup_subsys cpuacct_cgrp_subsys = { .css_alloc = cpuacct_css_alloc, .css_free = cpuacct_css_free, .legacy_cftypes = files, - .early_init = 1, + .early_init = true, }; diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h index ed605624a5e7..ba72807c73d4 100644 --- a/kernel/sched/cpuacct.h +++ b/kernel/sched/cpuacct.h @@ -1,7 +1,7 @@ #ifdef CONFIG_CGROUP_CPUACCT extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); -extern void cpuacct_account_field(struct task_struct *p, int index, u64 val); +extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); #else @@ -10,7 +10,7 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) } static inline void -cpuacct_account_field(struct task_struct *p, int index, u64 val) +cpuacct_account_field(struct task_struct *tsk, int index, u64 val) { } diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 5a75b08cfd85..5be58820465c 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -103,10 +103,10 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, const struct sched_dl_entity *dl_se = &p->dl; if (later_mask && - cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) { + cpumask_and(later_mask, cp->free_cpus, tsk_cpus_allowed(p))) { best_cpu = cpumask_any(later_mask); goto out; - } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && + } else if (cpumask_test_cpu(cpudl_maximum(cp), tsk_cpus_allowed(p)) && dl_time_before(dl_se->deadline, cp->elements[0].dl)) { best_cpu = cpudl_maximum(cp); if (later_mask) diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c new file mode 100644 index 000000000000..1141954e73b4 --- /dev/null +++ b/kernel/sched/cpufreq.c @@ -0,0 +1,63 @@ +/* + * Scheduler code and data structures related to cpufreq. + * + * Copyright (C) 2016, Intel Corporation + * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "sched.h" + +DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); + +/** + * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer. + * @cpu: The CPU to set the pointer for. + * @data: New pointer value. + * @func: Callback function to set for the CPU. + * + * Set and publish the update_util_data pointer for the given CPU. + * + * The update_util_data pointer of @cpu is set to @data and the callback + * function pointer in the target struct update_util_data is set to @func. + * That function will be called by cpufreq_update_util() from RCU-sched + * read-side critical sections, so it must not sleep. @data will always be + * passed to it as the first argument which allows the function to get to the + * target update_util_data structure and its container. + * + * The update_util_data pointer of @cpu must be NULL when this function is + * called or it will WARN() and return with no effect. + */ +void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data, + void (*func)(struct update_util_data *data, u64 time, + unsigned long util, unsigned long max)) +{ + if (WARN_ON(!data || !func)) + return; + + if (WARN_ON(per_cpu(cpufreq_update_util_data, cpu))) + return; + + data->func = func; + rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data); +} +EXPORT_SYMBOL_GPL(cpufreq_add_update_util_hook); + +/** + * cpufreq_remove_update_util_hook - Clear the CPU's update_util_data pointer. + * @cpu: The CPU to clear the pointer for. + * + * Clear the update_util_data pointer for the given CPU. + * + * Callers must use RCU-sched callbacks to free any memory that might be + * accessed via the old update_util_data pointer or invoke synchronize_sched() + * right after this function to avoid use-after-free. + */ +void cpufreq_remove_update_util_hook(int cpu) +{ + rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), NULL); +} +EXPORT_SYMBOL_GPL(cpufreq_remove_update_util_hook); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c new file mode 100644 index 000000000000..14c4aa25cc45 --- /dev/null +++ b/kernel/sched/cpufreq_schedutil.c @@ -0,0 +1,532 @@ +/* + * CPUFreq governor based on scheduler-provided CPU utilization data. + * + * Copyright (C) 2016, Intel Corporation + * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/cpufreq.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <trace/events/power.h> + +#include "sched.h" + +struct sugov_tunables { + struct gov_attr_set attr_set; + unsigned int rate_limit_us; +}; + +struct sugov_policy { + struct cpufreq_policy *policy; + + struct sugov_tunables *tunables; + struct list_head tunables_hook; + + raw_spinlock_t update_lock; /* For shared policies */ + u64 last_freq_update_time; + s64 freq_update_delay_ns; + unsigned int next_freq; + + /* The next fields are only needed if fast switch cannot be used. */ + struct irq_work irq_work; + struct work_struct work; + struct mutex work_lock; + bool work_in_progress; + + bool need_freq_update; +}; + +struct sugov_cpu { + struct update_util_data update_util; + struct sugov_policy *sg_policy; + + /* The fields below are only needed when sharing a policy. */ + unsigned long util; + unsigned long max; + u64 last_update; +}; + +static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); + +/************************ Governor internals ***********************/ + +static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) +{ + s64 delta_ns; + + if (sg_policy->work_in_progress) + return false; + + if (unlikely(sg_policy->need_freq_update)) { + sg_policy->need_freq_update = false; + /* + * This happens when limits change, so forget the previous + * next_freq value and force an update. + */ + sg_policy->next_freq = UINT_MAX; + return true; + } + + delta_ns = time - sg_policy->last_freq_update_time; + return delta_ns >= sg_policy->freq_update_delay_ns; +} + +static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, + unsigned int next_freq) +{ + struct cpufreq_policy *policy = sg_policy->policy; + + sg_policy->last_freq_update_time = time; + + if (policy->fast_switch_enabled) { + if (sg_policy->next_freq == next_freq) { + trace_cpu_frequency(policy->cur, smp_processor_id()); + return; + } + sg_policy->next_freq = next_freq; + next_freq = cpufreq_driver_fast_switch(policy, next_freq); + if (next_freq == CPUFREQ_ENTRY_INVALID) + return; + + policy->cur = next_freq; + trace_cpu_frequency(next_freq, smp_processor_id()); + } else if (sg_policy->next_freq != next_freq) { + sg_policy->next_freq = next_freq; + sg_policy->work_in_progress = true; + irq_work_queue(&sg_policy->irq_work); + } +} + +/** + * get_next_freq - Compute a new frequency for a given cpufreq policy. + * @policy: cpufreq policy object to compute the new frequency for. + * @util: Current CPU utilization. + * @max: CPU capacity. + * + * If the utilization is frequency-invariant, choose the new frequency to be + * proportional to it, that is + * + * next_freq = C * max_freq * util / max + * + * Otherwise, approximate the would-be frequency-invariant utilization by + * util_raw * (curr_freq / max_freq) which leads to + * + * next_freq = C * curr_freq * util_raw / max + * + * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8. + */ +static unsigned int get_next_freq(struct cpufreq_policy *policy, + unsigned long util, unsigned long max) +{ + unsigned int freq = arch_scale_freq_invariant() ? + policy->cpuinfo.max_freq : policy->cur; + + return (freq + (freq >> 2)) * util / max; +} + +static void sugov_update_single(struct update_util_data *hook, u64 time, + unsigned long util, unsigned long max) +{ + struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + struct cpufreq_policy *policy = sg_policy->policy; + unsigned int next_f; + + if (!sugov_should_update_freq(sg_policy, time)) + return; + + next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq : + get_next_freq(policy, util, max); + sugov_update_commit(sg_policy, time, next_f); +} + +static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy, + unsigned long util, unsigned long max) +{ + struct cpufreq_policy *policy = sg_policy->policy; + unsigned int max_f = policy->cpuinfo.max_freq; + u64 last_freq_update_time = sg_policy->last_freq_update_time; + unsigned int j; + + if (util == ULONG_MAX) + return max_f; + + for_each_cpu(j, policy->cpus) { + struct sugov_cpu *j_sg_cpu; + unsigned long j_util, j_max; + s64 delta_ns; + + if (j == smp_processor_id()) + continue; + + j_sg_cpu = &per_cpu(sugov_cpu, j); + /* + * If the CPU utilization was last updated before the previous + * frequency update and the time elapsed between the last update + * of the CPU utilization and the last frequency update is long + * enough, don't take the CPU into account as it probably is + * idle now. + */ + delta_ns = last_freq_update_time - j_sg_cpu->last_update; + if (delta_ns > TICK_NSEC) + continue; + + j_util = j_sg_cpu->util; + if (j_util == ULONG_MAX) + return max_f; + + j_max = j_sg_cpu->max; + if (j_util * max > j_max * util) { + util = j_util; + max = j_max; + } + } + + return get_next_freq(policy, util, max); +} + +static void sugov_update_shared(struct update_util_data *hook, u64 time, + unsigned long util, unsigned long max) +{ + struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + unsigned int next_f; + + raw_spin_lock(&sg_policy->update_lock); + + sg_cpu->util = util; + sg_cpu->max = max; + sg_cpu->last_update = time; + + if (sugov_should_update_freq(sg_policy, time)) { + next_f = sugov_next_freq_shared(sg_policy, util, max); + sugov_update_commit(sg_policy, time, next_f); + } + + raw_spin_unlock(&sg_policy->update_lock); +} + +static void sugov_work(struct work_struct *work) +{ + struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work); + + mutex_lock(&sg_policy->work_lock); + __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq, + CPUFREQ_RELATION_L); + mutex_unlock(&sg_policy->work_lock); + + sg_policy->work_in_progress = false; +} + +static void sugov_irq_work(struct irq_work *irq_work) +{ + struct sugov_policy *sg_policy; + + sg_policy = container_of(irq_work, struct sugov_policy, irq_work); + schedule_work_on(smp_processor_id(), &sg_policy->work); +} + +/************************** sysfs interface ************************/ + +static struct sugov_tunables *global_tunables; +static DEFINE_MUTEX(global_tunables_lock); + +static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set) +{ + return container_of(attr_set, struct sugov_tunables, attr_set); +} + +static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) +{ + struct sugov_tunables *tunables = to_sugov_tunables(attr_set); + + return sprintf(buf, "%u\n", tunables->rate_limit_us); +} + +static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, + size_t count) +{ + struct sugov_tunables *tunables = to_sugov_tunables(attr_set); + struct sugov_policy *sg_policy; + unsigned int rate_limit_us; + + if (kstrtouint(buf, 10, &rate_limit_us)) + return -EINVAL; + + tunables->rate_limit_us = rate_limit_us; + + list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) + sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC; + + return count; +} + +static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us); + +static struct attribute *sugov_attributes[] = { + &rate_limit_us.attr, + NULL +}; + +static struct kobj_type sugov_tunables_ktype = { + .default_attrs = sugov_attributes, + .sysfs_ops = &governor_sysfs_ops, +}; + +/********************** cpufreq governor interface *********************/ + +static struct cpufreq_governor schedutil_gov; + +static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy; + + sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL); + if (!sg_policy) + return NULL; + + sg_policy->policy = policy; + init_irq_work(&sg_policy->irq_work, sugov_irq_work); + INIT_WORK(&sg_policy->work, sugov_work); + mutex_init(&sg_policy->work_lock); + raw_spin_lock_init(&sg_policy->update_lock); + return sg_policy; +} + +static void sugov_policy_free(struct sugov_policy *sg_policy) +{ + mutex_destroy(&sg_policy->work_lock); + kfree(sg_policy); +} + +static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy) +{ + struct sugov_tunables *tunables; + + tunables = kzalloc(sizeof(*tunables), GFP_KERNEL); + if (tunables) { + gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook); + if (!have_governor_per_policy()) + global_tunables = tunables; + } + return tunables; +} + +static void sugov_tunables_free(struct sugov_tunables *tunables) +{ + if (!have_governor_per_policy()) + global_tunables = NULL; + + kfree(tunables); +} + +static int sugov_init(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy; + struct sugov_tunables *tunables; + unsigned int lat; + int ret = 0; + + /* State should be equivalent to EXIT */ + if (policy->governor_data) + return -EBUSY; + + sg_policy = sugov_policy_alloc(policy); + if (!sg_policy) + return -ENOMEM; + + mutex_lock(&global_tunables_lock); + + if (global_tunables) { + if (WARN_ON(have_governor_per_policy())) { + ret = -EINVAL; + goto free_sg_policy; + } + policy->governor_data = sg_policy; + sg_policy->tunables = global_tunables; + + gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook); + goto out; + } + + tunables = sugov_tunables_alloc(sg_policy); + if (!tunables) { + ret = -ENOMEM; + goto free_sg_policy; + } + + tunables->rate_limit_us = LATENCY_MULTIPLIER; + lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC; + if (lat) + tunables->rate_limit_us *= lat; + + policy->governor_data = sg_policy; + sg_policy->tunables = tunables; + + ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype, + get_governor_parent_kobj(policy), "%s", + schedutil_gov.name); + if (ret) + goto fail; + + out: + mutex_unlock(&global_tunables_lock); + + cpufreq_enable_fast_switch(policy); + return 0; + + fail: + policy->governor_data = NULL; + sugov_tunables_free(tunables); + + free_sg_policy: + mutex_unlock(&global_tunables_lock); + + sugov_policy_free(sg_policy); + pr_err("initialization failed (error %d)\n", ret); + return ret; +} + +static int sugov_exit(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy = policy->governor_data; + struct sugov_tunables *tunables = sg_policy->tunables; + unsigned int count; + + cpufreq_disable_fast_switch(policy); + + mutex_lock(&global_tunables_lock); + + count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); + policy->governor_data = NULL; + if (!count) + sugov_tunables_free(tunables); + + mutex_unlock(&global_tunables_lock); + + sugov_policy_free(sg_policy); + return 0; +} + +static int sugov_start(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy = policy->governor_data; + unsigned int cpu; + + sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; + sg_policy->last_freq_update_time = 0; + sg_policy->next_freq = UINT_MAX; + sg_policy->work_in_progress = false; + sg_policy->need_freq_update = false; + + for_each_cpu(cpu, policy->cpus) { + struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); + + sg_cpu->sg_policy = sg_policy; + if (policy_is_shared(policy)) { + sg_cpu->util = ULONG_MAX; + sg_cpu->max = 0; + sg_cpu->last_update = 0; + cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, + sugov_update_shared); + } else { + cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, + sugov_update_single); + } + } + return 0; +} + +static int sugov_stop(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy = policy->governor_data; + unsigned int cpu; + + for_each_cpu(cpu, policy->cpus) + cpufreq_remove_update_util_hook(cpu); + + synchronize_sched(); + + irq_work_sync(&sg_policy->irq_work); + cancel_work_sync(&sg_policy->work); + return 0; +} + +static int sugov_limits(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy = policy->governor_data; + + if (!policy->fast_switch_enabled) { + mutex_lock(&sg_policy->work_lock); + + if (policy->max < policy->cur) + __cpufreq_driver_target(policy, policy->max, + CPUFREQ_RELATION_H); + else if (policy->min > policy->cur) + __cpufreq_driver_target(policy, policy->min, + CPUFREQ_RELATION_L); + + mutex_unlock(&sg_policy->work_lock); + } + + sg_policy->need_freq_update = true; + return 0; +} + +int sugov_governor(struct cpufreq_policy *policy, unsigned int event) +{ + if (event == CPUFREQ_GOV_POLICY_INIT) { + return sugov_init(policy); + } else if (policy->governor_data) { + switch (event) { + case CPUFREQ_GOV_POLICY_EXIT: + return sugov_exit(policy); + case CPUFREQ_GOV_START: + return sugov_start(policy); + case CPUFREQ_GOV_STOP: + return sugov_stop(policy); + case CPUFREQ_GOV_LIMITS: + return sugov_limits(policy); + } + } + return -EINVAL; +} + +static struct cpufreq_governor schedutil_gov = { + .name = "schedutil", + .governor = sugov_governor, + .owner = THIS_MODULE, +}; + +static int __init sugov_module_init(void) +{ + return cpufreq_register_governor(&schedutil_gov); +} + +static void __exit sugov_module_exit(void) +{ + cpufreq_unregister_governor(&schedutil_gov); +} + +MODULE_AUTHOR("Rafael J. Wysocki <rafael.j.wysocki@intel.com>"); +MODULE_DESCRIPTION("Utilization-based CPU frequency selection"); +MODULE_LICENSE("GPL"); + +#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL +struct cpufreq_governor *cpufreq_default_governor(void) +{ + return &schedutil_gov; +} + +fs_initcall(sugov_module_init); +#else +module_init(sugov_module_init); +#endif +module_exit(sugov_module_exit); diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 981fcd7dc394..11e9705bf937 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -103,11 +103,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, if (skip) continue; - if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) + if (cpumask_any_and(tsk_cpus_allowed(p), vec->mask) >= nr_cpu_ids) continue; if (lowest_mask) { - cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); + cpumask_and(lowest_mask, tsk_cpus_allowed(p), vec->mask); /* * We have to ensure that we have at least one bit diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index b2ab2ffb1adc..75f98c5498d5 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -262,21 +262,21 @@ static __always_inline bool steal_account_process_tick(void) #ifdef CONFIG_PARAVIRT if (static_key_false(¶virt_steal_enabled)) { u64 steal; - cputime_t steal_ct; + unsigned long steal_jiffies; steal = paravirt_steal_clock(smp_processor_id()); steal -= this_rq()->prev_steal_time; /* - * cputime_t may be less precise than nsecs (eg: if it's - * based on jiffies). Lets cast the result to cputime + * steal is in nsecs but our caller is expecting steal + * time in jiffies. Lets cast the result to jiffies * granularity and account the rest on the next rounds. */ - steal_ct = nsecs_to_cputime(steal); - this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct); + steal_jiffies = nsecs_to_jiffies(steal); + this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies); - account_steal_time(steal_ct); - return steal_ct; + account_steal_time(jiffies_to_cputime(steal_jiffies)); + return steal_jiffies; } #endif return false; @@ -668,26 +668,25 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -static unsigned long long vtime_delta(struct task_struct *tsk) +static cputime_t vtime_delta(struct task_struct *tsk) { - unsigned long long clock; + unsigned long now = READ_ONCE(jiffies); - clock = local_clock(); - if (clock < tsk->vtime_snap) + if (time_before(now, (unsigned long)tsk->vtime_snap)) return 0; - return clock - tsk->vtime_snap; + return jiffies_to_cputime(now - tsk->vtime_snap); } static cputime_t get_vtime_delta(struct task_struct *tsk) { - unsigned long long delta = vtime_delta(tsk); + unsigned long now = READ_ONCE(jiffies); + unsigned long delta = now - tsk->vtime_snap; WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); - tsk->vtime_snap += delta; + tsk->vtime_snap = now; - /* CHECKME: always safe to convert nsecs to cputime? */ - return nsecs_to_cputime(delta); + return jiffies_to_cputime(delta); } static void __vtime_account_system(struct task_struct *tsk) @@ -699,6 +698,9 @@ static void __vtime_account_system(struct task_struct *tsk) void vtime_account_system(struct task_struct *tsk) { + if (!vtime_delta(tsk)) + return; + write_seqcount_begin(&tsk->vtime_seqcount); __vtime_account_system(tsk); write_seqcount_end(&tsk->vtime_seqcount); @@ -707,7 +709,8 @@ void vtime_account_system(struct task_struct *tsk) void vtime_gen_account_irq_exit(struct task_struct *tsk) { write_seqcount_begin(&tsk->vtime_seqcount); - __vtime_account_system(tsk); + if (vtime_delta(tsk)) + __vtime_account_system(tsk); if (context_tracking_in_user()) tsk->vtime_snap_whence = VTIME_USER; write_seqcount_end(&tsk->vtime_seqcount); @@ -718,16 +721,19 @@ void vtime_account_user(struct task_struct *tsk) cputime_t delta_cpu; write_seqcount_begin(&tsk->vtime_seqcount); - delta_cpu = get_vtime_delta(tsk); tsk->vtime_snap_whence = VTIME_SYS; - account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); + if (vtime_delta(tsk)) { + delta_cpu = get_vtime_delta(tsk); + account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); + } write_seqcount_end(&tsk->vtime_seqcount); } void vtime_user_enter(struct task_struct *tsk) { write_seqcount_begin(&tsk->vtime_seqcount); - __vtime_account_system(tsk); + if (vtime_delta(tsk)) + __vtime_account_system(tsk); tsk->vtime_snap_whence = VTIME_USER; write_seqcount_end(&tsk->vtime_seqcount); } @@ -742,7 +748,8 @@ void vtime_guest_enter(struct task_struct *tsk) * that can thus safely catch up with a tickless delta. */ write_seqcount_begin(&tsk->vtime_seqcount); - __vtime_account_system(tsk); + if (vtime_delta(tsk)) + __vtime_account_system(tsk); current->flags |= PF_VCPU; write_seqcount_end(&tsk->vtime_seqcount); } @@ -772,7 +779,7 @@ void arch_vtime_task_switch(struct task_struct *prev) write_seqcount_begin(¤t->vtime_seqcount); current->vtime_snap_whence = VTIME_SYS; - current->vtime_snap = sched_clock_cpu(smp_processor_id()); + current->vtime_snap = jiffies; write_seqcount_end(¤t->vtime_seqcount); } @@ -783,7 +790,7 @@ void vtime_init_idle(struct task_struct *t, int cpu) local_irq_save(flags); write_seqcount_begin(&t->vtime_seqcount); t->vtime_snap_whence = VTIME_SYS; - t->vtime_snap = sched_clock_cpu(cpu); + t->vtime_snap = jiffies; write_seqcount_end(&t->vtime_seqcount); local_irq_restore(flags); } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 57b939c81bce..fcb7f0217ff4 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -134,7 +134,7 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { struct task_struct *p = dl_task_of(dl_se); - if (p->nr_cpus_allowed > 1) + if (tsk_nr_cpus_allowed(p) > 1) dl_rq->dl_nr_migratory++; update_dl_migration(dl_rq); @@ -144,7 +144,7 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { struct task_struct *p = dl_task_of(dl_se); - if (p->nr_cpus_allowed > 1) + if (tsk_nr_cpus_allowed(p) > 1) dl_rq->dl_nr_migratory--; update_dl_migration(dl_rq); @@ -352,7 +352,15 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); - WARN_ON(!dl_se->dl_new || dl_se->dl_throttled); + WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); + + /* + * We are racing with the deadline timer. So, do nothing because + * the deadline timer handler will take care of properly recharging + * the runtime and postponing the deadline + */ + if (dl_se->dl_throttled) + return; /* * We use the regular wall clock time to set deadlines in the @@ -361,7 +369,6 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, */ dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; - dl_se->dl_new = 0; } /* @@ -399,6 +406,9 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, dl_se->runtime = pi_se->dl_runtime; } + if (dl_se->dl_yielded && dl_se->runtime > 0) + dl_se->runtime = 0; + /* * We keep moving the deadline away until we get some * available runtime for the entity. This ensures correct @@ -500,15 +510,6 @@ static void update_dl_entity(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); - /* - * The arrival of a new instance needs special treatment, i.e., - * the actual scheduling parameters have to be "renewed". - */ - if (dl_se->dl_new) { - setup_new_dl_entity(dl_se, pi_se); - return; - } - if (dl_time_before(dl_se->deadline, rq_clock(rq)) || dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; @@ -590,10 +591,10 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) struct sched_dl_entity, dl_timer); struct task_struct *p = dl_task_of(dl_se); - unsigned long flags; + struct rq_flags rf; struct rq *rq; - rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &rf); /* * The task might have changed its scheduling policy to something @@ -605,16 +606,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) } /* - * This is possible if switched_from_dl() raced against a running - * callback that took the above !dl_task() path and we've since then - * switched back into SCHED_DEADLINE. - * - * There's nothing to do except drop our task reference. - */ - if (dl_se->dl_new) - goto unlock; - - /* * The task might have been boosted by someone else and might be in the * boosting/deboosting path, its not throttled. */ @@ -679,14 +670,14 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * Nothing relies on rq->lock after this, so its safe to drop * rq->lock. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, rf.cookie); push_dl_task(rq); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, rf.cookie); } #endif unlock: - task_rq_unlock(rq, p, &flags); + task_rq_unlock(rq, p, &rf); /* * This can free the task_struct, including this hrtimer, do not touch @@ -735,8 +726,15 @@ static void update_curr_dl(struct rq *rq) * approach need further study. */ delta_exec = rq_clock_task(rq) - curr->se.exec_start; - if (unlikely((s64)delta_exec <= 0)) + if (unlikely((s64)delta_exec <= 0)) { + if (unlikely(dl_se->dl_yielded)) + goto throttle; return; + } + + /* kick cpufreq (see the comment in linux/cpufreq.h). */ + if (cpu_of(rq) == smp_processor_id()) + cpufreq_trigger_update(rq_clock(rq)); schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); @@ -749,8 +747,10 @@ static void update_curr_dl(struct rq *rq) sched_rt_avg_update(rq, delta_exec); - dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; - if (dl_runtime_exceeded(dl_se)) { + dl_se->runtime -= delta_exec; + +throttle: + if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { dl_se->dl_throttled = 1; __dequeue_task_dl(rq, curr, 0); if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) @@ -917,7 +917,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, * parameters of the task might need updating. Otherwise, * we want a replenishment of its runtime. */ - if (dl_se->dl_new || flags & ENQUEUE_WAKEUP) + if (flags & ENQUEUE_WAKEUP) update_dl_entity(dl_se, pi_se); else if (flags & ENQUEUE_REPLENISH) replenish_dl_entity(dl_se, pi_se); @@ -966,7 +966,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) enqueue_dl_entity(&p->dl, pi_se, flags); - if (!task_current(rq, p) && p->nr_cpus_allowed > 1) + if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1) enqueue_pushable_dl_task(rq, p); } @@ -994,18 +994,14 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) */ static void yield_task_dl(struct rq *rq) { - struct task_struct *p = rq->curr; - /* * We make the task go to sleep until its current deadline by * forcing its runtime to zero. This way, update_curr_dl() stops * it and the bandwidth timer will wake it up and will give it * new scheduling parameters (thanks to dl_yielded=1). */ - if (p->dl.runtime > 0) { - rq->curr->dl.dl_yielded = 1; - p->dl.runtime = 0; - } + rq->curr->dl.dl_yielded = 1; + update_rq_clock(rq); update_curr_dl(rq); /* @@ -1044,9 +1040,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) * try to make it stay here, it might be important. */ if (unlikely(dl_task(curr)) && - (curr->nr_cpus_allowed < 2 || + (tsk_nr_cpus_allowed(curr) < 2 || !dl_entity_preempt(&p->dl, &curr->dl)) && - (p->nr_cpus_allowed > 1)) { + (tsk_nr_cpus_allowed(p) > 1)) { int target = find_later_rq(p); if (target != -1 && @@ -1067,7 +1063,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * Current can't be migrated, useless to reschedule, * let's hope p can move out. */ - if (rq->curr->nr_cpus_allowed == 1 || + if (tsk_nr_cpus_allowed(rq->curr) == 1 || cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1) return; @@ -1075,7 +1071,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) * p is migratable, so let's not schedule it and * see if it is pushed or pulled somewhere else. */ - if (p->nr_cpus_allowed != 1 && + if (tsk_nr_cpus_allowed(p) != 1 && cpudl_find(&rq->rd->cpudl, p, NULL) != -1) return; @@ -1129,7 +1125,8 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, return rb_entry(left, struct sched_dl_entity, rb_node); } -struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) +struct task_struct * +pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) { struct sched_dl_entity *dl_se; struct task_struct *p; @@ -1144,9 +1141,9 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) * disabled avoiding further scheduler activity on it and we're * being very careful to re-start the picking loop. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); pull_dl_task(rq); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, cookie); /* * pull_rt_task() can drop (and re-acquire) rq->lock; this * means a stop task can slip in, in which case we need to @@ -1189,7 +1186,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p) { update_curr_dl(rq); - if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) + if (on_dl_rq(&p->dl) && tsk_nr_cpus_allowed(p) > 1) enqueue_pushable_dl_task(rq, p); } @@ -1290,7 +1287,7 @@ static int find_later_rq(struct task_struct *task) if (unlikely(!later_mask)) return -1; - if (task->nr_cpus_allowed == 1) + if (tsk_nr_cpus_allowed(task) == 1) return -1; /* @@ -1396,8 +1393,9 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) if (double_lock_balance(rq, later_rq)) { if (unlikely(task_rq(task) != rq || !cpumask_test_cpu(later_rq->cpu, - &task->cpus_allowed) || + tsk_cpus_allowed(task)) || task_running(rq, task) || + !dl_task(task) || !task_on_rq_queued(task))) { double_unlock_balance(rq, later_rq); later_rq = NULL; @@ -1435,7 +1433,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) BUG_ON(rq->cpu != task_cpu(p)); BUG_ON(task_current(rq, p)); - BUG_ON(p->nr_cpus_allowed <= 1); + BUG_ON(tsk_nr_cpus_allowed(p) <= 1); BUG_ON(!task_on_rq_queued(p)); BUG_ON(!dl_task(p)); @@ -1474,7 +1472,7 @@ retry: */ if (dl_task(rq->curr) && dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && - rq->curr->nr_cpus_allowed > 1) { + tsk_nr_cpus_allowed(rq->curr) > 1) { resched_curr(rq); return 0; } @@ -1621,9 +1619,9 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && - p->nr_cpus_allowed > 1 && + tsk_nr_cpus_allowed(p) > 1 && dl_task(rq->curr) && - (rq->curr->nr_cpus_allowed < 2 || + (tsk_nr_cpus_allowed(rq->curr) < 2 || !dl_entity_preempt(&p->dl, &rq->curr->dl))) { push_dl_tasks(rq); } @@ -1722,9 +1720,12 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) */ static void switched_to_dl(struct rq *rq, struct task_struct *p) { + if (dl_time_before(p->dl.deadline, rq_clock(rq))) + setup_new_dl_entity(&p->dl, &p->dl); + if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP - if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) + if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded) queue_push_tasks(rq); #else if (dl_task(rq->curr)) @@ -1768,8 +1769,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, */ resched_curr(rq); #endif /* CONFIG_SMP */ - } else - switched_to_dl(rq, p); + } } const struct sched_class dl_sched_class = { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 641511771ae6..0368c393a336 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -16,6 +16,7 @@ #include <linux/kallsyms.h> #include <linux/utsname.h> #include <linux/mempolicy.h> +#include <linux/debugfs.h> #include "sched.h" @@ -58,6 +59,309 @@ static unsigned long nsec_low(unsigned long long nsec) #define SPLIT_NS(x) nsec_high(x), nsec_low(x) +#define SCHED_FEAT(name, enabled) \ + #name , + +static const char * const sched_feat_names[] = { +#include "features.h" +}; + +#undef SCHED_FEAT + +static int sched_feat_show(struct seq_file *m, void *v) +{ + int i; + + for (i = 0; i < __SCHED_FEAT_NR; i++) { + if (!(sysctl_sched_features & (1UL << i))) + seq_puts(m, "NO_"); + seq_printf(m, "%s ", sched_feat_names[i]); + } + seq_puts(m, "\n"); + + return 0; +} + +#ifdef HAVE_JUMP_LABEL + +#define jump_label_key__true STATIC_KEY_INIT_TRUE +#define jump_label_key__false STATIC_KEY_INIT_FALSE + +#define SCHED_FEAT(name, enabled) \ + jump_label_key__##enabled , + +struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { +#include "features.h" +}; + +#undef SCHED_FEAT + +static void sched_feat_disable(int i) +{ + static_key_disable(&sched_feat_keys[i]); +} + +static void sched_feat_enable(int i) +{ + static_key_enable(&sched_feat_keys[i]); +} +#else +static void sched_feat_disable(int i) { }; +static void sched_feat_enable(int i) { }; +#endif /* HAVE_JUMP_LABEL */ + +static int sched_feat_set(char *cmp) +{ + int i; + int neg = 0; + + if (strncmp(cmp, "NO_", 3) == 0) { + neg = 1; + cmp += 3; + } + + for (i = 0; i < __SCHED_FEAT_NR; i++) { + if (strcmp(cmp, sched_feat_names[i]) == 0) { + if (neg) { + sysctl_sched_features &= ~(1UL << i); + sched_feat_disable(i); + } else { + sysctl_sched_features |= (1UL << i); + sched_feat_enable(i); + } + break; + } + } + + return i; +} + +static ssize_t +sched_feat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + char *cmp; + int i; + struct inode *inode; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + cmp = strstrip(buf); + + /* Ensure the static_key remains in a consistent state */ + inode = file_inode(filp); + inode_lock(inode); + i = sched_feat_set(cmp); + inode_unlock(inode); + if (i == __SCHED_FEAT_NR) + return -EINVAL; + + *ppos += cnt; + + return cnt; +} + +static int sched_feat_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_feat_show, NULL); +} + +static const struct file_operations sched_feat_fops = { + .open = sched_feat_open, + .write = sched_feat_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static __init int sched_init_debug(void) +{ + debugfs_create_file("sched_features", 0644, NULL, NULL, + &sched_feat_fops); + + return 0; +} +late_initcall(sched_init_debug); + +#ifdef CONFIG_SMP + +#ifdef CONFIG_SYSCTL + +static struct ctl_table sd_ctl_dir[] = { + { + .procname = "sched_domain", + .mode = 0555, + }, + {} +}; + +static struct ctl_table sd_ctl_root[] = { + { + .procname = "kernel", + .mode = 0555, + .child = sd_ctl_dir, + }, + {} +}; + +static struct ctl_table *sd_alloc_ctl_entry(int n) +{ + struct ctl_table *entry = + kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); + + return entry; +} + +static void sd_free_ctl_entry(struct ctl_table **tablep) +{ + struct ctl_table *entry; + + /* + * In the intermediate directories, both the child directory and + * procname are dynamically allocated and could fail but the mode + * will always be set. In the lowest directory the names are + * static strings and all have proc handlers. + */ + for (entry = *tablep; entry->mode; entry++) { + if (entry->child) + sd_free_ctl_entry(&entry->child); + if (entry->proc_handler == NULL) + kfree(entry->procname); + } + + kfree(*tablep); + *tablep = NULL; +} + +static int min_load_idx = 0; +static int max_load_idx = CPU_LOAD_IDX_MAX-1; + +static void +set_table_entry(struct ctl_table *entry, + const char *procname, void *data, int maxlen, + umode_t mode, proc_handler *proc_handler, + bool load_idx) +{ + entry->procname = procname; + entry->data = data; + entry->maxlen = maxlen; + entry->mode = mode; + entry->proc_handler = proc_handler; + + if (load_idx) { + entry->extra1 = &min_load_idx; + entry->extra2 = &max_load_idx; + } +} + +static struct ctl_table * +sd_alloc_ctl_domain_table(struct sched_domain *sd) +{ + struct ctl_table *table = sd_alloc_ctl_entry(14); + + if (table == NULL) + return NULL; + + set_table_entry(&table[0], "min_interval", &sd->min_interval, + sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[1], "max_interval", &sd->max_interval, + sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[2], "busy_idx", &sd->busy_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[3], "idle_idx", &sd->idle_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[5], "wake_idx", &sd->wake_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[7], "busy_factor", &sd->busy_factor, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[9], "cache_nice_tries", + &sd->cache_nice_tries, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[10], "flags", &sd->flags, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[11], "max_newidle_lb_cost", + &sd->max_newidle_lb_cost, + sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[12], "name", sd->name, + CORENAME_MAX_SIZE, 0444, proc_dostring, false); + /* &table[13] is terminator */ + + return table; +} + +static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) +{ + struct ctl_table *entry, *table; + struct sched_domain *sd; + int domain_num = 0, i; + char buf[32]; + + for_each_domain(cpu, sd) + domain_num++; + entry = table = sd_alloc_ctl_entry(domain_num + 1); + if (table == NULL) + return NULL; + + i = 0; + for_each_domain(cpu, sd) { + snprintf(buf, 32, "domain%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_domain_table(sd); + entry++; + i++; + } + return table; +} + +static struct ctl_table_header *sd_sysctl_header; +void register_sched_domain_sysctl(void) +{ + int i, cpu_num = num_possible_cpus(); + struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); + char buf[32]; + + WARN_ON(sd_ctl_dir[0].child); + sd_ctl_dir[0].child = entry; + + if (entry == NULL) + return; + + for_each_possible_cpu(i) { + snprintf(buf, 32, "cpu%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_cpu_table(i); + entry++; + } + + WARN_ON(sd_sysctl_header); + sd_sysctl_header = register_sysctl_table(sd_ctl_root); +} + +/* may be called multiple times per register */ +void unregister_sched_domain_sysctl(void) +{ + unregister_sysctl_table(sd_sysctl_header); + sd_sysctl_header = NULL; + if (sd_ctl_dir[0].child) + sd_free_ctl_entry(&sd_ctl_dir[0].child); +} +#endif /* CONFIG_SYSCTL */ +#endif /* CONFIG_SMP */ + #ifdef CONFIG_FAIR_GROUP_SCHED static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) { @@ -75,16 +379,18 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group PN(se->vruntime); PN(se->sum_exec_runtime); #ifdef CONFIG_SCHEDSTATS - PN(se->statistics.wait_start); - PN(se->statistics.sleep_start); - PN(se->statistics.block_start); - PN(se->statistics.sleep_max); - PN(se->statistics.block_max); - PN(se->statistics.exec_max); - PN(se->statistics.slice_max); - PN(se->statistics.wait_max); - PN(se->statistics.wait_sum); - P(se->statistics.wait_count); + if (schedstat_enabled()) { + PN(se->statistics.wait_start); + PN(se->statistics.sleep_start); + PN(se->statistics.block_start); + PN(se->statistics.sleep_max); + PN(se->statistics.block_max); + PN(se->statistics.exec_max); + PN(se->statistics.slice_max); + PN(se->statistics.wait_max); + PN(se->statistics.wait_sum); + P(se->statistics.wait_count); + } #endif P(se->load.weight); #ifdef CONFIG_SMP @@ -121,17 +427,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SPLIT_NS(p->se.vruntime), (long long)(p->nvcsw + p->nivcsw), p->prio); -#ifdef CONFIG_SCHEDSTATS - SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", - SPLIT_NS(p->se.statistics.wait_sum), - SPLIT_NS(p->se.sum_exec_runtime), - SPLIT_NS(p->se.statistics.sum_sleep_runtime)); -#else + SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", - 0LL, 0L, + SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)), SPLIT_NS(p->se.sum_exec_runtime), - 0LL, 0L); -#endif + SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime))); + #ifdef CONFIG_NUMA_BALANCING SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); #endif @@ -258,8 +559,17 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) { + struct dl_bw *dl_bw; + SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running); +#ifdef CONFIG_SMP + dl_bw = &cpu_rq(cpu)->rd->dl_bw; +#else + dl_bw = &dl_rq->dl_bw; +#endif + SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw); + SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw); } extern __read_mostly int sched_clock_running; @@ -309,24 +619,25 @@ do { \ #undef P #undef PN -#ifdef CONFIG_SCHEDSTATS -#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); -#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); - - P(yld_count); - - P(sched_count); - P(sched_goidle); #ifdef CONFIG_SMP +#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); P64(avg_idle); P64(max_idle_balance_cost); +#undef P64 #endif - P(ttwu_count); - P(ttwu_local); +#ifdef CONFIG_SCHEDSTATS +#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); + + if (schedstat_enabled()) { + P(yld_count); + P(sched_count); + P(sched_goidle); + P(ttwu_count); + P(ttwu_local); + } #undef P -#undef P64 #endif spin_lock_irqsave(&sched_debug_lock, flags); print_cfs_stats(m, cpu); @@ -569,38 +880,39 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) nr_switches = p->nvcsw + p->nivcsw; #ifdef CONFIG_SCHEDSTATS - PN(se.statistics.sum_sleep_runtime); - PN(se.statistics.wait_start); - PN(se.statistics.sleep_start); - PN(se.statistics.block_start); - PN(se.statistics.sleep_max); - PN(se.statistics.block_max); - PN(se.statistics.exec_max); - PN(se.statistics.slice_max); - PN(se.statistics.wait_max); - PN(se.statistics.wait_sum); - P(se.statistics.wait_count); - PN(se.statistics.iowait_sum); - P(se.statistics.iowait_count); P(se.nr_migrations); - P(se.statistics.nr_migrations_cold); - P(se.statistics.nr_failed_migrations_affine); - P(se.statistics.nr_failed_migrations_running); - P(se.statistics.nr_failed_migrations_hot); - P(se.statistics.nr_forced_migrations); - P(se.statistics.nr_wakeups); - P(se.statistics.nr_wakeups_sync); - P(se.statistics.nr_wakeups_migrate); - P(se.statistics.nr_wakeups_local); - P(se.statistics.nr_wakeups_remote); - P(se.statistics.nr_wakeups_affine); - P(se.statistics.nr_wakeups_affine_attempts); - P(se.statistics.nr_wakeups_passive); - P(se.statistics.nr_wakeups_idle); - { + if (schedstat_enabled()) { u64 avg_atom, avg_per_cpu; + PN(se.statistics.sum_sleep_runtime); + PN(se.statistics.wait_start); + PN(se.statistics.sleep_start); + PN(se.statistics.block_start); + PN(se.statistics.sleep_max); + PN(se.statistics.block_max); + PN(se.statistics.exec_max); + PN(se.statistics.slice_max); + PN(se.statistics.wait_max); + PN(se.statistics.wait_sum); + P(se.statistics.wait_count); + PN(se.statistics.iowait_sum); + P(se.statistics.iowait_count); + P(se.statistics.nr_migrations_cold); + P(se.statistics.nr_failed_migrations_affine); + P(se.statistics.nr_failed_migrations_running); + P(se.statistics.nr_failed_migrations_hot); + P(se.statistics.nr_forced_migrations); + P(se.statistics.nr_wakeups); + P(se.statistics.nr_wakeups_sync); + P(se.statistics.nr_wakeups_migrate); + P(se.statistics.nr_wakeups_local); + P(se.statistics.nr_wakeups_remote); + P(se.statistics.nr_wakeups_affine); + P(se.statistics.nr_wakeups_affine_attempts); + P(se.statistics.nr_wakeups_passive); + P(se.statistics.nr_wakeups_idle); + avg_atom = p->se.sum_exec_runtime; if (nr_switches) avg_atom = div64_ul(avg_atom, nr_switches); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 56b7d4b83947..bdcbeea90c95 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -20,8 +20,8 @@ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ -#include <linux/latencytop.h> #include <linux/sched.h> +#include <linux/latencytop.h> #include <linux/cpumask.h> #include <linux/cpuidle.h> #include <linux/slab.h> @@ -204,7 +204,7 @@ static void __update_inv_weight(struct load_weight *lw) * OR * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT * - * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case + * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case * we're guaranteed shift stays positive because inv_weight is guaranteed to * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22. * @@ -682,17 +682,68 @@ void init_entity_runnable_average(struct sched_entity *se) sa->period_contrib = 1023; sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; - sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); - sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + /* + * At this point, util_avg won't be used in select_task_rq_fair anyway + */ + sa->util_avg = 0; + sa->util_sum = 0; /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } +/* + * With new tasks being created, their initial util_avgs are extrapolated + * based on the cfs_rq's current util_avg: + * + * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight + * + * However, in many cases, the above util_avg does not give a desired + * value. Moreover, the sum of the util_avgs may be divergent, such + * as when the series is a harmonic series. + * + * To solve this problem, we also cap the util_avg of successive tasks to + * only 1/2 of the left utilization budget: + * + * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n + * + * where n denotes the nth task. + * + * For example, a simplest series from the beginning would be like: + * + * task util_avg: 512, 256, 128, 64, 32, 16, 8, ... + * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ... + * + * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap) + * if util_avg > util_avg_cap. + */ +void post_init_entity_util_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + struct sched_avg *sa = &se->avg; + long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; + + if (cap > 0) { + if (cfs_rq->avg.util_avg != 0) { + sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; + sa->util_avg /= (cfs_rq->avg.load_avg + 1); + + if (sa->util_avg > cap) + sa->util_avg = cap; + } else { + sa->util_avg = cap; + } + sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + } +} + static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq); #else void init_entity_runnable_average(struct sched_entity *se) { } +void post_init_entity_util_avg(struct sched_entity *se) +{ +} #endif /* @@ -755,7 +806,9 @@ static void update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct task_struct *p; - u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; + u64 delta; + + delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; if (entity_is_task(se)) { p = task_of(se); @@ -776,22 +829,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) se->statistics.wait_sum += delta; se->statistics.wait_start = 0; } -#else -static inline void -update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -} - -static inline void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -} -#endif /* * Task is being enqueued - update stats: */ -static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +static inline void +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { /* * Are we enqueueing a waiting task? (for current tasks @@ -802,7 +845,7 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) } static inline void -update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { /* * Mark the end of the wait period if dequeueing a @@ -810,7 +853,40 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) */ if (se != cfs_rq->curr) update_stats_wait_end(cfs_rq, se); + + if (flags & DEQUEUE_SLEEP) { + if (entity_is_task(se)) { + struct task_struct *tsk = task_of(se); + + if (tsk->state & TASK_INTERRUPTIBLE) + se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); + if (tsk->state & TASK_UNINTERRUPTIBLE) + se->statistics.block_start = rq_clock(rq_of(cfs_rq)); + } + } + +} +#else +static inline void +update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} + +static inline void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} + +static inline void +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} + +static inline void +update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +{ } +#endif /* * We are picking a new current task - update its stats: @@ -907,10 +983,11 @@ struct numa_group { spinlock_t lock; /* nr_tasks, tasks */ int nr_tasks; pid_t gid; + int active_nodes; struct rcu_head rcu; - nodemask_t active_nodes; unsigned long total_faults; + unsigned long max_faults_cpu; /* * Faults_cpu is used to decide whether memory should move * towards the CPU. As a consequence, these stats are weighted @@ -969,6 +1046,18 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; } +/* + * A node triggering more than 1/3 as many NUMA faults as the maximum is + * considered part of a numa group's pseudo-interleaving set. Migrations + * between these nodes are slowed down, to allow things to settle down. + */ +#define ACTIVE_NODE_FRACTION 3 + +static bool numa_is_active_node(int nid, struct numa_group *ng) +{ + return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu; +} + /* Handle placement on systems where not all nodes are directly connected. */ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, int maxdist, bool task) @@ -1118,27 +1207,23 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, return true; /* - * Do not migrate if the destination is not a node that - * is actively used by this numa group. - */ - if (!node_isset(dst_nid, ng->active_nodes)) - return false; - - /* - * Source is a node that is not actively used by this - * numa group, while the destination is. Migrate. + * Destination node is much more heavily used than the source + * node? Allow migration. */ - if (!node_isset(src_nid, ng->active_nodes)) + if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) * + ACTIVE_NODE_FRACTION) return true; /* - * Both source and destination are nodes in active - * use by this numa group. Maximize memory bandwidth - * by migrating from more heavily used groups, to less - * heavily used ones, spreading the load around. - * Use a 1/4 hysteresis to avoid spurious page movement. + * Distribute memory according to CPU & memory use on each node, + * with 3/4 hysteresis to avoid unnecessary memory migrations: + * + * faults_cpu(dst) 3 faults_cpu(src) + * --------------- * - > --------------- + * faults_mem(dst) 4 faults_mem(src) */ - return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4); + return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 > + group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; } static unsigned long weighted_cpuload(const int cpu); @@ -1484,7 +1569,7 @@ static int task_numa_migrate(struct task_struct *p) .best_task = NULL, .best_imp = 0, - .best_cpu = -1 + .best_cpu = -1, }; struct sched_domain *sd; unsigned long taskweight, groupweight; @@ -1536,8 +1621,7 @@ static int task_numa_migrate(struct task_struct *p) * multiple NUMA nodes; in order to better consolidate the group, * we need to check other locations. */ - if (env.best_cpu == -1 || (p->numa_group && - nodes_weight(p->numa_group->active_nodes) > 1)) { + if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) { for_each_online_node(nid) { if (nid == env.src_nid || nid == p->numa_preferred_nid) continue; @@ -1572,12 +1656,14 @@ static int task_numa_migrate(struct task_struct *p) * trying for a better one later. Do not set the preferred node here. */ if (p->numa_group) { + struct numa_group *ng = p->numa_group; + if (env.best_cpu == -1) nid = env.src_nid; else nid = env.dst_nid; - if (node_isset(nid, p->numa_group->active_nodes)) + if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng)) sched_setnuma(p, env.dst_nid); } @@ -1627,20 +1713,15 @@ static void numa_migrate_preferred(struct task_struct *p) } /* - * Find the nodes on which the workload is actively running. We do this by + * Find out how many nodes on the workload is actively running on. Do this by * tracking the nodes from which NUMA hinting faults are triggered. This can * be different from the set of nodes where the workload's memory is currently * located. - * - * The bitmask is used to make smarter decisions on when to do NUMA page - * migrations, To prevent flip-flopping, and excessive page migrations, nodes - * are added when they cause over 6/16 of the maximum number of faults, but - * only removed when they drop below 3/16. */ -static void update_numa_active_node_mask(struct numa_group *numa_group) +static void numa_group_count_active_nodes(struct numa_group *numa_group) { unsigned long faults, max_faults = 0; - int nid; + int nid, active_nodes = 0; for_each_online_node(nid) { faults = group_faults_cpu(numa_group, nid); @@ -1650,12 +1731,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group) for_each_online_node(nid) { faults = group_faults_cpu(numa_group, nid); - if (!node_isset(nid, numa_group->active_nodes)) { - if (faults > max_faults * 6 / 16) - node_set(nid, numa_group->active_nodes); - } else if (faults < max_faults * 3 / 16) - node_clear(nid, numa_group->active_nodes); + if (faults * ACTIVE_NODE_FRACTION > max_faults) + active_nodes++; } + + numa_group->max_faults_cpu = max_faults; + numa_group->active_nodes = active_nodes; } /* @@ -1946,7 +2027,7 @@ static void task_numa_placement(struct task_struct *p) update_task_scan_period(p, fault_types[0], fault_types[1]); if (p->numa_group) { - update_numa_active_node_mask(p->numa_group); + numa_group_count_active_nodes(p->numa_group); spin_unlock_irq(group_lock); max_nid = preferred_group_nid(p, max_group_nid); } @@ -1990,14 +2071,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, return; atomic_set(&grp->refcount, 1); + grp->active_nodes = 1; + grp->max_faults_cpu = 0; spin_lock_init(&grp->lock); grp->gid = p->pid; /* Second half of the array tracks nids where faults happen */ grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * nr_node_ids; - node_set(task_node(current), grp->active_nodes); - for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) grp->faults[i] = p->numa_faults[i]; @@ -2111,6 +2192,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) bool migrated = flags & TNF_MIGRATED; int cpu_node = task_node(current); int local = !!(flags & TNF_FAULT_LOCAL); + struct numa_group *ng; int priv; if (!static_branch_likely(&sched_numa_balancing)) @@ -2151,9 +2233,10 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) * actively using should be counted as local. This allows the * scan rate to slow down when a workload has settled down. */ - if (!priv && !local && p->numa_group && - node_isset(cpu_node, p->numa_group->active_nodes) && - node_isset(mem_node, p->numa_group->active_nodes)) + ng = p->numa_group; + if (!priv && !local && ng && ng->active_nodes > 1 && + numa_is_active_node(cpu_node, ng) && + numa_is_active_node(mem_node, ng)) local = 1; task_numa_placement(p); @@ -2405,10 +2488,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_sub(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); +#ifdef CONFIG_SMP if (entity_is_task(se)) { account_numa_dequeue(rq_of(cfs_rq), task_of(se)); list_del_init(&se->group_node); } +#endif cfs_rq->nr_running--; } @@ -2518,6 +2603,16 @@ static const u32 runnable_avg_yN_sum[] = { }; /* + * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to + * lower integers. See Documentation/scheduler/sched-avg.txt how these + * were generated: + */ +static const u32 __accumulated_sum_N32[] = { + 0, 23371, 35056, 40899, 43820, 45281, + 46011, 46376, 46559, 46650, 46696, 46719, +}; + +/* * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) */ @@ -2565,22 +2660,13 @@ static u32 __compute_runnable_contrib(u64 n) else if (unlikely(n >= LOAD_AVG_MAX_N)) return LOAD_AVG_MAX; - /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */ - do { - contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */ - contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD]; - - n -= LOAD_AVG_PERIOD; - } while (n > LOAD_AVG_PERIOD); - + /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */ + contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD]; + n %= LOAD_AVG_PERIOD; contrib = decay_load(contrib, n); return contrib + runnable_avg_yN_sum[n]; } -#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10 -#error "load tracking assumes 2^10 as unit" -#endif - #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) /* @@ -2789,23 +2875,71 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); +static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); + + if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) { + unsigned long max = rq->cpu_capacity_orig; + + /* + * There are a few boundary cases this might miss but it should + * get called often enough that that should (hopefully) not be + * a real problem -- added to that it only calls on the local + * CPU, so if we enqueue remotely we'll miss an update, but + * the next tick/schedule should update. + * + * It will not get called when we go idle, because the idle + * thread is a different class (!fair), nor will the utilization + * number include things like RT tasks. + * + * As is, the util number is not freq-invariant (we'd have to + * implement arch_scale_freq_capacity() for that). + * + * See cpu_util(). + */ + cpufreq_update_util(rq_clock(rq), + min(cfs_rq->avg.util_avg, max), max); + } +} + +/* + * Unsigned subtract and clamp on underflow. + * + * Explicitly do a load-store to ensure the intermediate value never hits + * memory. This allows lockless observations without ever seeing the negative + * values. + */ +#define sub_positive(_ptr, _val) do { \ + typeof(_ptr) ptr = (_ptr); \ + typeof(*ptr) val = (_val); \ + typeof(*ptr) res, var = READ_ONCE(*ptr); \ + res = var - val; \ + if (res > var) \ + res = 0; \ + WRITE_ONCE(*ptr, res); \ +} while (0) + /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ -static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) +static inline int +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) { struct sched_avg *sa = &cfs_rq->avg; - int decayed, removed = 0; + int decayed, removed_load = 0, removed_util = 0; if (atomic_long_read(&cfs_rq->removed_load_avg)) { s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); - sa->load_avg = max_t(long, sa->load_avg - r, 0); - sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); - removed = 1; + sub_positive(&sa->load_avg, r); + sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); + removed_load = 1; } if (atomic_long_read(&cfs_rq->removed_util_avg)) { long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); - sa->util_avg = max_t(long, sa->util_avg - r, 0); - sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); + sub_positive(&sa->util_avg, r); + sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); + removed_util = 1; } decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, @@ -2816,7 +2950,10 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) cfs_rq->load_last_update_time_copy = sa->last_update_time; #endif - return decayed || removed; + if (update_freq && (decayed || removed_util)) + cfs_rq_util_change(cfs_rq); + + return decayed || removed_load; } /* Update task and its cfs_rq load average */ @@ -2824,7 +2961,8 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) { struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); - int cpu = cpu_of(rq_of(cfs_rq)); + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); /* * Track task load average for carrying it to new CPU after migrated, and @@ -2834,7 +2972,7 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); - if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) + if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg) update_tg_load_avg(cfs_rq, 0); } @@ -2863,6 +3001,8 @@ skip_aging: cfs_rq->avg.load_sum += se->avg.load_sum; cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; + + cfs_rq_util_change(cfs_rq); } static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -2871,10 +3011,12 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s &se->avg, se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); - cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); - cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); - cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); - cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); + sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); + sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); + sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); + sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); + + cfs_rq_util_change(cfs_rq); } /* Add the load generated by se into cfs_rq's load average */ @@ -2892,7 +3034,7 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->curr == se, NULL); } - decayed = update_cfs_rq_load_avg(now, cfs_rq); + decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated); cfs_rq->runnable_load_avg += sa->load_avg; cfs_rq->runnable_load_sum += sa->load_sum; @@ -2974,7 +3116,14 @@ static int idle_balance(struct rq *this_rq); #else /* CONFIG_SMP */ -static inline void update_load_avg(struct sched_entity *se, int update_tg) {} +static inline void update_load_avg(struct sched_entity *se, int not_used) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + struct rq *rq = rq_of(cfs_rq); + + cpufreq_trigger_update(rq_clock(rq)); +} + static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void @@ -3102,32 +3251,97 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) static void check_enqueue_throttle(struct cfs_rq *cfs_rq); +static inline void check_schedstat_required(void) +{ +#ifdef CONFIG_SCHEDSTATS + if (schedstat_enabled()) + return; + + /* Force schedstat enabled if a dependent tracepoint is active */ + if (trace_sched_stat_wait_enabled() || + trace_sched_stat_sleep_enabled() || + trace_sched_stat_iowait_enabled() || + trace_sched_stat_blocked_enabled() || + trace_sched_stat_runtime_enabled()) { + printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, " + "stat_blocked and stat_runtime require the " + "kernel parameter schedstats=enabled or " + "kernel.sched_schedstats=1\n"); + } +#endif +} + + +/* + * MIGRATION + * + * dequeue + * update_curr() + * update_min_vruntime() + * vruntime -= min_vruntime + * + * enqueue + * update_curr() + * update_min_vruntime() + * vruntime += min_vruntime + * + * this way the vruntime transition between RQs is done when both + * min_vruntime are up-to-date. + * + * WAKEUP (remote) + * + * ->migrate_task_rq_fair() (p->state == TASK_WAKING) + * vruntime -= min_vruntime + * + * enqueue + * update_curr() + * update_min_vruntime() + * vruntime += min_vruntime + * + * this way we don't have the most up-to-date min_vruntime on the originating + * CPU and an up-to-date min_vruntime on the destination CPU. + */ + static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { + bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED); + bool curr = cfs_rq->curr == se; + /* - * Update the normalized vruntime before updating min_vruntime - * through calling update_curr(). + * If we're the current task, we must renormalise before calling + * update_curr(). */ - if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) + if (renorm && curr) se->vruntime += cfs_rq->min_vruntime; + update_curr(cfs_rq); + /* - * Update run-time statistics of the 'current'. + * Otherwise, renormalise after, such that we're placed at the current + * moment in time, instead of some random moment in the past. Being + * placed in the past could significantly boost this task to the + * fairness detriment of existing tasks. */ - update_curr(cfs_rq); + if (renorm && !curr) + se->vruntime += cfs_rq->min_vruntime; + enqueue_entity_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); - enqueue_sleeper(cfs_rq, se); + if (schedstat_enabled()) + enqueue_sleeper(cfs_rq, se); } - update_stats_enqueue(cfs_rq, se); - check_spread(cfs_rq, se); - if (se != cfs_rq->curr) + check_schedstat_required(); + if (schedstat_enabled()) { + update_stats_enqueue(cfs_rq, se); + check_spread(cfs_rq, se); + } + if (!curr) __enqueue_entity(cfs_rq, se); se->on_rq = 1; @@ -3193,19 +3407,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_curr(cfs_rq); dequeue_entity_load_avg(cfs_rq, se); - update_stats_dequeue(cfs_rq, se); - if (flags & DEQUEUE_SLEEP) { -#ifdef CONFIG_SCHEDSTATS - if (entity_is_task(se)) { - struct task_struct *tsk = task_of(se); - - if (tsk->state & TASK_INTERRUPTIBLE) - se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); - if (tsk->state & TASK_UNINTERRUPTIBLE) - se->statistics.block_start = rq_clock(rq_of(cfs_rq)); - } -#endif - } + if (schedstat_enabled()) + update_stats_dequeue(cfs_rq, se, flags); clear_buddies(cfs_rq, se); @@ -3279,7 +3482,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * a CPU. So account for the time it spent waiting on the * runqueue. */ - update_stats_wait_end(cfs_rq, se); + if (schedstat_enabled()) + update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); update_load_avg(se, 1); } @@ -3292,7 +3496,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * least twice that of our own weight (i.e. dont track it * when there are only lesser-weight tasks around): */ - if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { + if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { se->statistics.slice_max = max(se->statistics.slice_max, se->sum_exec_runtime - se->prev_sum_exec_runtime); } @@ -3375,9 +3579,13 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* throttle cfs_rqs exceeding runtime */ check_cfs_rq_runtime(cfs_rq); - check_spread(cfs_rq, prev); + if (schedstat_enabled()) { + check_spread(cfs_rq, prev); + if (prev->on_rq) + update_stats_wait_start(cfs_rq, prev); + } + if (prev->on_rq) { - update_stats_wait_start(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ @@ -3994,6 +4202,26 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) if (!cfs_bandwidth_used()) return; + /* Synchronize hierarchical throttle counter: */ + if (unlikely(!cfs_rq->throttle_uptodate)) { + struct rq *rq = rq_of(cfs_rq); + struct cfs_rq *pcfs_rq; + struct task_group *tg; + + cfs_rq->throttle_uptodate = 1; + + /* Get closest up-to-date node, because leaves go first: */ + for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) { + pcfs_rq = tg->cfs_rq[cpu_of(rq)]; + if (pcfs_rq->throttle_uptodate) + break; + } + if (tg) { + cfs_rq->throttle_count = pcfs_rq->throttle_count; + cfs_rq->throttled_clock_task = rq_clock_task(rq); + } + } + /* an active group must be handled by the update_curr()->put() path */ if (!cfs_rq->runtime_enabled || cfs_rq->curr) return; @@ -4309,15 +4537,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { + /* Avoid re-evaluating load for this entity: */ + se = parent_entity(se); /* * Bias pick_next to pick a task from this cfs_rq, as * p is sleeping when it is within its sched_slice. */ - if (task_sleep && parent_entity(se)) - set_next_buddy(parent_entity(se)); - - /* avoid re-evaluating load for this entity */ - se = parent_entity(se); + if (task_sleep && se && !throttled_hierarchy(cfs_rq)) + set_next_buddy(se); break; } flags |= DEQUEUE_SLEEP; @@ -4341,7 +4568,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP - +#ifdef CONFIG_NO_HZ_COMMON /* * per rq 'load' arrray crap; XXX kill this. */ @@ -4407,13 +4634,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) } return load; } +#endif /* CONFIG_NO_HZ_COMMON */ /** - * __update_cpu_load - update the rq->cpu_load[] statistics + * __cpu_load_update - update the rq->cpu_load[] statistics * @this_rq: The rq to update statistics for * @this_load: The current load * @pending_updates: The number of missed updates - * @active: !0 for NOHZ_FULL * * Update rq->cpu_load[] statistics. This function is usually called every * scheduler tick (TICK_NSEC). @@ -4442,12 +4669,12 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) * load[i]_n = (1 - 1/2^i)^n * load[i]_0 * * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra - * term. See the @active paramter. + * term. */ -static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, - unsigned long pending_updates, int active) +static void cpu_load_update(struct rq *this_rq, unsigned long this_load, + unsigned long pending_updates) { - unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0; + unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0]; int i, scale; this_rq->nr_load_updates++; @@ -4459,9 +4686,19 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, /* scale is effectively 1 << i now, and >> i divides by scale */ - old_load = this_rq->cpu_load[i] - tickless_load; + old_load = this_rq->cpu_load[i]; +#ifdef CONFIG_NO_HZ_COMMON old_load = decay_load_missed(old_load, pending_updates - 1, i); - old_load += tickless_load; + if (tickless_load) { + old_load -= decay_load_missed(tickless_load, pending_updates - 1, i); + /* + * old_load can never be a negative value because a + * decayed tickless_load cannot be greater than the + * original tickless_load. + */ + old_load += tickless_load; + } +#endif new_load = this_load; /* * Round up the averaging division if load is increasing. This @@ -4489,75 +4726,110 @@ static unsigned long weighted_cpuload(const int cpu) * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. * - * Therefore we cannot use the delta approach from the regular tick since that - * would seriously skew the load calculation. However we'll make do for those - * updates happening while idle (nohz_idle_balance) or coming out of idle - * (tick_nohz_idle_exit). + * Therefore we need to avoid the delta approach from the regular tick when + * possible since that would seriously skew the load calculation. This is why we + * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on + * jiffies deltas for updates happening while in nohz mode (idle ticks, idle + * loop exit, nohz_idle_balance, nohz full exit...) * * This means we might still be one tick off for nohz periods. */ +static void cpu_load_update_nohz(struct rq *this_rq, + unsigned long curr_jiffies, + unsigned long load) +{ + unsigned long pending_updates; + + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + if (pending_updates) { + this_rq->last_load_update_tick = curr_jiffies; + /* + * In the regular NOHZ case, we were idle, this means load 0. + * In the NOHZ_FULL case, we were non-idle, we should consider + * its weighted load. + */ + cpu_load_update(this_rq, load, pending_updates); + } +} + /* * Called from nohz_idle_balance() to update the load ratings before doing the * idle balance. */ -static void update_idle_cpu_load(struct rq *this_rq) +static void cpu_load_update_idle(struct rq *this_rq) { - unsigned long curr_jiffies = READ_ONCE(jiffies); - unsigned long load = weighted_cpuload(cpu_of(this_rq)); - unsigned long pending_updates; - /* * bail if there's load or we're actually up-to-date. */ - if (load || curr_jiffies == this_rq->last_load_update_tick) + if (weighted_cpuload(cpu_of(this_rq))) return; - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - this_rq->last_load_update_tick = curr_jiffies; - - __update_cpu_load(this_rq, load, pending_updates, 0); + cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0); } /* - * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. + * Record CPU load on nohz entry so we know the tickless load to account + * on nohz exit. cpu_load[0] happens then to be updated more frequently + * than other cpu_load[idx] but it should be fine as cpu_load readers + * shouldn't rely into synchronized cpu_load[*] updates. */ -void update_cpu_load_nohz(int active) +void cpu_load_update_nohz_start(void) { struct rq *this_rq = this_rq(); + + /* + * This is all lockless but should be fine. If weighted_cpuload changes + * concurrently we'll exit nohz. And cpu_load write can race with + * cpu_load_update_idle() but both updater would be writing the same. + */ + this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq)); +} + +/* + * Account the tickless load in the end of a nohz frame. + */ +void cpu_load_update_nohz_stop(void) +{ unsigned long curr_jiffies = READ_ONCE(jiffies); - unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0; - unsigned long pending_updates; + struct rq *this_rq = this_rq(); + unsigned long load; if (curr_jiffies == this_rq->last_load_update_tick) return; + load = weighted_cpuload(cpu_of(this_rq)); raw_spin_lock(&this_rq->lock); - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - if (pending_updates) { - this_rq->last_load_update_tick = curr_jiffies; - /* - * In the regular NOHZ case, we were idle, this means load 0. - * In the NOHZ_FULL case, we were non-idle, we should consider - * its weighted load. - */ - __update_cpu_load(this_rq, load, pending_updates, active); - } + update_rq_clock(this_rq); + cpu_load_update_nohz(this_rq, curr_jiffies, load); raw_spin_unlock(&this_rq->lock); } -#endif /* CONFIG_NO_HZ */ +#else /* !CONFIG_NO_HZ_COMMON */ +static inline void cpu_load_update_nohz(struct rq *this_rq, + unsigned long curr_jiffies, + unsigned long load) { } +#endif /* CONFIG_NO_HZ_COMMON */ + +static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load) +{ +#ifdef CONFIG_NO_HZ_COMMON + /* See the mess around cpu_load_update_nohz(). */ + this_rq->last_load_update_tick = READ_ONCE(jiffies); +#endif + cpu_load_update(this_rq, load, 1); +} /* * Called from scheduler_tick() */ -void update_cpu_load_active(struct rq *this_rq) +void cpu_load_update_active(struct rq *this_rq) { unsigned long load = weighted_cpuload(cpu_of(this_rq)); - /* - * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). - */ - this_rq->last_load_update_tick = jiffies; - __update_cpu_load(this_rq, load, 1, 1); + + if (tick_nohz_tick_stopped()) + cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load); + else + cpu_load_update_periodic(this_rq, load); } /* @@ -4615,46 +4887,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) return 0; } -static void record_wakee(struct task_struct *p) -{ - /* - * Rough decay (wiping) for cost saving, don't worry - * about the boundary, really active task won't care - * about the loss. - */ - if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) { - current->wakee_flips >>= 1; - current->wakee_flip_decay_ts = jiffies; - } - - if (current->last_wakee != p) { - current->last_wakee = p; - current->wakee_flips++; - } -} - -static void task_waking_fair(struct task_struct *p) -{ - struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 min_vruntime; - -#ifndef CONFIG_64BIT - u64 min_vruntime_copy; - - do { - min_vruntime_copy = cfs_rq->min_vruntime_copy; - smp_rmb(); - min_vruntime = cfs_rq->min_vruntime; - } while (min_vruntime != min_vruntime_copy); -#else - min_vruntime = cfs_rq->min_vruntime; -#endif - - se->vruntime -= min_vruntime; - record_wakee(p); -} - #ifdef CONFIG_FAIR_GROUP_SCHED /* * effective_load() calculates the load change as seen from the root_task_group @@ -4770,17 +5002,39 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif +static void record_wakee(struct task_struct *p) +{ + /* + * Only decay a single time; tasks that have less then 1 wakeup per + * jiffy will not have built up many flips. + */ + if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) { + current->wakee_flips >>= 1; + current->wakee_flip_decay_ts = jiffies; + } + + if (current->last_wakee != p) { + current->last_wakee = p; + current->wakee_flips++; + } +} + /* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. + * * A waker of many should wake a different task than the one last awakened - * at a frequency roughly N times higher than one of its wakees. In order - * to determine whether we should let the load spread vs consolodating to - * shared cache, we look for a minimum 'flip' frequency of llc_size in one - * partner, and a factor of lls_size higher frequency in the other. With - * both conditions met, we can be relatively sure that the relationship is - * non-monogamous, with partner count exceeding socket size. Waker/wakee - * being client/server, worker/dispatcher, interrupt source or whatever is - * irrelevant, spread criteria is apparent partner count exceeds socket size. + * at a frequency roughly N times higher than one of its wakees. + * + * In order to determine whether we should let the load spread vs consolidating + * to shared cache, we look for a minimum 'flip' frequency of llc_size in one + * partner, and a factor of lls_size higher frequency in the other. + * + * With both conditions met, we can be relatively sure that the relationship is + * non-monogamous, with partner count exceeding socket size. + * + * Waker/wakee being client/server, worker/dispatcher, interrupt source or + * whatever is irrelevant, spread criteria is apparent partner count exceeds + * socket size. */ static int wake_wide(struct task_struct *p) { @@ -4987,7 +5241,19 @@ static int select_idle_sibling(struct task_struct *p, int target) return i; /* - * Otherwise, iterate the domains and find an elegible idle cpu. + * Otherwise, iterate the domains and find an eligible idle cpu. + * + * A completely idle sched group at higher domains is more + * desirable than an idle group at a lower level, because lower + * domains have smaller groups and usually share hardware + * resources which causes tasks to contend on them, e.g. x86 + * hyperthread siblings in the lowest domain (SMT) can contend + * on the shared cpu pipeline. + * + * However, while we prefer idle groups at higher domains + * finding an idle cpu at the lowest domain is still better than + * returning 'target', which we've already established, isn't + * idle. */ sd = rcu_dereference(per_cpu(sd_llc, target)); for_each_lower_domain(sd) { @@ -4997,11 +5263,16 @@ static int select_idle_sibling(struct task_struct *p, int target) tsk_cpus_allowed(p))) goto next; + /* Ensure the entire group is idle */ for_each_cpu(i, sched_group_cpus(sg)) { if (i == target || !idle_cpu(i)) goto next; } + /* + * It doesn't matter which cpu we pick, the + * whole group is idle. + */ target = cpumask_first_and(sched_group_cpus(sg), tsk_cpus_allowed(p)); goto done; @@ -5068,8 +5339,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int want_affine = 0; int sync = wake_flags & WF_SYNC; - if (sd_flag & SD_BALANCE_WAKE) + if (sd_flag & SD_BALANCE_WAKE) { + record_wakee(p); want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + } rcu_read_lock(); for_each_domain(cpu, tmp) { @@ -5149,6 +5422,32 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f static void migrate_task_rq_fair(struct task_struct *p) { /* + * As blocked tasks retain absolute vruntime the migration needs to + * deal with this by subtracting the old and adding the new + * min_vruntime -- the latter is done by enqueue_entity() when placing + * the task on the new runqueue. + */ + if (p->state == TASK_WAKING) { + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 min_vruntime; + +#ifndef CONFIG_64BIT + u64 min_vruntime_copy; + + do { + min_vruntime_copy = cfs_rq->min_vruntime_copy; + smp_rmb(); + min_vruntime = cfs_rq->min_vruntime; + } while (min_vruntime != min_vruntime_copy); +#else + min_vruntime = cfs_rq->min_vruntime; +#endif + + se->vruntime -= min_vruntime; + } + + /* * We are supposed to update the task to "current" time, then its up to date * and ready to go to new CPU/cfs_rq. But we have difficulty in getting * what current time is, so simply throw away the out-of-date time. This @@ -5331,7 +5630,7 @@ preempt: } static struct task_struct * -pick_next_task_fair(struct rq *rq, struct task_struct *prev) +pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) { struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; @@ -5444,9 +5743,9 @@ idle: * further scheduler activity on it and we're being very careful to * re-start the picking loop. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); new_tasks = idle_balance(rq); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, cookie); /* * Because idle_balance() releases (and re-acquires) rq->lock, it is * possible for any higher priority task to appear. In that case we @@ -5545,7 +5844,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * W_i,0 = \Sum_j w_i,j (2) * * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight - * is derived from the nice value as per prio_to_weight[]. + * is derived from the nice value as per sched_prio_to_weight[]. * * The weight average is an exponential decay average of the instantaneous * weight: @@ -6047,7 +6346,7 @@ static void update_blocked_averages(int cpu) if (throttled_hierarchy(cfs_rq)) continue; - if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) + if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) update_tg_load_avg(cfs_rq, 0); } raw_spin_unlock_irqrestore(&rq->lock, flags); @@ -6108,7 +6407,7 @@ static inline void update_blocked_averages(int cpu) raw_spin_lock_irqsave(&rq->lock, flags); update_rq_clock(rq); - update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); + update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -6517,6 +6816,9 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (!(env->sd->flags & SD_ASYM_PACKING)) return true; + /* No ASYM_PACKING if target cpu is already busy */ + if (env->idle == CPU_NOT_IDLE) + return true; /* * ASYM_PACKING needs to move all the work to the lowest * numbered CPUs in the group, therefore mark all groups @@ -6526,7 +6828,8 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (!sds->busiest) return true; - if (group_first_cpu(sds->busiest) > group_first_cpu(sg)) + /* Prefer to move from highest possible cpu's work */ + if (group_first_cpu(sds->busiest) < group_first_cpu(sg)) return true; } @@ -6672,6 +6975,9 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) if (!(env->sd->flags & SD_ASYM_PACKING)) return 0; + if (env->idle == CPU_NOT_IDLE) + return 0; + if (!sds->busiest) return 0; @@ -6780,9 +7086,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s } /* - * In the presence of smp nice balancing, certain scenarios can have - * max load less than avg load(as we skip the groups at or below - * its cpu_capacity, while calculating max_load..) + * Avg load of busiest sg can be less and avg load of local sg can + * be greater than avg load across all sgs of sd because avg load + * factors in sg capacity and sgs with smaller group_type are + * skipped when updating the busiest sg: */ if (busiest->avg_load <= sds->avg_load || local->avg_load >= sds->avg_load) { @@ -6795,11 +7102,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ if (busiest->group_type == group_overloaded && local->group_type == group_overloaded) { - load_above_capacity = busiest->sum_nr_running * - SCHED_LOAD_SCALE; - if (load_above_capacity > busiest->group_capacity) + load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE; + if (load_above_capacity > busiest->group_capacity) { load_above_capacity -= busiest->group_capacity; - else + load_above_capacity *= NICE_0_LOAD; + load_above_capacity /= busiest->group_capacity; + } else load_above_capacity = ~0UL; } @@ -6807,9 +7115,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * We're trying to get all the cpus to the average_load, so we don't * want to push ourselves above the average load, nor do we wish to * reduce the max loaded cpu below the average load. At the same time, - * we also don't want to reduce the group load below the group capacity - * (so that we can implement power-savings policies etc). Thus we look - * for the minimum possible imbalance. + * we also don't want to reduce the group load below the group + * capacity. Thus we look for the minimum possible imbalance. */ max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity); @@ -6833,10 +7140,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /** * find_busiest_group - Returns the busiest group within the sched_domain - * if there is an imbalance. If there isn't an imbalance, and - * the user has opted for power-savings, it returns a group whose - * CPUs can be put to idle by rebalancing those tasks elsewhere, if - * such a group exists. + * if there is an imbalance. * * Also calculates the amount of weighted load which should be moved * to restore balance. @@ -6844,9 +7148,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * @env: The load balancing environment. * * Return: - The busiest group if imbalance exists. - * - If no imbalance and user has opted for power-savings balance, - * return the least loaded group whose CPUs can be - * put to idle by rebalancing its tasks onto our group. */ static struct sched_group *find_busiest_group(struct lb_env *env) { @@ -6864,8 +7165,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) busiest = &sds.busiest_stat; /* ASYM feature bypasses nice load balance check */ - if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && - check_asym_packing(env, &sds)) + if (check_asym_packing(env, &sds)) return sds.busiest; /* There is no busy sibling group to pull tasks from */ @@ -7290,10 +7590,7 @@ more_balance: &busiest->active_balance_work); } - /* - * We've kicked active balancing, reset the failure - * counter. - */ + /* We've kicked active balancing, force task migration. */ sd->nr_balance_failed = sd->cache_nice_tries+1; } } else @@ -7528,10 +7825,13 @@ static int active_load_balance_cpu_stop(void *data) schedstat_inc(sd, alb_count); p = detach_one_task(&env); - if (p) + if (p) { schedstat_inc(sd, alb_pushed); - else + /* Active balancing done, reset the failure counter. */ + sd->nr_balance_failed = 0; + } else { schedstat_inc(sd, alb_failed); + } } rcu_read_unlock(); out_unlock: @@ -7602,7 +7902,7 @@ static void nohz_balancer_kick(void) return; } -static inline void nohz_balance_exit_idle(int cpu) +void nohz_balance_exit_idle(unsigned int cpu) { if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { /* @@ -7675,18 +7975,6 @@ void nohz_balance_enter_idle(int cpu) atomic_inc(&nohz.nr_cpus); set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } - -static int sched_ilb_notifier(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DYING: - nohz_balance_exit_idle(smp_processor_id()); - return NOTIFY_OK; - default: - return NOTIFY_DONE; - } -} #endif static DEFINE_SPINLOCK(balancing); @@ -7848,7 +8136,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) if (time_after_eq(jiffies, rq->next_balance)) { raw_spin_lock_irq(&rq->lock); update_rq_clock(rq); - update_idle_cpu_load(rq); + cpu_load_update_idle(rq); raw_spin_unlock_irq(&rq->lock); rebalance_domains(rq, CPU_IDLE); } @@ -8234,11 +8522,8 @@ void free_fair_sched_group(struct task_group *tg) for_each_possible_cpu(i) { if (tg->cfs_rq) kfree(tg->cfs_rq[i]); - if (tg->se) { - if (tg->se[i]) - remove_entity_load_avg(tg->se[i]); + if (tg->se) kfree(tg->se[i]); - } } kfree(tg->cfs_rq); @@ -8247,8 +8532,9 @@ void free_fair_sched_group(struct task_group *tg) int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { - struct cfs_rq *cfs_rq; struct sched_entity *se; + struct cfs_rq *cfs_rq; + struct rq *rq; int i; tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); @@ -8263,6 +8549,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_bandwidth(tg_cfs_bandwidth(tg)); for_each_possible_cpu(i) { + rq = cpu_rq(i); + cfs_rq = kzalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); if (!cfs_rq) @@ -8276,6 +8564,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); init_entity_runnable_average(se); + + raw_spin_lock_irq(&rq->lock); + post_init_entity_util_avg(se); + raw_spin_unlock_irq(&rq->lock); } return 1; @@ -8286,21 +8578,29 @@ err: return 0; } -void unregister_fair_sched_group(struct task_group *tg, int cpu) +void unregister_fair_sched_group(struct task_group *tg) { - struct rq *rq = cpu_rq(cpu); unsigned long flags; + struct rq *rq; + int cpu; - /* - * Only empty task groups can be destroyed; so we can speculatively - * check on_list without danger of it being re-added. - */ - if (!tg->cfs_rq[cpu]->on_list) - return; + for_each_possible_cpu(cpu) { + if (tg->se[cpu]) + remove_entity_load_avg(tg->se[cpu]); - raw_spin_lock_irqsave(&rq->lock, flags); - list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); - raw_spin_unlock_irqrestore(&rq->lock, flags); + /* + * Only empty task groups can be destroyed; so we can speculatively + * check on_list without danger of it being re-added. + */ + if (!tg->cfs_rq[cpu]->on_list) + continue; + + rq = cpu_rq(cpu); + + raw_spin_lock_irqsave(&rq->lock, flags); + list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } } void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, @@ -8382,7 +8682,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) return 1; } -void unregister_fair_sched_group(struct task_group *tg, int cpu) { } +void unregister_fair_sched_group(struct task_group *tg) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -8424,7 +8724,6 @@ const struct sched_class fair_sched_class = { .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, - .task_waking = task_waking_fair, .task_dead = task_dead_fair, .set_cpus_allowed = set_cpus_allowed_common, #endif @@ -8486,7 +8785,6 @@ __init void init_sched_fair_class(void) #ifdef CONFIG_NO_HZ_COMMON nohz.next_balance = jiffies; zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); - cpu_notifier(sched_ilb_notifier, 0); #endif #endif /* SMP */ diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 544a7133cbd1..c5aeedf4e93a 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -4,6 +4,7 @@ #include <linux/sched.h> #include <linux/cpu.h> #include <linux/cpuidle.h> +#include <linux/cpuhotplug.h> #include <linux/tick.h> #include <linux/mm.h> #include <linux/stackprotector.h> @@ -126,7 +127,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, */ static void cpuidle_idle_call(void) { - struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); + struct cpuidle_device *dev = cpuidle_get_device(); struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); int next_state, entered_state; @@ -193,8 +194,6 @@ exit_idle: rcu_idle_exit(); } -DEFINE_PER_CPU(bool, cpu_dead_idle); - /* * Generic idle loop implementation * @@ -221,10 +220,7 @@ static void cpu_idle_loop(void) rmb(); if (cpu_is_offline(smp_processor_id())) { - rcu_cpu_notify(NULL, CPU_DYING_IDLE, - (void *)(long)smp_processor_id()); - smp_mb(); /* all activity before dead. */ - this_cpu_write(cpu_dead_idle, true); + cpuhp_report_idle_dead(); arch_cpu_idle_dead(); } @@ -291,5 +287,6 @@ void cpu_startup_entry(enum cpuhp_state state) boot_init_stack_canary(); #endif arch_cpu_idle_prepare(); + cpuhp_online_idle(state); cpu_idle_loop(); } diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 47ce94931f1b..2ce5458bbe1d 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -24,7 +24,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl } static struct task_struct * -pick_next_task_idle(struct rq *rq, struct task_struct *prev) +pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) { put_prev_task(rq, prev); diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index ef7159012cf3..b0b93fd33af9 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -99,10 +99,13 @@ long calc_load_fold_active(struct rq *this_rq) static unsigned long calc_load(unsigned long load, unsigned long exp, unsigned long active) { - load *= exp; - load += active * (FIXED_1 - exp); - load += 1UL << (FSHIFT - 1); - return load >> FSHIFT; + unsigned long newload; + + newload = load * exp + active * (FIXED_1 - exp); + if (active >= load) + newload += FIXED_1-1; + + return newload / FIXED_1; } #ifdef CONFIG_NO_HZ_COMMON diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 8ec86abe0ea1..d5690b722691 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -58,7 +58,15 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) raw_spin_lock(&rt_b->rt_runtime_lock); if (!rt_b->rt_period_active) { rt_b->rt_period_active = 1; - hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period); + /* + * SCHED_DEADLINE updates the bandwidth, as a run away + * RT task with a DL task could hog a CPU. But DL does + * not reset the period. If a deadline task was running + * without an RT task running, it can cause RT tasks to + * throttle when they start up. Kick the timer right away + * to update the period. + */ + hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0)); hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED); } raw_spin_unlock(&rt_b->rt_runtime_lock); @@ -326,7 +334,7 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) rt_rq = &rq_of_rt_rq(rt_rq)->rt; rt_rq->rt_nr_total++; - if (p->nr_cpus_allowed > 1) + if (tsk_nr_cpus_allowed(p) > 1) rt_rq->rt_nr_migratory++; update_rt_migration(rt_rq); @@ -343,7 +351,7 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) rt_rq = &rq_of_rt_rq(rt_rq)->rt; rt_rq->rt_nr_total--; - if (p->nr_cpus_allowed > 1) + if (tsk_nr_cpus_allowed(p) > 1) rt_rq->rt_nr_migratory--; update_rt_migration(rt_rq); @@ -436,7 +444,7 @@ static void dequeue_top_rt_rq(struct rt_rq *rt_rq); static inline int on_rt_rq(struct sched_rt_entity *rt_se) { - return !list_empty(&rt_se->run_list); + return rt_se->on_rq; } #ifdef CONFIG_RT_GROUP_SCHED @@ -482,8 +490,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) return rt_se->my_q; } -static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head); -static void dequeue_rt_entity(struct sched_rt_entity *rt_se); +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); +static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { @@ -499,7 +507,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) if (!rt_se) enqueue_top_rt_rq(rt_rq); else if (!on_rt_rq(rt_se)) - enqueue_rt_entity(rt_se, false); + enqueue_rt_entity(rt_se, 0); if (rt_rq->highest_prio.curr < curr->prio) resched_curr(rq); @@ -516,7 +524,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) if (!rt_se) dequeue_top_rt_rq(rt_rq); else if (on_rt_rq(rt_se)) - dequeue_rt_entity(rt_se); + dequeue_rt_entity(rt_se, 0); } static inline int rt_rq_throttled(struct rt_rq *rt_rq) @@ -949,6 +957,10 @@ static void update_curr_rt(struct rq *rq) if (unlikely((s64)delta_exec <= 0)) return; + /* Kick cpufreq (see the comment in linux/cpufreq.h). */ + if (cpu_of(rq) == smp_processor_id()) + cpufreq_trigger_update(rq_clock(rq)); + schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); @@ -1142,12 +1154,27 @@ unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) } static inline +unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se) +{ + struct rt_rq *group_rq = group_rt_rq(rt_se); + struct task_struct *tsk; + + if (group_rq) + return group_rq->rr_nr_running; + + tsk = rt_task_of(rt_se); + + return (tsk->policy == SCHED_RR) ? 1 : 0; +} + +static inline void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { int prio = rt_se_prio(rt_se); WARN_ON(!rt_prio(prio)); rt_rq->rt_nr_running += rt_se_nr_running(rt_se); + rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se); inc_rt_prio(rt_rq, prio); inc_rt_migration(rt_se, rt_rq); @@ -1160,13 +1187,37 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) WARN_ON(!rt_prio(rt_se_prio(rt_se))); WARN_ON(!rt_rq->rt_nr_running); rt_rq->rt_nr_running -= rt_se_nr_running(rt_se); + rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se); dec_rt_prio(rt_rq, rt_se_prio(rt_se)); dec_rt_migration(rt_se, rt_rq); dec_rt_group(rt_se, rt_rq); } -static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) +/* + * Change rt_se->run_list location unless SAVE && !MOVE + * + * assumes ENQUEUE/DEQUEUE flags match + */ +static inline bool move_entity(unsigned int flags) +{ + if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) + return false; + + return true; +} + +static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array) +{ + list_del_init(&rt_se->run_list); + + if (list_empty(array->queue + rt_se_prio(rt_se))) + __clear_bit(rt_se_prio(rt_se), array->bitmap); + + rt_se->on_list = 0; +} + +static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; @@ -1179,26 +1230,37 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) * get throttled and the current group doesn't have any other * active members. */ - if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) + if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) { + if (rt_se->on_list) + __delist_rt_entity(rt_se, array); return; + } - if (head) - list_add(&rt_se->run_list, queue); - else - list_add_tail(&rt_se->run_list, queue); - __set_bit(rt_se_prio(rt_se), array->bitmap); + if (move_entity(flags)) { + WARN_ON_ONCE(rt_se->on_list); + if (flags & ENQUEUE_HEAD) + list_add(&rt_se->run_list, queue); + else + list_add_tail(&rt_se->run_list, queue); + + __set_bit(rt_se_prio(rt_se), array->bitmap); + rt_se->on_list = 1; + } + rt_se->on_rq = 1; inc_rt_tasks(rt_se, rt_rq); } -static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) +static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; - list_del_init(&rt_se->run_list); - if (list_empty(array->queue + rt_se_prio(rt_se))) - __clear_bit(rt_se_prio(rt_se), array->bitmap); + if (move_entity(flags)) { + WARN_ON_ONCE(!rt_se->on_list); + __delist_rt_entity(rt_se, array); + } + rt_se->on_rq = 0; dec_rt_tasks(rt_se, rt_rq); } @@ -1207,7 +1269,7 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) * Because the prio of an upper entry depends on the lower * entries, we must remove entries top - down. */ -static void dequeue_rt_stack(struct sched_rt_entity *rt_se) +static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags) { struct sched_rt_entity *back = NULL; @@ -1220,31 +1282,31 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se) for (rt_se = back; rt_se; rt_se = rt_se->back) { if (on_rt_rq(rt_se)) - __dequeue_rt_entity(rt_se); + __dequeue_rt_entity(rt_se, flags); } } -static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rq *rq = rq_of_rt_se(rt_se); - dequeue_rt_stack(rt_se); + dequeue_rt_stack(rt_se, flags); for_each_sched_rt_entity(rt_se) - __enqueue_rt_entity(rt_se, head); + __enqueue_rt_entity(rt_se, flags); enqueue_top_rt_rq(&rq->rt); } -static void dequeue_rt_entity(struct sched_rt_entity *rt_se) +static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rq *rq = rq_of_rt_se(rt_se); - dequeue_rt_stack(rt_se); + dequeue_rt_stack(rt_se, flags); for_each_sched_rt_entity(rt_se) { struct rt_rq *rt_rq = group_rt_rq(rt_se); if (rt_rq && rt_rq->rt_nr_running) - __enqueue_rt_entity(rt_se, false); + __enqueue_rt_entity(rt_se, flags); } enqueue_top_rt_rq(&rq->rt); } @@ -1260,9 +1322,9 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) if (flags & ENQUEUE_WAKEUP) rt_se->timeout = 0; - enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); + enqueue_rt_entity(rt_se, flags); - if (!task_current(rq, p) && p->nr_cpus_allowed > 1) + if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1) enqueue_pushable_task(rq, p); } @@ -1271,7 +1333,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) struct sched_rt_entity *rt_se = &p->rt; update_curr_rt(rq); - dequeue_rt_entity(rt_se); + dequeue_rt_entity(rt_se, flags); dequeue_pushable_task(rq, p); } @@ -1351,7 +1413,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) * will have to sort it out. */ if (curr && unlikely(rt_task(curr)) && - (curr->nr_cpus_allowed < 2 || + (tsk_nr_cpus_allowed(curr) < 2 || curr->prio <= p->prio)) { int target = find_lowest_rq(p); @@ -1375,7 +1437,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) * Current can't be migrated, useless to reschedule, * let's hope p can move out. */ - if (rq->curr->nr_cpus_allowed == 1 || + if (tsk_nr_cpus_allowed(rq->curr) == 1 || !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) return; @@ -1383,7 +1445,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) * p is migratable, so let's not schedule it and * see if it is pushed or pulled somewhere else. */ - if (p->nr_cpus_allowed != 1 + if (tsk_nr_cpus_allowed(p) != 1 && cpupri_find(&rq->rd->cpupri, p, NULL)) return; @@ -1462,7 +1524,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) } static struct task_struct * -pick_next_task_rt(struct rq *rq, struct task_struct *prev) +pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) { struct task_struct *p; struct rt_rq *rt_rq = &rq->rt; @@ -1474,9 +1536,9 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) * disabled avoiding further scheduler activity on it and we're * being very careful to re-start the picking loop. */ - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, cookie); pull_rt_task(rq); - lockdep_pin_lock(&rq->lock); + lockdep_repin_lock(&rq->lock, cookie); /* * pull_rt_task() can drop (and re-acquire) rq->lock; this * means a dl or stop task can slip in, in which case we need @@ -1517,7 +1579,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) * The previous task needs to be made eligible for pushing * if it is still active */ - if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) + if (on_rt_rq(&p->rt) && tsk_nr_cpus_allowed(p) > 1) enqueue_pushable_task(rq, p); } @@ -1567,7 +1629,7 @@ static int find_lowest_rq(struct task_struct *task) if (unlikely(!lowest_mask)) return -1; - if (task->nr_cpus_allowed == 1) + if (tsk_nr_cpus_allowed(task) == 1) return -1; /* No other targets possible */ if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) @@ -1667,6 +1729,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) !cpumask_test_cpu(lowest_rq->cpu, tsk_cpus_allowed(task)) || task_running(rq, task) || + !rt_task(task) || !task_on_rq_queued(task))) { double_unlock_balance(rq, lowest_rq); @@ -1699,7 +1762,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) BUG_ON(rq->cpu != task_cpu(p)); BUG_ON(task_current(rq, p)); - BUG_ON(p->nr_cpus_allowed <= 1); + BUG_ON(tsk_nr_cpus_allowed(p) <= 1); BUG_ON(!task_on_rq_queued(p)); BUG_ON(!rt_task(p)); @@ -2059,9 +2122,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && - p->nr_cpus_allowed > 1 && + tsk_nr_cpus_allowed(p) > 1 && (dl_task(rq->curr) || rt_task(rq->curr)) && - (rq->curr->nr_cpus_allowed < 2 || + (tsk_nr_cpus_allowed(rq->curr) < 2 || rq->curr->prio <= p->prio)) push_rt_tasks(rq); } @@ -2134,7 +2197,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) */ if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP - if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) + if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded) queue_push_tasks(rq); #else if (p->prio < rq->curr->prio) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 10f16374df7f..7cbeb92a1cb9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3,6 +3,7 @@ #include <linux/sched/sysctl.h> #include <linux/sched/rt.h> #include <linux/sched/deadline.h> +#include <linux/binfmts.h> #include <linux/mutex.h> #include <linux/spinlock.h> #include <linux/stop_machine.h> @@ -30,9 +31,9 @@ extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq); #ifdef CONFIG_SMP -extern void update_cpu_load_active(struct rq *this_rq); +extern void cpu_load_update_active(struct rq *this_rq); #else -static inline void update_cpu_load_active(struct rq *this_rq) { } +static inline void cpu_load_update_active(struct rq *this_rq) { } #endif /* @@ -48,25 +49,32 @@ static inline void update_cpu_load_active(struct rq *this_rq) { } * and does not change the user-interface for setting shares/weights. * * We increase resolution only if we have enough bits to allow this increased - * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution - * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the - * increased costs. + * resolution (i.e. 64bit). The costs for increasing resolution when 32bit are + * pretty high and the returns do not justify the increased costs. + * + * Really only required when CONFIG_FAIR_GROUP_SCHED is also set, but to + * increase coverage and consistency always enable it on 64bit platforms. */ -#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */ -# define SCHED_LOAD_RESOLUTION 10 -# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION) -# define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION) +#ifdef CONFIG_64BIT +# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) +# define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT) +# define scale_load_down(w) ((w) >> SCHED_FIXEDPOINT_SHIFT) #else -# define SCHED_LOAD_RESOLUTION 0 +# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT) # define scale_load(w) (w) # define scale_load_down(w) (w) #endif -#define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION) -#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) - -#define NICE_0_LOAD SCHED_LOAD_SCALE -#define NICE_0_SHIFT SCHED_LOAD_SHIFT +/* + * Task weight (visible to users) and its load (invisible to users) have + * independent resolution, but they should be well calibrated. We use + * scale_load() and scale_load_down(w) to convert between them. The + * following must be true: + * + * scale_load(sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]) == NICE_0_LOAD + * + */ +#define NICE_0_LOAD (1L << NICE_0_LOAD_SHIFT) /* * Single value that decides SCHED_DEADLINE internal math precision. @@ -313,12 +321,11 @@ extern int tg_nop(struct task_group *tg, void *data); extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); -extern void unregister_fair_sched_group(struct task_group *tg, int cpu); +extern void unregister_fair_sched_group(struct task_group *tg); extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *se, int cpu, struct sched_entity *parent); extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); -extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); @@ -430,7 +437,7 @@ struct cfs_rq { u64 throttled_clock, throttled_clock_task; u64 throttled_clock_task_time; - int throttled, throttle_count; + int throttled, throttle_count, throttle_uptodate; struct list_head throttled_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -450,6 +457,7 @@ static inline int rt_bandwidth_enabled(void) struct rt_rq { struct rt_prio_array active; unsigned int rt_nr_running; + unsigned int rr_nr_running; #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED struct { int curr; /* highest queued rt task prio */ @@ -584,11 +592,13 @@ struct rq { #endif #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; - unsigned long last_load_update_tick; #ifdef CONFIG_NO_HZ_COMMON +#ifdef CONFIG_SMP + unsigned long last_load_update_tick; +#endif /* CONFIG_SMP */ u64 nohz_stamp; unsigned long nohz_flags; -#endif +#endif /* CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_FULL unsigned long last_sched_tick; #endif @@ -853,7 +863,7 @@ DECLARE_PER_CPU(struct sched_domain *, sd_asym); struct sched_group_capacity { atomic_t ref; /* - * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity + * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity * for a single CPU. */ unsigned int capacity; @@ -909,6 +919,18 @@ static inline unsigned int group_first_cpu(struct sched_group *group) extern int group_balance_cpu(struct sched_group *sg); +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) +void register_sched_domain_sysctl(void); +void unregister_sched_domain_sysctl(void); +#else +static inline void register_sched_domain_sysctl(void) +{ +} +static inline void unregister_sched_domain_sysctl(void) +{ +} +#endif + #else static inline void sched_ttwu_pending(void) { } @@ -1022,6 +1044,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ extern struct static_key_false sched_numa_balancing; +extern struct static_key_false sched_schedstats; static inline u64 global_rt_period(void) { @@ -1130,18 +1153,40 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) extern const int sched_prio_to_weight[40]; extern const u32 sched_prio_to_wmult[40]; +/* + * {de,en}queue flags: + * + * DEQUEUE_SLEEP - task is no longer runnable + * ENQUEUE_WAKEUP - task just became runnable + * + * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks + * are in a known state which allows modification. Such pairs + * should preserve as much state as possible. + * + * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location + * in the runqueue. + * + * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) + * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) + * ENQUEUE_MIGRATED - the task was migrated during wakeup + * + */ + +#define DEQUEUE_SLEEP 0x01 +#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ +#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ + #define ENQUEUE_WAKEUP 0x01 -#define ENQUEUE_HEAD 0x02 +#define ENQUEUE_RESTORE 0x02 +#define ENQUEUE_MOVE 0x04 + +#define ENQUEUE_HEAD 0x08 +#define ENQUEUE_REPLENISH 0x10 #ifdef CONFIG_SMP -#define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */ +#define ENQUEUE_MIGRATED 0x20 #else -#define ENQUEUE_WAKING 0x00 +#define ENQUEUE_MIGRATED 0x00 #endif -#define ENQUEUE_REPLENISH 0x08 -#define ENQUEUE_RESTORE 0x10 - -#define DEQUEUE_SLEEP 0x01 -#define DEQUEUE_SAVE 0x02 #define RETRY_TASK ((void *)-1UL) @@ -1164,14 +1209,14 @@ struct sched_class { * tasks. */ struct task_struct * (*pick_next_task) (struct rq *rq, - struct task_struct *prev); + struct task_struct *prev, + struct pin_cookie cookie); void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); void (*migrate_task_rq)(struct task_struct *p); - void (*task_waking) (struct task_struct *task); void (*task_woken) (struct rq *this_rq, struct task_struct *task); void (*set_cpus_allowed)(struct task_struct *p, @@ -1277,6 +1322,36 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se); unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); +extern void post_init_entity_util_avg(struct sched_entity *se); + +#ifdef CONFIG_NO_HZ_FULL +extern bool sched_can_stop_tick(struct rq *rq); + +/* + * Tick may be needed by tasks in the runqueue depending on their policy and + * requirements. If tick is needed, lets send the target an IPI to kick it out of + * nohz mode if necessary. + */ +static inline void sched_update_tick_dependency(struct rq *rq) +{ + int cpu; + + if (!tick_nohz_full_enabled()) + return; + + cpu = cpu_of(rq); + + if (!tick_nohz_full_cpu(cpu)) + return; + + if (sched_can_stop_tick(rq)) + tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); + else + tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); +} +#else +static inline void sched_update_tick_dependency(struct rq *rq) { } +#endif static inline void add_nr_running(struct rq *rq, unsigned count) { @@ -1289,26 +1364,16 @@ static inline void add_nr_running(struct rq *rq, unsigned count) if (!rq->rd->overload) rq->rd->overload = true; #endif - -#ifdef CONFIG_NO_HZ_FULL - if (tick_nohz_full_cpu(rq->cpu)) { - /* - * Tick is needed if more than one task runs on a CPU. - * Send the target an IPI to kick it out of nohz mode. - * - * We assume that IPI implies full memory barrier and the - * new value of rq->nr_running is visible on reception - * from the target. - */ - tick_nohz_full_kick_cpu(rq->cpu); - } -#endif } + + sched_update_tick_dependency(rq); } static inline void sub_nr_running(struct rq *rq, unsigned count) { rq->nr_running -= count; + /* Check if we still need preemption */ + sched_update_tick_dependency(rq); } static inline void rq_last_tick_reset(struct rq *rq) @@ -1393,86 +1458,32 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } static inline void sched_avg_update(struct rq *rq) { } #endif -/* - * __task_rq_lock - lock the rq @p resides on. - */ -static inline struct rq *__task_rq_lock(struct task_struct *p) - __acquires(rq->lock) -{ - struct rq *rq; - - lockdep_assert_held(&p->pi_lock); - - for (;;) { - rq = task_rq(p); - raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { - lockdep_pin_lock(&rq->lock); - return rq; - } - raw_spin_unlock(&rq->lock); - - while (unlikely(task_on_rq_migrating(p))) - cpu_relax(); - } -} +struct rq_flags { + unsigned long flags; + struct pin_cookie cookie; +}; -/* - * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. - */ -static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) +struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(rq->lock); +struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(p->pi_lock) - __acquires(rq->lock) -{ - struct rq *rq; + __acquires(rq->lock); - for (;;) { - raw_spin_lock_irqsave(&p->pi_lock, *flags); - rq = task_rq(p); - raw_spin_lock(&rq->lock); - /* - * move_queued_task() task_rq_lock() - * - * ACQUIRE (rq->lock) - * [S] ->on_rq = MIGRATING [L] rq = task_rq() - * WMB (__set_task_cpu()) ACQUIRE (rq->lock); - * [S] ->cpu = new_cpu [L] task_rq() - * [L] ->on_rq - * RELEASE (rq->lock) - * - * If we observe the old cpu in task_rq_lock, the acquire of - * the old rq->lock will fully serialize against the stores. - * - * If we observe the new cpu in task_rq_lock, the acquire will - * pair with the WMB to ensure we must then also see migrating. - */ - if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { - lockdep_pin_lock(&rq->lock); - return rq; - } - raw_spin_unlock(&rq->lock); - raw_spin_unlock_irqrestore(&p->pi_lock, *flags); - - while (unlikely(task_on_rq_migrating(p))) - cpu_relax(); - } -} - -static inline void __task_rq_unlock(struct rq *rq) +static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, rf->cookie); raw_spin_unlock(&rq->lock); } static inline void -task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) +task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) __releases(rq->lock) __releases(p->pi_lock) { - lockdep_unpin_lock(&rq->lock); + lockdep_unpin_lock(&rq->lock, rf->cookie); raw_spin_unlock(&rq->lock); - raw_spin_unlock_irqrestore(&p->pi_lock, *flags); + raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); } #ifdef CONFIG_SMP @@ -1688,6 +1699,10 @@ enum rq_nohz_flag_bits { }; #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) + +extern void nohz_balance_exit_idle(unsigned int cpu); +#else +static inline void nohz_balance_exit_idle(unsigned int cpu) { } #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -1738,3 +1753,72 @@ static inline u64 irq_time_read(int cpu) } #endif /* CONFIG_64BIT */ #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#ifdef CONFIG_CPU_FREQ +DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); + +/** + * cpufreq_update_util - Take a note about CPU utilization changes. + * @time: Current time. + * @util: Current utilization. + * @max: Utilization ceiling. + * + * This function is called by the scheduler on every invocation of + * update_load_avg() on the CPU whose utilization is being updated. + * + * It can only be called from RCU-sched read-side critical sections. + */ +static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) +{ + struct update_util_data *data; + + data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); + if (data) + data->func(data, time, util, max); +} + +/** + * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed. + * @time: Current time. + * + * The way cpufreq is currently arranged requires it to evaluate the CPU + * performance state (frequency/voltage) on a regular basis to prevent it from + * being stuck in a completely inadequate performance level for too long. + * That is not guaranteed to happen if the updates are only triggered from CFS, + * though, because they may not be coming in if RT or deadline tasks are active + * all the time (or there are RT and DL tasks only). + * + * As a workaround for that issue, this function is called by the RT and DL + * sched classes to trigger extra cpufreq updates to prevent it from stalling, + * but that really is a band-aid. Going forward it should be replaced with + * solutions targeted more specifically at RT and DL tasks. + */ +static inline void cpufreq_trigger_update(u64 time) +{ + cpufreq_update_util(time, ULONG_MAX, 0); +} +#else +static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {} +static inline void cpufreq_trigger_update(u64 time) {} +#endif /* CONFIG_CPU_FREQ */ + +#ifdef arch_scale_freq_capacity +#ifndef arch_scale_freq_invariant +#define arch_scale_freq_invariant() (true) +#endif +#else /* arch_scale_freq_capacity */ +#define arch_scale_freq_invariant() (false) +#endif + +static inline void account_reset_rq(struct rq *rq) +{ +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + rq->prev_irq_time = 0; +#endif +#ifdef CONFIG_PARAVIRT + rq->prev_steal_time = 0; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + rq->prev_steal_time_rq = 0; +#endif +} diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index b0fbc7632de5..78955cbea31c 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -29,9 +29,12 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) if (rq) rq->rq_sched_info.run_delay += delta; } -# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) -# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) -# define schedstat_set(var, val) do { var = (val); } while (0) +# define schedstat_enabled() static_branch_unlikely(&sched_schedstats) +# define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0) +# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0) +# define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) +# define schedstat_val(rq, field) ((schedstat_enabled()) ? (rq)->field : 0) + #else /* !CONFIG_SCHEDSTATS */ static inline void rq_sched_info_arrive(struct rq *rq, unsigned long long delta) @@ -42,9 +45,11 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) static inline void rq_sched_info_depart(struct rq *rq, unsigned long long delta) {} +# define schedstat_enabled() 0 # define schedstat_inc(rq, field) do { } while (0) # define schedstat_add(rq, field, amt) do { } while (0) # define schedstat_set(var, val) do { } while (0) +# define schedstat_val(rq, field) 0 #endif #ifdef CONFIG_SCHED_INFO diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index cbc67da10954..604297a08b3a 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -24,7 +24,7 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) } static struct task_struct * -pick_next_task_stop(struct rq *rq, struct task_struct *prev) +pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) { struct task_struct *stop = rq->stop; diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c new file mode 100644 index 000000000000..82f0dff90030 --- /dev/null +++ b/kernel/sched/swait.c @@ -0,0 +1,123 @@ +#include <linux/sched.h> +#include <linux/swait.h> + +void __init_swait_queue_head(struct swait_queue_head *q, const char *name, + struct lock_class_key *key) +{ + raw_spin_lock_init(&q->lock); + lockdep_set_class_and_name(&q->lock, key, name); + INIT_LIST_HEAD(&q->task_list); +} +EXPORT_SYMBOL(__init_swait_queue_head); + +/* + * The thing about the wake_up_state() return value; I think we can ignore it. + * + * If for some reason it would return 0, that means the previously waiting + * task is already running, so it will observe condition true (or has already). + */ +void swake_up_locked(struct swait_queue_head *q) +{ + struct swait_queue *curr; + + if (list_empty(&q->task_list)) + return; + + curr = list_first_entry(&q->task_list, typeof(*curr), task_list); + wake_up_process(curr->task); + list_del_init(&curr->task_list); +} +EXPORT_SYMBOL(swake_up_locked); + +void swake_up(struct swait_queue_head *q) +{ + unsigned long flags; + + if (!swait_active(q)) + return; + + raw_spin_lock_irqsave(&q->lock, flags); + swake_up_locked(q); + raw_spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(swake_up); + +/* + * Does not allow usage from IRQ disabled, since we must be able to + * release IRQs to guarantee bounded hold time. + */ +void swake_up_all(struct swait_queue_head *q) +{ + struct swait_queue *curr; + LIST_HEAD(tmp); + + if (!swait_active(q)) + return; + + raw_spin_lock_irq(&q->lock); + list_splice_init(&q->task_list, &tmp); + while (!list_empty(&tmp)) { + curr = list_first_entry(&tmp, typeof(*curr), task_list); + + wake_up_state(curr->task, TASK_NORMAL); + list_del_init(&curr->task_list); + + if (list_empty(&tmp)) + break; + + raw_spin_unlock_irq(&q->lock); + raw_spin_lock_irq(&q->lock); + } + raw_spin_unlock_irq(&q->lock); +} +EXPORT_SYMBOL(swake_up_all); + +void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) +{ + wait->task = current; + if (list_empty(&wait->task_list)) + list_add(&wait->task_list, &q->task_list); +} + +void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&q->lock, flags); + __prepare_to_swait(q, wait); + set_current_state(state); + raw_spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(prepare_to_swait); + +long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state) +{ + if (signal_pending_state(state, current)) + return -ERESTARTSYS; + + prepare_to_swait(q, wait, state); + + return 0; +} +EXPORT_SYMBOL(prepare_to_swait_event); + +void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait) +{ + __set_current_state(TASK_RUNNING); + if (!list_empty(&wait->task_list)) + list_del_init(&wait->task_list); +} + +void finish_swait(struct swait_queue_head *q, struct swait_queue *wait) +{ + unsigned long flags; + + __set_current_state(TASK_RUNNING); + + if (!list_empty_careful(&wait->task_list)) { + raw_spin_lock_irqsave(&q->lock, flags); + list_del_init(&wait->task_list); + raw_spin_unlock_irqrestore(&q->lock, flags); + } +} +EXPORT_SYMBOL(finish_swait); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 15a1795bbba1..7002796f14a4 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -395,7 +395,7 @@ seccomp_prepare_user_filter(const char __user *user_filter) struct seccomp_filter *filter = ERR_PTR(-EFAULT); #ifdef CONFIG_COMPAT - if (is_compat_task()) { + if (in_compat_syscall()) { struct compat_sock_fprog fprog32; if (copy_from_user(&fprog32, user_filter, sizeof(fprog32))) goto out; @@ -513,24 +513,17 @@ static void seccomp_send_sigsys(int syscall, int reason) * To be fully secure this must be combined with rlimit * to limit the stack allocations too. */ -static int mode1_syscalls[] = { +static const int mode1_syscalls[] = { __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, 0, /* null terminated */ }; -#ifdef CONFIG_COMPAT -static int mode1_syscalls_32[] = { - __NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32, - 0, /* null terminated */ -}; -#endif - static void __secure_computing_strict(int this_syscall) { - int *syscall_whitelist = mode1_syscalls; + const int *syscall_whitelist = mode1_syscalls; #ifdef CONFIG_COMPAT - if (is_compat_task()) - syscall_whitelist = mode1_syscalls_32; + if (in_compat_syscall()) + syscall_whitelist = get_compat_mode1_syscalls(); #endif do { if (*syscall_whitelist == this_syscall) @@ -915,7 +908,7 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, fprog = filter->prog->orig_prog; if (!fprog) { - /* This must be a new non-cBPF filter, since we save every + /* This must be a new non-cBPF filter, since we save * every cBPF filter's orig_prog above when * CONFIG_CHECKPOINT_RESTORE is enabled. */ diff --git a/kernel/signal.c b/kernel/signal.c index 0508544c8ced..96e9bc40667f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -224,7 +224,7 @@ static inline void print_dropped_signal(int sig) if (!__ratelimit(&ratelimit_state)) return; - printk(KERN_INFO "%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n", + pr_info("%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n", current->comm, current->pid, sig); } @@ -1089,10 +1089,10 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, static void print_fatal_signal(int signr) { struct pt_regs *regs = signal_pt_regs(); - printk(KERN_INFO "potentially unexpected fatal signal %d.\n", signr); + pr_info("potentially unexpected fatal signal %d.\n", signr); #if defined(__i386__) && !defined(__arch_um__) - printk(KERN_INFO "code at %08lx: ", regs->ip); + pr_info("code at %08lx: ", regs->ip); { int i; for (i = 0; i < 16; i++) { @@ -1100,10 +1100,10 @@ static void print_fatal_signal(int signr) if (get_user(insn, (unsigned char *)(regs->ip + i))) break; - printk(KERN_CONT "%02x ", insn); + pr_cont("%02x ", insn); } } - printk(KERN_CONT "\n"); + pr_cont("\n"); #endif preempt_disable(); show_regs(regs); @@ -2709,6 +2709,10 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) err |= __put_user(from->si_upper, &to->si_upper); } #endif +#ifdef SEGV_PKUERR + if (from->si_signo == SIGSEGV && from->si_code == SEGV_PKUERR) + err |= __put_user(from->si_pkey, &to->si_pkey); +#endif break; case __SI_CHLD: err |= __put_user(from->si_pid, &to->si_pid); @@ -3095,12 +3099,14 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s oss.ss_sp = (void __user *) current->sas_ss_sp; oss.ss_size = current->sas_ss_size; - oss.ss_flags = sas_ss_flags(sp); + oss.ss_flags = sas_ss_flags(sp) | + (current->sas_ss_flags & SS_FLAG_BITS); if (uss) { void __user *ss_sp; size_t ss_size; - int ss_flags; + unsigned ss_flags; + int ss_mode; error = -EFAULT; if (!access_ok(VERIFY_READ, uss, sizeof(*uss))) @@ -3115,18 +3121,13 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s if (on_sig_stack(sp)) goto out; + ss_mode = ss_flags & ~SS_FLAG_BITS; error = -EINVAL; - /* - * Note - this code used to test ss_flags incorrectly: - * old code may have been written using ss_flags==0 - * to mean ss_flags==SS_ONSTACK (as this was the only - * way that worked) - this fix preserves that older - * mechanism. - */ - if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0) + if (ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK && + ss_mode != 0) goto out; - if (ss_flags == SS_DISABLE) { + if (ss_mode == SS_DISABLE) { ss_size = 0; ss_sp = NULL; } else { @@ -3137,6 +3138,7 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s current->sas_ss_sp = (unsigned long) ss_sp; current->sas_ss_size = ss_size; + current->sas_ss_flags = ss_flags; } error = 0; @@ -3167,9 +3169,14 @@ int restore_altstack(const stack_t __user *uss) int __save_altstack(stack_t __user *uss, unsigned long sp) { struct task_struct *t = current; - return __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) | - __put_user(sas_ss_flags(sp), &uss->ss_flags) | + int err = __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) | + __put_user(t->sas_ss_flags, &uss->ss_flags) | __put_user(t->sas_ss_size, &uss->ss_size); + if (err) + return err; + if (t->sas_ss_flags & SS_AUTODISARM) + sas_ss_reset(t); + return 0; } #ifdef CONFIG_COMPAT @@ -3581,6 +3588,10 @@ __weak const char *arch_vma_name(struct vm_area_struct *vma) void __init signals_init(void) { + /* If this check fails, the __ARCH_SI_PREAMBLE_SIZE value is wrong! */ + BUILD_BUG_ON(__ARCH_SI_PREAMBLE_SIZE + != offsetof(struct siginfo, _sifields._pad)); + sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); } diff --git a/kernel/smp.c b/kernel/smp.c index d903c02223af..74165443c240 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -105,13 +105,12 @@ void __init call_function_init(void) * previous function call. For multi-cpu calls its even more interesting * as we'll have to ensure no other cpu is observing our csd. */ -static void csd_lock_wait(struct call_single_data *csd) +static __always_inline void csd_lock_wait(struct call_single_data *csd) { - while (smp_load_acquire(&csd->flags) & CSD_FLAG_LOCK) - cpu_relax(); + smp_cond_acquire(!(csd->flags & CSD_FLAG_LOCK)); } -static void csd_lock(struct call_single_data *csd) +static __always_inline void csd_lock(struct call_single_data *csd) { csd_lock_wait(csd); csd->flags |= CSD_FLAG_LOCK; @@ -124,7 +123,7 @@ static void csd_lock(struct call_single_data *csd) smp_wmb(); } -static void csd_unlock(struct call_single_data *csd) +static __always_inline void csd_unlock(struct call_single_data *csd) { WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); @@ -569,6 +568,7 @@ void __init smp_init(void) unsigned int cpu; idle_threads_init(); + cpuhp_threads_init(); /* FIXME: This should be done in userspace --RR */ for_each_present_cpu(cpu) { diff --git a/kernel/smpboot.c b/kernel/smpboot.c index d264f59bff56..13bc43d1fb22 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -226,7 +226,7 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp kthread_unpark(tsk); } -void smpboot_unpark_threads(unsigned int cpu) +int smpboot_unpark_threads(unsigned int cpu) { struct smp_hotplug_thread *cur; @@ -235,6 +235,7 @@ void smpboot_unpark_threads(unsigned int cpu) if (cpumask_test_cpu(cpu, cur->cpumask)) smpboot_unpark_thread(cur, cpu); mutex_unlock(&smpboot_threads_lock); + return 0; } static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu) @@ -245,7 +246,7 @@ static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu) kthread_park(tsk); } -void smpboot_park_threads(unsigned int cpu) +int smpboot_park_threads(unsigned int cpu) { struct smp_hotplug_thread *cur; @@ -253,6 +254,7 @@ void smpboot_park_threads(unsigned int cpu) list_for_each_entry_reverse(cur, &hotplug_threads, list) smpboot_park_thread(cur, cpu); mutex_unlock(&smpboot_threads_lock); + return 0; } static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) diff --git a/kernel/smpboot.h b/kernel/smpboot.h index 72415a0eb955..485b81cfab34 100644 --- a/kernel/smpboot.h +++ b/kernel/smpboot.h @@ -14,7 +14,9 @@ static inline void idle_threads_init(void) { } #endif int smpboot_create_threads(unsigned int cpu); -void smpboot_park_threads(unsigned int cpu); -void smpboot_unpark_threads(unsigned int cpu); +int smpboot_park_threads(unsigned int cpu); +int smpboot_unpark_threads(unsigned int cpu); + +void __init cpuhp_threads_init(void); #endif diff --git a/kernel/softirq.c b/kernel/softirq.c index 479e4436f787..17caf4b63342 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -116,9 +116,9 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) if (preempt_count() == cnt) { #ifdef CONFIG_DEBUG_PREEMPT - current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1); + current->preempt_disable_ip = get_lock_parent_ip(); #endif - trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + trace_preempt_off(CALLER_ADDR0, get_lock_parent_ip()); } } EXPORT_SYMBOL(__local_bh_disable_ip); @@ -227,7 +227,7 @@ static inline bool lockdep_softirq_start(void) { return false; } static inline void lockdep_softirq_end(bool in_hardirq) { } #endif -asmlinkage __visible void __do_softirq(void) +asmlinkage __visible void __softirq_entry __do_softirq(void) { unsigned long end = jiffies + MAX_SOFTIRQ_TIME; unsigned long old_flags = current->flags; diff --git a/kernel/sys.c b/kernel/sys.c index 78947de6f969..89d5be418157 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2169,7 +2169,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = perf_event_task_enable(); break; case PR_GET_TIMERSLACK: - error = current->timer_slack_ns; + if (current->timer_slack_ns > ULONG_MAX) + error = ULONG_MAX; + else + error = current->timer_slack_ns; break; case PR_SET_TIMERSLACK: if (arg2 <= 0) @@ -2243,7 +2246,8 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_THP_DISABLE: if (arg3 || arg4 || arg5) return -EINVAL; - down_write(&me->mm->mmap_sem); + if (down_write_killable(&me->mm->mmap_sem)) + return -EINTR; if (arg2) me->mm->def_flags |= VM_NOHUGEPAGE; else diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 97715fd9e790..87b2fc38398b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -126,9 +126,13 @@ static int __maybe_unused two = 2; static int __maybe_unused four = 4; static unsigned long one_ul = 1; static int one_hundred = 100; +static int one_thousand = 1000; #ifdef CONFIG_PRINTK static int ten_thousand = 10000; #endif +#ifdef CONFIG_PERF_EVENTS +static int six_hundred_forty_kb = 640 * 1024; +#endif /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; @@ -350,6 +354,17 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_SCHEDSTATS + { + .procname = "sched_schedstats", + .data = NULL, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_schedstats, + .extra1 = &zero, + .extra2 = &one, + }, +#endif /* CONFIG_SCHEDSTATS */ #endif /* CONFIG_SMP */ #ifdef CONFIG_NUMA_BALANCING { @@ -505,7 +520,7 @@ static struct ctl_table kern_table[] = { .data = &latencytop_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = sysctl_latencytop, }, #endif #ifdef CONFIG_BLK_DEV_INITRD @@ -1132,6 +1147,24 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one_hundred, }, + { + .procname = "perf_event_max_stack", + .data = &sysctl_perf_event_max_stack, + .maxlen = sizeof(sysctl_perf_event_max_stack), + .mode = 0644, + .proc_handler = perf_event_max_stack_handler, + .extra1 = &zero, + .extra2 = &six_hundred_forty_kb, + }, + { + .procname = "perf_event_max_contexts_per_stack", + .data = &sysctl_perf_event_max_contexts_per_stack, + .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack), + .mode = 0644, + .proc_handler = perf_event_max_stack_handler, + .extra1 = &zero, + .extra2 = &one_thousand, + }, #endif #ifdef CONFIG_KMEMCHECK { @@ -1393,6 +1426,15 @@ static struct ctl_table vm_table[] = { .extra1 = &zero, }, { + .procname = "watermark_scale_factor", + .data = &watermark_scale_factor, + .maxlen = sizeof(watermark_scale_factor), + .mode = 0644, + .proc_handler = watermark_scale_factor_sysctl_handler, + .extra1 = &one, + .extra2 = &one_thousand, + }, + { .procname = "percpu_pagelist_fraction", .data = &percpu_pagelist_fraction, .maxlen = sizeof(percpu_pagelist_fraction), @@ -1488,6 +1530,13 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "stat_refresh", + .data = NULL, + .maxlen = 0, + .mode = 0600, + .proc_handler = vmstat_refresh, + }, #endif #ifdef CONFIG_MMU { diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 7e7746a42a62..6eb99c17dbd8 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -13,6 +13,7 @@ #include <linux/ctype.h> #include <linux/netdevice.h> #include <linux/kernel.h> +#include <linux/uuid.h> #include <linux/slab.h> #include <linux/compat.h> @@ -1117,9 +1118,8 @@ static ssize_t bin_uuid(struct file *file, /* Only supports reads */ if (oldval && oldlen) { - char buf[40], *str = buf; - unsigned char uuid[16]; - int i; + char buf[UUID_STRING_LEN + 1]; + uuid_be uuid; result = kernel_read(file, 0, buf, sizeof(buf) - 1); if (result < 0) @@ -1127,24 +1127,15 @@ static ssize_t bin_uuid(struct file *file, buf[result] = '\0'; - /* Convert the uuid to from a string to binary */ - for (i = 0; i < 16; i++) { - result = -EIO; - if (!isxdigit(str[0]) || !isxdigit(str[1])) - goto out; - - uuid[i] = (hex_to_bin(str[0]) << 4) | - hex_to_bin(str[1]); - str += 2; - if (*str == '-') - str++; - } + result = -EIO; + if (uuid_be_to_bin(buf, &uuid)) + goto out; if (oldlen > 16) oldlen = 16; result = -EFAULT; - if (copy_to_user(oldval, uuid, oldlen)) + if (copy_to_user(oldval, &uuid, oldlen)) goto out; copied = oldlen; @@ -1321,7 +1312,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, } mnt = task_active_pid_ns(current)->proc_mnt; - file = file_open_root(mnt->mnt_root, mnt, pathname, flags); + file = file_open_root(mnt->mnt_root, mnt, pathname, flags, 0); result = PTR_ERR(file); if (IS_ERR(file)) goto out_putname; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 21f82c29c914..b3f05ee20d18 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -357,10 +357,6 @@ static int parse(struct nlattr *na, struct cpumask *mask) return ret; } -#if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) -#define TASKSTATS_NEEDS_PADDING 1 -#endif - static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) { struct nlattr *na, *ret; @@ -370,29 +366,6 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) ? TASKSTATS_TYPE_AGGR_PID : TASKSTATS_TYPE_AGGR_TGID; - /* - * The taskstats structure is internally aligned on 8 byte - * boundaries but the layout of the aggregrate reply, with - * two NLA headers and the pid (each 4 bytes), actually - * force the entire structure to be unaligned. This causes - * the kernel to issue unaligned access warnings on some - * architectures like ia64. Unfortunately, some software out there - * doesn't properly unroll the NLA packet and assumes that the start - * of the taskstats structure will always be 20 bytes from the start - * of the netlink payload. Aligning the start of the taskstats - * structure breaks this software, which we don't want. So, for now - * the alignment only happens on architectures that require it - * and those users will have to update to fixed versions of those - * packages. Space is reserved in the packet only when needed. - * This ifdef should be removed in several years e.g. 2012 once - * we can be confident that fixed versions are installed on most - * systems. We add the padding before the aggregate since the - * aggregate is already a defined type. - */ -#ifdef TASKSTATS_NEEDS_PADDING - if (nla_put(skb, TASKSTATS_TYPE_NULL, 0, NULL) < 0) - goto err; -#endif na = nla_nest_start(skb, aggr); if (!na) goto err; @@ -401,7 +374,8 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) nla_nest_cancel(skb, na); goto err; } - ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); + ret = nla_reserve_64bit(skb, TASKSTATS_TYPE_STATS, + sizeof(struct taskstats), TASKSTATS_TYPE_NULL); if (!ret) { nla_nest_cancel(skb, na); goto err; @@ -500,10 +474,9 @@ static size_t taskstats_packet_size(void) size_t size; size = nla_total_size(sizeof(u32)) + - nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); -#ifdef TASKSTATS_NEEDS_PADDING - size += nla_total_size(0); /* Padding for alignment */ -#endif + nla_total_size_64bit(sizeof(struct taskstats)) + + nla_total_size(0); + return size; } diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 664de539299b..56ece145a814 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -323,13 +323,42 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) /* cs is a watchdog. */ if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + } + spin_unlock_irqrestore(&watchdog_lock, flags); +} + +static void clocksource_select_watchdog(bool fallback) +{ + struct clocksource *cs, *old_wd; + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + /* save current watchdog */ + old_wd = watchdog; + if (fallback) + watchdog = NULL; + + list_for_each_entry(cs, &clocksource_list, list) { + /* cs is a clocksource to be watched. */ + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) + continue; + + /* Skip current if we were requested for a fallback. */ + if (fallback && cs == old_wd) + continue; + /* Pick the best watchdog. */ - if (!watchdog || cs->rating > watchdog->rating) { + if (!watchdog || cs->rating > watchdog->rating) watchdog = cs; - /* Reset watchdog cycles */ - clocksource_reset_watchdog(); - } } + /* If we failed to find a fallback restore the old one. */ + if (!watchdog) + watchdog = old_wd; + + /* If we changed the watchdog we need to reset cycles. */ + if (watchdog != old_wd) + clocksource_reset_watchdog(); + /* Check if the watchdog timer needs to be started. */ clocksource_start_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); @@ -404,6 +433,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; } +static void clocksource_select_watchdog(bool fallback) { } static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } static inline int __clocksource_watchdog_kthread(void) { return 0; } @@ -736,6 +766,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) clocksource_enqueue(cs); clocksource_enqueue_watchdog(cs); clocksource_select(); + clocksource_select_watchdog(false); mutex_unlock(&clocksource_mutex); return 0; } @@ -758,6 +789,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating) mutex_lock(&clocksource_mutex); __clocksource_change_rating(cs, rating); clocksource_select(); + clocksource_select_watchdog(false); mutex_unlock(&clocksource_mutex); } EXPORT_SYMBOL(clocksource_change_rating); @@ -767,12 +799,12 @@ EXPORT_SYMBOL(clocksource_change_rating); */ static int clocksource_unbind(struct clocksource *cs) { - /* - * I really can't convince myself to support this on hardware - * designed by lobotomized monkeys. - */ - if (clocksource_is_watchdog(cs)) - return -EBUSY; + if (clocksource_is_watchdog(cs)) { + /* Select and try to install a replacement watchdog. */ + clocksource_select_watchdog(true); + if (clocksource_is_watchdog(cs)) + return -EBUSY; + } if (cs == curr_clocksource) { /* Select and try to install a replacement clock source */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index fa909f9fd559..e99df0ff1d42 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -334,7 +334,7 @@ static void *hrtimer_debug_hint(void *addr) * fixup_init is called when: * - an active object is initialized */ -static int hrtimer_fixup_init(void *addr, enum debug_obj_state state) +static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state) { struct hrtimer *timer = addr; @@ -342,30 +342,25 @@ static int hrtimer_fixup_init(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: hrtimer_cancel(timer); debug_object_init(timer, &hrtimer_debug_descr); - return 1; + return true; default: - return 0; + return false; } } /* * fixup_activate is called when: * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) + * - an unknown non-static object is activated */ -static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state) +static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state) { switch (state) { - - case ODEBUG_STATE_NOTAVAILABLE: - WARN_ON_ONCE(1); - return 0; - case ODEBUG_STATE_ACTIVE: WARN_ON(1); default: - return 0; + return false; } } @@ -373,7 +368,7 @@ static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state) * fixup_free is called when: * - an active object is freed */ -static int hrtimer_fixup_free(void *addr, enum debug_obj_state state) +static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state) { struct hrtimer *timer = addr; @@ -381,9 +376,9 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: hrtimer_cancel(timer); debug_object_free(timer, &hrtimer_debug_descr); - return 1; + return true; default: - return 0; + return false; } } @@ -430,6 +425,7 @@ void destroy_hrtimer_on_stack(struct hrtimer *timer) { debug_object_free(timer, &hrtimer_debug_descr); } +EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack); #else static inline void debug_hrtimer_init(struct hrtimer *timer) { } @@ -515,7 +511,7 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) /* * High resolution timer enabled ? */ -static int hrtimer_hres_enabled __read_mostly = 1; +static bool hrtimer_hres_enabled __read_mostly = true; unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; EXPORT_SYMBOL_GPL(hrtimer_resolution); @@ -524,13 +520,7 @@ EXPORT_SYMBOL_GPL(hrtimer_resolution); */ static int __init setup_hrtimer_hres(char *str) { - if (!strcmp(str, "off")) - hrtimer_hres_enabled = 0; - else if (!strcmp(str, "on")) - hrtimer_hres_enabled = 1; - else - return 0; - return 1; + return (kstrtobool(str, &hrtimer_hres_enabled) == 0); } __setup("highres=", setup_hrtimer_hres); @@ -979,7 +969,7 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, * relative (HRTIMER_MODE_REL) */ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, const enum hrtimer_mode mode) + u64 delta_ns, const enum hrtimer_mode mode) { struct hrtimer_clock_base *base, *new_base; unsigned long flags; @@ -1548,7 +1538,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, struct restart_block *restart; struct hrtimer_sleeper t; int ret = 0; - unsigned long slack; + u64 slack; slack = current->timer_slack_ns; if (dl_task(current) || rt_task(current)) @@ -1724,7 +1714,7 @@ void __init hrtimers_init(void) * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME */ int __sched -schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, +schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, const enum hrtimer_mode mode, int clock) { struct hrtimer_sleeper t; @@ -1792,7 +1782,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, * * Returns 0 when the timer has expired otherwise -EINTR */ -int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, +int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode) { return schedule_hrtimeout_range_clock(expires, delta, mode, diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 347fecf86a3f..555e21f7b966 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -68,7 +68,7 @@ static struct clocksource clocksource_jiffies = { .name = "jiffies", .rating = 1, /* lowest valid rating*/ .read = jiffies_read, - .mask = 0xffffffff, /*32bits*/ + .mask = CLOCKSOURCE_MASK(32), .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ .shift = JIFFIES_SHIFT, .max_cycles = 10, diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index f5e86d282d52..1cafba860b08 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -333,7 +333,6 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) return err; } - /* * Validate the clockid_t for a new CPU-clock timer, and initialize the timer. * This is called from sys_timer_create() and do_cpu_nanosleep() with the @@ -517,6 +516,10 @@ static void arm_timer(struct k_itimer *timer) cputime_expires->sched_exp = exp; break; } + if (CPUCLOCK_PERTHREAD(timer->it_clock)) + tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER); + else + tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER); } } @@ -582,39 +585,6 @@ static int cpu_timer_sample_group(const clockid_t which_clock, return 0; } -#ifdef CONFIG_NO_HZ_FULL -static void nohz_kick_work_fn(struct work_struct *work) -{ - tick_nohz_full_kick_all(); -} - -static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn); - -/* - * We need the IPIs to be sent from sane process context. - * The posix cpu timers are always set with irqs disabled. - */ -static void posix_cpu_timer_kick_nohz(void) -{ - if (context_tracking_is_enabled()) - schedule_work(&nohz_kick_work); -} - -bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk) -{ - if (!task_cputime_zero(&tsk->cputime_expires)) - return false; - - /* Check if cputimer is running. This is accessed without locking. */ - if (READ_ONCE(tsk->signal->cputimer.running)) - return false; - - return true; -} -#else -static inline void posix_cpu_timer_kick_nohz(void) { } -#endif - /* * Guts of sys_timer_settime for CPU timers. * This is called with the timer locked and interrupts disabled. @@ -761,8 +731,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, sample_to_timespec(timer->it_clock, old_incr, &old->it_interval); } - if (!ret) - posix_cpu_timer_kick_nohz(); + return ret; } @@ -911,6 +880,8 @@ static void check_thread_timers(struct task_struct *tsk, __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); } } + if (task_cputime_zero(tsk_expires)) + tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER); } static inline void stop_process_timers(struct signal_struct *sig) @@ -919,6 +890,7 @@ static inline void stop_process_timers(struct signal_struct *sig) /* Turn off cputimer->running. This is done without locking. */ WRITE_ONCE(cputimer->running, false); + tick_dep_clear_signal(sig, TICK_DEP_BIT_POSIX_TIMER); } static u32 onecputick; @@ -1095,8 +1067,6 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) arm_timer(timer); unlock_task_sighand(p, &flags); - /* Kick full dynticks CPUs in case they need to tick on the new timer */ - posix_cpu_timer_kick_nohz(); out: timer->it_overrun_last = timer->it_overrun; timer->it_overrun = -1; @@ -1270,7 +1240,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, } if (!*newval) - goto out; + return; *newval += now; } @@ -1288,8 +1258,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, tsk->signal->cputime_expires.virt_exp = *newval; break; } -out: - posix_cpu_timer_kick_nohz(); + + tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER); } static int do_cpu_nanosleep(const clockid_t which_clock, int flags, diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 0b17424349eb..536ada80f6dd 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -22,7 +22,6 @@ #include <linux/module.h> #include <linux/irq_work.h> #include <linux/posix-timers.h> -#include <linux/perf_event.h> #include <linux/context_tracking.h> #include <asm/irq_regs.h> @@ -158,54 +157,61 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) cpumask_var_t tick_nohz_full_mask; cpumask_var_t housekeeping_mask; bool tick_nohz_full_running; +static atomic_t tick_dep_mask; -static bool can_stop_full_tick(void) +static bool check_tick_dependency(atomic_t *dep) +{ + int val = atomic_read(dep); + + if (val & TICK_DEP_MASK_POSIX_TIMER) { + trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); + return true; + } + + if (val & TICK_DEP_MASK_PERF_EVENTS) { + trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS); + return true; + } + + if (val & TICK_DEP_MASK_SCHED) { + trace_tick_stop(0, TICK_DEP_MASK_SCHED); + return true; + } + + if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) { + trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE); + return true; + } + + return false; +} + +static bool can_stop_full_tick(struct tick_sched *ts) { WARN_ON_ONCE(!irqs_disabled()); - if (!sched_can_stop_tick()) { - trace_tick_stop(0, "more than 1 task in runqueue\n"); + if (check_tick_dependency(&tick_dep_mask)) return false; - } - if (!posix_cpu_timers_can_stop_tick(current)) { - trace_tick_stop(0, "posix timers running\n"); + if (check_tick_dependency(&ts->tick_dep_mask)) return false; - } - if (!perf_event_can_stop_tick()) { - trace_tick_stop(0, "perf events running\n"); + if (check_tick_dependency(¤t->tick_dep_mask)) return false; - } - /* sched_clock_tick() needs us? */ -#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - /* - * TODO: kick full dynticks CPUs when - * sched_clock_stable is set. - */ - if (!sched_clock_stable()) { - trace_tick_stop(0, "unstable sched clock\n"); - /* - * Don't allow the user to think they can get - * full NO_HZ with this machine. - */ - WARN_ONCE(tick_nohz_full_running, - "NO_HZ FULL will not work with unstable sched clock"); + if (check_tick_dependency(¤t->signal->tick_dep_mask)) return false; - } -#endif return true; } -static void nohz_full_kick_work_func(struct irq_work *work) +static void nohz_full_kick_func(struct irq_work *work) { /* Empty, the tick restart happens on tick_nohz_irq_exit() */ } static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { - .func = nohz_full_kick_work_func, + .func = nohz_full_kick_func, }; /* @@ -214,7 +220,7 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), * is NMI safe. */ -void tick_nohz_full_kick(void) +static void tick_nohz_full_kick(void) { if (!tick_nohz_full_cpu(smp_processor_id())) return; @@ -234,27 +240,112 @@ void tick_nohz_full_kick_cpu(int cpu) irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); } -static void nohz_full_kick_ipi(void *info) -{ - /* Empty, the tick restart happens on tick_nohz_irq_exit() */ -} - /* * Kick all full dynticks CPUs in order to force these to re-evaluate * their dependency on the tick and restart it if necessary. */ -void tick_nohz_full_kick_all(void) +static void tick_nohz_full_kick_all(void) { + int cpu; + if (!tick_nohz_full_running) return; preempt_disable(); - smp_call_function_many(tick_nohz_full_mask, - nohz_full_kick_ipi, NULL, false); - tick_nohz_full_kick(); + for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask) + tick_nohz_full_kick_cpu(cpu); preempt_enable(); } +static void tick_nohz_dep_set_all(atomic_t *dep, + enum tick_dep_bits bit) +{ + int prev; + + prev = atomic_fetch_or(BIT(bit), dep); + if (!prev) + tick_nohz_full_kick_all(); +} + +/* + * Set a global tick dependency. Used by perf events that rely on freq and + * by unstable clock. + */ +void tick_nohz_dep_set(enum tick_dep_bits bit) +{ + tick_nohz_dep_set_all(&tick_dep_mask, bit); +} + +void tick_nohz_dep_clear(enum tick_dep_bits bit) +{ + atomic_andnot(BIT(bit), &tick_dep_mask); +} + +/* + * Set per-CPU tick dependency. Used by scheduler and perf events in order to + * manage events throttling. + */ +void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) +{ + int prev; + struct tick_sched *ts; + + ts = per_cpu_ptr(&tick_cpu_sched, cpu); + + prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask); + if (!prev) { + preempt_disable(); + /* Perf needs local kick that is NMI safe */ + if (cpu == smp_processor_id()) { + tick_nohz_full_kick(); + } else { + /* Remote irq work not NMI-safe */ + if (!WARN_ON_ONCE(in_nmi())) + tick_nohz_full_kick_cpu(cpu); + } + preempt_enable(); + } +} + +void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) +{ + struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); + + atomic_andnot(BIT(bit), &ts->tick_dep_mask); +} + +/* + * Set a per-task tick dependency. Posix CPU timers need this in order to elapse + * per task timers. + */ +void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) +{ + /* + * We could optimize this with just kicking the target running the task + * if that noise matters for nohz full users. + */ + tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit); +} + +void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit) +{ + atomic_andnot(BIT(bit), &tsk->tick_dep_mask); +} + +/* + * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse + * per process timers. + */ +void tick_nohz_dep_set_signal(struct signal_struct *sig, enum tick_dep_bits bit) +{ + tick_nohz_dep_set_all(&sig->tick_dep_mask, bit); +} + +void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit) +{ + atomic_andnot(BIT(bit), &sig->tick_dep_mask); +} + /* * Re-evaluate the need for the tick as we switch the current task. * It might need the tick due to per task/process properties: @@ -263,15 +354,20 @@ void tick_nohz_full_kick_all(void) void __tick_nohz_task_switch(void) { unsigned long flags; + struct tick_sched *ts; local_irq_save(flags); if (!tick_nohz_full_cpu(smp_processor_id())) goto out; - if (tick_nohz_tick_stopped() && !can_stop_full_tick()) - tick_nohz_full_kick(); + ts = this_cpu_ptr(&tick_cpu_sched); + if (ts->tick_stopped) { + if (atomic_read(¤t->tick_dep_mask) || + atomic_read(¤t->signal->tick_dep_mask)) + tick_nohz_full_kick(); + } out: local_irq_restore(flags); } @@ -281,7 +377,7 @@ static int __init tick_nohz_full_setup(char *str) { alloc_bootmem_cpumask_var(&tick_nohz_full_mask); if (cpulist_parse(str, tick_nohz_full_mask) < 0) { - pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); + pr_warn("NO_HZ: Incorrect nohz_full cpumask\n"); free_bootmem_cpumask_var(tick_nohz_full_mask); return 1; } @@ -349,8 +445,7 @@ void __init tick_nohz_init(void) * interrupts to avoid circular dependency on the tick */ if (!arch_irq_work_has_interrupt()) { - pr_warning("NO_HZ: Can't run full dynticks because arch doesn't " - "support irq work self-IPIs\n"); + pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n"); cpumask_clear(tick_nohz_full_mask); cpumask_copy(housekeeping_mask, cpu_possible_mask); tick_nohz_full_running = false; @@ -360,7 +455,8 @@ void __init tick_nohz_init(void) cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { - pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); + pr_warn("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", + cpu); cpumask_clear_cpu(cpu, tick_nohz_full_mask); } @@ -389,20 +485,14 @@ void __init tick_nohz_init(void) /* * NO HZ enabled ? */ -int tick_nohz_enabled __read_mostly = 1; +bool tick_nohz_enabled __read_mostly = true; unsigned long tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ static int __init setup_tick_nohz(char *str) { - if (!strcmp(str, "off")) - tick_nohz_enabled = 0; - else if (!strcmp(str, "on")) - tick_nohz_enabled = 1; - else - return 0; - return 1; + return (kstrtobool(str, &tick_nohz_enabled) == 0); } __setup("nohz=", setup_tick_nohz); @@ -686,10 +776,11 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, if (!ts->tick_stopped) { nohz_balance_enter_idle(cpu); calc_load_enter_idle(); + cpu_load_update_nohz_start(); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; - trace_tick_stop(1, " "); + trace_tick_stop(1, TICK_DEP_MASK_NONE); } /* @@ -712,11 +803,11 @@ out: return tick; } -static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now, int active) +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ tick_do_update_jiffies64(now); - update_cpu_load_nohz(active); + cpu_load_update_nohz_stop(); calc_load_exit_idle(); touch_softlockup_watchdog_sched(); @@ -740,10 +831,10 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) return; - if (can_stop_full_tick()) + if (can_stop_full_tick(ts)) tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); else if (ts->tick_stopped) - tick_nohz_restart_sched_tick(ts, ktime_get(), 1); + tick_nohz_restart_sched_tick(ts, ktime_get()); #endif } @@ -934,7 +1025,7 @@ void tick_nohz_idle_exit(void) tick_nohz_stop_idle(ts, now); if (ts->tick_stopped) { - tick_nohz_restart_sched_tick(ts, now, 0); + tick_nohz_restart_sched_tick(ts, now); tick_nohz_account_idle_ticks(ts); } diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index a4a8d4e9baa1..bf38226e5c17 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -60,6 +60,7 @@ struct tick_sched { u64 next_timer; ktime_t idle_expires; int do_timer_last; + atomic_t tick_dep_mask; }; extern struct tick_sched *tick_get_tick_sched(int cpu); diff --git a/kernel/time/time.c b/kernel/time/time.c index 86751c68e08d..667b9335f5d6 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -160,15 +160,15 @@ static inline void warp_clock(void) * various programs will get confused when the clock gets warped. */ -int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz) +int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz) { static int firsttime = 1; int error = 0; - if (tv && !timespec_valid(tv)) + if (tv && !timespec64_valid(tv)) return -EINVAL; - error = security_settime(tv, tz); + error = security_settime64(tv, tz); if (error) return error; @@ -186,7 +186,7 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz) } } if (tv) - return do_settimeofday(tv); + return do_settimeofday64(tv); return 0; } @@ -322,6 +322,13 @@ EXPORT_SYMBOL(timespec_trunc); * -year/100+year/400 terms, and add 10.] * * This algorithm was first published by Gauss (I think). + * + * A leap second can be indicated by calling this function with sec as + * 60 (allowable under ISO 8601). The leap second is treated the same + * as the following second since they don't exist in UNIX time. + * + * An encoding of midnight at the end of the day as 24:00:00 - ie. midnight + * tomorrow - (allowable under ISO 8601) is supported. */ time64_t mktime64(const unsigned int year0, const unsigned int mon0, const unsigned int day, const unsigned int hour, @@ -338,7 +345,7 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0, return ((((time64_t) (year/4 - year/100 + year/400 + 367*mon/12 + day) + year*365 - 719499 - )*24 + hour /* now have hours */ + )*24 + hour /* now have hours - midnight tomorrow handled here */ )*60 + min /* now have minutes */ )*60 + sec; /* finally seconds */ } @@ -762,3 +769,24 @@ struct timespec timespec_add_safe(const struct timespec lhs, return res; } + +/* + * Add two timespec64 values and do a safety check for overflow. + * It's assumed that both values are valid (>= 0). + * And, each timespec64 is in normalized form. + */ +struct timespec64 timespec64_add_safe(const struct timespec64 lhs, + const struct timespec64 rhs) +{ + struct timespec64 res; + + set_normalized_timespec64(&res, lhs.tv_sec + rhs.tv_sec, + lhs.tv_nsec + rhs.tv_nsec); + + if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) { + res.tv_sec = TIME64_MAX; + res.tv_nsec = 0; + } + + return res; +} diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 34b4cedfa80d..479d25cd3d4f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -131,7 +131,7 @@ static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n"); } else { if (offset > (max_cycles >> 1)) { - printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the the '%s' clock's 50%% safety margin (%lld)\n", + printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the '%s' clock's 50%% safety margin (%lld)\n", offset, name, max_cycles >> 1); printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n"); } @@ -233,6 +233,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) u64 tmp, ntpinterval; struct clocksource *old_clock; + ++tk->cs_was_changed_seq; old_clock = tk->tkr_mono.clock; tk->tkr_mono.clock = clock; tk->tkr_mono.read = clock->read; @@ -298,17 +299,34 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset; static inline u32 arch_gettimeoffset(void) { return 0; } #endif +static inline s64 timekeeping_delta_to_ns(struct tk_read_base *tkr, + cycle_t delta) +{ + s64 nsec; + + nsec = delta * tkr->mult + tkr->xtime_nsec; + nsec >>= tkr->shift; + + /* If arch requires, add in get_arch_timeoffset() */ + return nsec + arch_gettimeoffset(); +} + static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) { cycle_t delta; - s64 nsec; delta = timekeeping_get_delta(tkr); + return timekeeping_delta_to_ns(tkr, delta); +} - nsec = (delta * tkr->mult + tkr->xtime_nsec) >> tkr->shift; +static inline s64 timekeeping_cycles_to_ns(struct tk_read_base *tkr, + cycle_t cycles) +{ + cycle_t delta; - /* If arch requires, add in get_arch_timeoffset() */ - return nsec + arch_gettimeoffset(); + /* calculate the delta since the last update_wall_time */ + delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask); + return timekeeping_delta_to_ns(tkr, delta); } /** @@ -857,44 +875,262 @@ time64_t __ktime_get_real_seconds(void) return tk->xtime_sec; } +/** + * ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter + * @systime_snapshot: pointer to struct receiving the system time snapshot + */ +void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned long seq; + ktime_t base_raw; + ktime_t base_real; + s64 nsec_raw; + s64 nsec_real; + cycle_t now; -#ifdef CONFIG_NTP_PPS + WARN_ON_ONCE(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + + now = tk->tkr_mono.read(tk->tkr_mono.clock); + systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq; + systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq; + base_real = ktime_add(tk->tkr_mono.base, + tk_core.timekeeper.offs_real); + base_raw = tk->tkr_raw.base; + nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now); + nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now); + } while (read_seqcount_retry(&tk_core.seq, seq)); + + systime_snapshot->cycles = now; + systime_snapshot->real = ktime_add_ns(base_real, nsec_real); + systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw); +} +EXPORT_SYMBOL_GPL(ktime_get_snapshot); + +/* Scale base by mult/div checking for overflow */ +static int scale64_check_overflow(u64 mult, u64 div, u64 *base) +{ + u64 tmp, rem; + + tmp = div64_u64_rem(*base, div, &rem); + + if (((int)sizeof(u64)*8 - fls64(mult) < fls64(tmp)) || + ((int)sizeof(u64)*8 - fls64(mult) < fls64(rem))) + return -EOVERFLOW; + tmp *= mult; + rem *= mult; + + do_div(rem, div); + *base = tmp + rem; + return 0; +} /** - * ktime_get_raw_and_real_ts64 - get day and raw monotonic time in timespec format - * @ts_raw: pointer to the timespec to be set to raw monotonic time - * @ts_real: pointer to the timespec to be set to the time of day + * adjust_historical_crosststamp - adjust crosstimestamp previous to current interval + * @history: Snapshot representing start of history + * @partial_history_cycles: Cycle offset into history (fractional part) + * @total_history_cycles: Total history length in cycles + * @discontinuity: True indicates clock was set on history period + * @ts: Cross timestamp that should be adjusted using + * partial/total ratio * - * This function reads both the time of day and raw monotonic time at the - * same time atomically and stores the resulting timestamps in timespec - * format. + * Helper function used by get_device_system_crosststamp() to correct the + * crosstimestamp corresponding to the start of the current interval to the + * system counter value (timestamp point) provided by the driver. The + * total_history_* quantities are the total history starting at the provided + * reference point and ending at the start of the current interval. The cycle + * count between the driver timestamp point and the start of the current + * interval is partial_history_cycles. */ -void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, struct timespec64 *ts_real) +static int adjust_historical_crosststamp(struct system_time_snapshot *history, + cycle_t partial_history_cycles, + cycle_t total_history_cycles, + bool discontinuity, + struct system_device_crosststamp *ts) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned long seq; - s64 nsecs_raw, nsecs_real; + u64 corr_raw, corr_real; + bool interp_forward; + int ret; - WARN_ON_ONCE(timekeeping_suspended); + if (total_history_cycles == 0 || partial_history_cycles == 0) + return 0; + + /* Interpolate shortest distance from beginning or end of history */ + interp_forward = partial_history_cycles > total_history_cycles/2 ? + true : false; + partial_history_cycles = interp_forward ? + total_history_cycles - partial_history_cycles : + partial_history_cycles; + + /* + * Scale the monotonic raw time delta by: + * partial_history_cycles / total_history_cycles + */ + corr_raw = (u64)ktime_to_ns( + ktime_sub(ts->sys_monoraw, history->raw)); + ret = scale64_check_overflow(partial_history_cycles, + total_history_cycles, &corr_raw); + if (ret) + return ret; + + /* + * If there is a discontinuity in the history, scale monotonic raw + * correction by: + * mult(real)/mult(raw) yielding the realtime correction + * Otherwise, calculate the realtime correction similar to monotonic + * raw calculation + */ + if (discontinuity) { + corr_real = mul_u64_u32_div + (corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult); + } else { + corr_real = (u64)ktime_to_ns( + ktime_sub(ts->sys_realtime, history->real)); + ret = scale64_check_overflow(partial_history_cycles, + total_history_cycles, &corr_real); + if (ret) + return ret; + } + + /* Fixup monotonic raw and real time time values */ + if (interp_forward) { + ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw); + ts->sys_realtime = ktime_add_ns(history->real, corr_real); + } else { + ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw); + ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real); + } + + return 0; +} + +/* + * cycle_between - true if test occurs chronologically between before and after + */ +static bool cycle_between(cycle_t before, cycle_t test, cycle_t after) +{ + if (test > before && test < after) + return true; + if (test < before && before > after) + return true; + return false; +} + +/** + * get_device_system_crosststamp - Synchronously capture system/device timestamp + * @get_time_fn: Callback to get simultaneous device time and + * system counter from the device driver + * @ctx: Context passed to get_time_fn() + * @history_begin: Historical reference point used to interpolate system + * time when counter provided by the driver is before the current interval + * @xtstamp: Receives simultaneously captured system and device time + * + * Reads a timestamp from a device and correlates it to system time + */ +int get_device_system_crosststamp(int (*get_time_fn) + (ktime_t *device_time, + struct system_counterval_t *sys_counterval, + void *ctx), + void *ctx, + struct system_time_snapshot *history_begin, + struct system_device_crosststamp *xtstamp) +{ + struct system_counterval_t system_counterval; + struct timekeeper *tk = &tk_core.timekeeper; + cycle_t cycles, now, interval_start; + unsigned int clock_was_set_seq = 0; + ktime_t base_real, base_raw; + s64 nsec_real, nsec_raw; + u8 cs_was_changed_seq; + unsigned long seq; + bool do_interp; + int ret; do { seq = read_seqcount_begin(&tk_core.seq); + /* + * Try to synchronously capture device time and a system + * counter value calling back into the device driver + */ + ret = get_time_fn(&xtstamp->device, &system_counterval, ctx); + if (ret) + return ret; + + /* + * Verify that the clocksource associated with the captured + * system counter value is the same as the currently installed + * timekeeper clocksource + */ + if (tk->tkr_mono.clock != system_counterval.cs) + return -ENODEV; + cycles = system_counterval.cycles; - *ts_raw = tk->raw_time; - ts_real->tv_sec = tk->xtime_sec; - ts_real->tv_nsec = 0; + /* + * Check whether the system counter value provided by the + * device driver is on the current timekeeping interval. + */ + now = tk->tkr_mono.read(tk->tkr_mono.clock); + interval_start = tk->tkr_mono.cycle_last; + if (!cycle_between(interval_start, cycles, now)) { + clock_was_set_seq = tk->clock_was_set_seq; + cs_was_changed_seq = tk->cs_was_changed_seq; + cycles = interval_start; + do_interp = true; + } else { + do_interp = false; + } - nsecs_raw = timekeeping_get_ns(&tk->tkr_raw); - nsecs_real = timekeeping_get_ns(&tk->tkr_mono); + base_real = ktime_add(tk->tkr_mono.base, + tk_core.timekeeper.offs_real); + base_raw = tk->tkr_raw.base; + nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, + system_counterval.cycles); + nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, + system_counterval.cycles); } while (read_seqcount_retry(&tk_core.seq, seq)); - timespec64_add_ns(ts_raw, nsecs_raw); - timespec64_add_ns(ts_real, nsecs_real); -} -EXPORT_SYMBOL(ktime_get_raw_and_real_ts64); + xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real); + xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw); -#endif /* CONFIG_NTP_PPS */ + /* + * Interpolate if necessary, adjusting back from the start of the + * current interval + */ + if (do_interp) { + cycle_t partial_history_cycles, total_history_cycles; + bool discontinuity; + + /* + * Check that the counter value occurs after the provided + * history reference and that the history doesn't cross a + * clocksource change + */ + if (!history_begin || + !cycle_between(history_begin->cycles, + system_counterval.cycles, cycles) || + history_begin->cs_was_changed_seq != cs_was_changed_seq) + return -EINVAL; + partial_history_cycles = cycles - system_counterval.cycles; + total_history_cycles = cycles - history_begin->cycles; + discontinuity = + history_begin->clock_was_set_seq != clock_was_set_seq; + + ret = adjust_historical_crosststamp(history_begin, + partial_history_cycles, + total_history_cycles, + discontinuity, xtstamp); + if (ret) + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(get_device_system_crosststamp); /** * do_gettimeofday - Returns the time of day in a timeval diff --git a/kernel/time/timer.c b/kernel/time/timer.c index bbc5d1114583..3a95f9728778 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -489,11 +489,19 @@ static void *timer_debug_hint(void *addr) return ((struct timer_list *) addr)->function; } +static bool timer_is_static_object(void *addr) +{ + struct timer_list *timer = addr; + + return (timer->entry.pprev == NULL && + timer->entry.next == TIMER_ENTRY_STATIC); +} + /* * fixup_init is called when: * - an active object is initialized */ -static int timer_fixup_init(void *addr, enum debug_obj_state state) +static bool timer_fixup_init(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; @@ -501,9 +509,9 @@ static int timer_fixup_init(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: del_timer_sync(timer); debug_object_init(timer, &timer_debug_descr); - return 1; + return true; default: - return 0; + return false; } } @@ -516,36 +524,22 @@ static void stub_timer(unsigned long data) /* * fixup_activate is called when: * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) + * - an unknown non-static object is activated */ -static int timer_fixup_activate(void *addr, enum debug_obj_state state) +static bool timer_fixup_activate(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; switch (state) { - case ODEBUG_STATE_NOTAVAILABLE: - /* - * This is not really a fixup. The timer was - * statically initialized. We just make sure that it - * is tracked in the object tracker. - */ - if (timer->entry.pprev == NULL && - timer->entry.next == TIMER_ENTRY_STATIC) { - debug_object_init(timer, &timer_debug_descr); - debug_object_activate(timer, &timer_debug_descr); - return 0; - } else { - setup_timer(timer, stub_timer, 0); - return 1; - } - return 0; + setup_timer(timer, stub_timer, 0); + return true; case ODEBUG_STATE_ACTIVE: WARN_ON(1); default: - return 0; + return false; } } @@ -553,7 +547,7 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state) * fixup_free is called when: * - an active object is freed */ -static int timer_fixup_free(void *addr, enum debug_obj_state state) +static bool timer_fixup_free(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; @@ -561,9 +555,9 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: del_timer_sync(timer); debug_object_free(timer, &timer_debug_descr); - return 1; + return true; default: - return 0; + return false; } } @@ -571,32 +565,23 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state) * fixup_assert_init is called when: * - an untracked/uninit-ed object is found */ -static int timer_fixup_assert_init(void *addr, enum debug_obj_state state) +static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; switch (state) { case ODEBUG_STATE_NOTAVAILABLE: - if (timer->entry.next == TIMER_ENTRY_STATIC) { - /* - * This is not really a fixup. The timer was - * statically initialized. We just make sure that it - * is tracked in the object tracker. - */ - debug_object_init(timer, &timer_debug_descr); - return 0; - } else { - setup_timer(timer, stub_timer, 0); - return 1; - } + setup_timer(timer, stub_timer, 0); + return true; default: - return 0; + return false; } } static struct debug_obj_descr timer_debug_descr = { .name = "timer_list", .debug_hint = timer_debug_hint, + .is_static_object = timer_is_static_object, .fixup_init = timer_fixup_init, .fixup_activate = timer_fixup_activate, .fixup_free = timer_fixup_free, @@ -1566,6 +1551,17 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) } EXPORT_SYMBOL(schedule_timeout_uninterruptible); +/* + * Like schedule_timeout_uninterruptible(), except this task will not contribute + * to load average. + */ +signed long __sched schedule_timeout_idle(signed long timeout) +{ + __set_current_state(TASK_IDLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_idle); + #ifdef CONFIG_HOTPLUG_CPU static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) { @@ -1698,10 +1694,10 @@ EXPORT_SYMBOL(msleep_interruptible); static void __sched do_usleep_range(unsigned long min, unsigned long max) { ktime_t kmin; - unsigned long delta; + u64 delta; kmin = ktime_set(0, min * NSEC_PER_USEC); - delta = (max - min) * NSEC_PER_USEC; + delta = (u64)(max - min) * NSEC_PER_USEC; schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); } diff --git a/kernel/torture.c b/kernel/torture.c index 44aa462d033f..fa0bdeee17ac 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -451,6 +451,7 @@ static int torture_shutdown(void *arg) torture_shutdown_hook(); else VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping."); + ftrace_dump(DUMP_ALL); kernel_power_off(); /* Shut down the system. */ return 0; } @@ -602,8 +603,9 @@ bool torture_init_begin(char *ttype, bool v, int *runnable) { mutex_lock(&fullstop_mutex); if (torture_type != NULL) { - pr_alert("torture_init_begin: refusing %s init: %s running", + pr_alert("torture_init_begin: Refusing %s init: %s running.\n", ttype, torture_type); + pr_alert("torture_init_begin: One torture test at a time!\n"); mutex_unlock(&fullstop_mutex); return false; } diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e45db6b0d878..fafeaf803bd0 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -528,6 +528,32 @@ config MMIOTRACE See Documentation/trace/mmiotrace.txt. If you are not helping to develop drivers, say N. +config TRACING_MAP + bool + depends on ARCH_HAVE_NMI_SAFE_CMPXCHG + help + tracing_map is a special-purpose lock-free map for tracing, + separated out as a stand-alone facility in order to allow it + to be shared between multiple tracers. It isn't meant to be + generally used outside of that context, and is normally + selected by tracers that use it. + +config HIST_TRIGGERS + bool "Histogram triggers" + depends on ARCH_HAVE_NMI_SAFE_CMPXCHG + select TRACING_MAP + default n + help + Hist triggers allow one or more arbitrary trace event fields + to be aggregated into hash tables and dumped to stdout by + reading a debugfs/tracefs file. They're useful for + gathering quick and dirty (though precise) summaries of + event activity as an initial guide for further investigation + using more advanced tools. + + See Documentation/trace/events.txt. + If in doubt, say N. + config MMIOTRACE_TEST tristate "Test module for mmiotrace" depends on MMIOTRACE && m diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 9b1044e936a6..979e7bfbde7a 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -31,6 +31,7 @@ obj-$(CONFIG_TRACING) += trace_output.o obj-$(CONFIG_TRACING) += trace_seq.o obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_TRACING) += trace_printk.o +obj-$(CONFIG_TRACING_MAP) += tracing_map.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o @@ -53,6 +54,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o +obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o obj-$(CONFIG_TRACEPOINTS) += power-traces.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2aeb6ffc0a1e..9aef8654e90d 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1349,6 +1349,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, if (t->action == BLK_TN_MESSAGE) { log_action(iter, long_act ? "message" : "m"); blk_log_msg(s, iter->ent); + return trace_handle_return(s); } if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) @@ -1437,12 +1438,12 @@ static struct trace_event trace_blk_event = { static int __init init_blk_tracer(void) { if (!register_trace_event(&trace_blk_event)) { - pr_warning("Warning: could not register block events\n"); + pr_warn("Warning: could not register block events\n"); return 1; } if (register_tracer(&blk_tracer) != 0) { - pr_warning("Warning: could not register the block tracer\n"); + pr_warn("Warning: could not register the block tracer\n"); unregister_trace_event(&trace_blk_event); return 1; } @@ -1551,6 +1552,7 @@ static const struct { { BLK_TC_COMPLETE, "complete" }, { BLK_TC_FS, "fs" }, { BLK_TC_PC, "pc" }, + { BLK_TC_NOTIFY, "notify" }, { BLK_TC_AHEAD, "ahead" }, { BLK_TC_META, "meta" }, { BLK_TC_DISCARD, "discard" }, diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 326a75e884db..26f603da7e26 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -13,8 +13,6 @@ #include <linux/ctype.h> #include "trace.h" -static DEFINE_PER_CPU(int, bpf_prog_active); - /** * trace_call_bpf - invoke BPF program * @prog: BPF program @@ -64,17 +62,21 @@ EXPORT_SYMBOL_GPL(trace_call_bpf); static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { void *dst = (void *) (long) r1; - int size = (int) r2; + int ret, size = (int) r2; void *unsafe_ptr = (void *) (long) r3; - return probe_kernel_read(dst, unsafe_ptr, size); + ret = probe_kernel_read(dst, unsafe_ptr, size); + if (unlikely(ret < 0)) + memset(dst, 0, size); + + return ret; } static const struct bpf_func_proto bpf_probe_read_proto = { .func = bpf_probe_read, .gpl_only = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_STACK, + .arg1_type = ARG_PTR_TO_RAW_STACK, .arg2_type = ARG_CONST_STACK_SIZE, .arg3_type = ARG_ANYTHING, }; @@ -196,7 +198,7 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) if (unlikely(index >= array->map.max_entries)) return -E2BIG; - file = (struct file *)array->ptrs[index]; + file = READ_ONCE(array->ptrs[index]); if (unlikely(!file)) return -ENOENT; @@ -207,6 +209,10 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) event->pmu->count) return -EINVAL; + if (unlikely(event->attr.type != PERF_TYPE_HARDWARE && + event->attr.type != PERF_TYPE_RAW)) + return -EINVAL; + /* * we don't know if the function is run successfully by the * return value. It can be judged in other places, such as @@ -223,11 +229,12 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = { .arg2_type = ARG_ANYTHING, }; -static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size) +static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) { struct pt_regs *regs = (struct pt_regs *) (long) r1; struct bpf_map *map = (struct bpf_map *) (long) r2; struct bpf_array *array = container_of(map, struct bpf_array, map); + u64 index = flags & BPF_F_INDEX_MASK; void *data = (void *) (long) r4; struct perf_sample_data sample_data; struct perf_event *event; @@ -237,10 +244,14 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size) .data = data, }; + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) + return -EINVAL; + if (index == BPF_F_CURRENT_CPU) + index = raw_smp_processor_id(); if (unlikely(index >= array->map.max_entries)) return -E2BIG; - file = (struct file *)array->ptrs[index]; + file = READ_ONCE(array->ptrs[index]); if (unlikely(!file)) return -ENOENT; @@ -270,7 +281,34 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { .arg5_type = ARG_CONST_STACK_SIZE, }; -static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) +static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); + +static u64 bpf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) +{ + struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs); + + perf_fetch_caller_regs(regs); + + return bpf_perf_event_output((long)regs, r2, flags, r4, size); +} + +static const struct bpf_func_proto bpf_event_output_proto = { + .func = bpf_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_STACK, + .arg5_type = ARG_CONST_STACK_SIZE, +}; + +const struct bpf_func_proto *bpf_get_event_output_proto(void) +{ + return &bpf_event_output_proto; +} + +static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -297,15 +335,26 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_get_smp_processor_id_proto; case BPF_FUNC_perf_event_read: return &bpf_perf_event_read_proto; + default: + return NULL; + } +} + +static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto; + case BPF_FUNC_get_stackid: + return &bpf_get_stackid_proto; default: - return NULL; + return tracing_func_proto(func_id); } } /* bpf+kprobe programs can access fields of 'struct pt_regs' */ -static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type) +static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, + enum bpf_reg_type *reg_type) { /* check bounds */ if (off < 0 || off >= sizeof(struct pt_regs)) @@ -332,9 +381,83 @@ static struct bpf_prog_type_list kprobe_tl = { .type = BPF_PROG_TYPE_KPROBE, }; +static u64 bpf_perf_event_output_tp(u64 r1, u64 r2, u64 index, u64 r4, u64 size) +{ + /* + * r1 points to perf tracepoint buffer where first 8 bytes are hidden + * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it + * from there and call the same bpf_perf_event_output() helper + */ + u64 ctx = *(long *)(uintptr_t)r1; + + return bpf_perf_event_output(ctx, r2, index, r4, size); +} + +static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { + .func = bpf_perf_event_output_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_STACK, + .arg5_type = ARG_CONST_STACK_SIZE, +}; + +static u64 bpf_get_stackid_tp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + u64 ctx = *(long *)(uintptr_t)r1; + + return bpf_get_stackid(ctx, r2, r3, r4, r5); +} + +static const struct bpf_func_proto bpf_get_stackid_proto_tp = { + .func = bpf_get_stackid_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_perf_event_output_proto_tp; + case BPF_FUNC_get_stackid: + return &bpf_get_stackid_proto_tp; + default: + return tracing_func_proto(func_id); + } +} + +static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, + enum bpf_reg_type *reg_type) +{ + if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) + return false; + if (type != BPF_READ) + return false; + if (off % size != 0) + return false; + return true; +} + +static const struct bpf_verifier_ops tracepoint_prog_ops = { + .get_func_proto = tp_prog_func_proto, + .is_valid_access = tp_prog_is_valid_access, +}; + +static struct bpf_prog_type_list tracepoint_tl = { + .ops = &tracepoint_prog_ops, + .type = BPF_PROG_TYPE_TRACEPOINT, +}; + static int __init register_kprobe_prog_ops(void) { bpf_register_prog_type(&kprobe_tl); + bpf_register_prog_type(&tracepoint_tl); return 0; } late_initcall(register_kprobe_prog_ops); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 57a6eea84694..900dbb1efff2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1030,8 +1030,7 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) for_each_possible_cpu(cpu) { stat = &per_cpu(ftrace_profile_stats, cpu); - /* allocate enough for function name + cpu number */ - name = kmalloc(32, GFP_KERNEL); + name = kasprintf(GFP_KERNEL, "function%d", cpu); if (!name) { /* * The files created are permanent, if something happens @@ -1043,7 +1042,6 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) return; } stat->stat = function_stats; - snprintf(name, 32, "function%d", cpu); stat->stat.name = name; ret = register_stat_tracer(&stat->stat); if (ret) { @@ -1058,8 +1056,7 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) entry = tracefs_create_file("function_profile_enabled", 0644, d_tracer, NULL, &ftrace_profile_fops); if (!entry) - pr_warning("Could not create tracefs " - "'function_profile_enabled' entry\n"); + pr_warn("Could not create tracefs 'function_profile_enabled' entry\n"); } #else /* CONFIG_FUNCTION_PROFILER */ @@ -1533,7 +1530,19 @@ static int ftrace_cmp_recs(const void *a, const void *b) return 0; } -static unsigned long ftrace_location_range(unsigned long start, unsigned long end) +/** + * ftrace_location_range - return the first address of a traced location + * if it touches the given ip range + * @start: start of range to search. + * @end: end of range to search (inclusive). @end points to the last byte + * to check. + * + * Returns rec->ip if the related ftrace location is a least partly within + * the given address range. That is, the first address of the instruction + * that is either a NOP or call to the function tracer. It checks the ftrace + * internal tables to determine if the address belongs or not. + */ +unsigned long ftrace_location_range(unsigned long start, unsigned long end) { struct ftrace_page *pg; struct dyn_ftrace *rec; @@ -1610,7 +1619,7 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec) return keep_regs; } -static void __ftrace_hash_rec_update(struct ftrace_ops *ops, +static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, int filter_hash, bool inc) { @@ -1618,12 +1627,13 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, struct ftrace_hash *other_hash; struct ftrace_page *pg; struct dyn_ftrace *rec; + bool update = false; int count = 0; int all = 0; /* Only update if the ops has been registered */ if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) - return; + return false; /* * In the filter_hash case: @@ -1650,7 +1660,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, * then there's nothing to do. */ if (ftrace_hash_empty(hash)) - return; + return false; } do_for_each_ftrace_rec(pg, rec) { @@ -1694,7 +1704,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, if (inc) { rec->flags++; if (FTRACE_WARN_ON(ftrace_rec_count(rec) == FTRACE_REF_MAX)) - return; + return false; /* * If there's only a single callback registered to a @@ -1720,7 +1730,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, rec->flags |= FTRACE_FL_REGS; } else { if (FTRACE_WARN_ON(ftrace_rec_count(rec) == 0)) - return; + return false; rec->flags--; /* @@ -1753,22 +1763,28 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, */ } count++; + + /* Must match FTRACE_UPDATE_CALLS in ftrace_modify_all_code() */ + update |= ftrace_test_record(rec, 1) != FTRACE_UPDATE_IGNORE; + /* Shortcut, if we handled all records, we are done. */ if (!all && count == hash->count) - return; + return update; } while_for_each_ftrace_rec(); + + return update; } -static void ftrace_hash_rec_disable(struct ftrace_ops *ops, +static bool ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash) { - __ftrace_hash_rec_update(ops, filter_hash, 0); + return __ftrace_hash_rec_update(ops, filter_hash, 0); } -static void ftrace_hash_rec_enable(struct ftrace_ops *ops, +static bool ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash) { - __ftrace_hash_rec_update(ops, filter_hash, 1); + return __ftrace_hash_rec_update(ops, filter_hash, 1); } static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops, @@ -2314,8 +2330,8 @@ unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) if (rec->flags & FTRACE_FL_TRAMP_EN) { ops = ftrace_find_tramp_ops_curr(rec); if (FTRACE_WARN_ON(!ops)) { - pr_warning("Bad trampoline accounting at: %p (%pS)\n", - (void *)rec->ip, (void *)rec->ip); + pr_warn("Bad trampoline accounting at: %p (%pS)\n", + (void *)rec->ip, (void *)rec->ip); /* Ftrace is shutting down, return anything */ return (unsigned long)FTRACE_ADDR; } @@ -2644,7 +2660,6 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) return ret; ftrace_start_up++; - command |= FTRACE_UPDATE_CALLS; /* * Note that ftrace probes uses this to start up @@ -2665,7 +2680,8 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) return ret; } - ftrace_hash_rec_enable(ops, 1); + if (ftrace_hash_rec_enable(ops, 1)) + command |= FTRACE_UPDATE_CALLS; ftrace_startup_enable(command); @@ -2695,11 +2711,11 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) /* Disabling ipmodify never fails */ ftrace_hash_ipmodify_disable(ops); - ftrace_hash_rec_disable(ops, 1); - ops->flags &= ~FTRACE_OPS_FL_ENABLED; + if (ftrace_hash_rec_disable(ops, 1)) + command |= FTRACE_UPDATE_CALLS; - command |= FTRACE_UPDATE_CALLS; + ops->flags &= ~FTRACE_OPS_FL_ENABLED; if (saved_ftrace_func != ftrace_trace_function) { saved_ftrace_func = ftrace_trace_function; @@ -3440,11 +3456,23 @@ struct ftrace_glob { int type; }; +/* + * If symbols in an architecture don't correspond exactly to the user-visible + * name of what they represent, it is possible to define this function to + * perform the necessary adjustments. +*/ +char * __weak arch_ftrace_match_adjust(char *str, const char *search) +{ + return str; +} + static int ftrace_match(char *str, struct ftrace_glob *g) { int matched = 0; int slen; + str = arch_ftrace_match_adjust(str, g->search); + switch (g->type) { case MATCH_FULL: if (strcmp(str, g->search) == 0) @@ -5709,7 +5737,6 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) { int i; int ret = 0; - unsigned long flags; int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE; struct task_struct *g, *t; @@ -5725,7 +5752,7 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) } } - read_lock_irqsave(&tasklist_lock, flags); + read_lock(&tasklist_lock); do_each_thread(g, t) { if (start == end) { ret = -EAGAIN; @@ -5743,7 +5770,7 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) } while_each_thread(g, t); unlock: - read_unlock_irqrestore(&tasklist_lock, flags); + read_unlock(&tasklist_lock); free: for (i = start; i < end; i++) kfree(ret_stack_list[i]); diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index eb4220a132ec..0c7dee221dca 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -15,4 +15,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); +EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_frequency); +EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 95181e36891a..9c143739b8d7 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -437,7 +437,7 @@ struct ring_buffer_per_cpu { raw_spinlock_t reader_lock; /* serialize readers */ arch_spinlock_t lock; struct lock_class_key lock_key; - unsigned int nr_pages; + unsigned long nr_pages; unsigned int current_context; struct list_head *pages; struct buffer_page *head_page; /* read from head */ @@ -458,7 +458,7 @@ struct ring_buffer_per_cpu { u64 write_stamp; u64 read_stamp; /* ring buffer pages to update, > 0 to add, < 0 to remove */ - int nr_pages_to_update; + long nr_pages_to_update; struct list_head new_pages; /* new pages to add */ struct work_struct update_pages_work; struct completion update_done; @@ -1128,10 +1128,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) return 0; } -static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu) +static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) { - int i; struct buffer_page *bpage, *tmp; + long i; for (i = 0; i < nr_pages; i++) { struct page *page; @@ -1168,7 +1168,7 @@ free_pages: } static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, - unsigned nr_pages) + unsigned long nr_pages) { LIST_HEAD(pages); @@ -1193,7 +1193,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, } static struct ring_buffer_per_cpu * -rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) +rb_allocate_cpu_buffer(struct ring_buffer *buffer, long nr_pages, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *bpage; @@ -1293,8 +1293,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key) { struct ring_buffer *buffer; + long nr_pages; int bsize; - int cpu, nr_pages; + int cpu; /* keep it in its own cache line */ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), @@ -1420,12 +1421,12 @@ static inline unsigned long rb_page_write(struct buffer_page *bpage) } static int -rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages) +rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) { struct list_head *tail_page, *to_remove, *next_page; struct buffer_page *to_remove_page, *tmp_iter_page; struct buffer_page *last_page, *first_page; - unsigned int nr_removed; + unsigned long nr_removed; unsigned long head_bit; int page_entries; @@ -1642,7 +1643,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, int cpu_id) { struct ring_buffer_per_cpu *cpu_buffer; - unsigned nr_pages; + unsigned long nr_pages; int cpu, err = 0; /* @@ -1656,14 +1657,13 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, !cpumask_test_cpu(cpu_id, buffer->cpumask)) return size; - size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); - size *= BUF_PAGE_SIZE; + nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); /* we need a minimum of two pages */ - if (size < BUF_PAGE_SIZE * 2) - size = BUF_PAGE_SIZE * 2; + if (nr_pages < 2) + nr_pages = 2; - nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + size = nr_pages * BUF_PAGE_SIZE; /* * Don't succeed if resizing is disabled, as a reader might be @@ -4640,8 +4640,9 @@ static int rb_cpu_notify(struct notifier_block *self, struct ring_buffer *buffer = container_of(self, struct ring_buffer, cpu_notify); long cpu = (long)hcpu; - int cpu_i, nr_pages_same; - unsigned int nr_pages; + long nr_pages_same; + int cpu_i; + unsigned long nr_pages; switch (action) { case CPU_UP_PREPARE: diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d9293402ee68..8a4bd6b68a0b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -74,11 +74,6 @@ static struct tracer_opt dummy_tracer_opt[] = { { } }; -static struct tracer_flags dummy_tracer_flags = { - .val = 0, - .opts = dummy_tracer_opt -}; - static int dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { @@ -258,6 +253,9 @@ unsigned long long ns2usecs(cycle_t nsec) #define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \ TRACE_ITER_PRINTK_MSGONLY | TRACE_ITER_RECORD_CMD) +/* trace_flags that are default zero for instances */ +#define ZEROED_TRACE_FLAGS \ + TRACE_ITER_EVENT_FORK /* * The global_trace is the descriptor that holds the tracing @@ -308,33 +306,18 @@ void trace_array_put(struct trace_array *this_tr) mutex_unlock(&trace_types_lock); } -int filter_check_discard(struct trace_event_file *file, void *rec, - struct ring_buffer *buffer, - struct ring_buffer_event *event) -{ - if (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && - !filter_match_preds(file->filter, rec)) { - ring_buffer_discard_commit(buffer, event); - return 1; - } - - return 0; -} -EXPORT_SYMBOL_GPL(filter_check_discard); - int call_filter_check_discard(struct trace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event) { if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && !filter_match_preds(call->filter, rec)) { - ring_buffer_discard_commit(buffer, event); + __trace_event_discard_commit(buffer, event); return 1; } return 0; } -EXPORT_SYMBOL_GPL(call_filter_check_discard); static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) { @@ -1258,12 +1241,22 @@ int __init register_tracer(struct tracer *type) if (!type->set_flag) type->set_flag = &dummy_set_flag; - if (!type->flags) - type->flags = &dummy_tracer_flags; - else + if (!type->flags) { + /*allocate a dummy tracer_flags*/ + type->flags = kmalloc(sizeof(*type->flags), GFP_KERNEL); + if (!type->flags) { + ret = -ENOMEM; + goto out; + } + type->flags->val = 0; + type->flags->opts = dummy_tracer_opt; + } else if (!type->flags->opts) type->flags->opts = dummy_tracer_opt; + /* store the tracer for __set_tracer_option */ + type->flags->trace = type; + ret = run_tracer_selftest(type); if (ret < 0) goto out; @@ -1659,6 +1652,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, #else TRACE_FLAG_IRQS_NOSUPPORT | #endif + ((pc & NMI_MASK ) ? TRACE_FLAG_NMI : 0) | ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) | @@ -1666,6 +1660,16 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, } EXPORT_SYMBOL_GPL(tracing_generic_entry_update); +static __always_inline void +trace_event_setup(struct ring_buffer_event *event, + int type, unsigned long flags, int pc) +{ + struct trace_entry *ent = ring_buffer_event_data(event); + + tracing_generic_entry_update(ent, flags, pc); + ent->type = type; +} + struct ring_buffer_event * trace_buffer_lock_reserve(struct ring_buffer *buffer, int type, @@ -1675,34 +1679,137 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer, struct ring_buffer_event *event; event = ring_buffer_lock_reserve(buffer, len); - if (event != NULL) { - struct trace_entry *ent = ring_buffer_event_data(event); + if (event != NULL) + trace_event_setup(event, type, flags, pc); + + return event; +} + +DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); +DEFINE_PER_CPU(int, trace_buffered_event_cnt); +static int trace_buffered_event_ref; + +/** + * trace_buffered_event_enable - enable buffering events + * + * When events are being filtered, it is quicker to use a temporary + * buffer to write the event data into if there's a likely chance + * that it will not be committed. The discard of the ring buffer + * is not as fast as committing, and is much slower than copying + * a commit. + * + * When an event is to be filtered, allocate per cpu buffers to + * write the event data into, and if the event is filtered and discarded + * it is simply dropped, otherwise, the entire data is to be committed + * in one shot. + */ +void trace_buffered_event_enable(void) +{ + struct ring_buffer_event *event; + struct page *page; + int cpu; + + WARN_ON_ONCE(!mutex_is_locked(&event_mutex)); - tracing_generic_entry_update(ent, flags, pc); - ent->type = type; + if (trace_buffered_event_ref++) + return; + + for_each_tracing_cpu(cpu) { + page = alloc_pages_node(cpu_to_node(cpu), + GFP_KERNEL | __GFP_NORETRY, 0); + if (!page) + goto failed; + + event = page_address(page); + memset(event, 0, sizeof(*event)); + + per_cpu(trace_buffered_event, cpu) = event; + + preempt_disable(); + if (cpu == smp_processor_id() && + this_cpu_read(trace_buffered_event) != + per_cpu(trace_buffered_event, cpu)) + WARN_ON_ONCE(1); + preempt_enable(); } - return event; + return; + failed: + trace_buffered_event_disable(); } -void -__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) +static void enable_trace_buffered_event(void *data) { - __this_cpu_write(trace_cmdline_save, true); - ring_buffer_unlock_commit(buffer, event); + /* Probably not needed, but do it anyway */ + smp_rmb(); + this_cpu_dec(trace_buffered_event_cnt); } -void trace_buffer_unlock_commit(struct trace_array *tr, - struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc) +static void disable_trace_buffered_event(void *data) { - __buffer_unlock_commit(buffer, event); + this_cpu_inc(trace_buffered_event_cnt); +} - ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL); - ftrace_trace_userstack(buffer, flags, pc); +/** + * trace_buffered_event_disable - disable buffering events + * + * When a filter is removed, it is faster to not use the buffered + * events, and to commit directly into the ring buffer. Free up + * the temp buffers when there are no more users. This requires + * special synchronization with current events. + */ +void trace_buffered_event_disable(void) +{ + int cpu; + + WARN_ON_ONCE(!mutex_is_locked(&event_mutex)); + + if (WARN_ON_ONCE(!trace_buffered_event_ref)) + return; + + if (--trace_buffered_event_ref) + return; + + preempt_disable(); + /* For each CPU, set the buffer as used. */ + smp_call_function_many(tracing_buffer_mask, + disable_trace_buffered_event, NULL, 1); + preempt_enable(); + + /* Wait for all current users to finish */ + synchronize_sched(); + + for_each_tracing_cpu(cpu) { + free_page((unsigned long)per_cpu(trace_buffered_event, cpu)); + per_cpu(trace_buffered_event, cpu) = NULL; + } + /* + * Make sure trace_buffered_event is NULL before clearing + * trace_buffered_event_cnt. + */ + smp_wmb(); + + preempt_disable(); + /* Do the work on each cpu */ + smp_call_function_many(tracing_buffer_mask, + enable_trace_buffered_event, NULL, 1); + preempt_enable(); +} + +void +__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) +{ + __this_cpu_write(trace_cmdline_save, true); + + /* If this is the temp buffer, we need to commit fully */ + if (this_cpu_read(trace_buffered_event) == event) { + /* Length is in event->array[0] */ + ring_buffer_write(buffer, event->array[0], &event->array[1]); + /* Release the temp buffer */ + this_cpu_dec(trace_buffered_event_cnt); + } else + ring_buffer_unlock_commit(buffer, event); } -EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); static struct ring_buffer *temp_buffer; @@ -1713,8 +1820,23 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, unsigned long flags, int pc) { struct ring_buffer_event *entry; + int val; *current_rb = trace_file->tr->trace_buffer.buffer; + + if ((trace_file->flags & + (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) && + (entry = this_cpu_read(trace_buffered_event))) { + /* Try to use the per cpu buffer first */ + val = this_cpu_inc_return(trace_buffered_event_cnt); + if (val == 1) { + trace_event_setup(entry, type, flags, pc); + entry->array[0] = len; + return entry; + } + this_cpu_dec(trace_buffered_event_cnt); + } + entry = trace_buffer_lock_reserve(*current_rb, type, len, flags, pc); /* @@ -1732,17 +1854,6 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, } EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); -struct ring_buffer_event * -trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, - int type, unsigned long len, - unsigned long flags, int pc) -{ - *current_rb = global_trace.trace_buffer.buffer; - return trace_buffer_lock_reserve(*current_rb, - type, len, flags, pc); -} -EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve); - void trace_buffer_unlock_commit_regs(struct trace_array *tr, struct ring_buffer *buffer, struct ring_buffer_event *event, @@ -1754,14 +1865,6 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, ftrace_trace_stack(tr, buffer, flags, 0, pc, regs); ftrace_trace_userstack(buffer, flags, pc); } -EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs); - -void trace_current_buffer_discard_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event) -{ - ring_buffer_discard_commit(buffer, event); -} -EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit); void trace_function(struct trace_array *tr, @@ -2071,20 +2174,20 @@ void trace_printk_init_buffers(void) /* trace_printk() is for debug use only. Don't use it in production. */ - pr_warning("\n"); - pr_warning("**********************************************************\n"); - pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); - pr_warning("** **\n"); - pr_warning("** trace_printk() being used. Allocating extra memory. **\n"); - pr_warning("** **\n"); - pr_warning("** This means that this is a DEBUG kernel and it is **\n"); - pr_warning("** unsafe for production use. **\n"); - pr_warning("** **\n"); - pr_warning("** If you see this message and you are not debugging **\n"); - pr_warning("** the kernel, report this immediately to your vendor! **\n"); - pr_warning("** **\n"); - pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); - pr_warning("**********************************************************\n"); + pr_warn("\n"); + pr_warn("**********************************************************\n"); + pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warn("** **\n"); + pr_warn("** trace_printk() being used. Allocating extra memory. **\n"); + pr_warn("** **\n"); + pr_warn("** This means that this is a DEBUG kernel and it is **\n"); + pr_warn("** unsafe for production use. **\n"); + pr_warn("** **\n"); + pr_warn("** If you see this message and you are not debugging **\n"); + pr_warn("** the kernel, report this immediately to your vendor! **\n"); + pr_warn("** **\n"); + pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warn("**********************************************************\n"); /* Expand the buffers to set size */ tracing_update_buffers(); @@ -3505,7 +3608,7 @@ static int __set_tracer_option(struct trace_array *tr, struct tracer_flags *tracer_flags, struct tracer_opt *opts, int neg) { - struct tracer *trace = tr->current_trace; + struct tracer *trace = tracer_flags->trace; int ret; ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg); @@ -3565,6 +3668,9 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) if (mask == TRACE_ITER_RECORD_CMD) trace_event_enable_cmd_record(enabled); + if (mask == TRACE_ITER_EVENT_FORK) + trace_event_follow_fork(tr, enabled); + if (mask == TRACE_ITER_OVERWRITE) { ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled); #ifdef CONFIG_TRACER_MAX_TRACE @@ -3652,7 +3758,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, if (cnt >= sizeof(buf)) return -EINVAL; - if (copy_from_user(&buf, ubuf, cnt)) + if (copy_from_user(buf, ubuf, cnt)) return -EFAULT; buf[cnt] = 0; @@ -3798,12 +3904,19 @@ static const char readme_msg[] = "\t trigger: traceon, traceoff\n" "\t enable_event:<system>:<event>\n" "\t disable_event:<system>:<event>\n" +#ifdef CONFIG_HIST_TRIGGERS + "\t enable_hist:<system>:<event>\n" + "\t disable_hist:<system>:<event>\n" +#endif #ifdef CONFIG_STACKTRACE "\t\t stacktrace\n" #endif #ifdef CONFIG_TRACER_SNAPSHOT "\t\t snapshot\n" #endif +#ifdef CONFIG_HIST_TRIGGERS + "\t\t hist (see below)\n" +#endif "\t example: echo traceoff > events/block/block_unplug/trigger\n" "\t echo traceoff:3 > events/block/block_unplug/trigger\n" "\t echo 'enable_event:kmem:kmalloc:3 if nr_rq > 1' > \\\n" @@ -3819,6 +3932,56 @@ static const char readme_msg[] = "\t To remove a trigger with a count:\n" "\t echo '!<trigger>:0 > <system>/<event>/trigger\n" "\t Filters can be ignored when removing a trigger.\n" +#ifdef CONFIG_HIST_TRIGGERS + " hist trigger\t- If set, event hits are aggregated into a hash table\n" + "\t Format: hist:keys=<field1[,field2,...]>\n" + "\t [:values=<field1[,field2,...]>]\n" + "\t [:sort=<field1[,field2,...]>]\n" + "\t [:size=#entries]\n" + "\t [:pause][:continue][:clear]\n" + "\t [:name=histname1]\n" + "\t [if <filter>]\n\n" + "\t When a matching event is hit, an entry is added to a hash\n" + "\t table using the key(s) and value(s) named, and the value of a\n" + "\t sum called 'hitcount' is incremented. Keys and values\n" + "\t correspond to fields in the event's format description. Keys\n" + "\t can be any field, or the special string 'stacktrace'.\n" + "\t Compound keys consisting of up to two fields can be specified\n" + "\t by the 'keys' keyword. Values must correspond to numeric\n" + "\t fields. Sort keys consisting of up to two fields can be\n" + "\t specified using the 'sort' keyword. The sort direction can\n" + "\t be modified by appending '.descending' or '.ascending' to a\n" + "\t sort field. The 'size' parameter can be used to specify more\n" + "\t or fewer than the default 2048 entries for the hashtable size.\n" + "\t If a hist trigger is given a name using the 'name' parameter,\n" + "\t its histogram data will be shared with other triggers of the\n" + "\t same name, and trigger hits will update this common data.\n\n" + "\t Reading the 'hist' file for the event will dump the hash\n" + "\t table in its entirety to stdout. If there are multiple hist\n" + "\t triggers attached to an event, there will be a table for each\n" + "\t trigger in the output. The table displayed for a named\n" + "\t trigger will be the same as any other instance having the\n" + "\t same name. The default format used to display a given field\n" + "\t can be modified by appending any of the following modifiers\n" + "\t to the field name, as applicable:\n\n" + "\t .hex display a number as a hex value\n" + "\t .sym display an address as a symbol\n" + "\t .sym-offset display an address as a symbol and offset\n" + "\t .execname display a common_pid as a program name\n" + "\t .syscall display a syscall id as a syscall name\n\n" + "\t .log2 display log2 value rather than raw number\n\n" + "\t The 'pause' parameter can be used to pause an existing hist\n" + "\t trigger or to start a hist trigger but not log any events\n" + "\t until told to do so. 'continue' can be used to start or\n" + "\t restart a paused hist trigger.\n\n" + "\t The 'clear' parameter will clear the contents of a running\n" + "\t hist trigger and leave its current paused/active state\n" + "\t unchanged.\n\n" + "\t The enable_hist and disable_hist triggers can be used to\n" + "\t have one event conditionally start and stop another event's\n" + "\t already-attached hist trigger. The syntax is analagous to\n" + "\t the enable_event and disable_event triggers.\n" +#endif ; static ssize_t @@ -4101,7 +4264,7 @@ trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start, */ map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL); if (!map_array) { - pr_warning("Unable to allocate trace enum mapping\n"); + pr_warn("Unable to allocate trace enum mapping\n"); return; } @@ -4468,7 +4631,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, if (cnt > MAX_TRACER_SIZE) cnt = MAX_TRACER_SIZE; - if (copy_from_user(&buf, ubuf, cnt)) + if (copy_from_user(buf, ubuf, cnt)) return -EFAULT; buf[cnt] = 0; @@ -4949,7 +5112,10 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, spd.nr_pages = i; - ret = splice_to_pipe(pipe, &spd); + if (i) + ret = splice_to_pipe(pipe, &spd); + else + ret = 0; out: splice_shrink_spd(&spd); return ret; @@ -5255,7 +5421,7 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, if (cnt >= sizeof(buf)) return -EINVAL; - if (copy_from_user(&buf, ubuf, cnt)) + if (copy_from_user(buf, ubuf, cnt)) return -EFAULT; buf[cnt] = 0; @@ -6131,7 +6297,7 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) snprintf(cpu_dir, 30, "cpu%ld", cpu); d_cpu = tracefs_create_dir(cpu_dir, d_percpu); if (!d_cpu) { - pr_warning("Could not create tracefs '%s' entry\n", cpu_dir); + pr_warn("Could not create tracefs '%s' entry\n", cpu_dir); return; } @@ -6318,7 +6484,7 @@ struct dentry *trace_create_file(const char *name, ret = tracefs_create_file(name, mode, parent, data, fops); if (!ret) - pr_warning("Could not create tracefs '%s' entry\n", name); + pr_warn("Could not create tracefs '%s' entry\n", name); return ret; } @@ -6337,7 +6503,7 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr) tr->options = tracefs_create_dir("options", d_tracer); if (!tr->options) { - pr_warning("Could not create tracefs directory 'options'\n"); + pr_warn("Could not create tracefs directory 'options'\n"); return NULL; } @@ -6391,11 +6557,8 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer) return; for (i = 0; i < tr->nr_topts; i++) { - /* - * Check if these flags have already been added. - * Some tracers share flags. - */ - if (tr->topts[i].tracer->flags == tracer->flags) + /* Make sure there's no duplicate flags. */ + if (WARN_ON_ONCE(tr->topts[i].tracer->flags == tracer->flags)) return; } @@ -6644,7 +6807,7 @@ static int instance_mkdir(const char *name) if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL)) goto out_free_tr; - tr->trace_flags = global_trace.trace_flags; + tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS; cpumask_copy(tr->tracing_cpumask, cpu_all_mask); @@ -6718,6 +6881,12 @@ static int instance_rmdir(const char *name) list_del(&tr->list); + /* Disable all the flags that were enabled coming in */ + for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) { + if ((1 << i) & ZEROED_TRACE_FLAGS) + set_tracer_flag(tr, 1 << i, 0); + } + tracing_set_nop(tr); event_trace_del_tracer(tr); ftrace_destroy_function_files(tr); @@ -7248,8 +7417,8 @@ __init static int tracer_alloc_buffers(void) if (trace_boot_clock) { ret = tracing_set_clock(&global_trace, trace_boot_clock); if (ret < 0) - pr_warning("Trace clock %s not defined, going back to default\n", - trace_boot_clock); + pr_warn("Trace clock %s not defined, going back to default\n", + trace_boot_clock); } /* diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8414fa40bf27..5167c366d6b7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -125,6 +125,7 @@ enum trace_flag_type { TRACE_FLAG_HARDIRQ = 0x08, TRACE_FLAG_SOFTIRQ = 0x10, TRACE_FLAG_PREEMPT_RESCHED = 0x20, + TRACE_FLAG_NMI = 0x40, }; #define TRACE_BUF_SIZE 1024 @@ -176,9 +177,8 @@ struct trace_options { }; struct trace_pid_list { - unsigned int nr_pids; - int order; - pid_t *pids; + int pid_max; + unsigned long *pids; }; /* @@ -345,6 +345,7 @@ struct tracer_opt { struct tracer_flags { u32 val; struct tracer_opt *opts; + struct tracer *trace; }; /* Makes more easy to define a tracer opt */ @@ -654,6 +655,7 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags, extern cycle_t ftrace_now(int cpu); extern void trace_find_cmdline(int pid, char comm[]); +extern void trace_event_follow_fork(struct trace_array *tr, bool enable); #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; @@ -965,6 +967,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, C(STOP_ON_FREE, "disable_on_free"), \ C(IRQ_INFO, "irq-info"), \ C(MARKERS, "markers"), \ + C(EVENT_FORK, "event-fork"), \ FUNCTION_FLAGS \ FGRAPH_FLAGS \ STACK_FLAGS \ @@ -1062,6 +1065,137 @@ struct trace_subsystem_dir { int nr_events; }; +extern int call_filter_check_discard(struct trace_event_call *call, void *rec, + struct ring_buffer *buffer, + struct ring_buffer_event *event); + +void trace_buffer_unlock_commit_regs(struct trace_array *tr, + struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc, + struct pt_regs *regs); + +static inline void trace_buffer_unlock_commit(struct trace_array *tr, + struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc) +{ + trace_buffer_unlock_commit_regs(tr, buffer, event, flags, pc, NULL); +} + +DECLARE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); +DECLARE_PER_CPU(int, trace_buffered_event_cnt); +void trace_buffered_event_disable(void); +void trace_buffered_event_enable(void); + +static inline void +__trace_event_discard_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + if (this_cpu_read(trace_buffered_event) == event) { + /* Simply release the temp buffer */ + this_cpu_dec(trace_buffered_event_cnt); + return; + } + ring_buffer_discard_commit(buffer, event); +} + +/* + * Helper function for event_trigger_unlock_commit{_regs}(). + * If there are event triggers attached to this event that requires + * filtering against its fields, then they wil be called as the + * entry already holds the field information of the current event. + * + * It also checks if the event should be discarded or not. + * It is to be discarded if the event is soft disabled and the + * event was only recorded to process triggers, or if the event + * filter is active and this event did not match the filters. + * + * Returns true if the event is discarded, false otherwise. + */ +static inline bool +__event_trigger_test_discard(struct trace_event_file *file, + struct ring_buffer *buffer, + struct ring_buffer_event *event, + void *entry, + enum event_trigger_type *tt) +{ + unsigned long eflags = file->flags; + + if (eflags & EVENT_FILE_FL_TRIGGER_COND) + *tt = event_triggers_call(file, entry); + + if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) || + (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && + !filter_match_preds(file->filter, entry))) { + __trace_event_discard_commit(buffer, event); + return true; + } + + return false; +} + +/** + * event_trigger_unlock_commit - handle triggers and finish event commit + * @file: The file pointer assoctiated to the event + * @buffer: The ring buffer that the event is being written to + * @event: The event meta data in the ring buffer + * @entry: The event itself + * @irq_flags: The state of the interrupts at the start of the event + * @pc: The state of the preempt count at the start of the event. + * + * This is a helper function to handle triggers that require data + * from the event itself. It also tests the event against filters and + * if the event is soft disabled and should be discarded. + */ +static inline void +event_trigger_unlock_commit(struct trace_event_file *file, + struct ring_buffer *buffer, + struct ring_buffer_event *event, + void *entry, unsigned long irq_flags, int pc) +{ + enum event_trigger_type tt = ETT_NONE; + + if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) + trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc); + + if (tt) + event_triggers_post_call(file, tt, entry); +} + +/** + * event_trigger_unlock_commit_regs - handle triggers and finish event commit + * @file: The file pointer assoctiated to the event + * @buffer: The ring buffer that the event is being written to + * @event: The event meta data in the ring buffer + * @entry: The event itself + * @irq_flags: The state of the interrupts at the start of the event + * @pc: The state of the preempt count at the start of the event. + * + * This is a helper function to handle triggers that require data + * from the event itself. It also tests the event against filters and + * if the event is soft disabled and should be discarded. + * + * Same as event_trigger_unlock_commit() but calls + * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit(). + */ +static inline void +event_trigger_unlock_commit_regs(struct trace_event_file *file, + struct ring_buffer *buffer, + struct ring_buffer_event *event, + void *entry, unsigned long irq_flags, int pc, + struct pt_regs *regs) +{ + enum event_trigger_type tt = ETT_NONE; + + if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) + trace_buffer_unlock_commit_regs(file->tr, buffer, event, + irq_flags, pc, regs); + + if (tt) + event_triggers_post_call(file, tt, entry); +} + #define FILTER_PRED_INVALID ((unsigned short)-1) #define FILTER_PRED_IS_RIGHT (1 << 15) #define FILTER_PRED_FOLD (1 << 15) @@ -1111,6 +1245,18 @@ struct filter_pred { unsigned short right; }; +static inline bool is_string_field(struct ftrace_event_field *field) +{ + return field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_STATIC_STRING || + field->filter_type == FILTER_PTR_STRING; +} + +static inline bool is_function_field(struct ftrace_event_field *field) +{ + return field->filter_type == FILTER_TRACE_FN; +} + extern enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not); extern void print_event_filter(struct trace_event_file *file, @@ -1147,6 +1293,15 @@ extern struct mutex event_mutex; extern struct list_head ftrace_events; extern const struct file_operations event_trigger_fops; +extern const struct file_operations event_hist_fops; + +#ifdef CONFIG_HIST_TRIGGERS +extern int register_trigger_hist_cmd(void); +extern int register_trigger_hist_enable_disable_cmds(void); +#else +static inline int register_trigger_hist_cmd(void) { return 0; } +static inline int register_trigger_hist_enable_disable_cmds(void) { return 0; } +#endif extern int register_trigger_cmds(void); extern void clear_event_triggers(struct trace_array *tr); @@ -1159,9 +1314,67 @@ struct event_trigger_data { struct event_filter __rcu *filter; char *filter_str; void *private_data; + bool paused; + bool paused_tmp; struct list_head list; + char *name; + struct list_head named_list; + struct event_trigger_data *named_data; +}; + +/* Avoid typos */ +#define ENABLE_EVENT_STR "enable_event" +#define DISABLE_EVENT_STR "disable_event" +#define ENABLE_HIST_STR "enable_hist" +#define DISABLE_HIST_STR "disable_hist" + +struct enable_trigger_data { + struct trace_event_file *file; + bool enable; + bool hist; }; +extern int event_enable_trigger_print(struct seq_file *m, + struct event_trigger_ops *ops, + struct event_trigger_data *data); +extern void event_enable_trigger_free(struct event_trigger_ops *ops, + struct event_trigger_data *data); +extern int event_enable_trigger_func(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param); +extern int event_enable_register_trigger(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct trace_event_file *file); +extern void event_enable_unregister_trigger(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *test, + struct trace_event_file *file); +extern void trigger_data_free(struct event_trigger_data *data); +extern int event_trigger_init(struct event_trigger_ops *ops, + struct event_trigger_data *data); +extern int trace_event_trigger_enable_disable(struct trace_event_file *file, + int trigger_enable); +extern void update_cond_flag(struct trace_event_file *file); +extern void unregister_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *test, + struct trace_event_file *file); +extern int set_trigger_filter(char *filter_str, + struct event_trigger_data *trigger_data, + struct trace_event_file *file); +extern struct event_trigger_data *find_named_trigger(const char *name); +extern bool is_named_trigger(struct event_trigger_data *test); +extern int save_named_trigger(const char *name, + struct event_trigger_data *data); +extern void del_named_trigger(struct event_trigger_data *data); +extern void pause_named_trigger(struct event_trigger_data *data); +extern void unpause_named_trigger(struct event_trigger_data *data); +extern void set_named_trigger_data(struct event_trigger_data *data, + struct event_trigger_data *named_data); +extern int register_event_command(struct event_command *cmd); +extern int unregister_event_command(struct event_command *cmd); +extern int register_trigger_hist_enable_disable_cmds(void); + /** * struct event_trigger_ops - callbacks for trace event triggers * @@ -1174,7 +1387,8 @@ struct event_trigger_data { * @func: The trigger 'probe' function called when the triggering * event occurs. The data passed into this callback is the data * that was supplied to the event_command @reg() function that - * registered the trigger (see struct event_command). + * registered the trigger (see struct event_command) along with + * the trace record, rec. * * @init: An optional initialization function called for the trigger * when the trigger is registered (via the event_command reg() @@ -1199,7 +1413,8 @@ struct event_trigger_data { * (see trace_event_triggers.c). */ struct event_trigger_ops { - void (*func)(struct event_trigger_data *data); + void (*func)(struct event_trigger_data *data, + void *rec); int (*init)(struct event_trigger_ops *ops, struct event_trigger_data *data); void (*free)(struct event_trigger_ops *ops, @@ -1243,27 +1458,10 @@ struct event_trigger_ops { * values are defined by adding new values to the trigger_type * enum in include/linux/trace_events.h. * - * @post_trigger: A flag that says whether or not this command needs - * to have its action delayed until after the current event has - * been closed. Some triggers need to avoid being invoked while - * an event is currently in the process of being logged, since - * the trigger may itself log data into the trace buffer. Thus - * we make sure the current event is committed before invoking - * those triggers. To do that, the trigger invocation is split - * in two - the first part checks the filter using the current - * trace record; if a command has the @post_trigger flag set, it - * sets a bit for itself in the return value, otherwise it - * directly invokes the trigger. Once all commands have been - * either invoked or set their return flag, the current record is - * either committed or discarded. At that point, if any commands - * have deferred their triggers, those commands are finally - * invoked following the close of the current event. In other - * words, if the event_trigger_ops @func() probe implementation - * itself logs to the trace buffer, this flag should be set, - * otherwise it can be left unspecified. + * @flags: See the enum event_command_flags below. * - * All the methods below, except for @set_filter(), must be - * implemented. + * All the methods below, except for @set_filter() and @unreg_all(), + * must be implemented. * * @func: The callback function responsible for parsing and * registering the trigger written to the 'trigger' file by the @@ -1288,6 +1486,10 @@ struct event_trigger_ops { * This is usually implemented by the generic utility function * @unregister_trigger() (see trace_event_triggers.c). * + * @unreg_all: An optional function called to remove all the triggers + * from the list of triggers associated with the event. Called + * when a trigger file is opened in truncate mode. + * * @set_filter: An optional function called to parse and set a filter * for the trigger. If no @set_filter() method is set for the * event command, filters set by the user for the command will be @@ -1301,7 +1503,7 @@ struct event_command { struct list_head list; char *name; enum event_trigger_type trigger_type; - bool post_trigger; + int flags; int (*func)(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, char *cmd, char *params); @@ -1313,12 +1515,56 @@ struct event_command { struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file); + void (*unreg_all)(struct trace_event_file *file); int (*set_filter)(char *filter_str, struct event_trigger_data *data, struct trace_event_file *file); struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param); }; +/** + * enum event_command_flags - flags for struct event_command + * + * @POST_TRIGGER: A flag that says whether or not this command needs + * to have its action delayed until after the current event has + * been closed. Some triggers need to avoid being invoked while + * an event is currently in the process of being logged, since + * the trigger may itself log data into the trace buffer. Thus + * we make sure the current event is committed before invoking + * those triggers. To do that, the trigger invocation is split + * in two - the first part checks the filter using the current + * trace record; if a command has the @post_trigger flag set, it + * sets a bit for itself in the return value, otherwise it + * directly invokes the trigger. Once all commands have been + * either invoked or set their return flag, the current record is + * either committed or discarded. At that point, if any commands + * have deferred their triggers, those commands are finally + * invoked following the close of the current event. In other + * words, if the event_trigger_ops @func() probe implementation + * itself logs to the trace buffer, this flag should be set, + * otherwise it can be left unspecified. + * + * @NEEDS_REC: A flag that says whether or not this command needs + * access to the trace record in order to perform its function, + * regardless of whether or not it has a filter associated with + * it (filters make a trigger require access to the trace record + * but are not always present). + */ +enum event_command_flags { + EVENT_CMD_FL_POST_TRIGGER = 1, + EVENT_CMD_FL_NEEDS_REC = 2, +}; + +static inline bool event_command_post_trigger(struct event_command *cmd_ops) +{ + return cmd_ops->flags & EVENT_CMD_FL_POST_TRIGGER; +} + +static inline bool event_command_needs_rec(struct event_command *cmd_ops) +{ + return cmd_ops->flags & EVENT_CMD_FL_NEEDS_REC; +} + extern int trace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable); extern int tracing_alloc_snapshot(void); @@ -1365,8 +1611,13 @@ int perf_ftrace_event_register(struct trace_event_call *call, #ifdef CONFIG_FTRACE_SYSCALLS void init_ftrace_syscalls(void); +const char *get_syscall_name(int syscall); #else static inline void init_ftrace_syscalls(void) { } +static inline const char *get_syscall_name(int syscall) +{ + return NULL; +} #endif #ifdef CONFIG_EVENT_TRACING diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 00df25fd86ef..562fa69df5d3 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -47,6 +47,9 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event, if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) return -EPERM; + if (!is_sampling_event(p_event)) + return 0; + /* * We don't allow user space callchains for function trace * event, due to issues with page faults while tracing page @@ -260,42 +263,43 @@ void perf_trace_del(struct perf_event *p_event, int flags) tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event); } -void *perf_trace_buf_prepare(int size, unsigned short type, - struct pt_regs **regs, int *rctxp) +void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp) { - struct trace_entry *entry; - unsigned long flags; char *raw_data; - int pc; + int rctx; BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, - "perf buffer not large enough")) + "perf buffer not large enough")) return NULL; - pc = preempt_count(); - - *rctxp = perf_swevent_get_recursion_context(); - if (*rctxp < 0) + *rctxp = rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) return NULL; if (regs) - *regs = this_cpu_ptr(&__perf_regs[*rctxp]); - raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]); + *regs = this_cpu_ptr(&__perf_regs[rctx]); + raw_data = this_cpu_ptr(perf_trace_buf[rctx]); /* zero the dead bytes from align to not leak stack to user */ memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); + return raw_data; +} +EXPORT_SYMBOL_GPL(perf_trace_buf_alloc); +NOKPROBE_SYMBOL(perf_trace_buf_alloc); + +void perf_trace_buf_update(void *record, u16 type) +{ + struct trace_entry *entry = record; + int pc = preempt_count(); + unsigned long flags; - entry = (struct trace_entry *)raw_data; local_save_flags(flags); tracing_generic_entry_update(entry, flags, pc); entry->type = type; - - return raw_data; } -EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); -NOKPROBE_SYMBOL(perf_trace_buf_prepare); +NOKPROBE_SYMBOL(perf_trace_buf_update); #ifdef CONFIG_FUNCTION_TRACER static void @@ -316,15 +320,16 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE); + memset(®s, 0, sizeof(regs)); perf_fetch_caller_regs(®s); - entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx); + entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx); if (!entry) return; entry->ip = ip; entry->parent_ip = parent_ip; - perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, + perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN, 1, ®s, head, NULL); #undef ENTRY_SIZE diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 05ddc0820771..3d4155892a1e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -15,7 +15,7 @@ #include <linux/kthread.h> #include <linux/tracefs.h> #include <linux/uaccess.h> -#include <linux/bsearch.h> +#include <linux/vmalloc.h> #include <linux/module.h> #include <linux/ctype.h> #include <linux/sort.h> @@ -204,6 +204,24 @@ static void trace_destroy_fields(struct trace_event_call *call) } } +/* + * run-time version of trace_event_get_offsets_<call>() that returns the last + * accessible offset of trace fields excluding __dynamic_array bytes + */ +int trace_event_get_offsets(struct trace_event_call *call) +{ + struct ftrace_event_field *tail; + struct list_head *head; + + head = trace_get_fields(call); + /* + * head->next points to the last field with the largest offset, + * since it was added last by trace_define_field() + */ + tail = list_first_entry(head, struct ftrace_event_field, link); + return tail->offset + tail->size; +} + int trace_event_raw_init(struct trace_event_call *call) { int id; @@ -363,6 +381,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, { struct trace_event_call *call = file->event_call; struct trace_array *tr = file->tr; + unsigned long file_flags = file->flags; int ret = 0; int disable; @@ -445,6 +464,15 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, break; } + /* Enable or disable use of trace_buffered_event */ + if ((file_flags & EVENT_FILE_FL_SOFT_DISABLED) != + (file->flags & EVENT_FILE_FL_SOFT_DISABLED)) { + if (file->flags & EVENT_FILE_FL_SOFT_DISABLED) + trace_buffered_event_enable(); + else + trace_buffered_event_disable(); + } + return ret; } @@ -471,24 +499,26 @@ static void ftrace_clear_events(struct trace_array *tr) mutex_unlock(&event_mutex); } -static int cmp_pid(const void *key, const void *elt) +/* Shouldn't this be in a header? */ +extern int pid_max; + +/* Returns true if found in filter */ +static bool +find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) { - const pid_t *search_pid = key; - const pid_t *pid = elt; + /* + * If pid_max changed after filtered_pids was created, we + * by default ignore all pids greater than the previous pid_max. + */ + if (search_pid >= filtered_pids->pid_max) + return false; - if (*search_pid == *pid) - return 0; - if (*search_pid < *pid) - return -1; - return 1; + return test_bit(search_pid, filtered_pids->pids); } static bool -check_ignore_pid(struct trace_pid_list *filtered_pids, struct task_struct *task) +ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) { - pid_t search_pid; - pid_t *pid; - /* * Return false, because if filtered_pids does not exist, * all pids are good to trace. @@ -496,15 +526,68 @@ check_ignore_pid(struct trace_pid_list *filtered_pids, struct task_struct *task) if (!filtered_pids) return false; - search_pid = task->pid; + return !find_filtered_pid(filtered_pids, task->pid); +} + +static void filter_add_remove_task(struct trace_pid_list *pid_list, + struct task_struct *self, + struct task_struct *task) +{ + if (!pid_list) + return; + + /* For forks, we only add if the forking task is listed */ + if (self) { + if (!find_filtered_pid(pid_list, self->pid)) + return; + } - pid = bsearch(&search_pid, filtered_pids->pids, - filtered_pids->nr_pids, sizeof(pid_t), - cmp_pid); - if (!pid) - return true; + /* Sorry, but we don't support pid_max changing after setting */ + if (task->pid >= pid_list->pid_max) + return; - return false; + /* "self" is set for forks, and NULL for exits */ + if (self) + set_bit(task->pid, pid_list->pids); + else + clear_bit(task->pid, pid_list->pids); +} + +static void +event_filter_pid_sched_process_exit(void *data, struct task_struct *task) +{ + struct trace_pid_list *pid_list; + struct trace_array *tr = data; + + pid_list = rcu_dereference_sched(tr->filtered_pids); + filter_add_remove_task(pid_list, NULL, task); +} + +static void +event_filter_pid_sched_process_fork(void *data, + struct task_struct *self, + struct task_struct *task) +{ + struct trace_pid_list *pid_list; + struct trace_array *tr = data; + + pid_list = rcu_dereference_sched(tr->filtered_pids); + filter_add_remove_task(pid_list, self, task); +} + +void trace_event_follow_fork(struct trace_array *tr, bool enable) +{ + if (enable) { + register_trace_prio_sched_process_fork(event_filter_pid_sched_process_fork, + tr, INT_MIN); + register_trace_prio_sched_process_exit(event_filter_pid_sched_process_exit, + tr, INT_MAX); + } else { + unregister_trace_sched_process_fork(event_filter_pid_sched_process_fork, + tr); + unregister_trace_sched_process_exit(event_filter_pid_sched_process_exit, + tr); + } } static void @@ -517,8 +600,8 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, pid_list = rcu_dereference_sched(tr->filtered_pids); this_cpu_write(tr->trace_buffer.data->ignore_pid, - check_ignore_pid(pid_list, prev) && - check_ignore_pid(pid_list, next)); + ignore_this_task(pid_list, prev) && + ignore_this_task(pid_list, next)); } static void @@ -531,7 +614,7 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt, pid_list = rcu_dereference_sched(tr->filtered_pids); this_cpu_write(tr->trace_buffer.data->ignore_pid, - check_ignore_pid(pid_list, next)); + ignore_this_task(pid_list, next)); } static void @@ -547,7 +630,7 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task) pid_list = rcu_dereference_sched(tr->filtered_pids); this_cpu_write(tr->trace_buffer.data->ignore_pid, - check_ignore_pid(pid_list, task)); + ignore_this_task(pid_list, task)); } static void @@ -564,7 +647,7 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task) /* Set tracing if current is enabled */ this_cpu_write(tr->trace_buffer.data->ignore_pid, - check_ignore_pid(pid_list, current)); + ignore_this_task(pid_list, current)); } static void __ftrace_clear_event_pids(struct trace_array *tr) @@ -602,7 +685,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr) /* Wait till all users are no longer using pid filtering */ synchronize_sched(); - free_pages((unsigned long)pid_list->pids, pid_list->order); + vfree(pid_list->pids); kfree(pid_list); } @@ -946,11 +1029,32 @@ static void t_stop(struct seq_file *m, void *p) mutex_unlock(&event_mutex); } +static void * +p_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_array *tr = m->private; + struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids); + unsigned long pid = (unsigned long)v; + + (*pos)++; + + /* pid already is +1 of the actual prevous bit */ + pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); + + /* Return pid + 1 to allow zero to be represented */ + if (pid < pid_list->pid_max) + return (void *)(pid + 1); + + return NULL; +} + static void *p_start(struct seq_file *m, loff_t *pos) __acquires(RCU) { struct trace_pid_list *pid_list; struct trace_array *tr = m->private; + unsigned long pid; + loff_t l = 0; /* * Grab the mutex, to keep calls to p_next() having the same @@ -963,10 +1067,18 @@ static void *p_start(struct seq_file *m, loff_t *pos) pid_list = rcu_dereference_sched(tr->filtered_pids); - if (!pid_list || *pos >= pid_list->nr_pids) + if (!pid_list) return NULL; - return (void *)&pid_list->pids[*pos]; + pid = find_first_bit(pid_list->pids, pid_list->pid_max); + if (pid >= pid_list->pid_max) + return NULL; + + /* Return pid + 1 so that zero can be the exit value */ + for (pid++; pid && l < *pos; + pid = (unsigned long)p_next(m, (void *)pid, &l)) + ; + return (void *)pid; } static void p_stop(struct seq_file *m, void *p) @@ -976,25 +1088,11 @@ static void p_stop(struct seq_file *m, void *p) mutex_unlock(&event_mutex); } -static void * -p_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct trace_array *tr = m->private; - struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids); - - (*pos)++; - - if (*pos >= pid_list->nr_pids) - return NULL; - - return (void *)&pid_list->pids[*pos]; -} - static int p_show(struct seq_file *m, void *v) { - pid_t *pid = v; + unsigned long pid = (unsigned long)v - 1; - seq_printf(m, "%d\n", *pid); + seq_printf(m, "%lu\n", pid); return 0; } @@ -1543,11 +1641,6 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) return r; } -static int max_pids(struct trace_pid_list *pid_list) -{ - return (PAGE_SIZE << pid_list->order) / sizeof(pid_t); -} - static void ignore_task_cpu(void *data) { struct trace_array *tr = data; @@ -1561,7 +1654,7 @@ static void ignore_task_cpu(void *data) mutex_is_locked(&event_mutex)); this_cpu_write(tr->trace_buffer.data->ignore_pid, - check_ignore_pid(pid_list, current)); + ignore_this_task(pid_list, current)); } static ssize_t @@ -1571,7 +1664,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, struct seq_file *m = filp->private_data; struct trace_array *tr = m->private; struct trace_pid_list *filtered_pids = NULL; - struct trace_pid_list *pid_list = NULL; + struct trace_pid_list *pid_list; struct trace_event_file *file; struct trace_parser parser; unsigned long val; @@ -1579,7 +1672,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, ssize_t read = 0; ssize_t ret = 0; pid_t pid; - int i; + int nr_pids = 0; if (!cnt) return 0; @@ -1592,10 +1685,43 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, return -ENOMEM; mutex_lock(&event_mutex); + filtered_pids = rcu_dereference_protected(tr->filtered_pids, + lockdep_is_held(&event_mutex)); + /* - * Load as many pids into the array before doing a - * swap from the tr->filtered_pids to the new list. + * Always recreate a new array. The write is an all or nothing + * operation. Always create a new array when adding new pids by + * the user. If the operation fails, then the current list is + * not modified. */ + pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); + if (!pid_list) { + read = -ENOMEM; + goto out; + } + pid_list->pid_max = READ_ONCE(pid_max); + /* Only truncating will shrink pid_max */ + if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max) + pid_list->pid_max = filtered_pids->pid_max; + pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); + if (!pid_list->pids) { + kfree(pid_list); + read = -ENOMEM; + goto out; + } + if (filtered_pids) { + /* copy the current bits to the new max */ + pid = find_first_bit(filtered_pids->pids, + filtered_pids->pid_max); + while (pid < filtered_pids->pid_max) { + set_bit(pid, pid_list->pids); + pid = find_next_bit(filtered_pids->pids, + filtered_pids->pid_max, + pid + 1); + nr_pids++; + } + } + while (cnt > 0) { this_pos = 0; @@ -1613,92 +1739,35 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, ret = -EINVAL; if (kstrtoul(parser.buffer, 0, &val)) break; - if (val > INT_MAX) + if (val >= pid_list->pid_max) break; pid = (pid_t)val; - ret = -ENOMEM; - if (!pid_list) { - pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); - if (!pid_list) - break; - - filtered_pids = rcu_dereference_protected(tr->filtered_pids, - lockdep_is_held(&event_mutex)); - if (filtered_pids) - pid_list->order = filtered_pids->order; - else - pid_list->order = 0; - - pid_list->pids = (void *)__get_free_pages(GFP_KERNEL, - pid_list->order); - if (!pid_list->pids) - break; - - if (filtered_pids) { - pid_list->nr_pids = filtered_pids->nr_pids; - memcpy(pid_list->pids, filtered_pids->pids, - pid_list->nr_pids * sizeof(pid_t)); - } else - pid_list->nr_pids = 0; - } - - if (pid_list->nr_pids >= max_pids(pid_list)) { - pid_t *pid_page; - - pid_page = (void *)__get_free_pages(GFP_KERNEL, - pid_list->order + 1); - if (!pid_page) - break; - memcpy(pid_page, pid_list->pids, - pid_list->nr_pids * sizeof(pid_t)); - free_pages((unsigned long)pid_list->pids, pid_list->order); - - pid_list->order++; - pid_list->pids = pid_page; - } + set_bit(pid, pid_list->pids); + nr_pids++; - pid_list->pids[pid_list->nr_pids++] = pid; trace_parser_clear(&parser); ret = 0; } trace_parser_put(&parser); if (ret < 0) { - if (pid_list) - free_pages((unsigned long)pid_list->pids, pid_list->order); + vfree(pid_list->pids); kfree(pid_list); - mutex_unlock(&event_mutex); - return ret; - } - - if (!pid_list) { - mutex_unlock(&event_mutex); - return ret; + read = ret; + goto out; } - sort(pid_list->pids, pid_list->nr_pids, sizeof(pid_t), cmp_pid, NULL); - - /* Remove duplicates */ - for (i = 1; i < pid_list->nr_pids; i++) { - int start = i; - - while (i < pid_list->nr_pids && - pid_list->pids[i - 1] == pid_list->pids[i]) - i++; - - if (start != i) { - if (i < pid_list->nr_pids) { - memmove(&pid_list->pids[start], &pid_list->pids[i], - (pid_list->nr_pids - i) * sizeof(pid_t)); - pid_list->nr_pids -= i - start; - i = start; - } else - pid_list->nr_pids = start; - } + if (!nr_pids) { + /* Cleared the list of pids */ + vfree(pid_list->pids); + kfree(pid_list); + read = ret; + if (!filtered_pids) + goto out; + pid_list = NULL; } - rcu_assign_pointer(tr->filtered_pids, pid_list); list_for_each_entry(file, &tr->events, list) { @@ -1708,7 +1777,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, if (filtered_pids) { synchronize_sched(); - free_pages((unsigned long)filtered_pids->pids, filtered_pids->order); + vfree(filtered_pids->pids); kfree(filtered_pids); } else { /* @@ -1745,10 +1814,12 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, */ on_each_cpu(ignore_task_cpu, tr, 1); + out: mutex_unlock(&event_mutex); ret = read; - *ppos += read; + if (read > 0) + *ppos += read; return ret; } @@ -2095,9 +2166,18 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) trace_create_file("filter", 0644, file->dir, file, &ftrace_event_filter_fops); - trace_create_file("trigger", 0644, file->dir, file, - &event_trigger_fops); + /* + * Only event directories that can be enabled should have + * triggers. + */ + if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) + trace_create_file("trigger", 0644, file->dir, file, + &event_trigger_fops); +#ifdef CONFIG_HIST_TRIGGERS + trace_create_file("hist", 0444, file->dir, file, + &event_hist_fops); +#endif trace_create_file("format", 0444, file->dir, call, &ftrace_event_format_fops); @@ -3345,7 +3425,7 @@ static __init void event_trace_self_tests(void) static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); -static struct trace_array *event_tr; +static struct trace_event_file event_trace_file __initdata; static void __init function_test_events_call(unsigned long ip, unsigned long parent_ip, @@ -3369,17 +3449,17 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip, local_save_flags(flags); - event = trace_current_buffer_lock_reserve(&buffer, - TRACE_FN, sizeof(*entry), - flags, pc); + event = trace_event_buffer_lock_reserve(&buffer, &event_trace_file, + TRACE_FN, sizeof(*entry), + flags, pc); if (!event) goto out; entry = ring_buffer_event_data(event); entry->ip = ip; entry->parent_ip = parent_ip; - trace_buffer_unlock_commit(event_tr, buffer, event, flags, pc); - + event_trigger_unlock_commit(&event_trace_file, buffer, event, + entry, flags, pc); out: atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); preempt_enable_notrace(); @@ -3394,9 +3474,11 @@ static struct ftrace_ops trace_ops __initdata = static __init void event_trace_self_test_with_function(void) { int ret; - event_tr = top_trace_array(); - if (WARN_ON(!event_tr)) + + event_trace_file.tr = top_trace_array(); + if (WARN_ON(!event_trace_file.tr)) return; + ret = register_ftrace_function(&trace_ops); if (WARN_ON(ret < 0)) { pr_info("Failed to enable function tracer for event tests\n"); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 6816302542b2..9daa9b3bc6d9 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -689,10 +689,7 @@ static void append_filter_err(struct filter_parse_state *ps, static inline struct event_filter *event_filter(struct trace_event_file *file) { - if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - return file->event_call->filter; - else - return file->filter; + return file->filter; } /* caller must hold event_mutex */ @@ -826,12 +823,12 @@ static void __free_preds(struct event_filter *filter) static void filter_disable(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; + unsigned long old_flags = file->flags; - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - call->flags &= ~TRACE_EVENT_FL_FILTERED; - else - file->flags &= ~EVENT_FILE_FL_FILTERED; + file->flags &= ~EVENT_FILE_FL_FILTERED; + + if (old_flags != file->flags) + trace_buffered_event_disable(); } static void __free_filter(struct event_filter *filter) @@ -883,13 +880,8 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) static inline void __remove_filter(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; - filter_disable(file); - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - remove_filter_string(call->filter); - else - remove_filter_string(file->filter); + remove_filter_string(file->filter); } static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir, @@ -906,15 +898,8 @@ static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir, static inline void __free_subsystem_filter(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; - - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) { - __free_filter(call->filter); - call->filter = NULL; - } else { - __free_filter(file->filter); - file->filter = NULL; - } + __free_filter(file->filter); + file->filter = NULL; } static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir, @@ -961,18 +946,6 @@ int filter_assign_type(const char *type) return FILTER_OTHER; } -static bool is_function_field(struct ftrace_event_field *field) -{ - return field->filter_type == FILTER_TRACE_FN; -} - -static bool is_string_field(struct ftrace_event_field *field) -{ - return field->filter_type == FILTER_DYN_STRING || - field->filter_type == FILTER_STATIC_STRING || - field->filter_type == FILTER_PTR_STRING; -} - static bool is_legal_op(struct ftrace_event_field *field, int op) { if (is_string_field(field) && @@ -1730,69 +1703,43 @@ fail: static inline void event_set_filtered_flag(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; + unsigned long old_flags = file->flags; - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - call->flags |= TRACE_EVENT_FL_FILTERED; - else - file->flags |= EVENT_FILE_FL_FILTERED; + file->flags |= EVENT_FILE_FL_FILTERED; + + if (old_flags != file->flags) + trace_buffered_event_enable(); } static inline void event_set_filter(struct trace_event_file *file, struct event_filter *filter) { - struct trace_event_call *call = file->event_call; - - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - rcu_assign_pointer(call->filter, filter); - else - rcu_assign_pointer(file->filter, filter); + rcu_assign_pointer(file->filter, filter); } static inline void event_clear_filter(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; - - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - RCU_INIT_POINTER(call->filter, NULL); - else - RCU_INIT_POINTER(file->filter, NULL); + RCU_INIT_POINTER(file->filter, NULL); } static inline void event_set_no_set_filter_flag(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; - - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; - else - file->flags |= EVENT_FILE_FL_NO_SET_FILTER; + file->flags |= EVENT_FILE_FL_NO_SET_FILTER; } static inline void event_clear_no_set_filter_flag(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; - - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; - else - file->flags &= ~EVENT_FILE_FL_NO_SET_FILTER; + file->flags &= ~EVENT_FILE_FL_NO_SET_FILTER; } static inline bool event_no_set_filter_flag(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; - if (file->flags & EVENT_FILE_FL_NO_SET_FILTER) return true; - if ((call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) && - (call->flags & TRACE_EVENT_FL_NO_SET_FILTER)) - return true; - return false; } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c new file mode 100644 index 000000000000..0c05b8a99806 --- /dev/null +++ b/kernel/trace/trace_events_hist.c @@ -0,0 +1,1755 @@ +/* + * trace_events_hist - trace event hist triggers + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Copyright (C) 2015 Tom Zanussi <tom.zanussi@linux.intel.com> + */ + +#include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/stacktrace.h> + +#include "tracing_map.h" +#include "trace.h" + +struct hist_field; + +typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event); + +struct hist_field { + struct ftrace_event_field *field; + unsigned long flags; + hist_field_fn_t fn; + unsigned int size; + unsigned int offset; +}; + +static u64 hist_field_none(struct hist_field *field, void *event) +{ + return 0; +} + +static u64 hist_field_counter(struct hist_field *field, void *event) +{ + return 1; +} + +static u64 hist_field_string(struct hist_field *hist_field, void *event) +{ + char *addr = (char *)(event + hist_field->field->offset); + + return (u64)(unsigned long)addr; +} + +static u64 hist_field_dynstring(struct hist_field *hist_field, void *event) +{ + u32 str_item = *(u32 *)(event + hist_field->field->offset); + int str_loc = str_item & 0xffff; + char *addr = (char *)(event + str_loc); + + return (u64)(unsigned long)addr; +} + +static u64 hist_field_pstring(struct hist_field *hist_field, void *event) +{ + char **addr = (char **)(event + hist_field->field->offset); + + return (u64)(unsigned long)*addr; +} + +static u64 hist_field_log2(struct hist_field *hist_field, void *event) +{ + u64 val = *(u64 *)(event + hist_field->field->offset); + + return (u64) ilog2(roundup_pow_of_two(val)); +} + +#define DEFINE_HIST_FIELD_FN(type) \ +static u64 hist_field_##type(struct hist_field *hist_field, void *event)\ +{ \ + type *addr = (type *)(event + hist_field->field->offset); \ + \ + return (u64)(unsigned long)*addr; \ +} + +DEFINE_HIST_FIELD_FN(s64); +DEFINE_HIST_FIELD_FN(u64); +DEFINE_HIST_FIELD_FN(s32); +DEFINE_HIST_FIELD_FN(u32); +DEFINE_HIST_FIELD_FN(s16); +DEFINE_HIST_FIELD_FN(u16); +DEFINE_HIST_FIELD_FN(s8); +DEFINE_HIST_FIELD_FN(u8); + +#define for_each_hist_field(i, hist_data) \ + for ((i) = 0; (i) < (hist_data)->n_fields; (i)++) + +#define for_each_hist_val_field(i, hist_data) \ + for ((i) = 0; (i) < (hist_data)->n_vals; (i)++) + +#define for_each_hist_key_field(i, hist_data) \ + for ((i) = (hist_data)->n_vals; (i) < (hist_data)->n_fields; (i)++) + +#define HIST_STACKTRACE_DEPTH 16 +#define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long)) +#define HIST_STACKTRACE_SKIP 5 + +#define HITCOUNT_IDX 0 +#define HIST_KEY_SIZE_MAX (MAX_FILTER_STR_VAL + HIST_STACKTRACE_SIZE) + +enum hist_field_flags { + HIST_FIELD_FL_HITCOUNT = 1, + HIST_FIELD_FL_KEY = 2, + HIST_FIELD_FL_STRING = 4, + HIST_FIELD_FL_HEX = 8, + HIST_FIELD_FL_SYM = 16, + HIST_FIELD_FL_SYM_OFFSET = 32, + HIST_FIELD_FL_EXECNAME = 64, + HIST_FIELD_FL_SYSCALL = 128, + HIST_FIELD_FL_STACKTRACE = 256, + HIST_FIELD_FL_LOG2 = 512, +}; + +struct hist_trigger_attrs { + char *keys_str; + char *vals_str; + char *sort_key_str; + char *name; + bool pause; + bool cont; + bool clear; + unsigned int map_bits; +}; + +struct hist_trigger_data { + struct hist_field *fields[TRACING_MAP_FIELDS_MAX]; + unsigned int n_vals; + unsigned int n_keys; + unsigned int n_fields; + unsigned int key_size; + struct tracing_map_sort_key sort_keys[TRACING_MAP_SORT_KEYS_MAX]; + unsigned int n_sort_keys; + struct trace_event_file *event_file; + struct hist_trigger_attrs *attrs; + struct tracing_map *map; +}; + +static hist_field_fn_t select_value_fn(int field_size, int field_is_signed) +{ + hist_field_fn_t fn = NULL; + + switch (field_size) { + case 8: + if (field_is_signed) + fn = hist_field_s64; + else + fn = hist_field_u64; + break; + case 4: + if (field_is_signed) + fn = hist_field_s32; + else + fn = hist_field_u32; + break; + case 2: + if (field_is_signed) + fn = hist_field_s16; + else + fn = hist_field_u16; + break; + case 1: + if (field_is_signed) + fn = hist_field_s8; + else + fn = hist_field_u8; + break; + } + + return fn; +} + +static int parse_map_size(char *str) +{ + unsigned long size, map_bits; + int ret; + + strsep(&str, "="); + if (!str) { + ret = -EINVAL; + goto out; + } + + ret = kstrtoul(str, 0, &size); + if (ret) + goto out; + + map_bits = ilog2(roundup_pow_of_two(size)); + if (map_bits < TRACING_MAP_BITS_MIN || + map_bits > TRACING_MAP_BITS_MAX) + ret = -EINVAL; + else + ret = map_bits; + out: + return ret; +} + +static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs) +{ + if (!attrs) + return; + + kfree(attrs->name); + kfree(attrs->sort_key_str); + kfree(attrs->keys_str); + kfree(attrs->vals_str); + kfree(attrs); +} + +static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) +{ + struct hist_trigger_attrs *attrs; + int ret = 0; + + attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); + if (!attrs) + return ERR_PTR(-ENOMEM); + + while (trigger_str) { + char *str = strsep(&trigger_str, ":"); + + if ((strncmp(str, "key=", strlen("key=")) == 0) || + (strncmp(str, "keys=", strlen("keys=")) == 0)) + attrs->keys_str = kstrdup(str, GFP_KERNEL); + else if ((strncmp(str, "val=", strlen("val=")) == 0) || + (strncmp(str, "vals=", strlen("vals=")) == 0) || + (strncmp(str, "values=", strlen("values=")) == 0)) + attrs->vals_str = kstrdup(str, GFP_KERNEL); + else if (strncmp(str, "sort=", strlen("sort=")) == 0) + attrs->sort_key_str = kstrdup(str, GFP_KERNEL); + else if (strncmp(str, "name=", strlen("name=")) == 0) + attrs->name = kstrdup(str, GFP_KERNEL); + else if (strcmp(str, "pause") == 0) + attrs->pause = true; + else if ((strcmp(str, "cont") == 0) || + (strcmp(str, "continue") == 0)) + attrs->cont = true; + else if (strcmp(str, "clear") == 0) + attrs->clear = true; + else if (strncmp(str, "size=", strlen("size=")) == 0) { + int map_bits = parse_map_size(str); + + if (map_bits < 0) { + ret = map_bits; + goto free; + } + attrs->map_bits = map_bits; + } else { + ret = -EINVAL; + goto free; + } + } + + if (!attrs->keys_str) { + ret = -EINVAL; + goto free; + } + + return attrs; + free: + destroy_hist_trigger_attrs(attrs); + + return ERR_PTR(ret); +} + +static inline void save_comm(char *comm, struct task_struct *task) +{ + if (!task->pid) { + strcpy(comm, "<idle>"); + return; + } + + if (WARN_ON_ONCE(task->pid < 0)) { + strcpy(comm, "<XXX>"); + return; + } + + memcpy(comm, task->comm, TASK_COMM_LEN); +} + +static void hist_trigger_elt_comm_free(struct tracing_map_elt *elt) +{ + kfree((char *)elt->private_data); +} + +static int hist_trigger_elt_comm_alloc(struct tracing_map_elt *elt) +{ + struct hist_trigger_data *hist_data = elt->map->private_data; + struct hist_field *key_field; + unsigned int i; + + for_each_hist_key_field(i, hist_data) { + key_field = hist_data->fields[i]; + + if (key_field->flags & HIST_FIELD_FL_EXECNAME) { + unsigned int size = TASK_COMM_LEN + 1; + + elt->private_data = kzalloc(size, GFP_KERNEL); + if (!elt->private_data) + return -ENOMEM; + break; + } + } + + return 0; +} + +static void hist_trigger_elt_comm_copy(struct tracing_map_elt *to, + struct tracing_map_elt *from) +{ + char *comm_from = from->private_data; + char *comm_to = to->private_data; + + if (comm_from) + memcpy(comm_to, comm_from, TASK_COMM_LEN + 1); +} + +static void hist_trigger_elt_comm_init(struct tracing_map_elt *elt) +{ + char *comm = elt->private_data; + + if (comm) + save_comm(comm, current); +} + +static const struct tracing_map_ops hist_trigger_elt_comm_ops = { + .elt_alloc = hist_trigger_elt_comm_alloc, + .elt_copy = hist_trigger_elt_comm_copy, + .elt_free = hist_trigger_elt_comm_free, + .elt_init = hist_trigger_elt_comm_init, +}; + +static void destroy_hist_field(struct hist_field *hist_field) +{ + kfree(hist_field); +} + +static struct hist_field *create_hist_field(struct ftrace_event_field *field, + unsigned long flags) +{ + struct hist_field *hist_field; + + if (field && is_function_field(field)) + return NULL; + + hist_field = kzalloc(sizeof(struct hist_field), GFP_KERNEL); + if (!hist_field) + return NULL; + + if (flags & HIST_FIELD_FL_HITCOUNT) { + hist_field->fn = hist_field_counter; + goto out; + } + + if (flags & HIST_FIELD_FL_STACKTRACE) { + hist_field->fn = hist_field_none; + goto out; + } + + if (flags & HIST_FIELD_FL_LOG2) { + hist_field->fn = hist_field_log2; + goto out; + } + + if (WARN_ON_ONCE(!field)) + goto out; + + if (is_string_field(field)) { + flags |= HIST_FIELD_FL_STRING; + + if (field->filter_type == FILTER_STATIC_STRING) + hist_field->fn = hist_field_string; + else if (field->filter_type == FILTER_DYN_STRING) + hist_field->fn = hist_field_dynstring; + else + hist_field->fn = hist_field_pstring; + } else { + hist_field->fn = select_value_fn(field->size, + field->is_signed); + if (!hist_field->fn) { + destroy_hist_field(hist_field); + return NULL; + } + } + out: + hist_field->field = field; + hist_field->flags = flags; + + return hist_field; +} + +static void destroy_hist_fields(struct hist_trigger_data *hist_data) +{ + unsigned int i; + + for (i = 0; i < TRACING_MAP_FIELDS_MAX; i++) { + if (hist_data->fields[i]) { + destroy_hist_field(hist_data->fields[i]); + hist_data->fields[i] = NULL; + } + } +} + +static int create_hitcount_val(struct hist_trigger_data *hist_data) +{ + hist_data->fields[HITCOUNT_IDX] = + create_hist_field(NULL, HIST_FIELD_FL_HITCOUNT); + if (!hist_data->fields[HITCOUNT_IDX]) + return -ENOMEM; + + hist_data->n_vals++; + + if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX)) + return -EINVAL; + + return 0; +} + +static int create_val_field(struct hist_trigger_data *hist_data, + unsigned int val_idx, + struct trace_event_file *file, + char *field_str) +{ + struct ftrace_event_field *field = NULL; + unsigned long flags = 0; + char *field_name; + int ret = 0; + + if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX)) + return -EINVAL; + + field_name = strsep(&field_str, "."); + if (field_str) { + if (strcmp(field_str, "hex") == 0) + flags |= HIST_FIELD_FL_HEX; + else { + ret = -EINVAL; + goto out; + } + } + + field = trace_find_event_field(file->event_call, field_name); + if (!field) { + ret = -EINVAL; + goto out; + } + + hist_data->fields[val_idx] = create_hist_field(field, flags); + if (!hist_data->fields[val_idx]) { + ret = -ENOMEM; + goto out; + } + + ++hist_data->n_vals; + + if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX)) + ret = -EINVAL; + out: + return ret; +} + +static int create_val_fields(struct hist_trigger_data *hist_data, + struct trace_event_file *file) +{ + char *fields_str, *field_str; + unsigned int i, j; + int ret; + + ret = create_hitcount_val(hist_data); + if (ret) + goto out; + + fields_str = hist_data->attrs->vals_str; + if (!fields_str) + goto out; + + strsep(&fields_str, "="); + if (!fields_str) + goto out; + + for (i = 0, j = 1; i < TRACING_MAP_VALS_MAX && + j < TRACING_MAP_VALS_MAX; i++) { + field_str = strsep(&fields_str, ","); + if (!field_str) + break; + if (strcmp(field_str, "hitcount") == 0) + continue; + ret = create_val_field(hist_data, j++, file, field_str); + if (ret) + goto out; + } + if (fields_str && (strcmp(fields_str, "hitcount") != 0)) + ret = -EINVAL; + out: + return ret; +} + +static int create_key_field(struct hist_trigger_data *hist_data, + unsigned int key_idx, + unsigned int key_offset, + struct trace_event_file *file, + char *field_str) +{ + struct ftrace_event_field *field = NULL; + unsigned long flags = 0; + unsigned int key_size; + int ret = 0; + + if (WARN_ON(key_idx >= TRACING_MAP_FIELDS_MAX)) + return -EINVAL; + + flags |= HIST_FIELD_FL_KEY; + + if (strcmp(field_str, "stacktrace") == 0) { + flags |= HIST_FIELD_FL_STACKTRACE; + key_size = sizeof(unsigned long) * HIST_STACKTRACE_DEPTH; + } else { + char *field_name = strsep(&field_str, "."); + + if (field_str) { + if (strcmp(field_str, "hex") == 0) + flags |= HIST_FIELD_FL_HEX; + else if (strcmp(field_str, "sym") == 0) + flags |= HIST_FIELD_FL_SYM; + else if (strcmp(field_str, "sym-offset") == 0) + flags |= HIST_FIELD_FL_SYM_OFFSET; + else if ((strcmp(field_str, "execname") == 0) && + (strcmp(field_name, "common_pid") == 0)) + flags |= HIST_FIELD_FL_EXECNAME; + else if (strcmp(field_str, "syscall") == 0) + flags |= HIST_FIELD_FL_SYSCALL; + else if (strcmp(field_str, "log2") == 0) + flags |= HIST_FIELD_FL_LOG2; + else { + ret = -EINVAL; + goto out; + } + } + + field = trace_find_event_field(file->event_call, field_name); + if (!field) { + ret = -EINVAL; + goto out; + } + + if (is_string_field(field)) + key_size = MAX_FILTER_STR_VAL; + else + key_size = field->size; + } + + hist_data->fields[key_idx] = create_hist_field(field, flags); + if (!hist_data->fields[key_idx]) { + ret = -ENOMEM; + goto out; + } + + key_size = ALIGN(key_size, sizeof(u64)); + hist_data->fields[key_idx]->size = key_size; + hist_data->fields[key_idx]->offset = key_offset; + hist_data->key_size += key_size; + if (hist_data->key_size > HIST_KEY_SIZE_MAX) { + ret = -EINVAL; + goto out; + } + + hist_data->n_keys++; + + if (WARN_ON(hist_data->n_keys > TRACING_MAP_KEYS_MAX)) + return -EINVAL; + + ret = key_size; + out: + return ret; +} + +static int create_key_fields(struct hist_trigger_data *hist_data, + struct trace_event_file *file) +{ + unsigned int i, key_offset = 0, n_vals = hist_data->n_vals; + char *fields_str, *field_str; + int ret = -EINVAL; + + fields_str = hist_data->attrs->keys_str; + if (!fields_str) + goto out; + + strsep(&fields_str, "="); + if (!fields_str) + goto out; + + for (i = n_vals; i < n_vals + TRACING_MAP_KEYS_MAX; i++) { + field_str = strsep(&fields_str, ","); + if (!field_str) + break; + ret = create_key_field(hist_data, i, key_offset, + file, field_str); + if (ret < 0) + goto out; + key_offset += ret; + } + if (fields_str) { + ret = -EINVAL; + goto out; + } + ret = 0; + out: + return ret; +} + +static int create_hist_fields(struct hist_trigger_data *hist_data, + struct trace_event_file *file) +{ + int ret; + + ret = create_val_fields(hist_data, file); + if (ret) + goto out; + + ret = create_key_fields(hist_data, file); + if (ret) + goto out; + + hist_data->n_fields = hist_data->n_vals + hist_data->n_keys; + out: + return ret; +} + +static int is_descending(const char *str) +{ + if (!str) + return 0; + + if (strcmp(str, "descending") == 0) + return 1; + + if (strcmp(str, "ascending") == 0) + return 0; + + return -EINVAL; +} + +static int create_sort_keys(struct hist_trigger_data *hist_data) +{ + char *fields_str = hist_data->attrs->sort_key_str; + struct ftrace_event_field *field = NULL; + struct tracing_map_sort_key *sort_key; + int descending, ret = 0; + unsigned int i, j; + + hist_data->n_sort_keys = 1; /* we always have at least one, hitcount */ + + if (!fields_str) + goto out; + + strsep(&fields_str, "="); + if (!fields_str) { + ret = -EINVAL; + goto out; + } + + for (i = 0; i < TRACING_MAP_SORT_KEYS_MAX; i++) { + char *field_str, *field_name; + + sort_key = &hist_data->sort_keys[i]; + + field_str = strsep(&fields_str, ","); + if (!field_str) { + if (i == 0) + ret = -EINVAL; + break; + } + + if ((i == TRACING_MAP_SORT_KEYS_MAX - 1) && fields_str) { + ret = -EINVAL; + break; + } + + field_name = strsep(&field_str, "."); + if (!field_name) { + ret = -EINVAL; + break; + } + + if (strcmp(field_name, "hitcount") == 0) { + descending = is_descending(field_str); + if (descending < 0) { + ret = descending; + break; + } + sort_key->descending = descending; + continue; + } + + for (j = 1; j < hist_data->n_fields; j++) { + field = hist_data->fields[j]->field; + if (field && (strcmp(field_name, field->name) == 0)) { + sort_key->field_idx = j; + descending = is_descending(field_str); + if (descending < 0) { + ret = descending; + goto out; + } + sort_key->descending = descending; + break; + } + } + if (j == hist_data->n_fields) { + ret = -EINVAL; + break; + } + } + hist_data->n_sort_keys = i; + out: + return ret; +} + +static void destroy_hist_data(struct hist_trigger_data *hist_data) +{ + destroy_hist_trigger_attrs(hist_data->attrs); + destroy_hist_fields(hist_data); + tracing_map_destroy(hist_data->map); + kfree(hist_data); +} + +static int create_tracing_map_fields(struct hist_trigger_data *hist_data) +{ + struct tracing_map *map = hist_data->map; + struct ftrace_event_field *field; + struct hist_field *hist_field; + int i, idx; + + for_each_hist_field(i, hist_data) { + hist_field = hist_data->fields[i]; + if (hist_field->flags & HIST_FIELD_FL_KEY) { + tracing_map_cmp_fn_t cmp_fn; + + field = hist_field->field; + + if (hist_field->flags & HIST_FIELD_FL_STACKTRACE) + cmp_fn = tracing_map_cmp_none; + else if (is_string_field(field)) + cmp_fn = tracing_map_cmp_string; + else + cmp_fn = tracing_map_cmp_num(field->size, + field->is_signed); + idx = tracing_map_add_key_field(map, + hist_field->offset, + cmp_fn); + + } else + idx = tracing_map_add_sum_field(map); + + if (idx < 0) + return idx; + } + + return 0; +} + +static bool need_tracing_map_ops(struct hist_trigger_data *hist_data) +{ + struct hist_field *key_field; + unsigned int i; + + for_each_hist_key_field(i, hist_data) { + key_field = hist_data->fields[i]; + + if (key_field->flags & HIST_FIELD_FL_EXECNAME) + return true; + } + + return false; +} + +static struct hist_trigger_data * +create_hist_data(unsigned int map_bits, + struct hist_trigger_attrs *attrs, + struct trace_event_file *file) +{ + const struct tracing_map_ops *map_ops = NULL; + struct hist_trigger_data *hist_data; + int ret = 0; + + hist_data = kzalloc(sizeof(*hist_data), GFP_KERNEL); + if (!hist_data) + return ERR_PTR(-ENOMEM); + + hist_data->attrs = attrs; + + ret = create_hist_fields(hist_data, file); + if (ret) + goto free; + + ret = create_sort_keys(hist_data); + if (ret) + goto free; + + if (need_tracing_map_ops(hist_data)) + map_ops = &hist_trigger_elt_comm_ops; + + hist_data->map = tracing_map_create(map_bits, hist_data->key_size, + map_ops, hist_data); + if (IS_ERR(hist_data->map)) { + ret = PTR_ERR(hist_data->map); + hist_data->map = NULL; + goto free; + } + + ret = create_tracing_map_fields(hist_data); + if (ret) + goto free; + + ret = tracing_map_init(hist_data->map); + if (ret) + goto free; + + hist_data->event_file = file; + out: + return hist_data; + free: + hist_data->attrs = NULL; + + destroy_hist_data(hist_data); + + hist_data = ERR_PTR(ret); + + goto out; +} + +static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + void *rec) +{ + struct hist_field *hist_field; + unsigned int i; + u64 hist_val; + + for_each_hist_val_field(i, hist_data) { + hist_field = hist_data->fields[i]; + hist_val = hist_field->fn(hist_field, rec); + tracing_map_update_sum(elt, i, hist_val); + } +} + +static inline void add_to_key(char *compound_key, void *key, + struct hist_field *key_field, void *rec) +{ + size_t size = key_field->size; + + if (key_field->flags & HIST_FIELD_FL_STRING) { + struct ftrace_event_field *field; + + field = key_field->field; + if (field->filter_type == FILTER_DYN_STRING) + size = *(u32 *)(rec + field->offset) >> 16; + else if (field->filter_type == FILTER_PTR_STRING) + size = strlen(key); + else if (field->filter_type == FILTER_STATIC_STRING) + size = field->size; + + /* ensure NULL-termination */ + if (size > key_field->size - 1) + size = key_field->size - 1; + } + + memcpy(compound_key + key_field->offset, key, size); +} + +static void event_hist_trigger(struct event_trigger_data *data, void *rec) +{ + struct hist_trigger_data *hist_data = data->private_data; + bool use_compound_key = (hist_data->n_keys > 1); + unsigned long entries[HIST_STACKTRACE_DEPTH]; + char compound_key[HIST_KEY_SIZE_MAX]; + struct stack_trace stacktrace; + struct hist_field *key_field; + struct tracing_map_elt *elt; + u64 field_contents; + void *key = NULL; + unsigned int i; + + memset(compound_key, 0, hist_data->key_size); + + for_each_hist_key_field(i, hist_data) { + key_field = hist_data->fields[i]; + + if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { + stacktrace.max_entries = HIST_STACKTRACE_DEPTH; + stacktrace.entries = entries; + stacktrace.nr_entries = 0; + stacktrace.skip = HIST_STACKTRACE_SKIP; + + memset(stacktrace.entries, 0, HIST_STACKTRACE_SIZE); + save_stack_trace(&stacktrace); + + key = entries; + } else { + field_contents = key_field->fn(key_field, rec); + if (key_field->flags & HIST_FIELD_FL_STRING) { + key = (void *)(unsigned long)field_contents; + use_compound_key = true; + } else + key = (void *)&field_contents; + } + + if (use_compound_key) + add_to_key(compound_key, key, key_field, rec); + } + + if (use_compound_key) + key = compound_key; + + elt = tracing_map_insert(hist_data->map, key); + if (elt) + hist_trigger_elt_update(hist_data, elt, rec); +} + +static void hist_trigger_stacktrace_print(struct seq_file *m, + unsigned long *stacktrace_entries, + unsigned int max_entries) +{ + char str[KSYM_SYMBOL_LEN]; + unsigned int spaces = 8; + unsigned int i; + + for (i = 0; i < max_entries; i++) { + if (stacktrace_entries[i] == ULONG_MAX) + return; + + seq_printf(m, "%*c", 1 + spaces, ' '); + sprint_symbol(str, stacktrace_entries[i]); + seq_printf(m, "%s\n", str); + } +} + +static void +hist_trigger_entry_print(struct seq_file *m, + struct hist_trigger_data *hist_data, void *key, + struct tracing_map_elt *elt) +{ + struct hist_field *key_field; + char str[KSYM_SYMBOL_LEN]; + bool multiline = false; + unsigned int i; + u64 uval; + + seq_puts(m, "{ "); + + for_each_hist_key_field(i, hist_data) { + key_field = hist_data->fields[i]; + + if (i > hist_data->n_vals) + seq_puts(m, ", "); + + if (key_field->flags & HIST_FIELD_FL_HEX) { + uval = *(u64 *)(key + key_field->offset); + seq_printf(m, "%s: %llx", + key_field->field->name, uval); + } else if (key_field->flags & HIST_FIELD_FL_SYM) { + uval = *(u64 *)(key + key_field->offset); + sprint_symbol_no_offset(str, uval); + seq_printf(m, "%s: [%llx] %-45s", + key_field->field->name, uval, str); + } else if (key_field->flags & HIST_FIELD_FL_SYM_OFFSET) { + uval = *(u64 *)(key + key_field->offset); + sprint_symbol(str, uval); + seq_printf(m, "%s: [%llx] %-55s", + key_field->field->name, uval, str); + } else if (key_field->flags & HIST_FIELD_FL_EXECNAME) { + char *comm = elt->private_data; + + uval = *(u64 *)(key + key_field->offset); + seq_printf(m, "%s: %-16s[%10llu]", + key_field->field->name, comm, uval); + } else if (key_field->flags & HIST_FIELD_FL_SYSCALL) { + const char *syscall_name; + + uval = *(u64 *)(key + key_field->offset); + syscall_name = get_syscall_name(uval); + if (!syscall_name) + syscall_name = "unknown_syscall"; + + seq_printf(m, "%s: %-30s[%3llu]", + key_field->field->name, syscall_name, uval); + } else if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { + seq_puts(m, "stacktrace:\n"); + hist_trigger_stacktrace_print(m, + key + key_field->offset, + HIST_STACKTRACE_DEPTH); + multiline = true; + } else if (key_field->flags & HIST_FIELD_FL_LOG2) { + seq_printf(m, "%s: ~ 2^%-2llu", key_field->field->name, + *(u64 *)(key + key_field->offset)); + } else if (key_field->flags & HIST_FIELD_FL_STRING) { + seq_printf(m, "%s: %-50s", key_field->field->name, + (char *)(key + key_field->offset)); + } else { + uval = *(u64 *)(key + key_field->offset); + seq_printf(m, "%s: %10llu", key_field->field->name, + uval); + } + } + + if (!multiline) + seq_puts(m, " "); + + seq_puts(m, "}"); + + seq_printf(m, " hitcount: %10llu", + tracing_map_read_sum(elt, HITCOUNT_IDX)); + + for (i = 1; i < hist_data->n_vals; i++) { + if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) { + seq_printf(m, " %s: %10llx", + hist_data->fields[i]->field->name, + tracing_map_read_sum(elt, i)); + } else { + seq_printf(m, " %s: %10llu", + hist_data->fields[i]->field->name, + tracing_map_read_sum(elt, i)); + } + } + + seq_puts(m, "\n"); +} + +static int print_entries(struct seq_file *m, + struct hist_trigger_data *hist_data) +{ + struct tracing_map_sort_entry **sort_entries = NULL; + struct tracing_map *map = hist_data->map; + int i, n_entries; + + n_entries = tracing_map_sort_entries(map, hist_data->sort_keys, + hist_data->n_sort_keys, + &sort_entries); + if (n_entries < 0) + return n_entries; + + for (i = 0; i < n_entries; i++) + hist_trigger_entry_print(m, hist_data, + sort_entries[i]->key, + sort_entries[i]->elt); + + tracing_map_destroy_sort_entries(sort_entries, n_entries); + + return n_entries; +} + +static void hist_trigger_show(struct seq_file *m, + struct event_trigger_data *data, int n) +{ + struct hist_trigger_data *hist_data; + int n_entries, ret = 0; + + if (n > 0) + seq_puts(m, "\n\n"); + + seq_puts(m, "# event histogram\n#\n# trigger info: "); + data->ops->print(m, data->ops, data); + seq_puts(m, "#\n\n"); + + hist_data = data->private_data; + n_entries = print_entries(m, hist_data); + if (n_entries < 0) { + ret = n_entries; + n_entries = 0; + } + + seq_printf(m, "\nTotals:\n Hits: %llu\n Entries: %u\n Dropped: %llu\n", + (u64)atomic64_read(&hist_data->map->hits), + n_entries, (u64)atomic64_read(&hist_data->map->drops)); +} + +static int hist_show(struct seq_file *m, void *v) +{ + struct event_trigger_data *data; + struct trace_event_file *event_file; + int n = 0, ret = 0; + + mutex_lock(&event_mutex); + + event_file = event_file_data(m->private); + if (unlikely(!event_file)) { + ret = -ENODEV; + goto out_unlock; + } + + list_for_each_entry_rcu(data, &event_file->triggers, list) { + if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) + hist_trigger_show(m, data, n++); + } + + out_unlock: + mutex_unlock(&event_mutex); + + return ret; +} + +static int event_hist_open(struct inode *inode, struct file *file) +{ + return single_open(file, hist_show, file); +} + +const struct file_operations event_hist_fops = { + .open = event_hist_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const char *get_hist_field_flags(struct hist_field *hist_field) +{ + const char *flags_str = NULL; + + if (hist_field->flags & HIST_FIELD_FL_HEX) + flags_str = "hex"; + else if (hist_field->flags & HIST_FIELD_FL_SYM) + flags_str = "sym"; + else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET) + flags_str = "sym-offset"; + else if (hist_field->flags & HIST_FIELD_FL_EXECNAME) + flags_str = "execname"; + else if (hist_field->flags & HIST_FIELD_FL_SYSCALL) + flags_str = "syscall"; + else if (hist_field->flags & HIST_FIELD_FL_LOG2) + flags_str = "log2"; + + return flags_str; +} + +static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) +{ + seq_printf(m, "%s", hist_field->field->name); + if (hist_field->flags) { + const char *flags_str = get_hist_field_flags(hist_field); + + if (flags_str) + seq_printf(m, ".%s", flags_str); + } +} + +static int event_hist_trigger_print(struct seq_file *m, + struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + struct hist_trigger_data *hist_data = data->private_data; + struct hist_field *key_field; + unsigned int i; + + seq_puts(m, "hist:"); + + if (data->name) + seq_printf(m, "%s:", data->name); + + seq_puts(m, "keys="); + + for_each_hist_key_field(i, hist_data) { + key_field = hist_data->fields[i]; + + if (i > hist_data->n_vals) + seq_puts(m, ","); + + if (key_field->flags & HIST_FIELD_FL_STACKTRACE) + seq_puts(m, "stacktrace"); + else + hist_field_print(m, key_field); + } + + seq_puts(m, ":vals="); + + for_each_hist_val_field(i, hist_data) { + if (i == HITCOUNT_IDX) + seq_puts(m, "hitcount"); + else { + seq_puts(m, ","); + hist_field_print(m, hist_data->fields[i]); + } + } + + seq_puts(m, ":sort="); + + for (i = 0; i < hist_data->n_sort_keys; i++) { + struct tracing_map_sort_key *sort_key; + + sort_key = &hist_data->sort_keys[i]; + + if (i > 0) + seq_puts(m, ","); + + if (sort_key->field_idx == HITCOUNT_IDX) + seq_puts(m, "hitcount"); + else { + unsigned int idx = sort_key->field_idx; + + if (WARN_ON(idx >= TRACING_MAP_FIELDS_MAX)) + return -EINVAL; + + hist_field_print(m, hist_data->fields[idx]); + } + + if (sort_key->descending) + seq_puts(m, ".descending"); + } + + seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits)); + + if (data->filter_str) + seq_printf(m, " if %s", data->filter_str); + + if (data->paused) + seq_puts(m, " [paused]"); + else + seq_puts(m, " [active]"); + + seq_putc(m, '\n'); + + return 0; +} + +static int event_hist_trigger_init(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + struct hist_trigger_data *hist_data = data->private_data; + + if (!data->ref && hist_data->attrs->name) + save_named_trigger(hist_data->attrs->name, data); + + data->ref++; + + return 0; +} + +static void event_hist_trigger_free(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + struct hist_trigger_data *hist_data = data->private_data; + + if (WARN_ON_ONCE(data->ref <= 0)) + return; + + data->ref--; + if (!data->ref) { + if (data->name) + del_named_trigger(data); + trigger_data_free(data); + destroy_hist_data(hist_data); + } +} + +static struct event_trigger_ops event_hist_trigger_ops = { + .func = event_hist_trigger, + .print = event_hist_trigger_print, + .init = event_hist_trigger_init, + .free = event_hist_trigger_free, +}; + +static int event_hist_trigger_named_init(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + data->ref++; + + save_named_trigger(data->named_data->name, data); + + event_hist_trigger_init(ops, data->named_data); + + return 0; +} + +static void event_hist_trigger_named_free(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + if (WARN_ON_ONCE(data->ref <= 0)) + return; + + event_hist_trigger_free(ops, data->named_data); + + data->ref--; + if (!data->ref) { + del_named_trigger(data); + trigger_data_free(data); + } +} + +static struct event_trigger_ops event_hist_trigger_named_ops = { + .func = event_hist_trigger, + .print = event_hist_trigger_print, + .init = event_hist_trigger_named_init, + .free = event_hist_trigger_named_free, +}; + +static struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd, + char *param) +{ + return &event_hist_trigger_ops; +} + +static void hist_clear(struct event_trigger_data *data) +{ + struct hist_trigger_data *hist_data = data->private_data; + + if (data->name) + pause_named_trigger(data); + + synchronize_sched(); + + tracing_map_clear(hist_data->map); + + if (data->name) + unpause_named_trigger(data); +} + +static bool compatible_field(struct ftrace_event_field *field, + struct ftrace_event_field *test_field) +{ + if (field == test_field) + return true; + if (field == NULL || test_field == NULL) + return false; + if (strcmp(field->name, test_field->name) != 0) + return false; + if (strcmp(field->type, test_field->type) != 0) + return false; + if (field->size != test_field->size) + return false; + if (field->is_signed != test_field->is_signed) + return false; + + return true; +} + +static bool hist_trigger_match(struct event_trigger_data *data, + struct event_trigger_data *data_test, + struct event_trigger_data *named_data, + bool ignore_filter) +{ + struct tracing_map_sort_key *sort_key, *sort_key_test; + struct hist_trigger_data *hist_data, *hist_data_test; + struct hist_field *key_field, *key_field_test; + unsigned int i; + + if (named_data && (named_data != data_test) && + (named_data != data_test->named_data)) + return false; + + if (!named_data && is_named_trigger(data_test)) + return false; + + hist_data = data->private_data; + hist_data_test = data_test->private_data; + + if (hist_data->n_vals != hist_data_test->n_vals || + hist_data->n_fields != hist_data_test->n_fields || + hist_data->n_sort_keys != hist_data_test->n_sort_keys) + return false; + + if (!ignore_filter) { + if ((data->filter_str && !data_test->filter_str) || + (!data->filter_str && data_test->filter_str)) + return false; + } + + for_each_hist_field(i, hist_data) { + key_field = hist_data->fields[i]; + key_field_test = hist_data_test->fields[i]; + + if (key_field->flags != key_field_test->flags) + return false; + if (!compatible_field(key_field->field, key_field_test->field)) + return false; + if (key_field->offset != key_field_test->offset) + return false; + } + + for (i = 0; i < hist_data->n_sort_keys; i++) { + sort_key = &hist_data->sort_keys[i]; + sort_key_test = &hist_data_test->sort_keys[i]; + + if (sort_key->field_idx != sort_key_test->field_idx || + sort_key->descending != sort_key_test->descending) + return false; + } + + if (!ignore_filter && data->filter_str && + (strcmp(data->filter_str, data_test->filter_str) != 0)) + return false; + + return true; +} + +static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + struct hist_trigger_data *hist_data = data->private_data; + struct event_trigger_data *test, *named_data = NULL; + int ret = 0; + + if (hist_data->attrs->name) { + named_data = find_named_trigger(hist_data->attrs->name); + if (named_data) { + if (!hist_trigger_match(data, named_data, named_data, + true)) { + ret = -EINVAL; + goto out; + } + } + } + + if (hist_data->attrs->name && !named_data) + goto new; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (!hist_trigger_match(data, test, named_data, false)) + continue; + if (hist_data->attrs->pause) + test->paused = true; + else if (hist_data->attrs->cont) + test->paused = false; + else if (hist_data->attrs->clear) + hist_clear(test); + else + ret = -EEXIST; + goto out; + } + } + new: + if (hist_data->attrs->cont || hist_data->attrs->clear) { + ret = -ENOENT; + goto out; + } + + if (named_data) { + destroy_hist_data(data->private_data); + data->private_data = named_data->private_data; + set_named_trigger_data(data, named_data); + data->ops = &event_hist_trigger_named_ops; + } + + if (hist_data->attrs->pause) + data->paused = true; + + if (data->ops->init) { + ret = data->ops->init(data->ops, data); + if (ret < 0) + goto out; + } + + list_add_rcu(&data->list, &file->triggers); + ret++; + + update_cond_flag(file); + + if (trace_event_trigger_enable_disable(file, 1) < 0) { + list_del_rcu(&data->list); + update_cond_flag(file); + ret--; + } + out: + return ret; +} + +static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + struct hist_trigger_data *hist_data = data->private_data; + struct event_trigger_data *test, *named_data = NULL; + bool unregistered = false; + + if (hist_data->attrs->name) + named_data = find_named_trigger(hist_data->attrs->name); + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (!hist_trigger_match(data, test, named_data, false)) + continue; + unregistered = true; + list_del_rcu(&test->list); + trace_event_trigger_enable_disable(file, 0); + update_cond_flag(file); + break; + } + } + + if (unregistered && test->ops->free) + test->ops->free(test->ops, test); +} + +static void hist_unreg_all(struct trace_event_file *file) +{ + struct event_trigger_data *test; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + list_del_rcu(&test->list); + trace_event_trigger_enable_disable(file, 0); + update_cond_flag(file); + if (test->ops->free) + test->ops->free(test->ops, test); + } + } +} + +static int event_hist_trigger_func(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param) +{ + unsigned int hist_trigger_bits = TRACING_MAP_BITS_DEFAULT; + struct event_trigger_data *trigger_data; + struct hist_trigger_attrs *attrs; + struct event_trigger_ops *trigger_ops; + struct hist_trigger_data *hist_data; + char *trigger; + int ret = 0; + + if (!param) + return -EINVAL; + + /* separate the trigger from the filter (k:v [if filter]) */ + trigger = strsep(¶m, " \t"); + if (!trigger) + return -EINVAL; + + attrs = parse_hist_trigger_attrs(trigger); + if (IS_ERR(attrs)) + return PTR_ERR(attrs); + + if (attrs->map_bits) + hist_trigger_bits = attrs->map_bits; + + hist_data = create_hist_data(hist_trigger_bits, attrs, file); + if (IS_ERR(hist_data)) { + destroy_hist_trigger_attrs(attrs); + return PTR_ERR(hist_data); + } + + trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); + + ret = -ENOMEM; + trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); + if (!trigger_data) + goto out_free; + + trigger_data->count = -1; + trigger_data->ops = trigger_ops; + trigger_data->cmd_ops = cmd_ops; + + INIT_LIST_HEAD(&trigger_data->list); + RCU_INIT_POINTER(trigger_data->filter, NULL); + + trigger_data->private_data = hist_data; + + /* if param is non-empty, it's supposed to be a filter */ + if (param && cmd_ops->set_filter) { + ret = cmd_ops->set_filter(param, trigger_data, file); + if (ret < 0) + goto out_free; + } + + if (glob[0] == '!') { + cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + ret = 0; + goto out_free; + } + + ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); + /* + * The above returns on success the # of triggers registered, + * but if it didn't register any it returns zero. Consider no + * triggers registered a failure too. + */ + if (!ret) { + if (!(attrs->pause || attrs->cont || attrs->clear)) + ret = -ENOENT; + goto out_free; + } else if (ret < 0) + goto out_free; + /* Just return zero, not the number of registered triggers */ + ret = 0; + out: + return ret; + out_free: + if (cmd_ops->set_filter) + cmd_ops->set_filter(NULL, trigger_data, NULL); + + kfree(trigger_data); + + destroy_hist_data(hist_data); + goto out; +} + +static struct event_command trigger_hist_cmd = { + .name = "hist", + .trigger_type = ETT_EVENT_HIST, + .flags = EVENT_CMD_FL_NEEDS_REC, + .func = event_hist_trigger_func, + .reg = hist_register_trigger, + .unreg = hist_unregister_trigger, + .unreg_all = hist_unreg_all, + .get_trigger_ops = event_hist_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +__init int register_trigger_hist_cmd(void) +{ + int ret; + + ret = register_event_command(&trigger_hist_cmd); + WARN_ON(ret < 0); + + return ret; +} + +static void +hist_enable_trigger(struct event_trigger_data *data, void *rec) +{ + struct enable_trigger_data *enable_data = data->private_data; + struct event_trigger_data *test; + + list_for_each_entry_rcu(test, &enable_data->file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (enable_data->enable) + test->paused = false; + else + test->paused = true; + } + } +} + +static void +hist_enable_count_trigger(struct event_trigger_data *data, void *rec) +{ + if (!data->count) + return; + + if (data->count != -1) + (data->count)--; + + hist_enable_trigger(data, rec); +} + +static struct event_trigger_ops hist_enable_trigger_ops = { + .func = hist_enable_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops hist_enable_count_trigger_ops = { + .func = hist_enable_count_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops hist_disable_trigger_ops = { + .func = hist_enable_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops hist_disable_count_trigger_ops = { + .func = hist_enable_count_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops * +hist_enable_get_trigger_ops(char *cmd, char *param) +{ + struct event_trigger_ops *ops; + bool enable; + + enable = (strcmp(cmd, ENABLE_HIST_STR) == 0); + + if (enable) + ops = param ? &hist_enable_count_trigger_ops : + &hist_enable_trigger_ops; + else + ops = param ? &hist_disable_count_trigger_ops : + &hist_disable_trigger_ops; + + return ops; +} + +static void hist_enable_unreg_all(struct trace_event_file *file) +{ + struct event_trigger_data *test; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_HIST_ENABLE) { + list_del_rcu(&test->list); + update_cond_flag(file); + trace_event_trigger_enable_disable(file, 0); + if (test->ops->free) + test->ops->free(test->ops, test); + } + } +} + +static struct event_command trigger_hist_enable_cmd = { + .name = ENABLE_HIST_STR, + .trigger_type = ETT_HIST_ENABLE, + .func = event_enable_trigger_func, + .reg = event_enable_register_trigger, + .unreg = event_enable_unregister_trigger, + .unreg_all = hist_enable_unreg_all, + .get_trigger_ops = hist_enable_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static struct event_command trigger_hist_disable_cmd = { + .name = DISABLE_HIST_STR, + .trigger_type = ETT_HIST_ENABLE, + .func = event_enable_trigger_func, + .reg = event_enable_register_trigger, + .unreg = event_enable_unregister_trigger, + .unreg_all = hist_enable_unreg_all, + .get_trigger_ops = hist_enable_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static __init void unregister_trigger_hist_enable_disable_cmds(void) +{ + unregister_event_command(&trigger_hist_enable_cmd); + unregister_event_command(&trigger_hist_disable_cmd); +} + +__init int register_trigger_hist_enable_disable_cmds(void) +{ + int ret; + + ret = register_event_command(&trigger_hist_enable_cmd); + if (WARN_ON(ret < 0)) + return ret; + ret = register_event_command(&trigger_hist_disable_cmd); + if (WARN_ON(ret < 0)) + unregister_trigger_hist_enable_disable_cmds(); + + return ret; +} diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index b38f617b6181..a975571cde24 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -28,8 +28,7 @@ static LIST_HEAD(trigger_commands); static DEFINE_MUTEX(trigger_cmd_mutex); -static void -trigger_data_free(struct event_trigger_data *data) +void trigger_data_free(struct event_trigger_data *data) { if (data->cmd_ops->set_filter) data->cmd_ops->set_filter(NULL, data, NULL); @@ -73,18 +72,20 @@ event_triggers_call(struct trace_event_file *file, void *rec) return tt; list_for_each_entry_rcu(data, &file->triggers, list) { + if (data->paused) + continue; if (!rec) { - data->ops->func(data); + data->ops->func(data, rec); continue; } filter = rcu_dereference_sched(data->filter); if (filter && !filter_match_preds(filter, rec)) continue; - if (data->cmd_ops->post_trigger) { + if (event_command_post_trigger(data->cmd_ops)) { tt |= data->cmd_ops->trigger_type; continue; } - data->ops->func(data); + data->ops->func(data, rec); } return tt; } @@ -94,6 +95,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call); * event_triggers_post_call - Call 'post_triggers' for a trace event * @file: The trace_event_file associated with the event * @tt: enum event_trigger_type containing a set bit for each trigger to invoke + * @rec: The trace entry for the event * * For each trigger associated with an event, invoke the trigger * function registered with the associated trigger command, if the @@ -104,13 +106,16 @@ EXPORT_SYMBOL_GPL(event_triggers_call); */ void event_triggers_post_call(struct trace_event_file *file, - enum event_trigger_type tt) + enum event_trigger_type tt, + void *rec) { struct event_trigger_data *data; list_for_each_entry_rcu(data, &file->triggers, list) { + if (data->paused) + continue; if (data->cmd_ops->trigger_type & tt) - data->ops->func(data); + data->ops->func(data, rec); } } EXPORT_SYMBOL_GPL(event_triggers_post_call); @@ -188,6 +193,19 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file) return -ENODEV; } + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) { + struct trace_event_file *event_file; + struct event_command *p; + + event_file = event_file_data(file); + + list_for_each_entry(p, &trigger_commands, list) { + if (p->unreg_all) + p->unreg_all(event_file); + } + } + if (file->f_mode & FMODE_READ) { ret = seq_open(file, &event_triggers_seq_ops); if (!ret) { @@ -306,7 +324,7 @@ const struct file_operations event_trigger_fops = { * Currently we only register event commands from __init, so mark this * __init too. */ -static __init int register_event_command(struct event_command *cmd) +__init int register_event_command(struct event_command *cmd) { struct event_command *p; int ret = 0; @@ -329,7 +347,7 @@ static __init int register_event_command(struct event_command *cmd) * Currently we only unregister event commands from __init, so mark * this __init too. */ -static __init int unregister_event_command(struct event_command *cmd) +__init int unregister_event_command(struct event_command *cmd) { struct event_command *p, *n; int ret = -ENODEV; @@ -395,9 +413,8 @@ event_trigger_print(const char *name, struct seq_file *m, * * Return: 0 on success, errno otherwise */ -static int -event_trigger_init(struct event_trigger_ops *ops, - struct event_trigger_data *data) +int event_trigger_init(struct event_trigger_ops *ops, + struct event_trigger_data *data) { data->ref++; return 0; @@ -425,8 +442,8 @@ event_trigger_free(struct event_trigger_ops *ops, trigger_data_free(data); } -static int trace_event_trigger_enable_disable(struct trace_event_file *file, - int trigger_enable) +int trace_event_trigger_enable_disable(struct trace_event_file *file, + int trigger_enable) { int ret = 0; @@ -483,13 +500,14 @@ clear_event_triggers(struct trace_array *tr) * its TRIGGER_COND bit set, otherwise the TRIGGER_COND bit should be * cleared. */ -static void update_cond_flag(struct trace_event_file *file) +void update_cond_flag(struct trace_event_file *file) { struct event_trigger_data *data; bool set_cond = false; list_for_each_entry_rcu(data, &file->triggers, list) { - if (data->filter || data->cmd_ops->post_trigger) { + if (data->filter || event_command_post_trigger(data->cmd_ops) || + event_command_needs_rec(data->cmd_ops)) { set_cond = true; break; } @@ -560,9 +578,9 @@ out: * Usually used directly as the @unreg method in event command * implementations. */ -static void unregister_trigger(char *glob, struct event_trigger_ops *ops, - struct event_trigger_data *test, - struct trace_event_file *file) +void unregister_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *test, + struct trace_event_file *file) { struct event_trigger_data *data; bool unregistered = false; @@ -623,6 +641,7 @@ event_trigger_callback(struct event_command *cmd_ops, trigger_data->ops = trigger_ops; trigger_data->cmd_ops = cmd_ops; INIT_LIST_HEAD(&trigger_data->list); + INIT_LIST_HEAD(&trigger_data->named_list); if (glob[0] == '!') { cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); @@ -696,9 +715,9 @@ event_trigger_callback(struct event_command *cmd_ops, * * Return: 0 on success, errno otherwise */ -static int set_trigger_filter(char *filter_str, - struct event_trigger_data *trigger_data, - struct trace_event_file *file) +int set_trigger_filter(char *filter_str, + struct event_trigger_data *trigger_data, + struct trace_event_file *file) { struct event_trigger_data *data = trigger_data; struct event_filter *filter = NULL, *tmp; @@ -746,8 +765,150 @@ static int set_trigger_filter(char *filter_str, return ret; } +static LIST_HEAD(named_triggers); + +/** + * find_named_trigger - Find the common named trigger associated with @name + * @name: The name of the set of named triggers to find the common data for + * + * Named triggers are sets of triggers that share a common set of + * trigger data. The first named trigger registered with a given name + * owns the common trigger data that the others subsequently + * registered with the same name will reference. This function + * returns the common trigger data associated with that first + * registered instance. + * + * Return: the common trigger data for the given named trigger on + * success, NULL otherwise. + */ +struct event_trigger_data *find_named_trigger(const char *name) +{ + struct event_trigger_data *data; + + if (!name) + return NULL; + + list_for_each_entry(data, &named_triggers, named_list) { + if (data->named_data) + continue; + if (strcmp(data->name, name) == 0) + return data; + } + + return NULL; +} + +/** + * is_named_trigger - determine if a given trigger is a named trigger + * @test: The trigger data to test + * + * Return: true if 'test' is a named trigger, false otherwise. + */ +bool is_named_trigger(struct event_trigger_data *test) +{ + struct event_trigger_data *data; + + list_for_each_entry(data, &named_triggers, named_list) { + if (test == data) + return true; + } + + return false; +} + +/** + * save_named_trigger - save the trigger in the named trigger list + * @name: The name of the named trigger set + * @data: The trigger data to save + * + * Return: 0 if successful, negative error otherwise. + */ +int save_named_trigger(const char *name, struct event_trigger_data *data) +{ + data->name = kstrdup(name, GFP_KERNEL); + if (!data->name) + return -ENOMEM; + + list_add(&data->named_list, &named_triggers); + + return 0; +} + +/** + * del_named_trigger - delete a trigger from the named trigger list + * @data: The trigger data to delete + */ +void del_named_trigger(struct event_trigger_data *data) +{ + kfree(data->name); + data->name = NULL; + + list_del(&data->named_list); +} + +static void __pause_named_trigger(struct event_trigger_data *data, bool pause) +{ + struct event_trigger_data *test; + + list_for_each_entry(test, &named_triggers, named_list) { + if (strcmp(test->name, data->name) == 0) { + if (pause) { + test->paused_tmp = test->paused; + test->paused = true; + } else { + test->paused = test->paused_tmp; + } + } + } +} + +/** + * pause_named_trigger - Pause all named triggers with the same name + * @data: The trigger data of a named trigger to pause + * + * Pauses a named trigger along with all other triggers having the + * same name. Because named triggers share a common set of data, + * pausing only one is meaningless, so pausing one named trigger needs + * to pause all triggers with the same name. + */ +void pause_named_trigger(struct event_trigger_data *data) +{ + __pause_named_trigger(data, true); +} + +/** + * unpause_named_trigger - Un-pause all named triggers with the same name + * @data: The trigger data of a named trigger to unpause + * + * Un-pauses a named trigger along with all other triggers having the + * same name. Because named triggers share a common set of data, + * unpausing only one is meaningless, so unpausing one named trigger + * needs to unpause all triggers with the same name. + */ +void unpause_named_trigger(struct event_trigger_data *data) +{ + __pause_named_trigger(data, false); +} + +/** + * set_named_trigger_data - Associate common named trigger data + * @data: The trigger data of a named trigger to unpause + * + * Named triggers are sets of triggers that share a common set of + * trigger data. The first named trigger registered with a given name + * owns the common trigger data that the others subsequently + * registered with the same name will reference. This function + * associates the common trigger data from the first trigger with the + * given trigger. + */ +void set_named_trigger_data(struct event_trigger_data *data, + struct event_trigger_data *named_data) +{ + data->named_data = named_data; +} + static void -traceon_trigger(struct event_trigger_data *data) +traceon_trigger(struct event_trigger_data *data, void *rec) { if (tracing_is_on()) return; @@ -756,7 +917,7 @@ traceon_trigger(struct event_trigger_data *data) } static void -traceon_count_trigger(struct event_trigger_data *data) +traceon_count_trigger(struct event_trigger_data *data, void *rec) { if (tracing_is_on()) return; @@ -771,7 +932,7 @@ traceon_count_trigger(struct event_trigger_data *data) } static void -traceoff_trigger(struct event_trigger_data *data) +traceoff_trigger(struct event_trigger_data *data, void *rec) { if (!tracing_is_on()) return; @@ -780,7 +941,7 @@ traceoff_trigger(struct event_trigger_data *data) } static void -traceoff_count_trigger(struct event_trigger_data *data) +traceoff_count_trigger(struct event_trigger_data *data, void *rec) { if (!tracing_is_on()) return; @@ -876,13 +1037,13 @@ static struct event_command trigger_traceoff_cmd = { #ifdef CONFIG_TRACER_SNAPSHOT static void -snapshot_trigger(struct event_trigger_data *data) +snapshot_trigger(struct event_trigger_data *data, void *rec) { tracing_snapshot(); } static void -snapshot_count_trigger(struct event_trigger_data *data) +snapshot_count_trigger(struct event_trigger_data *data, void *rec) { if (!data->count) return; @@ -890,7 +1051,7 @@ snapshot_count_trigger(struct event_trigger_data *data) if (data->count != -1) (data->count)--; - snapshot_trigger(data); + snapshot_trigger(data, rec); } static int @@ -969,13 +1130,13 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; } #define STACK_SKIP 3 static void -stacktrace_trigger(struct event_trigger_data *data) +stacktrace_trigger(struct event_trigger_data *data, void *rec) { trace_dump_stack(STACK_SKIP); } static void -stacktrace_count_trigger(struct event_trigger_data *data) +stacktrace_count_trigger(struct event_trigger_data *data, void *rec) { if (!data->count) return; @@ -983,7 +1144,7 @@ stacktrace_count_trigger(struct event_trigger_data *data) if (data->count != -1) (data->count)--; - stacktrace_trigger(data); + stacktrace_trigger(data, rec); } static int @@ -1017,7 +1178,7 @@ stacktrace_get_trigger_ops(char *cmd, char *param) static struct event_command trigger_stacktrace_cmd = { .name = "stacktrace", .trigger_type = ETT_STACKTRACE, - .post_trigger = true, + .flags = EVENT_CMD_FL_POST_TRIGGER, .func = event_trigger_callback, .reg = register_trigger, .unreg = unregister_trigger, @@ -1044,17 +1205,8 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void) unregister_event_command(&trigger_traceoff_cmd); } -/* Avoid typos */ -#define ENABLE_EVENT_STR "enable_event" -#define DISABLE_EVENT_STR "disable_event" - -struct enable_trigger_data { - struct trace_event_file *file; - bool enable; -}; - static void -event_enable_trigger(struct event_trigger_data *data) +event_enable_trigger(struct event_trigger_data *data, void *rec) { struct enable_trigger_data *enable_data = data->private_data; @@ -1065,7 +1217,7 @@ event_enable_trigger(struct event_trigger_data *data) } static void -event_enable_count_trigger(struct event_trigger_data *data) +event_enable_count_trigger(struct event_trigger_data *data, void *rec) { struct enable_trigger_data *enable_data = data->private_data; @@ -1079,17 +1231,19 @@ event_enable_count_trigger(struct event_trigger_data *data) if (data->count != -1) (data->count)--; - event_enable_trigger(data); + event_enable_trigger(data, rec); } -static int -event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, - struct event_trigger_data *data) +int event_enable_trigger_print(struct seq_file *m, + struct event_trigger_ops *ops, + struct event_trigger_data *data) { struct enable_trigger_data *enable_data = data->private_data; seq_printf(m, "%s:%s:%s", - enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, + enable_data->hist ? + (enable_data->enable ? ENABLE_HIST_STR : DISABLE_HIST_STR) : + (enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR), enable_data->file->event_call->class->system, trace_event_name(enable_data->file->event_call)); @@ -1106,9 +1260,8 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, return 0; } -static void -event_enable_trigger_free(struct event_trigger_ops *ops, - struct event_trigger_data *data) +void event_enable_trigger_free(struct event_trigger_ops *ops, + struct event_trigger_data *data) { struct enable_trigger_data *enable_data = data->private_data; @@ -1153,10 +1306,9 @@ static struct event_trigger_ops event_disable_count_trigger_ops = { .free = event_enable_trigger_free, }; -static int -event_enable_trigger_func(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *param) +int event_enable_trigger_func(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param) { struct trace_event_file *event_enable_file; struct enable_trigger_data *enable_data; @@ -1165,6 +1317,7 @@ event_enable_trigger_func(struct event_command *cmd_ops, struct trace_array *tr = file->tr; const char *system; const char *event; + bool hist = false; char *trigger; char *number; bool enable; @@ -1189,8 +1342,15 @@ event_enable_trigger_func(struct event_command *cmd_ops, if (!event_enable_file) goto out; - enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; +#ifdef CONFIG_HIST_TRIGGERS + hist = ((strcmp(cmd, ENABLE_HIST_STR) == 0) || + (strcmp(cmd, DISABLE_HIST_STR) == 0)); + enable = ((strcmp(cmd, ENABLE_EVENT_STR) == 0) || + (strcmp(cmd, ENABLE_HIST_STR) == 0)); +#else + enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; +#endif trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); ret = -ENOMEM; @@ -1210,6 +1370,7 @@ event_enable_trigger_func(struct event_command *cmd_ops, INIT_LIST_HEAD(&trigger_data->list); RCU_INIT_POINTER(trigger_data->filter, NULL); + enable_data->hist = hist; enable_data->enable = enable; enable_data->file = event_enable_file; trigger_data->private_data = enable_data; @@ -1287,10 +1448,10 @@ event_enable_trigger_func(struct event_command *cmd_ops, goto out; } -static int event_enable_register_trigger(char *glob, - struct event_trigger_ops *ops, - struct event_trigger_data *data, - struct trace_event_file *file) +int event_enable_register_trigger(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct trace_event_file *file) { struct enable_trigger_data *enable_data = data->private_data; struct enable_trigger_data *test_enable_data; @@ -1300,6 +1461,8 @@ static int event_enable_register_trigger(char *glob, list_for_each_entry_rcu(test, &file->triggers, list) { test_enable_data = test->private_data; if (test_enable_data && + (test->cmd_ops->trigger_type == + data->cmd_ops->trigger_type) && (test_enable_data->file == enable_data->file)) { ret = -EEXIST; goto out; @@ -1325,10 +1488,10 @@ out: return ret; } -static void event_enable_unregister_trigger(char *glob, - struct event_trigger_ops *ops, - struct event_trigger_data *test, - struct trace_event_file *file) +void event_enable_unregister_trigger(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *test, + struct trace_event_file *file) { struct enable_trigger_data *test_enable_data = test->private_data; struct enable_trigger_data *enable_data; @@ -1338,6 +1501,8 @@ static void event_enable_unregister_trigger(char *glob, list_for_each_entry_rcu(data, &file->triggers, list) { enable_data = data->private_data; if (enable_data && + (data->cmd_ops->trigger_type == + test->cmd_ops->trigger_type) && (enable_data->file == test_enable_data->file)) { unregistered = true; list_del_rcu(&data->list); @@ -1357,8 +1522,12 @@ event_enable_get_trigger_ops(char *cmd, char *param) struct event_trigger_ops *ops; bool enable; +#ifdef CONFIG_HIST_TRIGGERS + enable = ((strcmp(cmd, ENABLE_EVENT_STR) == 0) || + (strcmp(cmd, ENABLE_HIST_STR) == 0)); +#else enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; - +#endif if (enable) ops = param ? &event_enable_count_trigger_ops : &event_enable_trigger_ops; @@ -1429,6 +1598,8 @@ __init int register_trigger_cmds(void) register_trigger_snapshot_cmd(); register_trigger_stacktrace_cmd(); register_trigger_enable_disable_cmds(); + register_trigger_hist_enable_disable_cmds(); + register_trigger_hist_cmd(); return 0; } diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index fcd41a166405..5a095c2e4b69 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -219,6 +219,8 @@ static void tracing_stop_function_trace(struct trace_array *tr) unregister_ftrace_function(tr->ops); } +static struct tracer function_trace; + static int func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { @@ -228,6 +230,10 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) break; + /* We can change this flag when not running. */ + if (tr->current_trace != &function_trace) + break; + unregister_ftrace_function(tr->ops); if (set) { diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index a663cbb84107..3a0244ff7ea8 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -8,6 +8,7 @@ */ #include <linux/uaccess.h> #include <linux/ftrace.h> +#include <linux/interrupt.h> #include <linux/slab.h> #include <linux/fs.h> @@ -1350,7 +1351,7 @@ void graph_trace_open(struct trace_iterator *iter) out_err_free: kfree(data); out_err: - pr_warning("function graph tracer: not enough memory\n"); + pr_warn("function graph tracer: not enough memory\n"); } void graph_trace_close(struct trace_iterator *iter) @@ -1468,12 +1469,12 @@ static __init int init_graph_trace(void) max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); if (!register_trace_event(&graph_trace_entry_event)) { - pr_warning("Warning: could not register graph trace events\n"); + pr_warn("Warning: could not register graph trace events\n"); return 1; } if (!register_trace_event(&graph_trace_ret_event)) { - pr_warning("Warning: could not register graph trace events\n"); + pr_warn("Warning: could not register graph trace events\n"); return 1; } diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index e4e56589ec1d..03cdff84d026 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -109,8 +109,12 @@ static int func_prolog_dec(struct trace_array *tr, return 0; local_save_flags(*flags); - /* slight chance to get a false positive on tracing_cpu */ - if (!irqs_disabled_flags(*flags)) + /* + * Slight chance to get a false positive on tracing_cpu, + * although I'm starting to think there isn't a chance. + * Leave this for now just to be paranoid. + */ + if (!irqs_disabled_flags(*flags) && !preempt_count()) return 0; *data = per_cpu_ptr(tr->trace_buffer.data, cpu); @@ -622,7 +626,6 @@ static int __irqsoff_tracer_init(struct trace_array *tr) irqsoff_trace = tr; /* make sure that the tracer is visible */ smp_wmb(); - tracing_reset_online_cpus(&tr->trace_buffer); ftrace_init_array_ops(tr, irqsoff_tracer_call); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c9956440d0e6..5546eec0505f 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -30,7 +30,7 @@ struct trace_kprobe { struct list_head list; struct kretprobe rp; /* Use rp.kp for kprobe use */ - unsigned long nhit; + unsigned long __percpu *nhit; const char *symbol; /* symbol name */ struct trace_probe tp; }; @@ -274,6 +274,10 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, if (!tk) return ERR_PTR(ret); + tk->nhit = alloc_percpu(unsigned long); + if (!tk->nhit) + goto error; + if (symbol) { tk->symbol = kstrdup(symbol, GFP_KERNEL); if (!tk->symbol) @@ -313,6 +317,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, error: kfree(tk->tp.call.name); kfree(tk->symbol); + free_percpu(tk->nhit); kfree(tk); return ERR_PTR(ret); } @@ -327,6 +332,7 @@ static void free_trace_kprobe(struct trace_kprobe *tk) kfree(tk->tp.call.class->system); kfree(tk->tp.call.name); kfree(tk->symbol); + free_percpu(tk->nhit); kfree(tk); } @@ -453,16 +459,14 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) if (ret == 0) tk->tp.flags |= TP_FLAG_REGISTERED; else { - pr_warning("Could not insert probe at %s+%lu: %d\n", - trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret); + pr_warn("Could not insert probe at %s+%lu: %d\n", + trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret); if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) { - pr_warning("This probe might be able to register after" - "target module is loaded. Continue.\n"); + pr_warn("This probe might be able to register after target module is loaded. Continue.\n"); ret = 0; } else if (ret == -EILSEQ) { - pr_warning("Probing address(0x%p) is not an " - "instruction boundary.\n", - tk->rp.kp.addr); + pr_warn("Probing address(0x%p) is not an instruction boundary.\n", + tk->rp.kp.addr); ret = -EINVAL; } } @@ -523,7 +527,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk) /* Register new event */ ret = register_kprobe_event(tk); if (ret) { - pr_warning("Failed to register probe event(%d)\n", ret); + pr_warn("Failed to register probe event(%d)\n", ret); goto end; } @@ -558,10 +562,9 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, __unregister_trace_kprobe(tk); ret = __register_trace_kprobe(tk); if (ret) - pr_warning("Failed to re-register probe %s on" - "%s: %d\n", - trace_event_name(&tk->tp.call), - mod->name, ret); + pr_warn("Failed to re-register probe %s on %s: %d\n", + trace_event_name(&tk->tp.call), + mod->name, ret); } } mutex_unlock(&probe_lock); @@ -874,9 +877,14 @@ static const struct file_operations kprobe_events_ops = { static int probes_profile_seq_show(struct seq_file *m, void *v) { struct trace_kprobe *tk = v; + unsigned long nhit = 0; + int cpu; + + for_each_possible_cpu(cpu) + nhit += *per_cpu_ptr(tk->nhit, cpu); seq_printf(m, " %-44s %15lu %15lu\n", - trace_event_name(&tk->tp.call), tk->nhit, + trace_event_name(&tk->tp.call), nhit, tk->rp.kp.nmissed); return 0; @@ -1141,14 +1149,15 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); + entry = perf_trace_buf_alloc(size, NULL, &rctx); if (!entry) return; entry->ip = (unsigned long)tk->rp.kp.addr; memset(&entry[1], 0, dsize); store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); - perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); + perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, + head, NULL); } NOKPROBE_SYMBOL(kprobe_perf_func); @@ -1176,14 +1185,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); + entry = perf_trace_buf_alloc(size, NULL, &rctx); if (!entry) return; entry->func = (unsigned long)tk->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); - perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); + perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, + head, NULL); } NOKPROBE_SYMBOL(kretprobe_perf_func); #endif /* CONFIG_PERF_EVENTS */ @@ -1225,7 +1235,7 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) { struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); - tk->nhit++; + raw_cpu_inc(*tk->nhit); if (tk->tp.flags & TP_FLAG_TRACE) kprobe_trace_func(tk, regs); @@ -1242,7 +1252,7 @@ kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) { struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp); - tk->nhit++; + raw_cpu_inc(*tk->nhit); if (tk->tp.flags & TP_FLAG_TRACE) kretprobe_trace_func(tk, ri, regs); @@ -1325,16 +1335,14 @@ static __init int init_kprobe_trace(void) /* Event list interface */ if (!entry) - pr_warning("Could not create tracefs " - "'kprobe_events' entry\n"); + pr_warn("Could not create tracefs 'kprobe_events' entry\n"); /* Profile interface */ entry = tracefs_create_file("kprobe_profile", 0444, d_tracer, NULL, &kprobe_profile_ops); if (!entry) - pr_warning("Could not create tracefs " - "'kprobe_profile' entry\n"); + pr_warn("Could not create tracefs 'kprobe_profile' entry\n"); return 0; } fs_initcall(init_kprobe_trace); diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 2be8c4f2403d..68f376ca6d3f 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -146,7 +146,7 @@ static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp, /* XXX: This is later than where events were lost. */ trace_seq_printf(s, "MARK 0.000000 Lost %lu events.\n", n); if (!overrun_detected) - pr_warning("mmiotrace has lost events.\n"); + pr_warn("mmiotrace has lost events\n"); overrun_detected = true; goto print_out; } diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index 8bb2071474dd..49f61fe96a6b 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -56,7 +56,7 @@ static void nop_trace_reset(struct trace_array *tr) } /* It only serves as a signal handler and a callback to - * accept or refuse tthe setting of a flag. + * accept or refuse the setting of a flag. * If you don't implement it, then the flag setting will be * automatically accepted. */ @@ -75,7 +75,7 @@ static int nop_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) if (bit == TRACE_NOP_OPT_REFUSE) { printk(KERN_DEBUG "nop_test_refuse flag set to %d: we refuse." - "Now cat trace_options to see the result\n", + " Now cat trace_options to see the result\n", set); return -EINVAL; } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 282982195e09..0bb9cf2d53e6 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -389,7 +389,9 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) char irqs_off; int hardirq; int softirq; + int nmi; + nmi = entry->flags & TRACE_FLAG_NMI; hardirq = entry->flags & TRACE_FLAG_HARDIRQ; softirq = entry->flags & TRACE_FLAG_SOFTIRQ; @@ -415,10 +417,12 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) } hardsoft_irq = + (nmi && hardirq) ? 'Z' : + nmi ? 'z' : (hardirq && softirq) ? 'H' : - hardirq ? 'h' : - softirq ? 's' : - '.'; + hardirq ? 'h' : + softirq ? 's' : + '.' ; trace_seq_printf(s, "%c%c%c", irqs_off, need_resched, hardsoft_irq); diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 060df67dbdd1..ad1d6164e946 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -36,6 +36,10 @@ struct trace_bprintk_fmt { static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) { struct trace_bprintk_fmt *pos; + + if (!fmt) + return ERR_PTR(-EINVAL); + list_for_each_entry(pos, &trace_bprintk_fmt_list, list) { if (!strcmp(pos->fmt, fmt)) return pos; @@ -57,7 +61,8 @@ void hold_module_trace_bprintk_format(const char **start, const char **end) for (iter = start; iter < end; iter++) { struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); if (tb_fmt) { - *iter = tb_fmt->fmt; + if (!IS_ERR(tb_fmt)) + *iter = tb_fmt->fmt; continue; } @@ -296,6 +301,9 @@ static int t_show(struct seq_file *m, void *v) const char *str = *fmt; int i; + if (!*fmt) + return 0; + seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt); /* diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 1769a81da8a7..1d372fa6fefb 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -636,8 +636,8 @@ ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer, *tmp = '\0'; size = tmp - kbuf + 1; } else if (done + size < count) { - pr_warning("Line length is too long: " - "Should be less than %d.", WRITE_BUFSIZE); + pr_warn("Line length is too long: Should be less than %d\n", + WRITE_BUFSIZE); ret = -EINVAL; goto out; } diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 6cf935316769..413ff108fbd0 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -281,8 +281,7 @@ static int tracing_stat_init(void) stat_dir = tracefs_create_dir("trace_stat", d_tracing); if (!stat_dir) - pr_warning("Could not create tracefs " - "'trace_stat' entry\n"); + pr_warn("Could not create tracefs 'trace_stat' entry\n"); return 0; } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 0655afbea83f..b2b6efc083a4 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -106,6 +106,17 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr) return syscalls_metadata[nr]; } +const char *get_syscall_name(int syscall) +{ + struct syscall_metadata *entry; + + entry = syscall_nr_to_meta(syscall); + if (!entry) + return NULL; + + return entry->name; +} + static enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags, struct trace_event *event) @@ -186,11 +197,11 @@ print_syscall_exit(struct trace_iterator *iter, int flags, extern char *__bad_type_size(void); -#define SYSCALL_FIELD(type, name) \ - sizeof(type) != sizeof(trace.name) ? \ +#define SYSCALL_FIELD(type, field, name) \ + sizeof(type) != sizeof(trace.field) ? \ __bad_type_size() : \ - #type, #name, offsetof(typeof(trace), name), \ - sizeof(trace.name), is_signed_type(type) + #type, #name, offsetof(typeof(trace), field), \ + sizeof(trace.field), is_signed_type(type) static int __init __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) @@ -261,7 +272,8 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call) int i; int offset = offsetof(typeof(trace), args); - ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); + ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr), + FILTER_OTHER); if (ret) return ret; @@ -281,11 +293,12 @@ static int __init syscall_exit_define_fields(struct trace_event_call *call) struct syscall_trace_exit trace; int ret; - ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); + ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr), + FILTER_OTHER); if (ret) return ret; - ret = trace_define_field(call, SYSCALL_FIELD(long, ret), + ret = trace_define_field(call, SYSCALL_FIELD(long, ret, ret), FILTER_OTHER); return ret; @@ -574,15 +587,16 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) size = ALIGN(size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, - sys_data->enter_event->event.type, NULL, &rctx); + rec = perf_trace_buf_alloc(size, NULL, &rctx); if (!rec) return; rec->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); - perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); + perf_trace_buf_submit(rec, size, rctx, + sys_data->enter_event->event.type, 1, regs, + head, NULL); } static int perf_sysenter_enable(struct trace_event_call *call) @@ -647,14 +661,14 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, - sys_data->exit_event->event.type, NULL, &rctx); + rec = perf_trace_buf_alloc(size, NULL, &rctx); if (!rec) return; rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); + perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type, + 1, regs, head, NULL); } static int perf_sysexit_enable(struct trace_event_call *call) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index d2f6d0be3503..c53485441c88 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -334,7 +334,7 @@ static int register_trace_uprobe(struct trace_uprobe *tu) ret = register_uprobe_event(tu); if (ret) { - pr_warning("Failed to register probe event(%d)\n", ret); + pr_warn("Failed to register probe event(%d)\n", ret); goto end; } @@ -1131,7 +1131,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, if (hlist_empty(head)) goto out; - entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); + entry = perf_trace_buf_alloc(size, NULL, &rctx); if (!entry) goto out; @@ -1152,7 +1152,8 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, memset(data + len, 0, size - esize - len); } - perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); + perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, + head, NULL); out: preempt_enable(); } diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c new file mode 100644 index 000000000000..0a689bbb78ef --- /dev/null +++ b/kernel/trace/tracing_map.c @@ -0,0 +1,1062 @@ +/* + * tracing_map - lock-free map for tracing + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Copyright (C) 2015 Tom Zanussi <tom.zanussi@linux.intel.com> + * + * tracing_map implementation inspired by lock-free map algorithms + * originated by Dr. Cliff Click: + * + * http://www.azulsystems.com/blog/cliff/2007-03-26-non-blocking-hashtable + * http://www.azulsystems.com/events/javaone_2007/2007_LockFreeHash.pdf + */ + +#include <linux/vmalloc.h> +#include <linux/jhash.h> +#include <linux/slab.h> +#include <linux/sort.h> + +#include "tracing_map.h" +#include "trace.h" + +/* + * NOTE: For a detailed description of the data structures used by + * these functions (such as tracing_map_elt) please see the overview + * of tracing_map data structures at the beginning of tracing_map.h. + */ + +/** + * tracing_map_update_sum - Add a value to a tracing_map_elt's sum field + * @elt: The tracing_map_elt + * @i: The index of the given sum associated with the tracing_map_elt + * @n: The value to add to the sum + * + * Add n to sum i associated with the specified tracing_map_elt + * instance. The index i is the index returned by the call to + * tracing_map_add_sum_field() when the tracing map was set up. + */ +void tracing_map_update_sum(struct tracing_map_elt *elt, unsigned int i, u64 n) +{ + atomic64_add(n, &elt->fields[i].sum); +} + +/** + * tracing_map_read_sum - Return the value of a tracing_map_elt's sum field + * @elt: The tracing_map_elt + * @i: The index of the given sum associated with the tracing_map_elt + * + * Retrieve the value of the sum i associated with the specified + * tracing_map_elt instance. The index i is the index returned by the + * call to tracing_map_add_sum_field() when the tracing map was set + * up. + * + * Return: The sum associated with field i for elt. + */ +u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i) +{ + return (u64)atomic64_read(&elt->fields[i].sum); +} + +int tracing_map_cmp_string(void *val_a, void *val_b) +{ + char *a = val_a; + char *b = val_b; + + return strcmp(a, b); +} + +int tracing_map_cmp_none(void *val_a, void *val_b) +{ + return 0; +} + +static int tracing_map_cmp_atomic64(void *val_a, void *val_b) +{ + u64 a = atomic64_read((atomic64_t *)val_a); + u64 b = atomic64_read((atomic64_t *)val_b); + + return (a > b) ? 1 : ((a < b) ? -1 : 0); +} + +#define DEFINE_TRACING_MAP_CMP_FN(type) \ +static int tracing_map_cmp_##type(void *val_a, void *val_b) \ +{ \ + type a = *(type *)val_a; \ + type b = *(type *)val_b; \ + \ + return (a > b) ? 1 : ((a < b) ? -1 : 0); \ +} + +DEFINE_TRACING_MAP_CMP_FN(s64); +DEFINE_TRACING_MAP_CMP_FN(u64); +DEFINE_TRACING_MAP_CMP_FN(s32); +DEFINE_TRACING_MAP_CMP_FN(u32); +DEFINE_TRACING_MAP_CMP_FN(s16); +DEFINE_TRACING_MAP_CMP_FN(u16); +DEFINE_TRACING_MAP_CMP_FN(s8); +DEFINE_TRACING_MAP_CMP_FN(u8); + +tracing_map_cmp_fn_t tracing_map_cmp_num(int field_size, + int field_is_signed) +{ + tracing_map_cmp_fn_t fn = tracing_map_cmp_none; + + switch (field_size) { + case 8: + if (field_is_signed) + fn = tracing_map_cmp_s64; + else + fn = tracing_map_cmp_u64; + break; + case 4: + if (field_is_signed) + fn = tracing_map_cmp_s32; + else + fn = tracing_map_cmp_u32; + break; + case 2: + if (field_is_signed) + fn = tracing_map_cmp_s16; + else + fn = tracing_map_cmp_u16; + break; + case 1: + if (field_is_signed) + fn = tracing_map_cmp_s8; + else + fn = tracing_map_cmp_u8; + break; + } + + return fn; +} + +static int tracing_map_add_field(struct tracing_map *map, + tracing_map_cmp_fn_t cmp_fn) +{ + int ret = -EINVAL; + + if (map->n_fields < TRACING_MAP_FIELDS_MAX) { + ret = map->n_fields; + map->fields[map->n_fields++].cmp_fn = cmp_fn; + } + + return ret; +} + +/** + * tracing_map_add_sum_field - Add a field describing a tracing_map sum + * @map: The tracing_map + * + * Add a sum field to the key and return the index identifying it in + * the map and associated tracing_map_elts. This is the index used + * for instance to update a sum for a particular tracing_map_elt using + * tracing_map_update_sum() or reading it via tracing_map_read_sum(). + * + * Return: The index identifying the field in the map and associated + * tracing_map_elts, or -EINVAL on error. + */ +int tracing_map_add_sum_field(struct tracing_map *map) +{ + return tracing_map_add_field(map, tracing_map_cmp_atomic64); +} + +/** + * tracing_map_add_key_field - Add a field describing a tracing_map key + * @map: The tracing_map + * @offset: The offset within the key + * @cmp_fn: The comparison function that will be used to sort on the key + * + * Let the map know there is a key and that if it's used as a sort key + * to use cmp_fn. + * + * A key can be a subset of a compound key; for that purpose, the + * offset param is used to describe where within the the compound key + * the key referenced by this key field resides. + * + * Return: The index identifying the field in the map and associated + * tracing_map_elts, or -EINVAL on error. + */ +int tracing_map_add_key_field(struct tracing_map *map, + unsigned int offset, + tracing_map_cmp_fn_t cmp_fn) + +{ + int idx = tracing_map_add_field(map, cmp_fn); + + if (idx < 0) + return idx; + + map->fields[idx].offset = offset; + + map->key_idx[map->n_keys++] = idx; + + return idx; +} + +void tracing_map_array_clear(struct tracing_map_array *a) +{ + unsigned int i; + + if (!a->pages) + return; + + for (i = 0; i < a->n_pages; i++) + memset(a->pages[i], 0, PAGE_SIZE); +} + +void tracing_map_array_free(struct tracing_map_array *a) +{ + unsigned int i; + + if (!a) + return; + + if (!a->pages) { + kfree(a); + return; + } + + for (i = 0; i < a->n_pages; i++) { + if (!a->pages[i]) + break; + free_page((unsigned long)a->pages[i]); + } +} + +struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts, + unsigned int entry_size) +{ + struct tracing_map_array *a; + unsigned int i; + + a = kzalloc(sizeof(*a), GFP_KERNEL); + if (!a) + return NULL; + + a->entry_size_shift = fls(roundup_pow_of_two(entry_size) - 1); + a->entries_per_page = PAGE_SIZE / (1 << a->entry_size_shift); + a->n_pages = n_elts / a->entries_per_page; + if (!a->n_pages) + a->n_pages = 1; + a->entry_shift = fls(a->entries_per_page) - 1; + a->entry_mask = (1 << a->entry_shift) - 1; + + a->pages = kcalloc(a->n_pages, sizeof(void *), GFP_KERNEL); + if (!a->pages) + goto free; + + for (i = 0; i < a->n_pages; i++) { + a->pages[i] = (void *)get_zeroed_page(GFP_KERNEL); + if (!a->pages[i]) + goto free; + } + out: + return a; + free: + tracing_map_array_free(a); + a = NULL; + + goto out; +} + +static void tracing_map_elt_clear(struct tracing_map_elt *elt) +{ + unsigned i; + + for (i = 0; i < elt->map->n_fields; i++) + if (elt->fields[i].cmp_fn == tracing_map_cmp_atomic64) + atomic64_set(&elt->fields[i].sum, 0); + + if (elt->map->ops && elt->map->ops->elt_clear) + elt->map->ops->elt_clear(elt); +} + +static void tracing_map_elt_init_fields(struct tracing_map_elt *elt) +{ + unsigned int i; + + tracing_map_elt_clear(elt); + + for (i = 0; i < elt->map->n_fields; i++) { + elt->fields[i].cmp_fn = elt->map->fields[i].cmp_fn; + + if (elt->fields[i].cmp_fn != tracing_map_cmp_atomic64) + elt->fields[i].offset = elt->map->fields[i].offset; + } +} + +static void tracing_map_elt_free(struct tracing_map_elt *elt) +{ + if (!elt) + return; + + if (elt->map->ops && elt->map->ops->elt_free) + elt->map->ops->elt_free(elt); + kfree(elt->fields); + kfree(elt->key); + kfree(elt); +} + +static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map) +{ + struct tracing_map_elt *elt; + int err = 0; + + elt = kzalloc(sizeof(*elt), GFP_KERNEL); + if (!elt) + return ERR_PTR(-ENOMEM); + + elt->map = map; + + elt->key = kzalloc(map->key_size, GFP_KERNEL); + if (!elt->key) { + err = -ENOMEM; + goto free; + } + + elt->fields = kcalloc(map->n_fields, sizeof(*elt->fields), GFP_KERNEL); + if (!elt->fields) { + err = -ENOMEM; + goto free; + } + + tracing_map_elt_init_fields(elt); + + if (map->ops && map->ops->elt_alloc) { + err = map->ops->elt_alloc(elt); + if (err) + goto free; + } + return elt; + free: + tracing_map_elt_free(elt); + + return ERR_PTR(err); +} + +static struct tracing_map_elt *get_free_elt(struct tracing_map *map) +{ + struct tracing_map_elt *elt = NULL; + int idx; + + idx = atomic_inc_return(&map->next_elt); + if (idx < map->max_elts) { + elt = *(TRACING_MAP_ELT(map->elts, idx)); + if (map->ops && map->ops->elt_init) + map->ops->elt_init(elt); + } + + return elt; +} + +static void tracing_map_free_elts(struct tracing_map *map) +{ + unsigned int i; + + if (!map->elts) + return; + + for (i = 0; i < map->max_elts; i++) { + tracing_map_elt_free(*(TRACING_MAP_ELT(map->elts, i))); + *(TRACING_MAP_ELT(map->elts, i)) = NULL; + } + + tracing_map_array_free(map->elts); + map->elts = NULL; +} + +static int tracing_map_alloc_elts(struct tracing_map *map) +{ + unsigned int i; + + map->elts = tracing_map_array_alloc(map->max_elts, + sizeof(struct tracing_map_elt *)); + if (!map->elts) + return -ENOMEM; + + for (i = 0; i < map->max_elts; i++) { + *(TRACING_MAP_ELT(map->elts, i)) = tracing_map_elt_alloc(map); + if (IS_ERR(*(TRACING_MAP_ELT(map->elts, i)))) { + *(TRACING_MAP_ELT(map->elts, i)) = NULL; + tracing_map_free_elts(map); + + return -ENOMEM; + } + } + + return 0; +} + +static inline bool keys_match(void *key, void *test_key, unsigned key_size) +{ + bool match = true; + + if (memcmp(key, test_key, key_size)) + match = false; + + return match; +} + +static inline struct tracing_map_elt * +__tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only) +{ + u32 idx, key_hash, test_key; + struct tracing_map_entry *entry; + + key_hash = jhash(key, map->key_size, 0); + if (key_hash == 0) + key_hash = 1; + idx = key_hash >> (32 - (map->map_bits + 1)); + + while (1) { + idx &= (map->map_size - 1); + entry = TRACING_MAP_ENTRY(map->map, idx); + test_key = entry->key; + + if (test_key && test_key == key_hash && entry->val && + keys_match(key, entry->val->key, map->key_size)) { + atomic64_inc(&map->hits); + return entry->val; + } + + if (!test_key) { + if (lookup_only) + break; + + if (!cmpxchg(&entry->key, 0, key_hash)) { + struct tracing_map_elt *elt; + + elt = get_free_elt(map); + if (!elt) { + atomic64_inc(&map->drops); + entry->key = 0; + break; + } + + memcpy(elt->key, key, map->key_size); + entry->val = elt; + atomic64_inc(&map->hits); + + return entry->val; + } + } + + idx++; + } + + return NULL; +} + +/** + * tracing_map_insert - Insert key and/or retrieve val from a tracing_map + * @map: The tracing_map to insert into + * @key: The key to insert + * + * Inserts a key into a tracing_map and creates and returns a new + * tracing_map_elt for it, or if the key has already been inserted by + * a previous call, returns the tracing_map_elt already associated + * with it. When the map was created, the number of elements to be + * allocated for the map was specified (internally maintained as + * 'max_elts' in struct tracing_map), and that number of + * tracing_map_elts was created by tracing_map_init(). This is the + * pre-allocated pool of tracing_map_elts that tracing_map_insert() + * will allocate from when adding new keys. Once that pool is + * exhausted, tracing_map_insert() is useless and will return NULL to + * signal that state. There are two user-visible tracing_map + * variables, 'hits' and 'drops', which are updated by this function. + * Every time an element is either successfully inserted or retrieved, + * the 'hits' value is incrememented. Every time an element insertion + * fails, the 'drops' value is incremented. + * + * This is a lock-free tracing map insertion function implementing a + * modified form of Cliff Click's basic insertion algorithm. It + * requires the table size be a power of two. To prevent any + * possibility of an infinite loop we always make the internal table + * size double the size of the requested table size (max_elts * 2). + * Likewise, we never reuse a slot or resize or delete elements - when + * we've reached max_elts entries, we simply return NULL once we've + * run out of entries. Readers can at any point in time traverse the + * tracing map and safely access the key/val pairs. + * + * Return: the tracing_map_elt pointer val associated with the key. + * If this was a newly inserted key, the val will be a newly allocated + * and associated tracing_map_elt pointer val. If the key wasn't + * found and the pool of tracing_map_elts has been exhausted, NULL is + * returned and no further insertions will succeed. + */ +struct tracing_map_elt *tracing_map_insert(struct tracing_map *map, void *key) +{ + return __tracing_map_insert(map, key, false); +} + +/** + * tracing_map_lookup - Retrieve val from a tracing_map + * @map: The tracing_map to perform the lookup on + * @key: The key to look up + * + * Looks up key in tracing_map and if found returns the matching + * tracing_map_elt. This is a lock-free lookup; see + * tracing_map_insert() for details on tracing_map and how it works. + * Every time an element is retrieved, the 'hits' value is + * incrememented. There is one user-visible tracing_map variable, + * 'hits', which is updated by this function. Every time an element + * is successfully retrieved, the 'hits' value is incrememented. The + * 'drops' value is never updated by this function. + * + * Return: the tracing_map_elt pointer val associated with the key. + * If the key wasn't found, NULL is returned. + */ +struct tracing_map_elt *tracing_map_lookup(struct tracing_map *map, void *key) +{ + return __tracing_map_insert(map, key, true); +} + +/** + * tracing_map_destroy - Destroy a tracing_map + * @map: The tracing_map to destroy + * + * Frees a tracing_map along with its associated array of + * tracing_map_elts. + * + * Callers should make sure there are no readers or writers actively + * reading or inserting into the map before calling this. + */ +void tracing_map_destroy(struct tracing_map *map) +{ + if (!map) + return; + + tracing_map_free_elts(map); + + tracing_map_array_free(map->map); + kfree(map); +} + +/** + * tracing_map_clear - Clear a tracing_map + * @map: The tracing_map to clear + * + * Resets the tracing map to a cleared or initial state. The + * tracing_map_elts are all cleared, and the array of struct + * tracing_map_entry is reset to an initialized state. + * + * Callers should make sure there are no writers actively inserting + * into the map before calling this. + */ +void tracing_map_clear(struct tracing_map *map) +{ + unsigned int i; + + atomic_set(&map->next_elt, -1); + atomic64_set(&map->hits, 0); + atomic64_set(&map->drops, 0); + + tracing_map_array_clear(map->map); + + for (i = 0; i < map->max_elts; i++) + tracing_map_elt_clear(*(TRACING_MAP_ELT(map->elts, i))); +} + +static void set_sort_key(struct tracing_map *map, + struct tracing_map_sort_key *sort_key) +{ + map->sort_key = *sort_key; +} + +/** + * tracing_map_create - Create a lock-free map and element pool + * @map_bits: The size of the map (2 ** map_bits) + * @key_size: The size of the key for the map in bytes + * @ops: Optional client-defined tracing_map_ops instance + * @private_data: Client data associated with the map + * + * Creates and sets up a map to contain 2 ** map_bits number of + * elements (internally maintained as 'max_elts' in struct + * tracing_map). Before using, map fields should be added to the map + * with tracing_map_add_sum_field() and tracing_map_add_key_field(). + * tracing_map_init() should then be called to allocate the array of + * tracing_map_elts, in order to avoid allocating anything in the map + * insertion path. The user-specified map size reflects the maximum + * number of elements that can be contained in the table requested by + * the user - internally we double that in order to keep the table + * sparse and keep collisions manageable. + * + * A tracing_map is a special-purpose map designed to aggregate or + * 'sum' one or more values associated with a specific object of type + * tracing_map_elt, which is attached by the map to a given key. + * + * tracing_map_create() sets up the map itself, and provides + * operations for inserting tracing_map_elts, but doesn't allocate the + * tracing_map_elts themselves, or provide a means for describing the + * keys or sums associated with the tracing_map_elts. All + * tracing_map_elts for a given map have the same set of sums and + * keys, which are defined by the client using the functions + * tracing_map_add_key_field() and tracing_map_add_sum_field(). Once + * the fields are defined, the pool of elements allocated for the map + * can be created, which occurs when the client code calls + * tracing_map_init(). + * + * When tracing_map_init() returns, tracing_map_elt elements can be + * inserted into the map using tracing_map_insert(). When called, + * tracing_map_insert() grabs a free tracing_map_elt from the pool, or + * finds an existing match in the map and in either case returns it. + * The client can then use tracing_map_update_sum() and + * tracing_map_read_sum() to update or read a given sum field for the + * tracing_map_elt. + * + * The client can at any point retrieve and traverse the current set + * of inserted tracing_map_elts in a tracing_map, via + * tracing_map_sort_entries(). Sorting can be done on any field, + * including keys. + * + * See tracing_map.h for a description of tracing_map_ops. + * + * Return: the tracing_map pointer if successful, ERR_PTR if not. + */ +struct tracing_map *tracing_map_create(unsigned int map_bits, + unsigned int key_size, + const struct tracing_map_ops *ops, + void *private_data) +{ + struct tracing_map *map; + unsigned int i; + + if (map_bits < TRACING_MAP_BITS_MIN || + map_bits > TRACING_MAP_BITS_MAX) + return ERR_PTR(-EINVAL); + + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (!map) + return ERR_PTR(-ENOMEM); + + map->map_bits = map_bits; + map->max_elts = (1 << map_bits); + atomic_set(&map->next_elt, -1); + + map->map_size = (1 << (map_bits + 1)); + map->ops = ops; + + map->private_data = private_data; + + map->map = tracing_map_array_alloc(map->map_size, + sizeof(struct tracing_map_entry)); + if (!map->map) + goto free; + + map->key_size = key_size; + for (i = 0; i < TRACING_MAP_KEYS_MAX; i++) + map->key_idx[i] = -1; + out: + return map; + free: + tracing_map_destroy(map); + map = ERR_PTR(-ENOMEM); + + goto out; +} + +/** + * tracing_map_init - Allocate and clear a map's tracing_map_elts + * @map: The tracing_map to initialize + * + * Allocates a clears a pool of tracing_map_elts equal to the + * user-specified size of 2 ** map_bits (internally maintained as + * 'max_elts' in struct tracing_map). Before using, the map fields + * should be added to the map with tracing_map_add_sum_field() and + * tracing_map_add_key_field(). tracing_map_init() should then be + * called to allocate the array of tracing_map_elts, in order to avoid + * allocating anything in the map insertion path. The user-specified + * map size reflects the max number of elements requested by the user + * - internally we double that in order to keep the table sparse and + * keep collisions manageable. + * + * See tracing_map.h for a description of tracing_map_ops. + * + * Return: the tracing_map pointer if successful, ERR_PTR if not. + */ +int tracing_map_init(struct tracing_map *map) +{ + int err; + + if (map->n_fields < 2) + return -EINVAL; /* need at least 1 key and 1 val */ + + err = tracing_map_alloc_elts(map); + if (err) + return err; + + tracing_map_clear(map); + + return err; +} + +static int cmp_entries_dup(const struct tracing_map_sort_entry **a, + const struct tracing_map_sort_entry **b) +{ + int ret = 0; + + if (memcmp((*a)->key, (*b)->key, (*a)->elt->map->key_size)) + ret = 1; + + return ret; +} + +static int cmp_entries_sum(const struct tracing_map_sort_entry **a, + const struct tracing_map_sort_entry **b) +{ + const struct tracing_map_elt *elt_a, *elt_b; + struct tracing_map_sort_key *sort_key; + struct tracing_map_field *field; + tracing_map_cmp_fn_t cmp_fn; + void *val_a, *val_b; + int ret = 0; + + elt_a = (*a)->elt; + elt_b = (*b)->elt; + + sort_key = &elt_a->map->sort_key; + + field = &elt_a->fields[sort_key->field_idx]; + cmp_fn = field->cmp_fn; + + val_a = &elt_a->fields[sort_key->field_idx].sum; + val_b = &elt_b->fields[sort_key->field_idx].sum; + + ret = cmp_fn(val_a, val_b); + if (sort_key->descending) + ret = -ret; + + return ret; +} + +static int cmp_entries_key(const struct tracing_map_sort_entry **a, + const struct tracing_map_sort_entry **b) +{ + const struct tracing_map_elt *elt_a, *elt_b; + struct tracing_map_sort_key *sort_key; + struct tracing_map_field *field; + tracing_map_cmp_fn_t cmp_fn; + void *val_a, *val_b; + int ret = 0; + + elt_a = (*a)->elt; + elt_b = (*b)->elt; + + sort_key = &elt_a->map->sort_key; + + field = &elt_a->fields[sort_key->field_idx]; + + cmp_fn = field->cmp_fn; + + val_a = elt_a->key + field->offset; + val_b = elt_b->key + field->offset; + + ret = cmp_fn(val_a, val_b); + if (sort_key->descending) + ret = -ret; + + return ret; +} + +static void destroy_sort_entry(struct tracing_map_sort_entry *entry) +{ + if (!entry) + return; + + if (entry->elt_copied) + tracing_map_elt_free(entry->elt); + + kfree(entry); +} + +/** + * tracing_map_destroy_sort_entries - Destroy an array of sort entries + * @entries: The entries to destroy + * @n_entries: The number of entries in the array + * + * Destroy the elements returned by a tracing_map_sort_entries() call. + */ +void tracing_map_destroy_sort_entries(struct tracing_map_sort_entry **entries, + unsigned int n_entries) +{ + unsigned int i; + + for (i = 0; i < n_entries; i++) + destroy_sort_entry(entries[i]); + + vfree(entries); +} + +static struct tracing_map_sort_entry * +create_sort_entry(void *key, struct tracing_map_elt *elt) +{ + struct tracing_map_sort_entry *sort_entry; + + sort_entry = kzalloc(sizeof(*sort_entry), GFP_KERNEL); + if (!sort_entry) + return NULL; + + sort_entry->key = key; + sort_entry->elt = elt; + + return sort_entry; +} + +static struct tracing_map_elt *copy_elt(struct tracing_map_elt *elt) +{ + struct tracing_map_elt *dup_elt; + unsigned int i; + + dup_elt = tracing_map_elt_alloc(elt->map); + if (IS_ERR(dup_elt)) + return NULL; + + if (elt->map->ops && elt->map->ops->elt_copy) + elt->map->ops->elt_copy(dup_elt, elt); + + dup_elt->private_data = elt->private_data; + memcpy(dup_elt->key, elt->key, elt->map->key_size); + + for (i = 0; i < elt->map->n_fields; i++) { + atomic64_set(&dup_elt->fields[i].sum, + atomic64_read(&elt->fields[i].sum)); + dup_elt->fields[i].cmp_fn = elt->fields[i].cmp_fn; + } + + return dup_elt; +} + +static int merge_dup(struct tracing_map_sort_entry **sort_entries, + unsigned int target, unsigned int dup) +{ + struct tracing_map_elt *target_elt, *elt; + bool first_dup = (target - dup) == 1; + int i; + + if (first_dup) { + elt = sort_entries[target]->elt; + target_elt = copy_elt(elt); + if (!target_elt) + return -ENOMEM; + sort_entries[target]->elt = target_elt; + sort_entries[target]->elt_copied = true; + } else + target_elt = sort_entries[target]->elt; + + elt = sort_entries[dup]->elt; + + for (i = 0; i < elt->map->n_fields; i++) + atomic64_add(atomic64_read(&elt->fields[i].sum), + &target_elt->fields[i].sum); + + sort_entries[dup]->dup = true; + + return 0; +} + +static int merge_dups(struct tracing_map_sort_entry **sort_entries, + int n_entries, unsigned int key_size) +{ + unsigned int dups = 0, total_dups = 0; + int err, i, j; + void *key; + + if (n_entries < 2) + return total_dups; + + sort(sort_entries, n_entries, sizeof(struct tracing_map_sort_entry *), + (int (*)(const void *, const void *))cmp_entries_dup, NULL); + + key = sort_entries[0]->key; + for (i = 1; i < n_entries; i++) { + if (!memcmp(sort_entries[i]->key, key, key_size)) { + dups++; total_dups++; + err = merge_dup(sort_entries, i - dups, i); + if (err) + return err; + continue; + } + key = sort_entries[i]->key; + dups = 0; + } + + if (!total_dups) + return total_dups; + + for (i = 0, j = 0; i < n_entries; i++) { + if (!sort_entries[i]->dup) { + sort_entries[j] = sort_entries[i]; + if (j++ != i) + sort_entries[i] = NULL; + } else { + destroy_sort_entry(sort_entries[i]); + sort_entries[i] = NULL; + } + } + + return total_dups; +} + +static bool is_key(struct tracing_map *map, unsigned int field_idx) +{ + unsigned int i; + + for (i = 0; i < map->n_keys; i++) + if (map->key_idx[i] == field_idx) + return true; + return false; +} + +static void sort_secondary(struct tracing_map *map, + const struct tracing_map_sort_entry **entries, + unsigned int n_entries, + struct tracing_map_sort_key *primary_key, + struct tracing_map_sort_key *secondary_key) +{ + int (*primary_fn)(const struct tracing_map_sort_entry **, + const struct tracing_map_sort_entry **); + int (*secondary_fn)(const struct tracing_map_sort_entry **, + const struct tracing_map_sort_entry **); + unsigned i, start = 0, n_sub = 1; + + if (is_key(map, primary_key->field_idx)) + primary_fn = cmp_entries_key; + else + primary_fn = cmp_entries_sum; + + if (is_key(map, secondary_key->field_idx)) + secondary_fn = cmp_entries_key; + else + secondary_fn = cmp_entries_sum; + + for (i = 0; i < n_entries - 1; i++) { + const struct tracing_map_sort_entry **a = &entries[i]; + const struct tracing_map_sort_entry **b = &entries[i + 1]; + + if (primary_fn(a, b) == 0) { + n_sub++; + if (i < n_entries - 2) + continue; + } + + if (n_sub < 2) { + start = i + 1; + n_sub = 1; + continue; + } + + set_sort_key(map, secondary_key); + sort(&entries[start], n_sub, + sizeof(struct tracing_map_sort_entry *), + (int (*)(const void *, const void *))secondary_fn, NULL); + set_sort_key(map, primary_key); + + start = i + 1; + n_sub = 1; + } +} + +/** + * tracing_map_sort_entries - Sort the current set of tracing_map_elts in a map + * @map: The tracing_map + * @sort_key: The sort key to use for sorting + * @sort_entries: outval: pointer to allocated and sorted array of entries + * + * tracing_map_sort_entries() sorts the current set of entries in the + * map and returns the list of tracing_map_sort_entries containing + * them to the client in the sort_entries param. The client can + * access the struct tracing_map_elt element of interest directly as + * the 'elt' field of a returned struct tracing_map_sort_entry object. + * + * The sort_key has only two fields: idx and descending. 'idx' refers + * to the index of the field added via tracing_map_add_sum_field() or + * tracing_map_add_key_field() when the tracing_map was initialized. + * 'descending' is a flag that if set reverses the sort order, which + * by default is ascending. + * + * The client should not hold on to the returned array but should use + * it and call tracing_map_destroy_sort_entries() when done. + * + * Return: the number of sort_entries in the struct tracing_map_sort_entry + * array, negative on error + */ +int tracing_map_sort_entries(struct tracing_map *map, + struct tracing_map_sort_key *sort_keys, + unsigned int n_sort_keys, + struct tracing_map_sort_entry ***sort_entries) +{ + int (*cmp_entries_fn)(const struct tracing_map_sort_entry **, + const struct tracing_map_sort_entry **); + struct tracing_map_sort_entry *sort_entry, **entries; + int i, n_entries, ret; + + entries = vmalloc(map->max_elts * sizeof(sort_entry)); + if (!entries) + return -ENOMEM; + + for (i = 0, n_entries = 0; i < map->map_size; i++) { + struct tracing_map_entry *entry; + + entry = TRACING_MAP_ENTRY(map->map, i); + + if (!entry->key || !entry->val) + continue; + + entries[n_entries] = create_sort_entry(entry->val->key, + entry->val); + if (!entries[n_entries++]) { + ret = -ENOMEM; + goto free; + } + } + + if (n_entries == 0) { + ret = 0; + goto free; + } + + if (n_entries == 1) { + *sort_entries = entries; + return 1; + } + + ret = merge_dups(entries, n_entries, map->key_size); + if (ret < 0) + goto free; + n_entries -= ret; + + if (is_key(map, sort_keys[0].field_idx)) + cmp_entries_fn = cmp_entries_key; + else + cmp_entries_fn = cmp_entries_sum; + + set_sort_key(map, &sort_keys[0]); + + sort(entries, n_entries, sizeof(struct tracing_map_sort_entry *), + (int (*)(const void *, const void *))cmp_entries_fn, NULL); + + if (n_sort_keys > 1) + sort_secondary(map, + (const struct tracing_map_sort_entry **)entries, + n_entries, + &sort_keys[0], + &sort_keys[1]); + + *sort_entries = entries; + + return n_entries; + free: + tracing_map_destroy_sort_entries(entries, n_entries); + + return ret; +} diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h new file mode 100644 index 000000000000..618838f5f30a --- /dev/null +++ b/kernel/trace/tracing_map.h @@ -0,0 +1,283 @@ +#ifndef __TRACING_MAP_H +#define __TRACING_MAP_H + +#define TRACING_MAP_BITS_DEFAULT 11 +#define TRACING_MAP_BITS_MAX 17 +#define TRACING_MAP_BITS_MIN 7 + +#define TRACING_MAP_KEYS_MAX 2 +#define TRACING_MAP_VALS_MAX 3 +#define TRACING_MAP_FIELDS_MAX (TRACING_MAP_KEYS_MAX + \ + TRACING_MAP_VALS_MAX) +#define TRACING_MAP_SORT_KEYS_MAX 2 + +typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b); + +/* + * This is an overview of the tracing_map data structures and how they + * relate to the tracing_map API. The details of the algorithms + * aren't discussed here - this is just a general overview of the data + * structures and how they interact with the API. + * + * The central data structure of the tracing_map is an initially + * zeroed array of struct tracing_map_entry (stored in the map field + * of struct tracing_map). tracing_map_entry is a very simple data + * structure containing only two fields: a 32-bit unsigned 'key' + * variable and a pointer named 'val'. This array of struct + * tracing_map_entry is essentially a hash table which will be + * modified by a single function, tracing_map_insert(), but which can + * be traversed and read by a user at any time (though the user does + * this indirectly via an array of tracing_map_sort_entry - see the + * explanation of that data structure in the discussion of the + * sorting-related data structures below). + * + * The central function of the tracing_map API is + * tracing_map_insert(). tracing_map_insert() hashes the + * arbitrarily-sized key passed into it into a 32-bit unsigned key. + * It then uses this key, truncated to the array size, as an index + * into the array of tracing_map_entries. If the value of the 'key' + * field of the tracing_map_entry found at that location is 0, then + * that entry is considered to be free and can be claimed, by + * replacing the 0 in the 'key' field of the tracing_map_entry with + * the new 32-bit hashed key. Once claimed, that tracing_map_entry's + * 'val' field is then used to store a unique element which will be + * forever associated with that 32-bit hashed key in the + * tracing_map_entry. + * + * That unique element now in the tracing_map_entry's 'val' field is + * an instance of tracing_map_elt, where 'elt' in the latter part of + * that variable name is short for 'element'. The purpose of a + * tracing_map_elt is to hold values specific to the particular + * 32-bit hashed key it's assocated with. Things such as the unique + * set of aggregated sums associated with the 32-bit hashed key, along + * with a copy of the full key associated with the entry, and which + * was used to produce the 32-bit hashed key. + * + * When tracing_map_create() is called to create the tracing map, the + * user specifies (indirectly via the map_bits param, the details are + * unimportant for this discussion) the maximum number of elements + * that the map can hold (stored in the max_elts field of struct + * tracing_map). This is the maximum possible number of + * tracing_map_entries in the tracing_map_entry array which can be + * 'claimed' as described in the above discussion, and therefore is + * also the maximum number of tracing_map_elts that can be associated + * with the tracing_map_entry array in the tracing_map. Because of + * the way the insertion algorithm works, the size of the allocated + * tracing_map_entry array is always twice the maximum number of + * elements (2 * max_elts). This value is stored in the map_size + * field of struct tracing_map. + * + * Because tracing_map_insert() needs to work from any context, + * including from within the memory allocation functions themselves, + * both the tracing_map_entry array and a pool of max_elts + * tracing_map_elts are pre-allocated before any call is made to + * tracing_map_insert(). + * + * The tracing_map_entry array is allocated as a single block by + * tracing_map_create(). + * + * Because the tracing_map_elts are much larger objects and can't + * generally be allocated together as a single large array without + * failure, they're allocated individually, by tracing_map_init(). + * + * The pool of tracing_map_elts are allocated by tracing_map_init() + * rather than by tracing_map_create() because at the time + * tracing_map_create() is called, there isn't enough information to + * create the tracing_map_elts. Specifically,the user first needs to + * tell the tracing_map implementation how many fields the + * tracing_map_elts contain, and which types of fields they are (key + * or sum). The user does this via the tracing_map_add_sum_field() + * and tracing_map_add_key_field() functions, following which the user + * calls tracing_map_init() to finish up the tracing map setup. The + * array holding the pointers which make up the pre-allocated pool of + * tracing_map_elts is allocated as a single block and is stored in + * the elts field of struct tracing_map. + * + * There is also a set of structures used for sorting that might + * benefit from some minimal explanation. + * + * struct tracing_map_sort_key is used to drive the sort at any given + * time. By 'any given time' we mean that a different + * tracing_map_sort_key will be used at different times depending on + * whether the sort currently being performed is a primary or a + * secondary sort. + * + * The sort key is very simple, consisting of the field index of the + * tracing_map_elt field to sort on (which the user saved when adding + * the field), and whether the sort should be done in an ascending or + * descending order. + * + * For the convenience of the sorting code, a tracing_map_sort_entry + * is created for each tracing_map_elt, again individually allocated + * to avoid failures that might be expected if allocated as a single + * large array of struct tracing_map_sort_entry. + * tracing_map_sort_entry instances are the objects expected by the + * various internal sorting functions, and are also what the user + * ultimately receives after calling tracing_map_sort_entries(). + * Because it doesn't make sense for users to access an unordered and + * sparsely populated tracing_map directly, the + * tracing_map_sort_entries() function is provided so that users can + * retrieve a sorted list of all existing elements. In addition to + * the associated tracing_map_elt 'elt' field contained within the + * tracing_map_sort_entry, which is the object of interest to the + * user, tracing_map_sort_entry objects contain a number of additional + * fields which are used for caching and internal purposes and can + * safely be ignored. +*/ + +struct tracing_map_field { + tracing_map_cmp_fn_t cmp_fn; + union { + atomic64_t sum; + unsigned int offset; + }; +}; + +struct tracing_map_elt { + struct tracing_map *map; + struct tracing_map_field *fields; + void *key; + void *private_data; +}; + +struct tracing_map_entry { + u32 key; + struct tracing_map_elt *val; +}; + +struct tracing_map_sort_key { + unsigned int field_idx; + bool descending; +}; + +struct tracing_map_sort_entry { + void *key; + struct tracing_map_elt *elt; + bool elt_copied; + bool dup; +}; + +struct tracing_map_array { + unsigned int entries_per_page; + unsigned int entry_size_shift; + unsigned int entry_shift; + unsigned int entry_mask; + unsigned int n_pages; + void **pages; +}; + +#define TRACING_MAP_ARRAY_ELT(array, idx) \ + (array->pages[idx >> array->entry_shift] + \ + ((idx & array->entry_mask) << array->entry_size_shift)) + +#define TRACING_MAP_ENTRY(array, idx) \ + ((struct tracing_map_entry *)TRACING_MAP_ARRAY_ELT(array, idx)) + +#define TRACING_MAP_ELT(array, idx) \ + ((struct tracing_map_elt **)TRACING_MAP_ARRAY_ELT(array, idx)) + +struct tracing_map { + unsigned int key_size; + unsigned int map_bits; + unsigned int map_size; + unsigned int max_elts; + atomic_t next_elt; + struct tracing_map_array *elts; + struct tracing_map_array *map; + const struct tracing_map_ops *ops; + void *private_data; + struct tracing_map_field fields[TRACING_MAP_FIELDS_MAX]; + unsigned int n_fields; + int key_idx[TRACING_MAP_KEYS_MAX]; + unsigned int n_keys; + struct tracing_map_sort_key sort_key; + atomic64_t hits; + atomic64_t drops; +}; + +/** + * struct tracing_map_ops - callbacks for tracing_map + * + * The methods in this structure define callback functions for various + * operations on a tracing_map or objects related to a tracing_map. + * + * For a detailed description of tracing_map_elt objects please see + * the overview of tracing_map data structures at the beginning of + * this file. + * + * All the methods below are optional. + * + * @elt_alloc: When a tracing_map_elt is allocated, this function, if + * defined, will be called and gives clients the opportunity to + * allocate additional data and attach it to the element + * (tracing_map_elt->private_data is meant for that purpose). + * Element allocation occurs before tracing begins, when the + * tracing_map_init() call is made by client code. + * + * @elt_copy: At certain points in the lifetime of an element, it may + * need to be copied. The copy should include a copy of the + * client-allocated data, which can be copied into the 'to' + * element from the 'from' element. + * + * @elt_free: When a tracing_map_elt is freed, this function is called + * and allows client-allocated per-element data to be freed. + * + * @elt_clear: This callback allows per-element client-defined data to + * be cleared, if applicable. + * + * @elt_init: This callback allows per-element client-defined data to + * be initialized when used i.e. when the element is actually + * claimed by tracing_map_insert() in the context of the map + * insertion. + */ +struct tracing_map_ops { + int (*elt_alloc)(struct tracing_map_elt *elt); + void (*elt_copy)(struct tracing_map_elt *to, + struct tracing_map_elt *from); + void (*elt_free)(struct tracing_map_elt *elt); + void (*elt_clear)(struct tracing_map_elt *elt); + void (*elt_init)(struct tracing_map_elt *elt); +}; + +extern struct tracing_map * +tracing_map_create(unsigned int map_bits, + unsigned int key_size, + const struct tracing_map_ops *ops, + void *private_data); +extern int tracing_map_init(struct tracing_map *map); + +extern int tracing_map_add_sum_field(struct tracing_map *map); +extern int tracing_map_add_key_field(struct tracing_map *map, + unsigned int offset, + tracing_map_cmp_fn_t cmp_fn); + +extern void tracing_map_destroy(struct tracing_map *map); +extern void tracing_map_clear(struct tracing_map *map); + +extern struct tracing_map_elt * +tracing_map_insert(struct tracing_map *map, void *key); +extern struct tracing_map_elt * +tracing_map_lookup(struct tracing_map *map, void *key); + +extern tracing_map_cmp_fn_t tracing_map_cmp_num(int field_size, + int field_is_signed); +extern int tracing_map_cmp_string(void *val_a, void *val_b); +extern int tracing_map_cmp_none(void *val_a, void *val_b); + +extern void tracing_map_update_sum(struct tracing_map_elt *elt, + unsigned int i, u64 n); +extern u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i); +extern void tracing_map_set_field_descr(struct tracing_map *map, + unsigned int i, + unsigned int key_offset, + tracing_map_cmp_fn_t cmp_fn); +extern int +tracing_map_sort_entries(struct tracing_map *map, + struct tracing_map_sort_key *sort_keys, + unsigned int n_sort_keys, + struct tracing_map_sort_entry ***sort_entries); + +extern void +tracing_map_destroy_sort_entries(struct tracing_map_sort_entry **entries, + unsigned int n_entries); +#endif /* __TRACING_MAP_H */ diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index ecd536de603a..d0639d917899 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -491,7 +491,7 @@ static __init int init_tracepoints(void) ret = register_module_notifier(&tracepoint_module_nb); if (ret) - pr_warning("Failed to register tracepoint module enter notifier\n"); + pr_warn("Failed to register tracepoint module enter notifier\n"); return ret; } diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 975cb49e32bf..f8e26ab963ed 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -93,9 +93,11 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) { struct mm_struct *mm; - /* convert pages-usec to Mbyte-usec */ - stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB; - stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB; + /* convert pages-nsec/1024 to Mbyte-usec, see __acct_update_integrals */ + stats->coremem = p->acct_rss_mem1 * PAGE_SIZE; + do_div(stats->coremem, 1000 * KB); + stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE; + do_div(stats->virtmem, 1000 * KB); mm = get_task_mm(p); if (mm) { /* adjust to KB unit */ @@ -123,27 +125,28 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) static void __acct_update_integrals(struct task_struct *tsk, cputime_t utime, cputime_t stime) { - if (likely(tsk->mm)) { - cputime_t time, dtime; - struct timeval value; - unsigned long flags; - u64 delta; - - local_irq_save(flags); - time = stime + utime; - dtime = time - tsk->acct_timexpd; - jiffies_to_timeval(cputime_to_jiffies(dtime), &value); - delta = value.tv_sec; - delta = delta * USEC_PER_SEC + value.tv_usec; - - if (delta == 0) - goto out; - tsk->acct_timexpd = time; - tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); - tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; - out: - local_irq_restore(flags); - } + cputime_t time, dtime; + u64 delta; + + if (!likely(tsk->mm)) + return; + + time = stime + utime; + dtime = time - tsk->acct_timexpd; + /* Avoid division: cputime_t is often in nanoseconds already. */ + delta = cputime_to_nsecs(dtime); + + if (delta < TICK_NSEC) + return; + + tsk->acct_timexpd = time; + /* + * Divide by 1024 to avoid overflow, and to avoid division. + * The final unit reported to userspace is Mbyte-usecs, + * the rest of the math is done in xacct_add_tsk. + */ + tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10; + tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10; } /** @@ -153,9 +156,12 @@ static void __acct_update_integrals(struct task_struct *tsk, void acct_update_integrals(struct task_struct *tsk) { cputime_t utime, stime; + unsigned long flags; + local_irq_save(flags); task_cputime(tsk, &utime, &stime); __acct_update_integrals(tsk, utime, stime); + local_irq_restore(flags); } /** diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b3ace6ebbba3..9acb29f280ec 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -923,6 +923,9 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, * both lockup detectors are disabled if proc_watchdog_update() * returns an error. */ + if (old == new) + goto out; + err = proc_watchdog_update(); } out: @@ -967,7 +970,7 @@ int proc_soft_watchdog(struct ctl_table *table, int write, int proc_watchdog_thresh(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int err, old; + int err, old, new; get_online_cpus(); mutex_lock(&watchdog_proc_mutex); @@ -987,6 +990,10 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, /* * Update the sample period. Restore on failure. */ + new = ACCESS_ONCE(watchdog_thresh); + if (old == new) + goto out; + set_sample_period(); err = proc_watchdog_update(); if (err) { diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7ff5dc7d2ac5..e1c0e996b5ae 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -320,8 +320,7 @@ static bool wq_debug_force_rr_cpu = false; module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644); /* the per-cpu worker pools */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], - cpu_worker_pools); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */ @@ -434,54 +433,28 @@ static void *work_debug_hint(void *addr) return ((struct work_struct *) addr)->func; } -/* - * fixup_init is called when: - * - an active object is initialized - */ -static int work_fixup_init(void *addr, enum debug_obj_state state) +static bool work_is_static_object(void *addr) { struct work_struct *work = addr; - switch (state) { - case ODEBUG_STATE_ACTIVE: - cancel_work_sync(work); - debug_object_init(work, &work_debug_descr); - return 1; - default: - return 0; - } + return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work)); } /* - * fixup_activate is called when: - * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) + * fixup_init is called when: + * - an active object is initialized */ -static int work_fixup_activate(void *addr, enum debug_obj_state state) +static bool work_fixup_init(void *addr, enum debug_obj_state state) { struct work_struct *work = addr; switch (state) { - - case ODEBUG_STATE_NOTAVAILABLE: - /* - * This is not really a fixup. The work struct was - * statically initialized. We just make sure that it - * is tracked in the object tracker. - */ - if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) { - debug_object_init(work, &work_debug_descr); - debug_object_activate(work, &work_debug_descr); - return 0; - } - WARN_ON_ONCE(1); - return 0; - case ODEBUG_STATE_ACTIVE: - WARN_ON(1); - + cancel_work_sync(work); + debug_object_init(work, &work_debug_descr); + return true; default: - return 0; + return false; } } @@ -489,7 +462,7 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state) * fixup_free is called when: * - an active object is freed */ -static int work_fixup_free(void *addr, enum debug_obj_state state) +static bool work_fixup_free(void *addr, enum debug_obj_state state) { struct work_struct *work = addr; @@ -497,17 +470,17 @@ static int work_fixup_free(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: cancel_work_sync(work); debug_object_free(work, &work_debug_descr); - return 1; + return true; default: - return 0; + return false; } } static struct debug_obj_descr work_debug_descr = { .name = "work_struct", .debug_hint = work_debug_hint, + .is_static_object = work_is_static_object, .fixup_init = work_fixup_init, - .fixup_activate = work_fixup_activate, .fixup_free = work_fixup_free, }; @@ -667,6 +640,35 @@ static void set_work_pool_and_clear_pending(struct work_struct *work, */ smp_wmb(); set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0); + /* + * The following mb guarantees that previous clear of a PENDING bit + * will not be reordered with any speculative LOADS or STORES from + * work->current_func, which is executed afterwards. This possible + * reordering can lead to a missed execution on attempt to qeueue + * the same @work. E.g. consider this case: + * + * CPU#0 CPU#1 + * ---------------------------- -------------------------------- + * + * 1 STORE event_indicated + * 2 queue_work_on() { + * 3 test_and_set_bit(PENDING) + * 4 } set_..._and_clear_pending() { + * 5 set_work_data() # clear bit + * 6 smp_mb() + * 7 work->current_func() { + * 8 LOAD event_indicated + * } + * + * Without an explicit full barrier speculative LOAD on line 8 can + * be executed before CPU#0 does STORE on line 1. If that happens, + * CPU#0 observes the PENDING bit is still set and new execution of + * a @work is not queued in a hope, that CPU#1 will eventually + * finish the queued @work. Meanwhile CPU#1 does not see + * event_indicated is set, because speculative LOAD was executed + * before actual STORE. + */ + smp_mb(); } static void clear_work_data(struct work_struct *work) @@ -858,7 +860,6 @@ void wq_worker_waking_up(struct task_struct *task, int cpu) /** * wq_worker_sleeping - a worker is going to sleep * @task: task going to sleep - * @cpu: CPU in question, must be the current CPU number * * This function is called during schedule() when a busy worker is * going to sleep. Worker on the same cpu can be woken up by @@ -870,7 +871,7 @@ void wq_worker_waking_up(struct task_struct *task, int cpu) * Return: * Worker task on @cpu to wake up, %NULL if none. */ -struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) +struct task_struct *wq_worker_sleeping(struct task_struct *task) { struct worker *worker = kthread_data(task), *to_wakeup = NULL; struct worker_pool *pool; @@ -886,7 +887,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) pool = worker->pool; /* this can only happen on the local cpu */ - if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu)) + if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id())) return NULL; /* @@ -4527,6 +4528,17 @@ static void rebind_workers(struct worker_pool *pool) pool->attrs->cpumask) < 0); spin_lock_irq(&pool->lock); + + /* + * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED + * w/o preceding DOWN_PREPARE. Work around it. CPU hotplug is + * being reworked and this can go away in time. + */ + if (!(pool->flags & POOL_DISASSOCIATED)) { + spin_unlock_irq(&pool->lock); + return; + } + pool->flags &= ~POOL_DISASSOCIATED; for_each_pool_worker(worker, pool) { @@ -4696,7 +4708,7 @@ static void work_for_cpu_fn(struct work_struct *work) } /** - * work_on_cpu - run a function in user context on a particular cpu + * work_on_cpu - run a function in thread context on a particular cpu * @cpu: the cpu to run on * @fn: the function to run * @arg: the function arg @@ -5222,8 +5234,8 @@ int workqueue_sysfs_register(struct workqueue_struct *wq) wq_dev->wq = wq; wq_dev->dev.bus = &wq_subsys; - wq_dev->dev.init_name = wq->name; wq_dev->dev.release = wq_device_release; + dev_set_name(&wq_dev->dev, "%s", wq->name); /* * unbound_attrs are created separately. Suppress uevent until diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 45215870ac6c..8635417c587b 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -69,6 +69,6 @@ static inline struct worker *current_wq_worker(void) * sched/core.c and workqueue.c. */ void wq_worker_waking_up(struct task_struct *task, int cpu); -struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu); +struct task_struct *wq_worker_sleeping(struct task_struct *task); #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ |