From 133e1e5acd4a63c4a0dcc413e90d5decdbce9c4a Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Mon, 25 Jan 2016 18:04:15 -0500 Subject: audit: stop an old auditd being starved out by a new auditd Nothing prevents a new auditd starting up and replacing a valid audit_pid when an old auditd is still running, effectively starving out the old auditd since audit_pid no longer points to the old valid auditd. If no message to auditd has been attempted since auditd died unnaturally or got killed, audit_pid will still indicate it is alive. There isn't an easy way to detect if an old auditd is still running on the existing audit_pid other than attempting to send a message to see if it fails. An -ECONNREFUSED almost certainly means it disappeared and can be replaced. Other errors are not so straightforward and may indicate transient problems that will resolve themselves and the old auditd will recover. Yet others will likely need manual intervention for which a new auditd will not solve the problem. Send a new message type (AUDIT_REPLACE) to the old auditd containing a u32 with the PID of the new auditd. If the audit replace message succeeds (or doesn't fail with certainty), fail to register the new auditd and return an error (-EEXIST). This is expected to make the patch preventing an old auditd orphaning a new auditd redundant. V3: Switch audit message type from 1000 to 1300 block. Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index d6dd95cc59e6..2fd63d6879c5 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -809,6 +809,16 @@ static int audit_set_feature(struct sk_buff *skb) return 0; } +static int audit_replace(pid_t pid) +{ + struct sk_buff *skb = audit_make_reply(0, 0, AUDIT_REPLACE, 0, 0, + &pid, sizeof(pid)); + + if (!skb) + return -ENOMEM; + return netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); +} + static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { u32 seq; @@ -870,9 +880,13 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } if (s.mask & AUDIT_STATUS_PID) { int new_pid = s.pid; + pid_t requesting_pid = task_tgid_vnr(current); - if ((!new_pid) && (task_tgid_vnr(current) != audit_pid)) + if ((!new_pid) && (requesting_pid != audit_pid)) return -EACCES; + if (audit_pid && new_pid && + audit_replace(requesting_pid) != -ECONNREFUSED) + return -EEXIST; if (audit_enabled != AUDIT_OFF) audit_log_config_change("audit_pid", new_pid, audit_pid, 1); audit_pid = new_pid; -- cgit v1.2.3 From 935c9e7ff06abf12c45155f75ec2f712d3768095 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Mon, 25 Jan 2016 18:04:15 -0500 Subject: audit: log failed attempts to change audit_pid configuration Failed attempts to change the audit_pid configuration are not presently logged. One case is an attempt to starve an old auditd by starting up a new auditd when the old one is still alive and active. The other case is an attempt to orphan a new auditd when an old auditd shuts down. Log both as AUDIT_CONFIG_CHANGE messages with failure result. Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 2fd63d6879c5..8fa7533bf106 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -882,11 +882,15 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) int new_pid = s.pid; pid_t requesting_pid = task_tgid_vnr(current); - if ((!new_pid) && (requesting_pid != audit_pid)) + if ((!new_pid) && (requesting_pid != audit_pid)) { + audit_log_config_change("audit_pid", new_pid, audit_pid, 0); return -EACCES; + } if (audit_pid && new_pid && - audit_replace(requesting_pid) != -ECONNREFUSED) + audit_replace(requesting_pid) != -ECONNREFUSED) { + audit_log_config_change("audit_pid", new_pid, audit_pid, 0); return -EEXIST; + } if (audit_enabled != AUDIT_OFF) audit_log_config_change("audit_pid", new_pid, audit_pid, 1); audit_pid = new_pid; -- cgit v1.2.3 From 37282a77954aa2dbb339d15902290f25b868d2e8 Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Sat, 9 Jan 2016 22:55:31 -0800 Subject: tty: audit: Combine push functions tty_audit_push() and tty_audit_push_current() perform identical tasks; eliminate the tty_audit_push() implementation and the tty_audit_push_current() name. Signed-off-by: Peter Hurley Signed-off-by: Greg Kroah-Hartman --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 3a3e5deeda8d..610f221df069 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -920,7 +920,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (err == 1) { /* match or error */ err = 0; if (msg_type == AUDIT_USER_TTY) { - err = tty_audit_push_current(); + err = tty_audit_push(); if (err) break; } -- cgit v1.2.3 From 2e28d38ae1d9ced6ac2deb4001aca3f267304cdb Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Sat, 9 Jan 2016 22:55:33 -0800 Subject: tty: audit: Handle tty audit enable atomically The audit_tty and audit_tty_log_passwd fields are actually bool values, so merge into single memory location to access atomically. NB: audit log operations may still occur after tty audit is disabled which is consistent with the existing functionality Signed-off-by: Peter Hurley Signed-off-by: Greg Kroah-Hartman --- kernel/audit.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 610f221df069..2651e423b2dc 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1030,20 +1030,19 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; case AUDIT_TTY_GET: { struct audit_tty_status s; - struct task_struct *tsk = current; + unsigned int t; - spin_lock(&tsk->sighand->siglock); - s.enabled = tsk->signal->audit_tty; - s.log_passwd = tsk->signal->audit_tty_log_passwd; - spin_unlock(&tsk->sighand->siglock); + t = READ_ONCE(current->signal->audit_tty); + s.enabled = t & AUDIT_TTY_ENABLE; + s.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD); audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_TTY_SET: { struct audit_tty_status s, old; - struct task_struct *tsk = current; struct audit_buffer *ab; + unsigned int t; memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ @@ -1053,14 +1052,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) (s.log_passwd != 0 && s.log_passwd != 1)) err = -EINVAL; - spin_lock(&tsk->sighand->siglock); - old.enabled = tsk->signal->audit_tty; - old.log_passwd = tsk->signal->audit_tty_log_passwd; - if (!err) { - tsk->signal->audit_tty = s.enabled; - tsk->signal->audit_tty_log_passwd = s.log_passwd; + if (err) + t = READ_ONCE(current->signal->audit_tty); + else { + t = s.enabled | (-s.log_passwd & AUDIT_TTY_LOG_PASSWD); + t = xchg(¤t->signal->audit_tty, t); } - spin_unlock(&tsk->sighand->siglock); + old.enabled = t & AUDIT_TTY_ENABLE; + old.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD); audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d" -- cgit v1.2.3 From e11b956e9ebef098cc4f81964a1f57e40fe75cd4 Mon Sep 17 00:00:00 2001 From: Li Bin Date: Sat, 30 Jan 2016 11:54:03 +0800 Subject: kernel/Makefile: remove the useless CFLAGS_REMOVE_cgroup-debug.o The file cgroup-debug.c had been removed from commit fe6934354f8e (cgroups: move the cgroup debug subsys into cgroup.c to access internal state). Remain the CFLAGS_REMOVE_cgroup-debug.o = $(CC_FLAGS_FTRACE) useless in kernel/Makefile. Signed-off-by: Li Bin Acked-by: Zefan Li Signed-off-by: Tejun Heo --- kernel/Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 53abf008ecb3..baa55e50a315 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -14,8 +14,7 @@ obj-y = fork.o exec_domain.o panic.o \ obj-$(CONFIG_MULTIUSER) += groups.o ifdef CONFIG_FUNCTION_TRACER -# Do not trace debug files and internal ftrace files -CFLAGS_REMOVE_cgroup-debug.o = $(CC_FLAGS_FTRACE) +# Do not trace internal ftrace files CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE) endif -- cgit v1.2.3 From 0306e481d479a58eff17c27adf213fbb5822946b Mon Sep 17 00:00:00 2001 From: Shilpasri G Bhat Date: Wed, 3 Feb 2016 01:11:40 +0530 Subject: cpufreq: powernv/tracing: Add powernv_throttle tracepoint This patch adds the powernv_throttle tracepoint to trace the CPU frequency throttling event, which is used by the powernv-cpufreq driver in POWER8. Signed-off-by: Shilpasri G Bhat Reviewed-by: Gautham R. Shenoy Signed-off-by: Rafael J. Wysocki --- kernel/trace/power-traces.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index eb4220a132ec..81b87451c0ea 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -15,4 +15,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); +EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle); -- cgit v1.2.3 From 824bd0ce6c7c43a9e1e210abf124958e54d88342 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Feb 2016 22:39:53 -0800 Subject: bpf: introduce BPF_MAP_TYPE_PERCPU_HASH map Introduce BPF_MAP_TYPE_PERCPU_HASH map type which is used to do accurate counters without need to use BPF_XADD instruction which turned out to be too costly for high-performance network monitoring. In the typical use case the 'key' is the flow tuple or other long living object that sees a lot of events per second. bpf_map_lookup_elem() returns per-cpu area. Example: struct { u32 packets; u32 bytes; } * ptr = bpf_map_lookup_elem(&map, &key); /* ptr points to this_cpu area of the value, so the following * increments will not collide with other cpus */ ptr->packets ++; ptr->bytes += skb->len; bpf_update_elem() atomically creates a new element where all per-cpu values are zero initialized and this_cpu value is populated with given 'value'. Note that non-per-cpu hash map always allocates new element and then deletes old after rcu grace period to maintain atomicity of update. Per-cpu hash map updates element values in-place. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/hashtab.c | 275 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 228 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index c5b30fd8a315..2be5f6e8bb04 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -31,21 +31,27 @@ struct bpf_htab { struct htab_elem { struct hlist_node hash_node; struct rcu_head rcu; - u32 hash; + union { + u32 hash; + u32 key_size; + }; char key[0] __aligned(8); }; /* Called from syscall */ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { + bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH; struct bpf_htab *htab; int err, i; + u64 cost; htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) return ERR_PTR(-ENOMEM); /* mandatory map attributes */ + htab->map.map_type = attr->map_type; htab->map.key_size = attr->key_size; htab->map.value_size = attr->value_size; htab->map.max_entries = attr->max_entries; @@ -77,24 +83,34 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) */ goto free_htab; + if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE) + /* make sure the size for pcpu_alloc() is reasonable */ + goto free_htab; + htab->elem_size = sizeof(struct htab_elem) + - round_up(htab->map.key_size, 8) + - htab->map.value_size; + round_up(htab->map.key_size, 8); + if (percpu) + htab->elem_size += sizeof(void *); + else + htab->elem_size += htab->map.value_size; /* prevent zero size kmalloc and check for u32 overflow */ if (htab->n_buckets == 0 || htab->n_buckets > U32_MAX / sizeof(struct bucket)) goto free_htab; - if ((u64) htab->n_buckets * sizeof(struct bucket) + - (u64) htab->elem_size * htab->map.max_entries >= - U32_MAX - PAGE_SIZE) + cost = (u64) htab->n_buckets * sizeof(struct bucket) + + (u64) htab->elem_size * htab->map.max_entries; + + if (percpu) + cost += (u64) round_up(htab->map.value_size, 8) * + num_possible_cpus() * htab->map.max_entries; + + if (cost >= U32_MAX - PAGE_SIZE) /* make sure page count doesn't overflow */ goto free_htab; - htab->map.pages = round_up(htab->n_buckets * sizeof(struct bucket) + - htab->elem_size * htab->map.max_entries, - PAGE_SIZE) >> PAGE_SHIFT; + htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; err = -ENOMEM; htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket), @@ -148,7 +164,7 @@ static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, } /* Called from syscall or from eBPF program */ -static void *htab_map_lookup_elem(struct bpf_map *map, void *key) +static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_head *head; @@ -166,6 +182,13 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) l = lookup_elem_raw(head, hash, key, key_size); + return l; +} + +static void *htab_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct htab_elem *l = __htab_map_lookup_elem(map, key); + if (l) return l->key + round_up(map->key_size, 8); @@ -230,65 +253,139 @@ find_first_elem: return -ENOENT; } + +static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, + void __percpu *pptr) +{ + *(void __percpu **)(l->key + key_size) = pptr; +} + +static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) +{ + return *(void __percpu **)(l->key + key_size); +} + +static void htab_percpu_elem_free(struct htab_elem *l) +{ + free_percpu(htab_elem_get_ptr(l, l->key_size)); + kfree(l); +} + +static void htab_percpu_elem_free_rcu(struct rcu_head *head) +{ + struct htab_elem *l = container_of(head, struct htab_elem, rcu); + + htab_percpu_elem_free(l); +} + +static void free_htab_elem(struct htab_elem *l, bool percpu, u32 key_size) +{ + if (percpu) { + l->key_size = key_size; + call_rcu(&l->rcu, htab_percpu_elem_free_rcu); + } else { + kfree_rcu(l, rcu); + } +} + +static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, + void *value, u32 key_size, u32 hash, + bool percpu) +{ + u32 size = htab->map.value_size; + struct htab_elem *l_new; + void __percpu *pptr; + + l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); + if (!l_new) + return NULL; + + memcpy(l_new->key, key, key_size); + if (percpu) { + /* round up value_size to 8 bytes */ + size = round_up(size, 8); + + /* alloc_percpu zero-fills */ + pptr = __alloc_percpu_gfp(size, 8, GFP_ATOMIC | __GFP_NOWARN); + if (!pptr) { + kfree(l_new); + return NULL; + } + + /* copy true value_size bytes */ + memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); + htab_elem_set_ptr(l_new, key_size, pptr); + } else { + memcpy(l_new->key + round_up(key_size, 8), value, size); + } + + l_new->hash = hash; + return l_new; +} + +static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, + u64 map_flags) +{ + if (!l_old && unlikely(atomic_read(&htab->count) >= htab->map.max_entries)) + /* if elem with this 'key' doesn't exist and we've reached + * max_entries limit, fail insertion of new elem + */ + return -E2BIG; + + if (l_old && map_flags == BPF_NOEXIST) + /* elem already exists */ + return -EEXIST; + + if (!l_old && map_flags == BPF_EXIST) + /* elem doesn't exist, cannot update it */ + return -ENOENT; + + return 0; +} + /* Called from syscall or from eBPF program */ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct htab_elem *l_new, *l_old; + struct htab_elem *l_new = NULL, *l_old; struct hlist_head *head; - struct bucket *b; unsigned long flags; - u32 key_size; + struct bucket *b; + u32 key_size, hash; int ret; - if (map_flags > BPF_EXIST) + if (unlikely(map_flags > BPF_EXIST)) /* unknown flags */ return -EINVAL; WARN_ON_ONCE(!rcu_read_lock_held()); - /* allocate new element outside of lock */ - l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); - if (!l_new) - return -ENOMEM; - key_size = map->key_size; - memcpy(l_new->key, key, key_size); - memcpy(l_new->key + round_up(key_size, 8), value, map->value_size); + hash = htab_map_hash(key, key_size); + + /* allocate new element outside of the lock, since + * we're most likley going to insert it + */ + l_new = alloc_htab_elem(htab, key, value, key_size, hash, false); + if (!l_new) + return -ENOMEM; - l_new->hash = htab_map_hash(l_new->key, key_size); - b = __select_bucket(htab, l_new->hash); + b = __select_bucket(htab, hash); head = &b->head; /* bpf_map_update_elem() can be called in_irq() */ raw_spin_lock_irqsave(&b->lock, flags); - l_old = lookup_elem_raw(head, l_new->hash, key, key_size); + l_old = lookup_elem_raw(head, hash, key, key_size); - if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) { - /* if elem with this 'key' doesn't exist and we've reached - * max_entries limit, fail insertion of new elem - */ - ret = -E2BIG; + ret = check_flags(htab, l_old, map_flags); + if (ret) goto err; - } - if (l_old && map_flags == BPF_NOEXIST) { - /* elem already exists */ - ret = -EEXIST; - goto err; - } - - if (!l_old && map_flags == BPF_EXIST) { - /* elem doesn't exist, cannot update it */ - ret = -ENOENT; - goto err; - } - - /* add new element to the head of the list, so that concurrent - * search will find it before old elem + /* add new element to the head of the list, so that + * concurrent search will find it before old elem */ hlist_add_head_rcu(&l_new->hash_node, head); if (l_old) { @@ -298,7 +395,6 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, atomic_inc(&htab->count); } raw_spin_unlock_irqrestore(&b->lock, flags); - return 0; err: raw_spin_unlock_irqrestore(&b->lock, flags); @@ -306,10 +402,64 @@ err: return ret; } +static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l_new = NULL, *l_old; + struct hlist_head *head; + unsigned long flags; + struct bucket *b; + u32 key_size, hash; + int ret; + + if (unlikely(map_flags > BPF_EXIST)) + /* unknown flags */ + return -EINVAL; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + b = __select_bucket(htab, hash); + head = &b->head; + + /* bpf_map_update_elem() can be called in_irq() */ + raw_spin_lock_irqsave(&b->lock, flags); + + l_old = lookup_elem_raw(head, hash, key, key_size); + + ret = check_flags(htab, l_old, map_flags); + if (ret) + goto err; + + if (l_old) { + /* per-cpu hash map can update value in-place */ + memcpy(this_cpu_ptr(htab_elem_get_ptr(l_old, key_size)), + value, htab->map.value_size); + } else { + l_new = alloc_htab_elem(htab, key, value, key_size, + hash, true); + if (!l_new) { + ret = -ENOMEM; + goto err; + } + hlist_add_head_rcu(&l_new->hash_node, head); + atomic_inc(&htab->count); + } + ret = 0; +err: + raw_spin_unlock_irqrestore(&b->lock, flags); + return ret; +} + /* Called from syscall or from eBPF program */ static int htab_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + bool percpu = map->map_type == BPF_MAP_TYPE_PERCPU_HASH; struct hlist_head *head; struct bucket *b; struct htab_elem *l; @@ -332,7 +482,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) if (l) { hlist_del_rcu(&l->hash_node); atomic_dec(&htab->count); - kfree_rcu(l, rcu); + free_htab_elem(l, percpu, key_size); ret = 0; } @@ -352,7 +502,12 @@ static void delete_all_elements(struct bpf_htab *htab) hlist_for_each_entry_safe(l, n, head, hash_node) { hlist_del_rcu(&l->hash_node); atomic_dec(&htab->count); - kfree(l); + if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) { + l->key_size = htab->map.key_size; + htab_percpu_elem_free(l); + } else { + kfree(l); + } } } } @@ -391,9 +546,35 @@ static struct bpf_map_type_list htab_type __read_mostly = { .type = BPF_MAP_TYPE_HASH, }; +/* Called from eBPF program */ +static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct htab_elem *l = __htab_map_lookup_elem(map, key); + + if (l) + return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); + else + return NULL; +} + +static const struct bpf_map_ops htab_percpu_ops = { + .map_alloc = htab_map_alloc, + .map_free = htab_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_percpu_map_lookup_elem, + .map_update_elem = htab_percpu_map_update_elem, + .map_delete_elem = htab_map_delete_elem, +}; + +static struct bpf_map_type_list htab_percpu_type __read_mostly = { + .ops = &htab_percpu_ops, + .type = BPF_MAP_TYPE_PERCPU_HASH, +}; + static int __init register_htab_map(void) { bpf_register_map_type(&htab_type); + bpf_register_map_type(&htab_percpu_type); return 0; } late_initcall(register_htab_map); -- cgit v1.2.3 From a10423b87a7eae75da79ce80a8d9475047a674ee Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Feb 2016 22:39:54 -0800 Subject: bpf: introduce BPF_MAP_TYPE_PERCPU_ARRAY map Primary use case is a histogram array of latency where bpf program computes the latency of block requests or other events and stores histogram of latency into array of 64 elements. All cpus are constantly running, so normal increment is not accurate, bpf_xadd causes cache ping-pong and this per-cpu approach allows fastest collision-free counters. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 102 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 91 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 89ebbc4d1164..b9bf1d7949ca 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -17,11 +17,39 @@ #include #include +static void bpf_array_free_percpu(struct bpf_array *array) +{ + int i; + + for (i = 0; i < array->map.max_entries; i++) + free_percpu(array->pptrs[i]); +} + +static int bpf_array_alloc_percpu(struct bpf_array *array) +{ + void __percpu *ptr; + int i; + + for (i = 0; i < array->map.max_entries; i++) { + ptr = __alloc_percpu_gfp(array->elem_size, 8, + GFP_USER | __GFP_NOWARN); + if (!ptr) { + bpf_array_free_percpu(array); + return -ENOMEM; + } + array->pptrs[i] = ptr; + } + + return 0; +} + /* Called from syscall */ static struct bpf_map *array_map_alloc(union bpf_attr *attr) { + bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; struct bpf_array *array; - u32 elem_size, array_size; + u64 array_size; + u32 elem_size; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || @@ -36,12 +64,16 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) elem_size = round_up(attr->value_size, 8); - /* check round_up into zero and u32 overflow */ - if (elem_size == 0 || - attr->max_entries > (U32_MAX - PAGE_SIZE - sizeof(*array)) / elem_size) + array_size = sizeof(*array); + if (percpu) + array_size += (u64) attr->max_entries * sizeof(void *); + else + array_size += (u64) attr->max_entries * elem_size; + + /* make sure there is no u32 overflow later in round_up() */ + if (array_size >= U32_MAX - PAGE_SIZE) return ERR_PTR(-ENOMEM); - array_size = sizeof(*array) + attr->max_entries * elem_size; /* allocate all map elements and zero-initialize them */ array = kzalloc(array_size, GFP_USER | __GFP_NOWARN); @@ -52,12 +84,25 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) } /* copy mandatory map attributes */ + array->map.map_type = attr->map_type; array->map.key_size = attr->key_size; array->map.value_size = attr->value_size; array->map.max_entries = attr->max_entries; - array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT; array->elem_size = elem_size; + if (!percpu) + goto out; + + array_size += (u64) attr->max_entries * elem_size * num_possible_cpus(); + + if (array_size >= U32_MAX - PAGE_SIZE || + elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) { + kvfree(array); + return ERR_PTR(-ENOMEM); + } +out: + array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT; + return &array->map; } @@ -67,12 +112,24 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; - if (index >= array->map.max_entries) + if (unlikely(index >= array->map.max_entries)) return NULL; return array->value + array->elem_size * index; } +/* Called from eBPF program */ +static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + + if (unlikely(index >= array->map.max_entries)) + return NULL; + + return this_cpu_ptr(array->pptrs[index]); +} + /* Called from syscall */ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { @@ -99,19 +156,24 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; - if (map_flags > BPF_EXIST) + if (unlikely(map_flags > BPF_EXIST)) /* unknown flags */ return -EINVAL; - if (index >= array->map.max_entries) + if (unlikely(index >= array->map.max_entries)) /* all elements were pre-allocated, cannot insert a new one */ return -E2BIG; - if (map_flags == BPF_NOEXIST) + if (unlikely(map_flags == BPF_NOEXIST)) /* all elements already exist */ return -EEXIST; - memcpy(array->value + array->elem_size * index, value, map->value_size); + if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + memcpy(this_cpu_ptr(array->pptrs[index]), + value, map->value_size); + else + memcpy(array->value + array->elem_size * index, + value, map->value_size); return 0; } @@ -133,6 +195,9 @@ static void array_map_free(struct bpf_map *map) */ synchronize_rcu(); + if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + bpf_array_free_percpu(array); + kvfree(array); } @@ -150,9 +215,24 @@ static struct bpf_map_type_list array_type __read_mostly = { .type = BPF_MAP_TYPE_ARRAY, }; +static const struct bpf_map_ops percpu_array_ops = { + .map_alloc = array_map_alloc, + .map_free = array_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = percpu_array_map_lookup_elem, + .map_update_elem = array_map_update_elem, + .map_delete_elem = array_map_delete_elem, +}; + +static struct bpf_map_type_list percpu_array_type __read_mostly = { + .ops = &percpu_array_ops, + .type = BPF_MAP_TYPE_PERCPU_ARRAY, +}; + static int __init register_array_map(void) { bpf_register_map_type(&array_type); + bpf_register_map_type(&percpu_array_type); return 0; } late_initcall(register_array_map); -- cgit v1.2.3 From 15a07b33814d14ca817887dbea8530728dc0fbe4 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Feb 2016 22:39:55 -0800 Subject: bpf: add lookup/update support for per-cpu hash and array maps The functions bpf_map_lookup_elem(map, key, value) and bpf_map_update_elem(map, key, value, flags) need to get/set values from all-cpus for per-cpu hash and array maps, so that user space can aggregate/update them as necessary. Example of single counter aggregation in user space: unsigned int nr_cpus = sysconf(_SC_NPROCESSORS_CONF); long values[nr_cpus]; long value = 0; bpf_lookup_elem(fd, key, values); for (i = 0; i < nr_cpus; i++) value += values[i]; The user space must provide round_up(value_size, 8) * nr_cpus array to get/set values, since kernel will use 'long' copy of per-cpu values to try to copy good counters atomically. It's a best-effort, since bpf programs and user space are racing to access the same memory. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 64 +++++++++++++++++++++++++++++++++++++++ kernel/bpf/hashtab.c | 83 +++++++++++++++++++++++++++++++++++++++++++++------ kernel/bpf/syscall.c | 57 ++++++++++++++++++++++++----------- 3 files changed, 178 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index b9bf1d7949ca..bd3bdf2486a7 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -130,6 +130,32 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) return this_cpu_ptr(array->pptrs[index]); } +int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + void __percpu *pptr; + int cpu, off = 0; + u32 size; + + if (unlikely(index >= array->map.max_entries)) + return -ENOENT; + + /* per_cpu areas are zero-filled and bpf programs can only + * access 'value_size' of them, so copying rounded areas + * will not leak any kernel data + */ + size = round_up(map->value_size, 8); + rcu_read_lock(); + pptr = array->pptrs[index]; + for_each_possible_cpu(cpu) { + bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); + off += size; + } + rcu_read_unlock(); + return 0; +} + /* Called from syscall */ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { @@ -177,6 +203,44 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, return 0; } +int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + void __percpu *pptr; + int cpu, off = 0; + u32 size; + + if (unlikely(map_flags > BPF_EXIST)) + /* unknown flags */ + return -EINVAL; + + if (unlikely(index >= array->map.max_entries)) + /* all elements were pre-allocated, cannot insert a new one */ + return -E2BIG; + + if (unlikely(map_flags == BPF_NOEXIST)) + /* all elements already exist */ + return -EEXIST; + + /* the user space will provide round_up(value_size, 8) bytes that + * will be copied into per-cpu area. bpf programs can only access + * value_size of it. During lookup the same extra bytes will be + * returned or zeros which were zero-filled by percpu_alloc, + * so no kernel data leaks possible + */ + size = round_up(map->value_size, 8); + rcu_read_lock(); + pptr = array->pptrs[index]; + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); + off += size; + } + rcu_read_unlock(); + return 0; +} + /* Called from syscall or from eBPF program */ static int array_map_delete_elem(struct bpf_map *map, void *key) { diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 2be5f6e8bb04..fd5db8fe9360 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -290,7 +290,7 @@ static void free_htab_elem(struct htab_elem *l, bool percpu, u32 key_size) static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, - bool percpu) + bool percpu, bool onallcpus) { u32 size = htab->map.value_size; struct htab_elem *l_new; @@ -312,8 +312,18 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, return NULL; } - /* copy true value_size bytes */ - memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); + if (!onallcpus) { + /* copy true value_size bytes */ + memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); + } else { + int off = 0, cpu; + + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), + value + off, size); + off += size; + } + } htab_elem_set_ptr(l_new, key_size, pptr); } else { memcpy(l_new->key + round_up(key_size, 8), value, size); @@ -368,7 +378,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, /* allocate new element outside of the lock, since * we're most likley going to insert it */ - l_new = alloc_htab_elem(htab, key, value, key_size, hash, false); + l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false); if (!l_new) return -ENOMEM; @@ -402,8 +412,9 @@ err: return ret; } -static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) +static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags, + bool onallcpus) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new = NULL, *l_old; @@ -436,12 +447,25 @@ static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, goto err; if (l_old) { + void __percpu *pptr = htab_elem_get_ptr(l_old, key_size); + u32 size = htab->map.value_size; + /* per-cpu hash map can update value in-place */ - memcpy(this_cpu_ptr(htab_elem_get_ptr(l_old, key_size)), - value, htab->map.value_size); + if (!onallcpus) { + memcpy(this_cpu_ptr(pptr), value, size); + } else { + int off = 0, cpu; + + size = round_up(size, 8); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), + value + off, size); + off += size; + } + } } else { l_new = alloc_htab_elem(htab, key, value, key_size, - hash, true); + hash, true, onallcpus); if (!l_new) { ret = -ENOMEM; goto err; @@ -455,6 +479,12 @@ err: return ret; } +static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + return __htab_percpu_map_update_elem(map, key, value, map_flags, false); +} + /* Called from syscall or from eBPF program */ static int htab_map_delete_elem(struct bpf_map *map, void *key) { @@ -557,6 +587,41 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) +{ + struct htab_elem *l; + void __percpu *pptr; + int ret = -ENOENT; + int cpu, off = 0; + u32 size; + + /* per_cpu areas are zero-filled and bpf programs can only + * access 'value_size' of them, so copying rounded areas + * will not leak any kernel data + */ + size = round_up(map->value_size, 8); + rcu_read_lock(); + l = __htab_map_lookup_elem(map, key); + if (!l) + goto out; + pptr = htab_elem_get_ptr(l, map->key_size); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(value + off, + per_cpu_ptr(pptr, cpu), size); + off += size; + } + ret = 0; +out: + rcu_read_unlock(); + return ret; +} + +int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + return __htab_percpu_map_update_elem(map, key, value, map_flags, true); +} + static const struct bpf_map_ops htab_percpu_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 637397059f76..c95a753c2007 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -239,6 +239,7 @@ static int map_lookup_elem(union bpf_attr *attr) int ufd = attr->map_fd; struct bpf_map *map; void *key, *value, *ptr; + u32 value_size; struct fd f; int err; @@ -259,23 +260,35 @@ static int map_lookup_elem(union bpf_attr *attr) if (copy_from_user(key, ukey, map->key_size) != 0) goto free_key; + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + value_size = round_up(map->value_size, 8) * num_possible_cpus(); + else + value_size = map->value_size; + err = -ENOMEM; - value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; - rcu_read_lock(); - ptr = map->ops->map_lookup_elem(map, key); - if (ptr) - memcpy(value, ptr, map->value_size); - rcu_read_unlock(); + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { + err = bpf_percpu_hash_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + err = bpf_percpu_array_copy(map, key, value); + } else { + rcu_read_lock(); + ptr = map->ops->map_lookup_elem(map, key); + if (ptr) + memcpy(value, ptr, value_size); + rcu_read_unlock(); + err = ptr ? 0 : -ENOENT; + } - err = -ENOENT; - if (!ptr) + if (err) goto free_value; err = -EFAULT; - if (copy_to_user(uvalue, value, map->value_size) != 0) + if (copy_to_user(uvalue, value, value_size) != 0) goto free_value; err = 0; @@ -298,6 +311,7 @@ static int map_update_elem(union bpf_attr *attr) int ufd = attr->map_fd; struct bpf_map *map; void *key, *value; + u32 value_size; struct fd f; int err; @@ -318,21 +332,30 @@ static int map_update_elem(union bpf_attr *attr) if (copy_from_user(key, ukey, map->key_size) != 0) goto free_key; + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + value_size = round_up(map->value_size, 8) * num_possible_cpus(); + else + value_size = map->value_size; + err = -ENOMEM; - value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; err = -EFAULT; - if (copy_from_user(value, uvalue, map->value_size) != 0) + if (copy_from_user(value, uvalue, value_size) != 0) goto free_value; - /* eBPF program that use maps are running under rcu_read_lock(), - * therefore all map accessors rely on this fact, so do the same here - */ - rcu_read_lock(); - err = map->ops->map_update_elem(map, key, value, attr->flags); - rcu_read_unlock(); + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { + err = bpf_percpu_hash_update(map, key, value, attr->flags); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + err = bpf_percpu_array_update(map, key, value, attr->flags); + } else { + rcu_read_lock(); + err = map->ops->map_update_elem(map, key, value, attr->flags); + rcu_read_unlock(); + } free_value: kfree(value); -- cgit v1.2.3 From fd97646b05957348e01be3d9de5c3d979b25c819 Mon Sep 17 00:00:00 2001 From: Wei Yuan Date: Sat, 6 Feb 2016 15:39:47 +0800 Subject: audit: Fix typo in comment Signed-off-by: Weiyuan Signed-off-by: Paul Moore --- kernel/audit_watch.c | 2 +- kernel/auditfilter.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 656c7e93ac0d..0348b12b5a4d 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -185,7 +185,7 @@ static struct audit_watch *audit_init_watch(char *path) return watch; } -/* Translate a watch string to kernel respresentation. */ +/* Translate a watch string to kernel representation. */ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op) { struct audit_watch *watch; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index b8ff9e193753..94ca7b1e5e7e 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -158,7 +158,7 @@ char *audit_unpack_string(void **bufp, size_t *remain, size_t len) return str; } -/* Translate an inode field to kernel respresentation. */ +/* Translate an inode field to kernel representation. */ static inline int audit_to_inode(struct audit_krule *krule, struct audit_field *f) { @@ -415,7 +415,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) return 0; } -/* Translate struct audit_rule_data to kernel's rule respresentation. */ +/* Translate struct audit_rule_data to kernel's rule representation. */ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, size_t datasz) { @@ -593,7 +593,7 @@ static inline size_t audit_pack_string(void **bufp, const char *str) return len; } -/* Translate kernel rule respresentation to struct audit_rule_data. */ +/* Translate kernel rule representation to struct audit_rule_data. */ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) { struct audit_rule_data *data; -- cgit v1.2.3 From f7b382b988233b5851eddf4531651ffe4133e88c Mon Sep 17 00:00:00 2001 From: Abhilash Jindal Date: Sun, 31 Jan 2016 14:29:01 -0500 Subject: PM/freezer: y2038, use boottime to compare tstamps Wall time obtained from do_gettimeofday gives 32 bit timeval which can only represent time until January 2038. This patch moves to ktime_t, a 64-bit time. Also, wall time is susceptible to sudden jumps due to user setting the time or due to NTP. Boot time is constantly increasing time better suited for subtracting two timestamps. Signed-off-by: Abhilash Jindal Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/process.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index 564f786df470..df058bed53ce 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -30,13 +30,12 @@ static int try_to_freeze_tasks(bool user_only) unsigned long end_time; unsigned int todo; bool wq_busy = false; - struct timeval start, end; - u64 elapsed_msecs64; + ktime_t start, end, elapsed; unsigned int elapsed_msecs; bool wakeup = false; int sleep_usecs = USEC_PER_MSEC; - do_gettimeofday(&start); + start = ktime_get_boottime(); end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs); @@ -78,10 +77,9 @@ static int try_to_freeze_tasks(bool user_only) sleep_usecs *= 2; } - do_gettimeofday(&end); - elapsed_msecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); - do_div(elapsed_msecs64, NSEC_PER_MSEC); - elapsed_msecs = elapsed_msecs64; + end = ktime_get_boottime(); + elapsed = ktime_sub(end, start); + elapsed_msecs = ktime_to_ms(elapsed); if (todo) { pr_cont("\n"); -- cgit v1.2.3 From 22e09b333f0b395b3eb6ab6efa4b3284e2c06810 Mon Sep 17 00:00:00 2001 From: saurabh Date: Wed, 28 Oct 2015 08:54:01 +0530 Subject: PM / suspend: replacing printk replacing printk(s) with appropriate pr_info and pr_err in order to fix checkpatch.pl warnings Signed-off-by: Saurabh Sengar Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index f9fe133c13e2..230a77225e2e 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -248,7 +248,7 @@ static int suspend_test(int level) { #ifdef CONFIG_PM_DEBUG if (pm_test_level == level) { - printk(KERN_INFO "suspend debug: Waiting for %d second(s).\n", + pr_info("suspend debug: Waiting for %d second(s).\n", pm_test_delay); mdelay(pm_test_delay * 1000); return 1; @@ -320,7 +320,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) error = dpm_suspend_late(PMSG_SUSPEND); if (error) { - printk(KERN_ERR "PM: late suspend of devices failed\n"); + pr_err("PM: late suspend of devices failed\n"); goto Platform_finish; } error = platform_suspend_prepare_late(state); @@ -329,7 +329,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) error = dpm_suspend_noirq(PMSG_SUSPEND); if (error) { - printk(KERN_ERR "PM: noirq suspend of devices failed\n"); + pr_err("PM: noirq suspend of devices failed\n"); goto Platform_early_resume; } error = platform_suspend_prepare_noirq(state); -- cgit v1.2.3 From 223ffb29f9723a4b485cacf6dc7e6d639fffc322 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 11 Feb 2016 13:34:49 -0500 Subject: cgroup: provide cgroup_nov1= to disable controllers in v1 mounts Testing cgroup2 can be painful with system software automatically mounting and populating all cgroup controllers in v1 mode. Sometimes they can be unmounted from rc.local, sometimes even that is too late. Provide a commandline option to disable certain controllers in v1 mounts, so that they remain available for cgroup2 mounts. Example use: cgroup_no_v1=memory,cpu cgroup_no_v1=all Disabling will be confirmed at boot-time as such: [ 0.013770] Disabling cpu control group subsystem in v1 mounts [ 0.016004] Disabling memory control group subsystem in v1 mounts Signed-off-by: Johannes Weiner Signed-off-by: Tejun Heo --- kernel/cgroup.c | 43 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d27904c193da..7ad61915967f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -180,6 +180,9 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root); */ static bool cgrp_dfl_root_visible; +/* Controllers blocked by the commandline in v1 */ +static unsigned long cgroup_no_v1_mask; + /* some controllers are not supported in the default hierarchy */ static unsigned long cgrp_dfl_root_inhibit_ss_mask; @@ -241,6 +244,11 @@ static bool cgroup_ssid_enabled(int ssid) return static_key_enabled(cgroup_subsys_enabled_key[ssid]); } +static bool cgroup_ssid_no_v1(int ssid) +{ + return cgroup_no_v1_mask & (1 << ssid); +} + /** * cgroup_on_dfl - test whether a cgroup is on the default hierarchy * @cgrp: the cgroup of interest @@ -1678,6 +1686,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) continue; if (!cgroup_ssid_enabled(i)) continue; + if (cgroup_ssid_no_v1(i)) + continue; /* Mutually exclusive option 'all' + subsystem name */ if (all_ss) @@ -1698,7 +1708,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) */ if (all_ss || (!one_ss && !opts->none && !opts->name)) for_each_subsys(ss, i) - if (cgroup_ssid_enabled(i)) + if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i)) opts->subsys_mask |= (1 << i); /* @@ -5324,6 +5334,10 @@ int __init cgroup_init(void) continue; } + if (cgroup_ssid_no_v1(ssid)) + printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n", + ss->name); + cgrp_dfl_root.subsys_mask |= 1 << ss->id; if (!ss->dfl_cftypes) @@ -5750,6 +5764,33 @@ static int __init cgroup_disable(char *str) } __setup("cgroup_disable=", cgroup_disable); +static int __init cgroup_no_v1(char *str) +{ + struct cgroup_subsys *ss; + char *token; + int i; + + while ((token = strsep(&str, ",")) != NULL) { + if (!*token) + continue; + + if (!strcmp(token, "all")) { + cgroup_no_v1_mask = ~0UL; + break; + } + + for_each_subsys(ss, i) { + if (strcmp(token, ss->name) && + strcmp(token, ss->legacy_name)) + continue; + + cgroup_no_v1_mask |= 1 << i; + } + } + return 1; +} +__setup("cgroup_no_v1=", cgroup_no_v1); + /** * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest -- cgit v1.2.3 From fc4fa6e112c0f999fab022a4eb7f6614bb47c7ab Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Sun, 13 Dec 2015 15:26:11 +0900 Subject: treewide: Fix typo in printk This patch fix spelling typos found in printk and Kconfig. Signed-off-by: Masanari Iida Acked-by: Randy Dunlap Signed-off-by: Jiri Kosina --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b1356b7ae570..0d4cc7601df7 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -131,7 +131,7 @@ static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n"); } else { if (offset > (max_cycles >> 1)) { - printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the the '%s' clock's 50%% safety margin (%lld)\n", + printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the '%s' clock's 50%% safety margin (%lld)\n", offset, name, max_cycles >> 1); printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n"); } -- cgit v1.2.3 From 1e9877902dc7e11d2be038371c6fbf2dfcd469d7 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 12 Feb 2016 13:01:54 -0800 Subject: mm/gup: Introduce get_user_pages_remote() For protection keys, we need to understand whether protections should be enforced in software or not. In general, we enforce protections when working on our own task, but not when on others. We call these "current" and "remote" operations. This patch introduces a new get_user_pages() variant: get_user_pages_remote() Which is a replacement for when get_user_pages() is called on non-current tsk/mm. We also introduce a new gup flag: FOLL_REMOTE which can be used for the "__" gup variants to get this new behavior. The uprobes is_trap_at_addr() location holds mmap_sem and calls get_user_pages(current->mm) on an instruction address. This makes it a pretty unique gup caller. Being an instruction access and also really originating from the kernel (vs. the app), I opted to consider this a 'remote' access where protection keys will not be enforced. Without protection keys, this patch should not change any behavior. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Andrea Arcangeli Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Naoya Horiguchi Cc: Peter Zijlstra Cc: Rik van Riel Cc: Srikar Dronamraju Cc: Vlastimil Babka Cc: jack@suse.cz Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20160212210154.3F0E51EA@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 0167679182c0..8eef5f55d3f0 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -299,7 +299,7 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, retry: /* Read the page with vaddr into memory */ - ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); + ret = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); if (ret <= 0) return ret; @@ -1700,7 +1700,13 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) if (likely(result == 0)) goto out; - result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL); + /* + * The NULL 'tsk' here ensures that any faults that occur here + * will not be accounted to the task. 'mm' *is* current->mm, + * but we treat this as a 'remote' access since it is + * essentially a kernel access to the memory. + */ + result = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &page, NULL); if (result < 0) return result; -- cgit v1.2.3 From a79a908fd2b080977b45bf103184b81c9d11ad07 Mon Sep 17 00:00:00 2001 From: Aditya Kali Date: Fri, 29 Jan 2016 02:54:06 -0600 Subject: cgroup: introduce cgroup namespaces Introduce the ability to create new cgroup namespace. The newly created cgroup namespace remembers the cgroup of the process at the point of creation of the cgroup namespace (referred as cgroupns-root). The main purpose of cgroup namespace is to virtualize the contents of /proc/self/cgroup file. Processes inside a cgroup namespace are only able to see paths relative to their namespace root (unless they are moved outside of their cgroupns-root, at which point they will see a relative path from their cgroupns-root). For a correctly setup container this enables container-tools (like libcontainer, lxc, lmctfy, etc.) to create completely virtualized containers without leaking system level cgroup hierarchy to the task. This patch only implements the 'unshare' part of the cgroupns. Signed-off-by: Aditya Kali Signed-off-by: Serge Hallyn Signed-off-by: Tejun Heo --- kernel/cgroup.c | 173 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- kernel/cpuset.c | 8 +-- kernel/fork.c | 2 +- kernel/nsproxy.c | 19 +++++- 4 files changed, 192 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7ad61915967f..b001c5d36bec 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -59,6 +59,9 @@ #include #include #include +#include +#include +#include #include /* @@ -212,6 +215,15 @@ static unsigned long have_fork_callback __read_mostly; static unsigned long have_exit_callback __read_mostly; static unsigned long have_free_callback __read_mostly; +/* cgroup namespace for init task */ +struct cgroup_namespace init_cgroup_ns = { + .count = { .counter = 2, }, + .user_ns = &init_user_ns, + .ns.ops = &cgroupns_operations, + .ns.inum = PROC_CGROUP_INIT_INO, + .root_cset = &init_css_set, +}; + /* Ditto for the can_fork callback. */ static unsigned long have_canfork_callback __read_mostly; @@ -2177,6 +2189,35 @@ static struct file_system_type cgroup2_fs_type = { .kill_sb = cgroup_kill_sb, }; +static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) +{ + struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); + int ret; + + ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); + if (ret < 0 || ret >= buflen) + return NULL; + return buf; +} + +char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) +{ + char *ret; + + mutex_lock(&cgroup_mutex); + spin_lock_bh(&css_set_lock); + + ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns); + + spin_unlock_bh(&css_set_lock); + mutex_unlock(&cgroup_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(cgroup_path_ns); + /** * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy * @task: target task @@ -2204,7 +2245,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) if (root) { cgrp = task_cgroup_from_root(task, root); - path = cgroup_path(cgrp, buf, buflen); + path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); } else { /* if no hierarchy exists, everyone is in "/" */ if (strlcpy(buf, "/", buflen) < buflen) @@ -5297,6 +5338,8 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); + get_user_ns(init_cgroup_ns.user_ns); + mutex_lock(&cgroup_mutex); /* Add init_css_set to the hash table */ @@ -5438,7 +5481,8 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, * " (deleted)" is appended to the cgroup path. */ if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { - path = cgroup_path(cgrp, buf, PATH_MAX); + path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, + current->nsproxy->cgroup_ns); if (!path) { retval = -ENAMETOOLONG; goto out_unlock; @@ -5720,7 +5764,9 @@ static void cgroup_release_agent(struct work_struct *work) if (!pathbuf || !agentbuf) goto out; - path = cgroup_path(cgrp, pathbuf, PATH_MAX); + spin_lock_bh(&css_set_lock); + path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); + spin_unlock_bh(&css_set_lock); if (!path) goto out; @@ -5931,6 +5977,127 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) #endif /* CONFIG_SOCK_CGROUP_DATA */ +/* cgroup namespaces */ + +static struct cgroup_namespace *alloc_cgroup_ns(void) +{ + struct cgroup_namespace *new_ns; + int ret; + + new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL); + if (!new_ns) + return ERR_PTR(-ENOMEM); + ret = ns_alloc_inum(&new_ns->ns); + if (ret) { + kfree(new_ns); + return ERR_PTR(ret); + } + atomic_set(&new_ns->count, 1); + new_ns->ns.ops = &cgroupns_operations; + return new_ns; +} + +void free_cgroup_ns(struct cgroup_namespace *ns) +{ + put_css_set(ns->root_cset); + put_user_ns(ns->user_ns); + ns_free_inum(&ns->ns); + kfree(ns); +} +EXPORT_SYMBOL(free_cgroup_ns); + +struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, + struct user_namespace *user_ns, + struct cgroup_namespace *old_ns) +{ + struct cgroup_namespace *new_ns = NULL; + struct css_set *cset = NULL; + int err; + + BUG_ON(!old_ns); + + if (!(flags & CLONE_NEWCGROUP)) { + get_cgroup_ns(old_ns); + return old_ns; + } + + /* Allow only sysadmin to create cgroup namespace. */ + err = -EPERM; + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) + goto err_out; + + mutex_lock(&cgroup_mutex); + spin_lock_bh(&css_set_lock); + + cset = task_css_set(current); + get_css_set(cset); + + spin_unlock_bh(&css_set_lock); + mutex_unlock(&cgroup_mutex); + + err = -ENOMEM; + new_ns = alloc_cgroup_ns(); + if (!new_ns) + goto err_out; + + new_ns->user_ns = get_user_ns(user_ns); + new_ns->root_cset = cset; + + return new_ns; + +err_out: + if (cset) + put_css_set(cset); + kfree(new_ns); + return ERR_PTR(err); +} + +static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) +{ + return container_of(ns, struct cgroup_namespace, ns); +} + +static int cgroupns_install(struct nsproxy *nsproxy, void *ns) +{ + pr_info("setns not supported for cgroup namespace"); + return -EINVAL; +} + +static struct ns_common *cgroupns_get(struct task_struct *task) +{ + struct cgroup_namespace *ns = NULL; + struct nsproxy *nsproxy; + + task_lock(task); + nsproxy = task->nsproxy; + if (nsproxy) { + ns = nsproxy->cgroup_ns; + get_cgroup_ns(ns); + } + task_unlock(task); + + return ns ? &ns->ns : NULL; +} + +static void cgroupns_put(struct ns_common *ns) +{ + put_cgroup_ns(to_cg_ns(ns)); +} + +const struct proc_ns_operations cgroupns_operations = { + .name = "cgroup", + .type = CLONE_NEWCGROUP, + .get = cgroupns_get, + .put = cgroupns_put, + .install = cgroupns_install, +}; + +static __init int cgroup_namespaces_init(void) +{ + return 0; +} +subsys_initcall(cgroup_namespaces_init); + #ifdef CONFIG_CGROUP_DEBUG static struct cgroup_subsys_state * debug_css_alloc(struct cgroup_subsys_state *parent_css) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 41989ab4db57..d393125b228c 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2714,10 +2714,10 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, goto out; retval = -ENAMETOOLONG; - rcu_read_lock(); - css = task_css(tsk, cpuset_cgrp_id); - p = cgroup_path(css->cgroup, buf, PATH_MAX); - rcu_read_unlock(); + css = task_get_css(tsk, cpuset_cgrp_id); + p = cgroup_path_ns(css->cgroup, buf, PATH_MAX, + current->nsproxy->cgroup_ns); + css_put(css); if (!p) goto out_free; seq_puts(m, p); diff --git a/kernel/fork.c b/kernel/fork.c index 2e391c754ae7..6611a6267949 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1884,7 +1884,7 @@ static int check_unshare_flags(unsigned long unshare_flags) if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| - CLONE_NEWUSER|CLONE_NEWPID)) + CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 49746c81ad8d..782102e59eed 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -25,6 +25,7 @@ #include #include #include +#include static struct kmem_cache *nsproxy_cachep; @@ -39,6 +40,9 @@ struct nsproxy init_nsproxy = { #ifdef CONFIG_NET .net_ns = &init_net, #endif +#ifdef CONFIG_CGROUPS + .cgroup_ns = &init_cgroup_ns, +#endif }; static inline struct nsproxy *create_nsproxy(void) @@ -92,6 +96,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_pid; } + new_nsp->cgroup_ns = copy_cgroup_ns(flags, user_ns, + tsk->nsproxy->cgroup_ns); + if (IS_ERR(new_nsp->cgroup_ns)) { + err = PTR_ERR(new_nsp->cgroup_ns); + goto out_cgroup; + } + new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns); if (IS_ERR(new_nsp->net_ns)) { err = PTR_ERR(new_nsp->net_ns); @@ -101,6 +112,8 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, return new_nsp; out_net: + put_cgroup_ns(new_nsp->cgroup_ns); +out_cgroup: if (new_nsp->pid_ns_for_children) put_pid_ns(new_nsp->pid_ns_for_children); out_pid: @@ -128,7 +141,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) struct nsproxy *new_ns; if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWPID | CLONE_NEWNET)))) { + CLONE_NEWPID | CLONE_NEWNET | + CLONE_NEWCGROUP)))) { get_nsproxy(old_ns); return 0; } @@ -165,6 +179,7 @@ void free_nsproxy(struct nsproxy *ns) put_ipc_ns(ns->ipc_ns); if (ns->pid_ns_for_children) put_pid_ns(ns->pid_ns_for_children); + put_cgroup_ns(ns->cgroup_ns); put_net(ns->net_ns); kmem_cache_free(nsproxy_cachep, ns); } @@ -180,7 +195,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, int err = 0; if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWNET | CLONE_NEWPID))) + CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP))) return 0; user_ns = new_cred ? new_cred->user_ns : current_user_ns(); -- cgit v1.2.3 From a0530e087e648263f81a81d62ca020f66b54bcb0 Mon Sep 17 00:00:00 2001 From: Aditya Kali Date: Fri, 29 Jan 2016 02:54:07 -0600 Subject: cgroup: cgroup namespace setns support setns on a cgroup namespace is allowed only if task has CAP_SYS_ADMIN in its current user-namespace and over the user-namespace associated with target cgroupns. No implicit cgroup changes happen with attaching to another cgroupns. It is expected that the somone moves the attaching process under the target cgroupns-root. Signed-off-by: Aditya Kali Signed-off-by: Serge E. Hallyn Signed-off-by: Tejun Heo --- kernel/cgroup.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b001c5d36bec..b086a461be23 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -6057,10 +6057,23 @@ static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) return container_of(ns, struct cgroup_namespace, ns); } -static int cgroupns_install(struct nsproxy *nsproxy, void *ns) +static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns) { - pr_info("setns not supported for cgroup namespace"); - return -EINVAL; + struct cgroup_namespace *cgroup_ns = to_cg_ns(ns); + + if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) || + !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + /* Don't need to do anything if we are attaching to our own cgroupns. */ + if (cgroup_ns == nsproxy->cgroup_ns) + return 0; + + get_cgroup_ns(cgroup_ns); + put_cgroup_ns(nsproxy->cgroup_ns); + nsproxy->cgroup_ns = cgroup_ns; + + return 0; } static struct ns_common *cgroupns_get(struct task_struct *task) -- cgit v1.2.3 From ed82571b1a14ab2bfbede2bb2c209700495749fc Mon Sep 17 00:00:00 2001 From: Serge Hallyn Date: Fri, 29 Jan 2016 02:54:09 -0600 Subject: cgroup: mount cgroupns-root when inside non-init cgroupns This patch enables cgroup mounting inside userns when a process as appropriate privileges. The cgroup filesystem mounted is rooted at the cgroupns-root. Thus, in a container-setup, only the hierarchy under the cgroupns-root is exposed inside the container. This allows container management tools to run inside the containers without depending on any global state. Signed-off-by: Serge Hallyn Signed-off-by: Tejun Heo Signed-off-by: Tejun Heo --- kernel/cgroup.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b086a461be23..24989022ff62 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1994,6 +1994,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, { bool is_v2 = fs_type == &cgroup2_fs_type; struct super_block *pinned_sb = NULL; + struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct cgroup_subsys *ss; struct cgroup_root *root; struct cgroup_sb_opts opts; @@ -2002,6 +2003,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int i; bool new_sb; + get_cgroup_ns(ns); + + /* Check if the caller has permission to mount. */ + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) { + put_cgroup_ns(ns); + return ERR_PTR(-EPERM); + } + /* * The first time anyone tries to mount a cgroup, enable the list * linking each css_set to its tasks and fix up all existing tasks. @@ -2012,6 +2021,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (is_v2) { if (data) { pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); + put_cgroup_ns(ns); return ERR_PTR(-EINVAL); } cgrp_dfl_root_visible = true; @@ -2117,6 +2127,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, goto out_unlock; } + /* + * We know this subsystem has not yet been bound. Users in a non-init + * user namespace may only mount hierarchies with no bound subsystems, + * i.e. 'none,name=user1' + */ + if (!opts.none && !capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_unlock; + } + root = kzalloc(sizeof(*root), GFP_KERNEL); if (!root) { ret = -ENOMEM; @@ -2135,12 +2155,37 @@ out_free: kfree(opts.release_agent); kfree(opts.name); - if (ret) + if (ret) { + put_cgroup_ns(ns); return ERR_PTR(ret); + } out_mount: dentry = kernfs_mount(fs_type, flags, root->kf_root, is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC, &new_sb); + + /* + * In non-init cgroup namespace, instead of root cgroup's + * dentry, we return the dentry corresponding to the + * cgroupns->root_cgrp. + */ + if (!IS_ERR(dentry) && ns != &init_cgroup_ns) { + struct dentry *nsdentry; + struct cgroup *cgrp; + + mutex_lock(&cgroup_mutex); + spin_lock_bh(&css_set_lock); + + cgrp = cset_cgroup_from_root(ns->root_cset, root); + + spin_unlock_bh(&css_set_lock); + mutex_unlock(&cgroup_mutex); + + nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb); + dput(dentry); + dentry = nsdentry; + } + if (IS_ERR(dentry) || !new_sb) cgroup_put(&root->cgrp); @@ -2153,6 +2198,7 @@ out_mount: deactivate_super(pinned_sb); } + put_cgroup_ns(ns); return dentry; } -- cgit v1.2.3 From 1c53753e0df1ae4d21661053459e7c024a43f1d3 Mon Sep 17 00:00:00 2001 From: Serge Hallyn Date: Fri, 29 Jan 2016 02:54:11 -0600 Subject: Add FS_USERNS_FLAG to cgroup fs allowing root in a non-init user namespace to mount it. This should now be safe, because 1. non-init-root cannot mount a previously unbound subsystem 2. the task doing the mount must be privileged with respect to the user namespace owning the cgroup namespace 3. the mounted subsystem will have its current cgroup as the root dentry. the permissions will be unchanged, so tasks will receive no new privilege over the cgroups which they did not have on the original mounts. Signed-off-by: Serge Hallyn --- kernel/cgroup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 24989022ff62..afb1205fc789 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2227,12 +2227,14 @@ static struct file_system_type cgroup_fs_type = { .name = "cgroup", .mount = cgroup_mount, .kill_sb = cgroup_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; static struct file_system_type cgroup2_fs_type = { .name = "cgroup2", .mount = cgroup_mount, .kill_sb = cgroup_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, -- cgit v1.2.3 From 23217b443b4b0439c8b55d3be0482d3cd7fbc5ac Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Wed, 17 Feb 2016 21:04:41 +0100 Subject: workqueue: Replace usage of init_name with dev_set_name() The init_name property of the device struct is sort of a hack and should only be used for statically allocated devices. Since the device is dynamically allocated here it is safe to use the proper way to set a devices name by calling dev_set_name(). Signed-off-by: Lars-Peter Clausen Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7ff5dc7d2ac5..3a1c99b0c1b3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5222,8 +5222,8 @@ int workqueue_sysfs_register(struct workqueue_struct *wq) wq_dev->wq = wq; wq_dev->dev.bus = &wq_subsys; - wq_dev->dev.init_name = wq->name; wq_dev->dev.release = wq_device_release; + dev_set_name(&wq_dev->dev, "%s", wq->name); /* * unbound_attrs are created separately. Suppress uevent until -- cgit v1.2.3 From cd0ea35ff5511cde299a61c21a95889b4a71464e Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 12 Feb 2016 13:02:12 -0800 Subject: signals, pkeys: Notify userspace about protection key faults A protection key fault is very similar to any other access error. There must be a VMA, etc... We even want to take the same action (SIGSEGV) that we do with a normal access fault. However, we do need to let userspace know that something is different. We do this the same way what we did with SEGV_BNDERR with Memory Protection eXtensions (MPX): define a new SEGV code: SEGV_PKUERR. We add a siginfo field: si_pkey that reveals to userspace which protection key was set on the PTE that we faulted on. There is no other easy way for userspace to figure this out. They could parse smaps but that would be a bit cruel. We share space with in siginfo with _addr_bnd. #BR faults from MPX are completely separate from page faults (#PF) that trigger from protection key violations, so we never need both at the same time. Note that _pkey is a 64-bit value. The current hardware only supports 4-bit protection keys. We do this because there is _plenty_ of space in _sigfault and it is possible that future processors would support more than 4 bits of protection keys. The x86 code to actually fill in the siginfo is in the next patch. Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: Al Viro Cc: Amanieu d'Antras Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Palmer Dabbelt Cc: Peter Zijlstra Cc: Richard Weinberger Cc: Rik van Riel Cc: Sasha Levin Cc: Vegard Nossum Cc: Vladimir Davydov Cc: linux-arch@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20160212210212.3A9B83AC@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- kernel/signal.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 0508544c8ced..fe8ed298373c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2708,6 +2708,10 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) err |= __put_user(from->si_lower, &to->si_lower); err |= __put_user(from->si_upper, &to->si_upper); } +#endif +#ifdef SEGV_PKUERR + if (from->si_signo == SIGSEGV && from->si_code == SEGV_PKUERR) + err |= __put_user(from->si_pkey, &to->si_pkey); #endif break; case __SI_CHLD: -- cgit v1.2.3 From d22025570e2ebfc68819b35c5d457e53d9337217 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 18 Feb 2016 11:44:24 -0500 Subject: cgroup: fix alloc_cgroup_ns() error handling in copy_cgroup_ns() alloc_cgroup_ns() returns an ERR_PTR value on error but copy_cgroup_ns() was checking for NULL for error. Fix it. Signed-off-by: Tejun Heo Reported-by: Dan Carpenter --- kernel/cgroup.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index afb1205fc789..d92d91a4bb3e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -6083,10 +6083,11 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, spin_unlock_bh(&css_set_lock); mutex_unlock(&cgroup_mutex); - err = -ENOMEM; new_ns = alloc_cgroup_ns(); - if (!new_ns) + if (IS_ERR(new_ns)) { + err = PTR_ERR(new_ns); goto err_out; + } new_ns->user_ns = get_user_ns(user_ns); new_ns->root_cset = cset; -- cgit v1.2.3 From 6bbd9a05a1f9839873a9290b5b7c6fafde8447ba Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Fri, 19 Feb 2016 13:53:10 -0500 Subject: bpf: grab rcu read lock for bpf_percpu_hash_update bpf_percpu_hash_update() expects rcu lock to be held and warns if it's not, which pointed out a missing rcu read lock. Fixes: 15a07b338 ("bpf: add lookup/update support for per-cpu hash and array maps") Signed-off-by: Sasha Levin Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/hashtab.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index fd5db8fe9360..a68e95133fcd 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -619,7 +619,13 @@ out: int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, u64 map_flags) { - return __htab_percpu_map_update_elem(map, key, value, map_flags, true); + int ret; + + rcu_read_lock(); + ret = __htab_percpu_map_update_elem(map, key, value, map_flags, true); + rcu_read_unlock(); + + return ret; } static const struct bpf_map_ops htab_percpu_ops = { -- cgit v1.2.3 From 568b329a02f75ed3aaae5eb2cca384cb9e09cb29 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 17 Feb 2016 19:58:57 -0800 Subject: perf: generalize perf_callchain . avoid walking the stack when there is no room left in the buffer . generalize get_perf_callchain() to be called from bpf helper Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/events/callchain.c | 32 ++++++++++++++++++++------------ kernel/events/internal.h | 2 -- 2 files changed, 20 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 9c418002b8c1..343c22f5e867 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -159,15 +159,24 @@ put_callchain_entry(int rctx) struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs) { - int rctx; - struct perf_callchain_entry *entry; - - int kernel = !event->attr.exclude_callchain_kernel; - int user = !event->attr.exclude_callchain_user; + bool kernel = !event->attr.exclude_callchain_kernel; + bool user = !event->attr.exclude_callchain_user; + /* Disallow cross-task user callchains. */ + bool crosstask = event->ctx->task && event->ctx->task != current; if (!kernel && !user) return NULL; + return get_perf_callchain(regs, 0, kernel, user, crosstask, true); +} + +struct perf_callchain_entry * +get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, + bool crosstask, bool add_mark) +{ + struct perf_callchain_entry *entry; + int rctx; + entry = get_callchain_entry(&rctx); if (rctx == -1) return NULL; @@ -175,10 +184,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) if (!entry) goto exit_put; - entry->nr = 0; + entry->nr = init_nr; if (kernel && !user_mode(regs)) { - perf_callchain_store(entry, PERF_CONTEXT_KERNEL); + if (add_mark) + perf_callchain_store(entry, PERF_CONTEXT_KERNEL); perf_callchain_kernel(entry, regs); } @@ -191,13 +201,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) } if (regs) { - /* - * Disallow cross-task user callchains. - */ - if (event->ctx->task && event->ctx->task != current) + if (crosstask) goto exit_put; - perf_callchain_store(entry, PERF_CONTEXT_USER); + if (add_mark) + perf_callchain_store(entry, PERF_CONTEXT_USER); perf_callchain_user(entry, regs); } } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 2bbad9c1274c..4199b6d193f5 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -182,8 +182,6 @@ DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) /* Callchain handling */ extern struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs); -extern int get_callchain_buffers(void); -extern void put_callchain_buffers(void); static inline int get_recursion_context(int *recursion) { -- cgit v1.2.3 From d5a3b1f691865be576c2bffa708549b8cdccda19 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 17 Feb 2016 19:58:58 -0800 Subject: bpf: introduce BPF_MAP_TYPE_STACK_TRACE add new map type to store stack traces and corresponding helper bpf_get_stackid(ctx, map, flags) - walk user or kernel stack and return id @ctx: struct pt_regs* @map: pointer to stack_trace map @flags: bits 0-7 - numer of stack frames to skip bit 8 - collect user stack instead of kernel bit 9 - compare stacks by hash only bit 10 - if two different stacks hash into the same stackid discard old other bits - reserved Return: >= 0 stackid on success or negative error stackid is a 32-bit integer handle that can be further combined with other data (including other stackid) and used as a key into maps. Userspace will access stackmap using standard lookup/delete syscall commands to retrieve full stack trace for given stackid. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/Makefile | 3 + kernel/bpf/stackmap.c | 237 +++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 6 +- kernel/trace/bpf_trace.c | 2 + 4 files changed, 247 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/stackmap.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 13272582eee0..8a932d079c24 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -2,3 +2,6 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o +ifeq ($(CONFIG_PERF_EVENTS),y) +obj-$(CONFIG_BPF_SYSCALL) += stackmap.o +endif diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c new file mode 100644 index 000000000000..8a60ee14a977 --- /dev/null +++ b/kernel/bpf/stackmap.c @@ -0,0 +1,237 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include + +struct stack_map_bucket { + struct rcu_head rcu; + u32 hash; + u32 nr; + u64 ip[]; +}; + +struct bpf_stack_map { + struct bpf_map map; + u32 n_buckets; + struct stack_map_bucket __rcu *buckets[]; +}; + +/* Called from syscall */ +static struct bpf_map *stack_map_alloc(union bpf_attr *attr) +{ + u32 value_size = attr->value_size; + struct bpf_stack_map *smap; + u64 cost, n_buckets; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 4 || + value_size < 8 || value_size % 8 || + value_size / 8 > PERF_MAX_STACK_DEPTH) + return ERR_PTR(-EINVAL); + + /* hash table size must be power of 2 */ + n_buckets = roundup_pow_of_two(attr->max_entries); + + cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); + if (cost >= U32_MAX - PAGE_SIZE) + return ERR_PTR(-E2BIG); + + smap = kzalloc(cost, GFP_USER | __GFP_NOWARN); + if (!smap) { + smap = vzalloc(cost); + if (!smap) + return ERR_PTR(-ENOMEM); + } + + err = -E2BIG; + cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); + if (cost >= U32_MAX - PAGE_SIZE) + goto free_smap; + + smap->map.map_type = attr->map_type; + smap->map.key_size = attr->key_size; + smap->map.value_size = value_size; + smap->map.max_entries = attr->max_entries; + smap->n_buckets = n_buckets; + smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + err = get_callchain_buffers(); + if (err) + goto free_smap; + + return &smap->map; + +free_smap: + kvfree(smap); + return ERR_PTR(err); +} + +static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) +{ + struct pt_regs *regs = (struct pt_regs *) (long) r1; + struct bpf_map *map = (struct bpf_map *) (long) r2; + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + struct perf_callchain_entry *trace; + struct stack_map_bucket *bucket, *new_bucket, *old_bucket; + u32 max_depth = map->value_size / 8; + /* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */ + u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth; + u32 skip = flags & BPF_F_SKIP_FIELD_MASK; + u32 hash, id, trace_nr, trace_len; + bool user = flags & BPF_F_USER_STACK; + bool kernel = !user; + u64 *ips; + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) + return -EINVAL; + + trace = get_perf_callchain(regs, init_nr, kernel, user, false, false); + + if (unlikely(!trace)) + /* couldn't fetch the stack trace */ + return -EFAULT; + + /* get_perf_callchain() guarantees that trace->nr >= init_nr + * and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth + */ + trace_nr = trace->nr - init_nr; + + if (trace_nr <= skip) + /* skipping more than usable stack trace */ + return -EFAULT; + + trace_nr -= skip; + trace_len = trace_nr * sizeof(u64); + ips = trace->ip + skip + init_nr; + hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); + id = hash & (smap->n_buckets - 1); + bucket = rcu_dereference(smap->buckets[id]); + + if (bucket && bucket->hash == hash) { + if (flags & BPF_F_FAST_STACK_CMP) + return id; + if (bucket->nr == trace_nr && + memcmp(bucket->ip, ips, trace_len) == 0) + return id; + } + + /* this call stack is not in the map, try to add it */ + if (bucket && !(flags & BPF_F_REUSE_STACKID)) + return -EEXIST; + + new_bucket = kmalloc(sizeof(struct stack_map_bucket) + map->value_size, + GFP_ATOMIC | __GFP_NOWARN); + if (unlikely(!new_bucket)) + return -ENOMEM; + + memcpy(new_bucket->ip, ips, trace_len); + memset(new_bucket->ip + trace_len / 8, 0, map->value_size - trace_len); + new_bucket->hash = hash; + new_bucket->nr = trace_nr; + + old_bucket = xchg(&smap->buckets[id], new_bucket); + if (old_bucket) + kfree_rcu(old_bucket, rcu); + return id; +} + +const struct bpf_func_proto bpf_get_stackid_proto = { + .func = bpf_get_stackid, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + +/* Called from syscall or from eBPF program */ +static void *stack_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + struct stack_map_bucket *bucket; + u32 id = *(u32 *)key; + + if (unlikely(id >= smap->n_buckets)) + return NULL; + bucket = rcu_dereference(smap->buckets[id]); + return bucket ? bucket->ip : NULL; +} + +static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + return -EINVAL; +} + +static int stack_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + return -EINVAL; +} + +/* Called from syscall or from eBPF program */ +static int stack_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + struct stack_map_bucket *old_bucket; + u32 id = *(u32 *)key; + + if (unlikely(id >= smap->n_buckets)) + return -E2BIG; + + old_bucket = xchg(&smap->buckets[id], NULL); + if (old_bucket) { + kfree_rcu(old_bucket, rcu); + return 0; + } else { + return -ENOENT; + } +} + +/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ +static void stack_map_free(struct bpf_map *map) +{ + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + int i; + + synchronize_rcu(); + + for (i = 0; i < smap->n_buckets; i++) + if (smap->buckets[i]) + kfree_rcu(smap->buckets[i], rcu); + kvfree(smap); + put_callchain_buffers(); +} + +static const struct bpf_map_ops stack_map_ops = { + .map_alloc = stack_map_alloc, + .map_free = stack_map_free, + .map_get_next_key = stack_map_get_next_key, + .map_lookup_elem = stack_map_lookup_elem, + .map_update_elem = stack_map_update_elem, + .map_delete_elem = stack_map_delete_elem, +}; + +static struct bpf_map_type_list stack_map_type __read_mostly = { + .ops = &stack_map_ops, + .type = BPF_MAP_TYPE_STACK_TRACE, +}; + +static int __init register_stack_map(void) +{ + bpf_register_map_type(&stack_map_type); + return 0; +} +late_initcall(register_stack_map); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d1d3e8f57de9..42ba4ccc020b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -246,6 +246,7 @@ static const struct { {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call}, {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read}, {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output}, + {BPF_MAP_TYPE_STACK_TRACE, BPF_FUNC_get_stackid}, }; static void print_verifier_state(struct verifier_env *env) @@ -911,8 +912,11 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) * don't allow any other map type to be passed into * the special func; */ - if (bool_func && bool_map != bool_func) + if (bool_func && bool_map != bool_func) { + verbose("cannot pass map_type %d into func %d\n", + map->map_type, func_id); return -EINVAL; + } } return 0; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 326a75e884db..4b8caa392b86 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -299,6 +299,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_perf_event_read_proto; case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto; + case BPF_FUNC_get_stackid: + return &bpf_get_stackid_proto; default: return NULL; } -- cgit v1.2.3 From a1db74209483a24c861c848b4bb79a4d945ef6fa Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Wed, 30 Dec 2015 07:35:30 -0500 Subject: module: replace copy_module_from_fd with kernel version Replace copy_module_from_fd() with kernel_read_file_from_fd(). Although none of the upstreamed LSMs define a kernel_module_from_file hook, IMA is called, based on policy, to prevent unsigned kernel modules from being loaded by the original kernel module syscall and to measure/appraise signed kernel modules. The security function security_kernel_module_from_file() was called prior to reading a kernel module. Preventing unsigned kernel modules from being loaded by the original kernel module syscall remains on the pre-read kernel_read_file() security hook. Instead of reading the kernel module twice, once for measuring/appraising and again for loading the kernel module, the signature validation is moved to the kernel_post_read_file() security hook. This patch removes the security_kernel_module_from_file() hook and security call. Signed-off-by: Mimi Zohar Acked-by: Kees Cook Acked-by: Luis R. Rodriguez Cc: Rusty Russell --- kernel/module.c | 68 +++++++-------------------------------------------------- 1 file changed, 8 insertions(+), 60 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 8358f4697c0c..955410928696 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2654,7 +2654,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, if (info->len < sizeof(*(info->hdr))) return -ENOEXEC; - err = security_kernel_module_from_file(NULL); + err = security_kernel_read_file(NULL, READING_MODULE); if (err) return err; @@ -2672,63 +2672,6 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, return 0; } -/* Sets info->hdr and info->len. */ -static int copy_module_from_fd(int fd, struct load_info *info) -{ - struct fd f = fdget(fd); - int err; - struct kstat stat; - loff_t pos; - ssize_t bytes = 0; - - if (!f.file) - return -ENOEXEC; - - err = security_kernel_module_from_file(f.file); - if (err) - goto out; - - err = vfs_getattr(&f.file->f_path, &stat); - if (err) - goto out; - - if (stat.size > INT_MAX) { - err = -EFBIG; - goto out; - } - - /* Don't hand 0 to vmalloc, it whines. */ - if (stat.size == 0) { - err = -EINVAL; - goto out; - } - - info->hdr = vmalloc(stat.size); - if (!info->hdr) { - err = -ENOMEM; - goto out; - } - - pos = 0; - while (pos < stat.size) { - bytes = kernel_read(f.file, pos, (char *)(info->hdr) + pos, - stat.size - pos); - if (bytes < 0) { - vfree(info->hdr); - err = bytes; - goto out; - } - if (bytes == 0) - break; - pos += bytes; - } - info->len = pos; - -out: - fdput(f); - return err; -} - static void free_copy(struct load_info *info) { vfree(info->hdr); @@ -3589,8 +3532,10 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) { - int err; struct load_info info = { }; + loff_t size; + void *hdr; + int err; err = may_init_module(); if (err) @@ -3602,9 +3547,12 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) |MODULE_INIT_IGNORE_VERMAGIC)) return -EINVAL; - err = copy_module_from_fd(fd, &info); + err = kernel_read_file_from_fd(fd, &hdr, &size, INT_MAX, + READING_MODULE); if (err) return err; + info.hdr = hdr; + info.len = size; return load_module(&info, uargs, flags); } -- cgit v1.2.3 From b804defe4297157a9ff45863769efe9a01953398 Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Thu, 14 Jan 2016 20:59:14 -0500 Subject: kexec: replace call to copy_file_from_fd() with kernel version Replace copy_file_from_fd() with kernel_read_file_from_fd(). Two new identifiers named READING_KEXEC_IMAGE and READING_KEXEC_INITRAMFS are defined for measuring, appraising or auditing the kexec image and initramfs. Changelog v3: - return -EBADF, not -ENOEXEC - identifier change - split patch, moving copy_file_from_fd() to a separate patch - split patch, moving IMA changes to a separate patch v0: - use kstat file size type loff_t, not size_t - Calculate the file hash from the in memory buffer - Dave Young Signed-off-by: Mimi Zohar Acked-by: Kees Cook Acked-by: Luis R. Rodriguez Cc: Eric Biederman Acked-by: Dave Young --- kernel/kexec_file.c | 73 +++++++---------------------------------------------- 1 file changed, 9 insertions(+), 64 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 007b791f676d..b696c3f3708f 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -33,65 +34,6 @@ size_t __weak kexec_purgatory_size = 0; static int kexec_calculate_store_digests(struct kimage *image); -static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len) -{ - struct fd f = fdget(fd); - int ret; - struct kstat stat; - loff_t pos; - ssize_t bytes = 0; - - if (!f.file) - return -EBADF; - - ret = vfs_getattr(&f.file->f_path, &stat); - if (ret) - goto out; - - if (stat.size > INT_MAX) { - ret = -EFBIG; - goto out; - } - - /* Don't hand 0 to vmalloc, it whines. */ - if (stat.size == 0) { - ret = -EINVAL; - goto out; - } - - *buf = vmalloc(stat.size); - if (!*buf) { - ret = -ENOMEM; - goto out; - } - - pos = 0; - while (pos < stat.size) { - bytes = kernel_read(f.file, pos, (char *)(*buf) + pos, - stat.size - pos); - if (bytes < 0) { - vfree(*buf); - ret = bytes; - goto out; - } - - if (bytes == 0) - break; - pos += bytes; - } - - if (pos != stat.size) { - ret = -EBADF; - vfree(*buf); - goto out; - } - - *buf_len = pos; -out: - fdput(f); - return ret; -} - /* Architectures can provide this probe function */ int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, unsigned long buf_len) @@ -182,16 +124,17 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, { int ret = 0; void *ldata; + loff_t size; - ret = copy_file_from_fd(kernel_fd, &image->kernel_buf, - &image->kernel_buf_len); + ret = kernel_read_file_from_fd(kernel_fd, &image->kernel_buf, + &size, INT_MAX, READING_KEXEC_IMAGE); if (ret) return ret; + image->kernel_buf_len = size; /* Call arch image probe handlers */ ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, image->kernel_buf_len); - if (ret) goto out; @@ -206,10 +149,12 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, #endif /* It is possible that there no initramfs is being loaded */ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { - ret = copy_file_from_fd(initrd_fd, &image->initrd_buf, - &image->initrd_buf_len); + ret = kernel_read_file_from_fd(initrd_fd, &image->initrd_buf, + &size, INT_MAX, + READING_KEXEC_INITRAMFS); if (ret) goto out; + image->initrd_buf_len = size; } if (cmdline_len) { -- cgit v1.2.3 From 8e2fe1d9f1a20924f98ea46931a1d7fb092aa876 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 19 Feb 2016 23:05:22 +0100 Subject: bpf: add new arg_type that allows for 0 sized stack buffer Currently, when we pass a buffer from the eBPF stack into a helper function, the function proto indicates argument types as ARG_PTR_TO_STACK and ARG_CONST_STACK_SIZE pair. If R contains the former, then R must be of the latter type. Then, verifier checks whether the buffer points into eBPF stack, is initialized, etc. The verifier also guarantees that the constant value passed in R is greater than 0, so helper functions don't need to test for it and can always assume a non-NULL initialized buffer as well as non-0 buffer size. This patch adds a new argument types ARG_CONST_STACK_SIZE_OR_ZERO that allows to also pass NULL as R and 0 as R into the helper function. Such helper functions, of course, need to be able to handle these cases internally then. Verifier guarantees that either R == NULL && R == 0 or R != NULL && R != 0 (like the case of ARG_CONST_STACK_SIZE), any other combinations are not possible to load. I went through various options of extending the verifier, and introducing the type ARG_CONST_STACK_SIZE_OR_ZERO seems to have most minimal changes needed to the verifier. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 42 ++++++++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 42ba4ccc020b..36dc497deaa3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -779,15 +779,24 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) * bytes from that pointer, make sure that it's within stack boundary * and all elements of stack are initialized */ -static int check_stack_boundary(struct verifier_env *env, - int regno, int access_size) +static int check_stack_boundary(struct verifier_env *env, int regno, + int access_size, bool zero_size_allowed) { struct verifier_state *state = &env->cur_state; struct reg_state *regs = state->regs; int off, i; - if (regs[regno].type != PTR_TO_STACK) + if (regs[regno].type != PTR_TO_STACK) { + if (zero_size_allowed && access_size == 0 && + regs[regno].type == CONST_IMM && + regs[regno].imm == 0) + return 0; + + verbose("R%d type=%s expected=%s\n", regno, + reg_type_str[regs[regno].type], + reg_type_str[PTR_TO_STACK]); return -EACCES; + } off = regs[regno].imm; if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || @@ -830,15 +839,24 @@ static int check_func_arg(struct verifier_env *env, u32 regno, return 0; } - if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || + if (arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE) { expected_type = PTR_TO_STACK; - } else if (arg_type == ARG_CONST_STACK_SIZE) { + } else if (arg_type == ARG_CONST_STACK_SIZE || + arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { expected_type = CONST_IMM; } else if (arg_type == ARG_CONST_MAP_PTR) { expected_type = CONST_PTR_TO_MAP; } else if (arg_type == ARG_PTR_TO_CTX) { expected_type = PTR_TO_CTX; + } else if (arg_type == ARG_PTR_TO_STACK) { + expected_type = PTR_TO_STACK; + /* One exception here. In case function allows for NULL to be + * passed in as argument, it's a CONST_IMM type. Final test + * happens during stack boundary checking. + */ + if (reg->type == CONST_IMM && reg->imm == 0) + expected_type = CONST_IMM; } else { verbose("unsupported arg_type %d\n", arg_type); return -EFAULT; @@ -868,8 +886,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno, verbose("invalid map_ptr to access map->key\n"); return -EACCES; } - err = check_stack_boundary(env, regno, (*mapp)->key_size); - + err = check_stack_boundary(env, regno, (*mapp)->key_size, + false); } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity @@ -879,9 +897,12 @@ static int check_func_arg(struct verifier_env *env, u32 regno, verbose("invalid map_ptr to access map->value\n"); return -EACCES; } - err = check_stack_boundary(env, regno, (*mapp)->value_size); + err = check_stack_boundary(env, regno, (*mapp)->value_size, + false); + } else if (arg_type == ARG_CONST_STACK_SIZE || + arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { + bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO); - } else if (arg_type == ARG_CONST_STACK_SIZE) { /* bpf_xxx(..., buf, len) call will access 'len' bytes * from stack pointer 'buf'. Check it * note: regno == len, regno - 1 == buf @@ -891,7 +912,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno, verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); return -EACCES; } - err = check_stack_boundary(env, regno - 1, reg->imm); + err = check_stack_boundary(env, regno - 1, reg->imm, + zero_size_allowed); } return err; -- cgit v1.2.3 From b598dde354de22d87f664a7b99b8c21437da8efb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Feb 2016 22:25:45 -0500 Subject: cgroup: fix error return value of cgroup_addrm_files() cgroup_addrm_files() incorrectly returned 0 after add failure. Fix it. Signed-off-by: Tejun Heo Acked-by: Johannes Weiner --- kernel/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7ad61915967f..68b032df77f5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3369,7 +3369,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, bool is_add) { struct cftype *cft, *cft_end = NULL; - int ret; + int ret = 0; lockdep_assert_held(&cgroup_mutex); @@ -3398,7 +3398,7 @@ restart: cgroup_rm_file(cgrp, cft); } } - return 0; + return ret; } static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) -- cgit v1.2.3 From 5eb385cc5ae1b31fbcdd727854a00c5a083f6b9b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Feb 2016 22:25:46 -0500 Subject: Revert "cgroup: add cgroup_subsys->css_e_css_changed()" This reverts commit 56c807ba4e91f0980567b6a69de239677879b17f. cgroup_subsys->css_e_css_changed() was supposed to be used by cgroup writeback support; however, the change to per-inode cgroup association made it unnecessary and the callback doesn't have any user. Remove it. Signed-off-by: Tejun Heo Acked-by: Johannes Weiner --- kernel/cgroup.c | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 68b032df77f5..7727b6e43e10 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3127,24 +3127,6 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, } } - /* - * The effective csses of all the descendants (excluding @cgrp) may - * have changed. Subsystems can optionally subscribe to this event - * by implementing ->css_e_css_changed() which is invoked if any of - * the effective csses seen from the css's cgroup may have changed. - */ - for_each_subsys(ss, ssid) { - struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss); - struct cgroup_subsys_state *css; - - if (!ss->css_e_css_changed || !this_css) - continue; - - css_for_each_descendant_pre(css, this_css) - if (css != this_css) - ss->css_e_css_changed(css); - } - kernfs_activate(cgrp->kn); ret = 0; out_unlock: -- cgit v1.2.3 From 8699b7762a623c46ced891b3cf490058b56cf99c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Feb 2016 22:25:46 -0500 Subject: cgroup: s/child_subsys_mask/subtree_ss_mask/ For consistency with cgroup->subtree_control. * cgroup->child_subsys_mask -> cgroup->subtree_ss_mask * cgroup_calc_child_subsys_mask() -> cgroup_calc_subtree_ss_mask() * cgroup_refresh_child_subsys_mask() -> cgroup_refresh_subtree_ss_mask() No functional changes. Signed-off-by: Tejun Heo Acked-by: Johannes Weiner --- kernel/cgroup.c | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7727b6e43e10..f3cd67bfe6c0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -391,10 +391,10 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, /* * This function is used while updating css associations and thus - * can't test the csses directly. Use ->child_subsys_mask. + * can't test the csses directly. Use ->subtree_ss_mask. */ while (cgroup_parent(cgrp) && - !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id))) + !(cgroup_parent(cgrp)->subtree_ss_mask & (1 << ss->id))) cgrp = cgroup_parent(cgrp); return cgroup_css(cgrp, ss); @@ -1256,7 +1256,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft) } /** - * cgroup_calc_child_subsys_mask - calculate child_subsys_mask + * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask * @cgrp: the target cgroup * @subtree_control: the new subtree_control mask to consider * @@ -1268,8 +1268,8 @@ static umode_t cgroup_file_mode(const struct cftype *cft) * @subtree_control is to be applied to @cgrp. The returned mask is always * a superset of @subtree_control and follows the usual hierarchy rules. */ -static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp, - unsigned long subtree_control) +static unsigned long cgroup_calc_subtree_ss_mask(struct cgroup *cgrp, + unsigned long subtree_control) { struct cgroup *parent = cgroup_parent(cgrp); unsigned long cur_ss_mask = subtree_control; @@ -1293,7 +1293,7 @@ static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp, * to non-default hierarchies. */ if (parent) - new_ss_mask &= parent->child_subsys_mask; + new_ss_mask &= parent->subtree_ss_mask; else new_ss_mask &= cgrp->root->subsys_mask; @@ -1306,16 +1306,16 @@ static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp, } /** - * cgroup_refresh_child_subsys_mask - update child_subsys_mask + * cgroup_refresh_subtree_ss_mask - update subtree_ss_mask * @cgrp: the target cgroup * - * Update @cgrp->child_subsys_mask according to the current - * @cgrp->subtree_control using cgroup_calc_child_subsys_mask(). + * Update @cgrp->subtree_ss_mask according to the current + * @cgrp->subtree_control using cgroup_calc_subtree_ss_mask(). */ -static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) +static void cgroup_refresh_subtree_ss_mask(struct cgroup *cgrp) { - cgrp->child_subsys_mask = - cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control); + cgrp->subtree_ss_mask = + cgroup_calc_subtree_ss_mask(cgrp, cgrp->subtree_control); } /** @@ -1542,7 +1542,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, src_root->subsys_mask &= ~(1 << ssid); scgrp->subtree_control &= ~(1 << ssid); - cgroup_refresh_child_subsys_mask(scgrp); + cgroup_refresh_subtree_ss_mask(scgrp); /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; @@ -1550,7 +1550,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, static_branch_enable(cgroup_subsys_on_dfl_key[ssid]); } else { dcgrp->subtree_control |= 1 << ssid; - cgroup_refresh_child_subsys_mask(dcgrp); + cgroup_refresh_subtree_ss_mask(dcgrp); static_branch_disable(cgroup_subsys_on_dfl_key[ssid]); } @@ -2523,11 +2523,11 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, lockdep_assert_held(&cgroup_mutex); /* - * Except for the root, child_subsys_mask must be zero for a cgroup + * Except for the root, subtree_ss_mask must be zero for a cgroup * with tasks so that child cgroups don't compete against tasks. */ if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) && - dst_cgrp->child_subsys_mask) + dst_cgrp->subtree_ss_mask) return -EBUSY; /* look up the dst cset for each src cset and link it to src */ @@ -2880,7 +2880,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy * @cgrp: root of the subtree to update csses for * - * @cgrp's child_subsys_mask has changed and its subtree's (self excluded) + * @cgrp's subtree_ss_mask has changed and its subtree's (self excluded) * css associations need to be updated accordingly. This function looks up * all css_sets which are attached to the subtree, creates the matching * updated css_sets and migrates the tasks to the new ones. @@ -2902,7 +2902,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { struct cgrp_cset_link *link; - /* self is not affected by child_subsys_mask change */ + /* self is not affected by subtree_ss_mask change */ if (css->cgroup == cgrp) continue; @@ -3034,9 +3034,9 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, * depending on subsystem dependencies. */ old_sc = cgrp->subtree_control; - old_ss = cgrp->child_subsys_mask; + old_ss = cgrp->subtree_ss_mask; new_sc = (old_sc | enable) & ~disable; - new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc); + new_ss = cgroup_calc_subtree_ss_mask(cgrp, new_sc); css_enable = ~old_ss & new_ss; css_disable = old_ss & ~new_ss; @@ -3069,7 +3069,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, } cgrp->subtree_control = new_sc; - cgrp->child_subsys_mask = new_ss; + cgrp->subtree_ss_mask = new_ss; /* * Create new csses or make the existing ones visible. A css is @@ -3135,7 +3135,7 @@ out_unlock: err_undo_css: cgrp->subtree_control = old_sc; - cgrp->child_subsys_mask = old_ss; + cgrp->subtree_ss_mask = old_ss; for_each_subsys(ss, ssid) { if (!(enable & (1 << ssid))) @@ -4969,7 +4969,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, /* let's create and online css's */ for_each_subsys(ss, ssid) { - if (parent->child_subsys_mask & (1 << ssid)) { + if (parent->subtree_ss_mask & (1 << ssid)) { ret = create_css(cgrp, ss, parent->subtree_control & (1 << ssid)); if (ret) @@ -4983,7 +4983,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, */ if (!cgroup_on_dfl(cgrp)) { cgrp->subtree_control = parent->subtree_control; - cgroup_refresh_child_subsys_mask(cgrp); + cgroup_refresh_subtree_ss_mask(cgrp); } kernfs_activate(kn); -- cgit v1.2.3 From b4e0eeafba61b141c3af22d6636be3f477c5d3bd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Feb 2016 22:25:46 -0500 Subject: cgroup: convert for_each_subsys_which() to do-while style for_each_subsys_which() allows iterating subsystems specified in a subsystem bitmask; unfortunately, it requires the mask to be an unsigned long l-value which can be inconvenient and makes it awkward to use a smaller type for subsystem masks. This patch converts for_each_subsy_which() to do-while style which allows it to drop the l-value requirement. The new iterator is named do_each_subsys_mask() / while_each_subsys_mask(). Signed-off-by: Tejun Heo Cc: Aleksa Sarai Acked-by: Johannes Weiner --- kernel/cgroup.c | 72 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f3cd67bfe6c0..5d102980dc27 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -514,22 +514,28 @@ static int notify_on_release(const struct cgroup *cgrp) (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) /** - * for_each_subsys_which - filter for_each_subsys with a bitmask + * do_each_subsys_mask - filter for_each_subsys with a bitmask * @ss: the iteration cursor * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end - * @ss_maskp: a pointer to the bitmask + * @ss_mask: the bitmask * * The block will only run for cases where the ssid-th bit (1 << ssid) of - * mask is set to 1. + * @ss_mask is set. */ -#define for_each_subsys_which(ss, ssid, ss_maskp) \ - if (!CGROUP_SUBSYS_COUNT) /* to avoid spurious gcc warning */ \ +#define do_each_subsys_mask(ss, ssid, ss_mask) do { \ + unsigned long __ss_mask = (ss_mask); \ + if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */ \ (ssid) = 0; \ - else \ - for_each_set_bit(ssid, ss_maskp, CGROUP_SUBSYS_COUNT) \ - if (((ss) = cgroup_subsys[ssid]) && false) \ - break; \ - else + break; \ + } \ + for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \ + (ss) = cgroup_subsys[ssid]; \ + { + +#define while_each_subsys_mask() \ + } \ + } \ +} while (false) /* iterate across the hierarchies */ #define for_each_root(root) \ @@ -1284,8 +1290,9 @@ static unsigned long cgroup_calc_subtree_ss_mask(struct cgroup *cgrp, while (true) { unsigned long new_ss_mask = cur_ss_mask; - for_each_subsys_which(ss, ssid, &cur_ss_mask) + do_each_subsys_mask(ss, ssid, cur_ss_mask) { new_ss_mask |= ss->depends_on; + } while_each_subsys_mask(); /* * Mask out subsystems which aren't available. This can @@ -1469,7 +1476,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, lockdep_assert_held(&cgroup_mutex); - for_each_subsys_which(ss, ssid, &ss_mask) { + do_each_subsys_mask(ss, ssid, ss_mask) { /* if @ss has non-root csses attached to it, can't move */ if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss))) return -EBUSY; @@ -1477,14 +1484,14 @@ static int rebind_subsystems(struct cgroup_root *dst_root, /* can't move between two non-dummy roots either */ if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) return -EBUSY; - } + } while_each_subsys_mask(); /* skip creating root files on dfl_root for inhibited subsystems */ tmp_ss_mask = ss_mask; if (dst_root == &cgrp_dfl_root) tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask; - for_each_subsys_which(ss, ssid, &tmp_ss_mask) { + do_each_subsys_mask(ss, ssid, tmp_ss_mask) { struct cgroup *scgrp = &ss->root->cgrp; int tssid; @@ -1507,19 +1514,19 @@ static int rebind_subsystems(struct cgroup_root *dst_root, continue; } - for_each_subsys_which(ss, tssid, &tmp_ss_mask) { + do_each_subsys_mask(ss, tssid, tmp_ss_mask) { if (tssid == ssid) break; css_clear_dir(cgroup_css(scgrp, ss), dcgrp); - } + } while_each_subsys_mask(); return ret; - } + } while_each_subsys_mask(); /* * Nothing can fail from this point on. Remove files for the * removed subsystems and rebind each subsystem. */ - for_each_subsys_which(ss, ssid, &ss_mask) { + do_each_subsys_mask(ss, ssid, ss_mask) { struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); @@ -1556,7 +1563,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, if (ss->bind) ss->bind(css); - } + } while_each_subsys_mask(); kernfs_activate(dcgrp->kn); return 0; @@ -2838,12 +2845,12 @@ static void cgroup_print_ss_mask(struct seq_file *seq, unsigned long ss_mask) bool printed = false; int ssid; - for_each_subsys_which(ss, ssid, &ss_mask) { + do_each_subsys_mask(ss, ssid, ss_mask) { if (printed) seq_putc(seq, ' '); seq_printf(seq, "%s", ss->name); printed = true; - } + } while_each_subsys_mask(); if (printed) seq_putc(seq, '\n'); } @@ -2956,11 +2963,9 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, */ buf = strstrip(buf); while ((tok = strsep(&buf, " "))) { - unsigned long tmp_ss_mask = ~cgrp_dfl_root_inhibit_ss_mask; - if (tok[0] == '\0') continue; - for_each_subsys_which(ss, ssid, &tmp_ss_mask) { + do_each_subsys_mask(ss, ssid, ~cgrp_dfl_root_inhibit_ss_mask) { if (!cgroup_ssid_enabled(ssid) || strcmp(tok + 1, ss->name)) continue; @@ -2975,7 +2980,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, return -EINVAL; } break; - } + } while_each_subsys_mask(); if (ssid == CGROUP_SUBSYS_COUNT) return -EINVAL; } @@ -3049,7 +3054,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, * still around. In such cases, wait till it's gone using * offline_waitq. */ - for_each_subsys_which(ss, ssid, &css_enable) { + do_each_subsys_mask(ss, ssid, css_enable) { cgroup_for_each_live_child(child, cgrp) { DEFINE_WAIT(wait); @@ -3066,7 +3071,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, return restart_syscall(); } - } + } while_each_subsys_mask(); cgrp->subtree_control = new_sc; cgrp->subtree_ss_mask = new_ss; @@ -5509,11 +5514,11 @@ int cgroup_can_fork(struct task_struct *child) struct cgroup_subsys *ss; int i, j, ret; - for_each_subsys_which(ss, i, &have_canfork_callback) { + do_each_subsys_mask(ss, i, have_canfork_callback) { ret = ss->can_fork(child); if (ret) goto out_revert; - } + } while_each_subsys_mask(); return 0; @@ -5598,8 +5603,9 @@ void cgroup_post_fork(struct task_struct *child) * css_set; otherwise, @child might change state between ->fork() * and addition to css_set. */ - for_each_subsys_which(ss, i, &have_fork_callback) + do_each_subsys_mask(ss, i, have_fork_callback) { ss->fork(child); + } while_each_subsys_mask(); } /** @@ -5642,8 +5648,9 @@ void cgroup_exit(struct task_struct *tsk) } /* see cgroup_post_fork() for details */ - for_each_subsys_which(ss, i, &have_exit_callback) + do_each_subsys_mask(ss, i, have_exit_callback) { ss->exit(tsk); + } while_each_subsys_mask(); } void cgroup_free(struct task_struct *task) @@ -5652,8 +5659,9 @@ void cgroup_free(struct task_struct *task) struct cgroup_subsys *ss; int ssid; - for_each_subsys_which(ss, ssid, &have_free_callback) + do_each_subsys_mask(ss, ssid, have_free_callback) { ss->free(task); + } while_each_subsys_mask(); put_css_set(cset); } -- cgit v1.2.3 From 996cd1fb7383cf087496e8a441bb10b9873b1eb6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Feb 2016 22:25:46 -0500 Subject: cgroup: use do_each_subsys_mask() where applicable There are several places in cgroup_subtree_control_write() which can use do_each_subsys_mask() instead of manual mask testing. Use it. No functional changes. Signed-off-by: Tejun Heo Acked-by: Johannes Weiner --- kernel/cgroup.c | 35 ++++++++++++----------------------- 1 file changed, 12 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5d102980dc27..1e561bd990b9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3082,10 +3082,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, * dependency. An invisible css is made visible when the userland * explicitly enables it. */ - for_each_subsys(ss, ssid) { - if (!(enable & (1 << ssid))) - continue; - + do_each_subsys_mask(ss, ssid, enable) { cgroup_for_each_live_child(child, cgrp) { if (css_enable & (1 << ssid)) ret = create_css(child, ss, @@ -3096,7 +3093,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, if (ret) goto err_undo_css; } - } + } while_each_subsys_mask(); /* * At this point, cgroup_e_css() results reflect the new csses @@ -3115,10 +3112,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, * state if it's made visible again later. Controllers which may * be depended upon should provide ->css_reset() for this purpose. */ - for_each_subsys(ss, ssid) { - if (!(disable & (1 << ssid))) - continue; - + do_each_subsys_mask(ss, ssid, disable) { cgroup_for_each_live_child(child, cgrp) { struct cgroup_subsys_state *css = cgroup_css(child, ss); @@ -3130,7 +3124,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, ss->css_reset(css); } } - } + } while_each_subsys_mask(); kernfs_activate(cgrp->kn); ret = 0; @@ -3142,10 +3136,7 @@ err_undo_css: cgrp->subtree_control = old_sc; cgrp->subtree_ss_mask = old_ss; - for_each_subsys(ss, ssid) { - if (!(enable & (1 << ssid))) - continue; - + do_each_subsys_mask(ss, ssid, enable) { cgroup_for_each_live_child(child, cgrp) { struct cgroup_subsys_state *css = cgroup_css(child, ss); @@ -3157,7 +3148,7 @@ err_undo_css: else css_clear_dir(css, NULL); } - } + } while_each_subsys_mask(); goto out_unlock; } @@ -4973,14 +4964,12 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, goto out_destroy; /* let's create and online css's */ - for_each_subsys(ss, ssid) { - if (parent->subtree_ss_mask & (1 << ssid)) { - ret = create_css(cgrp, ss, - parent->subtree_control & (1 << ssid)); - if (ret) - goto out_destroy; - } - } + do_each_subsys_mask(ss, ssid, parent->subtree_ss_mask) { + ret = create_css(cgrp, ss, + parent->subtree_control & (1 << ssid)); + if (ret) + goto out_destroy; + } while_each_subsys_mask(); /* * On the default hierarchy, a child doesn't automatically inherit -- cgit v1.2.3 From 6e5c830770f9045a17b1b931c3e11fbd5591e630 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 22 Feb 2016 22:25:47 -0500 Subject: cgroup: make cgroup subsystem masks u16 After the recent do_each_subsys_mask() conversion, there's no reason to use ulong for subsystem masks. We'll be adding more subsystem masks to persistent data structures, let's reduce its size to u16 which should be enough for now and the foreseeable future. This doesn't create any noticeable behavior differences. v2: Johannes spotted that the initial patch missed cgroup_no_v1_mask. Converted. Signed-off-by: Tejun Heo Acked-by: Johannes Weiner --- kernel/cgroup.c | 50 ++++++++++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1e561bd990b9..7669f68077b8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -181,10 +181,10 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root); static bool cgrp_dfl_root_visible; /* Controllers blocked by the commandline in v1 */ -static unsigned long cgroup_no_v1_mask; +static u16 cgroup_no_v1_mask; /* some controllers are not supported in the default hierarchy */ -static unsigned long cgrp_dfl_root_inhibit_ss_mask; +static u16 cgrp_dfl_root_inhibit_ss_mask; /* The list of hierarchy roots */ @@ -208,19 +208,18 @@ static u64 css_serial_nr_next = 1; * fork/exit handlers to call. This avoids us having to do extra work in the * fork/exit path to check which subsystems have fork/exit callbacks. */ -static unsigned long have_fork_callback __read_mostly; -static unsigned long have_exit_callback __read_mostly; -static unsigned long have_free_callback __read_mostly; +static u16 have_fork_callback __read_mostly; +static u16 have_exit_callback __read_mostly; +static u16 have_free_callback __read_mostly; /* Ditto for the can_fork callback. */ -static unsigned long have_canfork_callback __read_mostly; +static u16 have_canfork_callback __read_mostly; static struct file_system_type cgroup2_fs_type; static struct cftype cgroup_dfl_base_files[]; static struct cftype cgroup_legacy_base_files[]; -static int rebind_subsystems(struct cgroup_root *dst_root, - unsigned long ss_mask); +static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); static void css_task_iter_advance(struct css_task_iter *it); static int cgroup_destroy_locked(struct cgroup *cgrp); static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, @@ -1274,11 +1273,10 @@ static umode_t cgroup_file_mode(const struct cftype *cft) * @subtree_control is to be applied to @cgrp. The returned mask is always * a superset of @subtree_control and follows the usual hierarchy rules. */ -static unsigned long cgroup_calc_subtree_ss_mask(struct cgroup *cgrp, - unsigned long subtree_control) +static u16 cgroup_calc_subtree_ss_mask(struct cgroup *cgrp, u16 subtree_control) { struct cgroup *parent = cgroup_parent(cgrp); - unsigned long cur_ss_mask = subtree_control; + u16 cur_ss_mask = subtree_control; struct cgroup_subsys *ss; int ssid; @@ -1288,7 +1286,7 @@ static unsigned long cgroup_calc_subtree_ss_mask(struct cgroup *cgrp, return cur_ss_mask; while (true) { - unsigned long new_ss_mask = cur_ss_mask; + u16 new_ss_mask = cur_ss_mask; do_each_subsys_mask(ss, ssid, cur_ss_mask) { new_ss_mask |= ss->depends_on; @@ -1466,12 +1464,11 @@ err: return ret; } -static int rebind_subsystems(struct cgroup_root *dst_root, - unsigned long ss_mask) +static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; - unsigned long tmp_ss_mask; + u16 tmp_ss_mask; int ssid, i, ret; lockdep_assert_held(&cgroup_mutex); @@ -1507,7 +1504,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, */ if (dst_root == &cgrp_dfl_root) { if (cgrp_dfl_root_visible) { - pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n", + pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n", ret, ss_mask); pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); } @@ -1599,7 +1596,7 @@ static int cgroup_show_options(struct seq_file *seq, } struct cgroup_sb_opts { - unsigned long subsys_mask; + u16 subsys_mask; unsigned int flags; char *release_agent; bool cpuset_clone_children; @@ -1612,13 +1609,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) { char *token, *o = data; bool all_ss = false, one_ss = false; - unsigned long mask = -1UL; + u16 mask = U16_MAX; struct cgroup_subsys *ss; int nr_opts = 0; int i; #ifdef CONFIG_CPUSETS - mask = ~(1U << cpuset_cgrp_id); + mask = ~((u16)1 << cpuset_cgrp_id); #endif memset(opts, 0, sizeof(*opts)); @@ -1745,7 +1742,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) int ret = 0; struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_sb_opts opts; - unsigned long added_mask, removed_mask; + u16 added_mask, removed_mask; if (root == &cgrp_dfl_root) { pr_err("remount is not allowed\n"); @@ -1893,7 +1890,7 @@ static void init_cgroup_root(struct cgroup_root *root, set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) +static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; @@ -2839,7 +2836,7 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) return 0; } -static void cgroup_print_ss_mask(struct seq_file *seq, unsigned long ss_mask) +static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) { struct cgroup_subsys *ss; bool printed = false; @@ -2950,8 +2947,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - unsigned long enable = 0, disable = 0; - unsigned long css_enable, css_disable, old_sc, new_sc, old_ss, new_ss; + u16 enable = 0, disable = 0; + u16 css_enable, css_disable, old_sc, new_sc, old_ss, new_ss; struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; @@ -5255,7 +5252,7 @@ int __init cgroup_init_early(void) return 0; } -static unsigned long cgroup_disable_mask __initdata; +static u16 cgroup_disable_mask __initdata; /** * cgroup_init - cgroup initialization @@ -5269,6 +5266,7 @@ int __init cgroup_init(void) unsigned long key; int ssid; + BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); @@ -5754,7 +5752,7 @@ static int __init cgroup_no_v1(char *str) continue; if (!strcmp(token, "all")) { - cgroup_no_v1_mask = ~0UL; + cgroup_no_v1_mask = U16_MAX; break; } -- cgit v1.2.3 From a7165264429b7b0d95557f306f310e77407fc2ee Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 23 Feb 2016 10:00:50 -0500 Subject: cgroup: s/cgrp_dfl_root_/cgrp_dfl_/ These var names are unnecessarily unwiedly and another similar variable will be added. Let's shorten them. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7669f68077b8..afbed523b22f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -178,13 +178,13 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root); * The default hierarchy always exists but is hidden until mounted for the * first time. This is for backward compatibility. */ -static bool cgrp_dfl_root_visible; +static bool cgrp_dfl_visible; /* Controllers blocked by the commandline in v1 */ static u16 cgroup_no_v1_mask; /* some controllers are not supported in the default hierarchy */ -static u16 cgrp_dfl_root_inhibit_ss_mask; +static u16 cgrp_dfl_inhibit_ss_mask; /* The list of hierarchy roots */ @@ -1486,7 +1486,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) /* skip creating root files on dfl_root for inhibited subsystems */ tmp_ss_mask = ss_mask; if (dst_root == &cgrp_dfl_root) - tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask; + tmp_ss_mask &= ~cgrp_dfl_inhibit_ss_mask; do_each_subsys_mask(ss, ssid, tmp_ss_mask) { struct cgroup *scgrp = &ss->root->cgrp; @@ -1503,7 +1503,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) * Just warn about it and continue. */ if (dst_root == &cgrp_dfl_root) { - if (cgrp_dfl_root_visible) { + if (cgrp_dfl_visible) { pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n", ret, ss_mask); pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); @@ -2006,7 +2006,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); return ERR_PTR(-EINVAL); } - cgrp_dfl_root_visible = true; + cgrp_dfl_visible = true; root = &cgrp_dfl_root; cgroup_get(&root->cgrp); goto out_mount; @@ -2858,7 +2858,7 @@ static int cgroup_root_controllers_show(struct seq_file *seq, void *v) struct cgroup *cgrp = seq_css(seq)->cgroup; cgroup_print_ss_mask(seq, cgrp->root->subsys_mask & - ~cgrp_dfl_root_inhibit_ss_mask); + ~cgrp_dfl_inhibit_ss_mask); return 0; } @@ -2962,7 +2962,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, while ((tok = strsep(&buf, " "))) { if (tok[0] == '\0') continue; - do_each_subsys_mask(ss, ssid, ~cgrp_dfl_root_inhibit_ss_mask) { + do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) { if (!cgroup_ssid_enabled(ssid) || strcmp(tok + 1, ss->name)) continue; @@ -5315,7 +5315,7 @@ int __init cgroup_init(void) cgrp_dfl_root.subsys_mask |= 1 << ss->id; if (!ss->dfl_cftypes) - cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id; + cgrp_dfl_inhibit_ss_mask |= 1 << ss->id; if (ss->dfl_cftypes == ss->legacy_cftypes) { WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes)); @@ -5386,7 +5386,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct cgroup *cgrp; int ssid, count = 0; - if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible) + if (root == &cgrp_dfl_root && !cgrp_dfl_visible) continue; seq_printf(m, "%d:", root->hierarchy_id); -- cgit v1.2.3 From b38e42e962dbc2fbc3839ce70750881db7c9277e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 23 Feb 2016 10:00:50 -0500 Subject: cgroup: convert cgroup_subsys flag fields to bool bitfields Signed-off-by: Tejun Heo Cc: Li Zefan Cc: Ingo Molnar Cc: Peter Zijlstra --- kernel/cpuset.c | 2 +- kernel/sched/core.c | 2 +- kernel/sched/cpuacct.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 41989ab4db57..90899837ea78 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2089,7 +2089,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .attach = cpuset_attach, .bind = cpuset_bind, .legacy_cftypes = files, - .early_init = 1, + .early_init = true, }; /** diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 44253adb3c36..0f5abc6e4ff3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8706,7 +8706,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .legacy_cftypes = cpu_files, - .early_init = 1, + .early_init = true, }; #endif /* CONFIG_CGROUP_SCHED */ diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index dd7cbb55bbf2..2ddaebf7469a 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -279,5 +279,5 @@ struct cgroup_subsys cpuacct_cgrp_subsys = { .css_alloc = cpuacct_css_alloc, .css_free = cpuacct_css_free, .legacy_cftypes = files, - .early_init = 1, + .early_init = true, }; -- cgit v1.2.3 From f17fc25f2b4f4bd8edafe36af6d7379eb9db27a0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 23 Feb 2016 10:00:51 -0500 Subject: cgroup: make css_tryget_online_from_dir() also recognize cgroup2 fs The function currently returns -EBADF for a directory on the default hierarchy. Make it also recognize cgroup2_fs_type. This will be used for perf_event cgroup2 support. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index afbed523b22f..2b114368666c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5781,12 +5781,13 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys *ss) { struct kernfs_node *kn = kernfs_node_from_dentry(dentry); + struct file_system_type *s_type = dentry->d_sb->s_type; struct cgroup_subsys_state *css = NULL; struct cgroup *cgrp; /* is @dentry a cgroup dir? */ - if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || - kernfs_type(kn) != KERNFS_DIR) + if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) || + !kn || kernfs_type(kn) != KERNFS_DIR) return ERR_PTR(-EBADF); rcu_read_lock(); -- cgit v1.2.3 From 62716ea0f2ea4253984008fd4a96a532674ac58f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 23 Feb 2016 10:00:51 -0500 Subject: cgroup: use ->subtree_control when testing no internal process rule No internal process rule is enforced by cgroup_migrate_prepare_dst() during process migration. It tests whether the target cgroup's ->child_subsys_mask is zero which is different from "subtree_control" write path which tests ->subtree_control. This hasn't mattered because up until now, both ->child_subsys_mask and ->subtree_control are zero or non-zero at the same time. However, with the planned addition of implicit controllers, this will no longer be true. This patch prepares for the change by making cgorup_migrate_prepare_dst() test ->subtree_control instead. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2b114368666c..ac5451e7c458 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2527,11 +2527,11 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, lockdep_assert_held(&cgroup_mutex); /* - * Except for the root, subtree_ss_mask must be zero for a cgroup + * Except for the root, subtree_control must be zero for a cgroup * with tasks so that child cgroups don't compete against tasks. */ if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) && - dst_cgrp->subtree_ss_mask) + dst_cgrp->subtree_control) return -EBUSY; /* look up the dst cset for each src cset and link it to src */ -- cgit v1.2.3 From 63253ad814db726d43c04011c752d83b7aaca998 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Fri, 26 Feb 2016 13:07:38 +0800 Subject: cgroup: fix a mistake in warning message There is a mistake about the print format name:id <--> %d:%s, which the name is 'char *' type and id is 'int' type. Change "name:id" to "id:name" instead to be consistent with "cgroup_subsys %d:%s". Signed-off-by: Xiubo Li Acked-by: Zefan Li Signed-off-by: Tejun Heo --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ac5451e7c458..fcfad82149b1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5235,7 +5235,7 @@ int __init cgroup_init_early(void) for_each_subsys(ss, i) { WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id, - "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n", + "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n", i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free, ss->id, ss->name); WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN, -- cgit v1.2.3 From 39853cc0cdcf1b11f00f7f81e2f515a4d68ed209 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Sun, 28 Feb 2016 22:22:37 -0600 Subject: bpf: Mark __bpf_prog_run() stack frame as non-standard objtool reports the following false positive warnings: kernel/bpf/core.o: warning: objtool: __bpf_prog_run()+0x5c: sibling call from callable instruction with changed frame pointer kernel/bpf/core.o: warning: objtool: __bpf_prog_run()+0x60: function has unreachable instruction kernel/bpf/core.o: warning: objtool: __bpf_prog_run()+0x64: function has unreachable instruction [...] It's confused by the following dynamic jump instruction in __bpf_prog_run():: jmp *(%r12,%rax,8) which corresponds to the following line in the C code: goto *jumptable[insn->code]; There's no way for objtool to deterministically find all possible branch targets for a dynamic jump, so it can't verify this code. In this case the jumps all stay within the function, and there's nothing unusual going on related to the stack, so we can whitelist the function. Signed-off-by: Josh Poimboeuf Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Bernd Petrovitsch Cc: Borislav Petkov Cc: Chris J Arges Cc: Jiri Slaby Cc: Linus Torvalds Cc: Michal Marek Cc: Namhyung Kim Cc: Pedro Alves Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: live-patching@vger.kernel.org Cc: netdev@vger.kernel.org Link: http://lkml.kernel.org/r/b90e6bf3fdbfb5c4cc1b164b965502e53cf48935.1456719558.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- kernel/bpf/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 972d9a8e4ac4..be0abf669ced 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -649,6 +650,7 @@ load_byte: WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); return 0; } +STACK_FRAME_NON_STANDARD(__bpf_prog_run); /* jump table */ bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) -- cgit v1.2.3 From 8e05e96ac949c80704d0a38420bf60dcf18c938f Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Sun, 28 Feb 2016 22:22:38 -0600 Subject: sched: Mark __schedule() stack frame as non-standard objtool reports the following warnings for __schedule(): kernel/sched/core.o: warning: objtool:__schedule()+0x3c0: duplicate frame pointer save kernel/sched/core.o: warning: objtool:__schedule()+0x3fd: sibling call from callable instruction with changed frame pointer kernel/sched/core.o: warning: objtool:__schedule()+0x40a: call without frame pointer save/setup kernel/sched/core.o: warning: objtool:__schedule()+0x7fd: frame pointer state mismatch kernel/sched/core.o: warning: objtool:__schedule()+0x421: frame pointer state mismatch Basically it's confused by two unusual attributes of the switch_to() macro: 1. It saves prev's frame pointer to the old stack and restores next's frame pointer from the new stack. 2. For new tasks it jumps directly to ret_from_fork. Eventually it would probably be a good idea to clean up the ret_from_fork hack so that new tasks are created with a valid initial stack, as suggested by Andy: https://lkml.kernel.org/r/CALCETrWsqCw4L1qKO9j9L5F+4ED4viuLQTFc=n1pKBZfFPQUFg@mail.gmail.com Then __schedule() could return normally into the new code and objtool hopefully wouldn't have a problem anymore. In the meantime, mark its stack frame as non-standard so we can have a baseline with no objtool warnings. The marker also serves as a reminder that this code could be improved a bit. Signed-off-by: Josh Poimboeuf Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Bernd Petrovitsch Cc: Borislav Petkov Cc: Chris J Arges Cc: Jiri Slaby Cc: Linus Torvalds Cc: Michal Marek Cc: Namhyung Kim Cc: Pedro Alves Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: live-patching@vger.kernel.org Link: http://lkml.kernel.org/r/91190e324ebd7fcd01748d508d0dfd4693e84d91.1456719558.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9503d590e5ef..641043dfc773 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -74,6 +74,7 @@ #include #include #include +#include #include #include @@ -3288,6 +3289,7 @@ static void __sched notrace __schedule(bool preempt) balance_callback(rq); } +STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */ static inline void sched_submit_work(struct task_struct *tsk) { -- cgit v1.2.3 From 049369487e2068294b61cee19233be0ffac7d243 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Sun, 28 Feb 2016 22:22:39 -0600 Subject: sched: Always inline context_switch() When CONFIG_GCOV is enabled, gcc decides to put context_switch() out-of-line, which is inconsistent with its normal behavior. It also causes an objtool warning because __schedule() no longer inlines context_switch(), so the "STACK_FRAME_NON_STANDARD(__schedule)" statement loses its effect. Signed-off-by: Josh Poimboeuf Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Bernd Petrovitsch Cc: Borislav Petkov Cc: Chris J Arges Cc: Jiri Slaby Cc: Linus Torvalds Cc: Michal Marek Cc: Namhyung Kim Cc: Pedro Alves Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: live-patching@vger.kernel.org Link: http://lkml.kernel.org/r/d62aee926b6e303394e34a06999a964dc2773cf6.1456719558.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 641043dfc773..bb0daabe0ffe 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2763,7 +2763,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) /* * context_switch - switch to the new MM and the new thread's register state. */ -static inline struct rq * +static __always_inline struct rq * context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { -- cgit v1.2.3 From ede5147d515694e012cd958ec874b9daf8a65fec Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 24 Feb 2016 14:37:53 +0000 Subject: Handle ISO 8601 leap seconds and encodings of midnight in mktime64() Handle the following ISO 8601 features in mktime64(): (1) Leap seconds. Leap seconds are indicated by the seconds parameter being the value 60. Handle this by treating it the same as 00 of the following minute. It has been pointed out that a minute may contain two leap seconds. However, pending discussion of what that looks like and how to handle it, I'm not going to concern myself with it. (2) Alternate encodings of midnight. Two different encodings of midnight are permitted - 00:00:00 and 24:00:00 - the first is midnight today and the second is midnight tomorrow and is exactly equivalent to the first with tomorrow's date. As it happens, we don't actually need to change mktime64() to handle either of these - just comment them as valid parameters. These facility will be used by the X.509 parser. Doing it in mktime64() makes the policy common to the whole kernel and easier to find. Signed-off-by: David Howells Acked-by: Arnd Bergmann cc: John Stultz cc: Rudolf Polzer cc: One Thousand Gnomes --- kernel/time/time.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index 86751c68e08d..be115b020d27 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -322,6 +322,13 @@ EXPORT_SYMBOL(timespec_trunc); * -year/100+year/400 terms, and add 10.] * * This algorithm was first published by Gauss (I think). + * + * A leap second can be indicated by calling this function with sec as + * 60 (allowable under ISO 8601). The leap second is treated the same + * as the following second since they don't exist in UNIX time. + * + * An encoding of midnight at the end of the day as 24:00:00 - ie. midnight + * tomorrow - (allowable under ISO 8601) is supported. */ time64_t mktime64(const unsigned int year0, const unsigned int mon0, const unsigned int day, const unsigned int hour, @@ -338,7 +345,7 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0, return ((((time64_t) (year/4 - year/100 + year/400 + 367*mon/12 + day) + year*365 - 719499 - )*24 + hour /* now have hours */ + )*24 + hour /* now have hours - midnight tomorrow handled here */ )*60 + min /* now have minutes */ )*60 + sec; /* finally seconds */ } -- cgit v1.2.3 From fa5ff8a1c43fc7b78353059899edf3cbedf54e9f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 28 Feb 2016 08:59:33 -0500 Subject: cgroup: fix and restructure error handling in copy_cgroup_ns() copy_cgroup_ns()'s error handling was broken and the attempt to fix it d22025570e2e ("cgroup: fix alloc_cgroup_ns() error handling in copy_cgroup_ns()") was broken too in that it ended up trying an ERR_PTR() value. There's only one place where copy_cgroup_ns() needs to perform cleanup after failure. Simplify and fix the error handling by removing the goto's. Signed-off-by: Tejun Heo Reported-by: Dan Carpenter Acked-by: Serge E. Hallyn --- kernel/cgroup.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d92d91a4bb3e..2c88149da848 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -6058,9 +6058,8 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { - struct cgroup_namespace *new_ns = NULL; - struct css_set *cset = NULL; - int err; + struct cgroup_namespace *new_ns; + struct css_set *cset; BUG_ON(!old_ns); @@ -6070,9 +6069,8 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, } /* Allow only sysadmin to create cgroup namespace. */ - err = -EPERM; if (!ns_capable(user_ns, CAP_SYS_ADMIN)) - goto err_out; + return ERR_PTR(-EPERM); mutex_lock(&cgroup_mutex); spin_lock_bh(&css_set_lock); @@ -6085,20 +6083,14 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, new_ns = alloc_cgroup_ns(); if (IS_ERR(new_ns)) { - err = PTR_ERR(new_ns); - goto err_out; + put_css_set(cset); + return new_ns; } new_ns->user_ns = get_user_ns(user_ns); new_ns->root_cset = cset; return new_ns; - -err_out: - if (cset) - put_css_set(cset); - kfree(new_ns); - return ERR_PTR(err); } static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) -- cgit v1.2.3 From 89053aa9c711bd455a68844b4ed37c8b72ef1daa Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 1 Mar 2016 10:36:07 +0000 Subject: MODSIGN: linux/string.h should be #included to get memcpy() linux/string.h should be #included in module_signing.c to get memcpy(), lest the following occur: kernel/module_signing.c: In function 'mod_verify_sig': kernel/module_signing.c:57:2: error: implicit declaration of function 'memcpy' [-Werror=implicit-function-declaration] memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms)); ^ Reported-by: kbuild test robot Signed-off-by: David Howells --- kernel/module_signing.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/module_signing.c b/kernel/module_signing.c index 6528a79d998d..9cfa46d8d14f 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include "module-internal.h" -- cgit v1.2.3 From fa06235b8eb0ae87a962e023243dba1eb4e7160d Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 1 Mar 2016 19:56:30 +0300 Subject: cgroup: reset css on destruction An associated css can be around for quite a while after a cgroup directory has been removed. In general, it makes sense to reset it to defaults so as not to worry about any remnants. For instance, memory cgroup needs to reset memory.low, otherwise pages charged to a dead cgroup might never get reclaimed. There's ->css_reset callback, which would fit perfectly for the purpose. Currently, it's only called when a subsystem is disabled in the unified hierarchy and there are other subsystems dependant on it. Let's call it on css destruction as well. Suggested-by: Johannes Weiner Signed-off-by: Vladimir Davydov Signed-off-by: Tejun Heo --- kernel/cgroup.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fcfad82149b1..46529502e9d5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4787,6 +4787,9 @@ static void offline_css(struct cgroup_subsys_state *css) if (!(css->flags & CSS_ONLINE)) return; + if (ss->css_reset) + ss->css_reset(css); + if (ss->css_offline) ss->css_offline(css); -- cgit v1.2.3 From 9b7f6597f013d449d6700d11820faf91ee0ec985 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 2 Mar 2016 12:53:31 +0100 Subject: sched/core: Get rid of 'cpu' argument in wq_worker_sleeping() Given that wq_worker_sleeping() could only be called for a CPU it is running on, we do not need passing a CPU ID as an argument. Suggested-by: Oleg Nesterov Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Alexander Gordeev Signed-off-by: Tejun Heo --- kernel/sched/core.c | 2 +- kernel/workqueue.c | 5 ++--- kernel/workqueue_internal.h | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 63d3a24e081a..81ff7f2ad37a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3257,7 +3257,7 @@ static void __sched notrace __schedule(bool preempt) if (prev->flags & PF_WQ_WORKER) { struct task_struct *to_wakeup; - to_wakeup = wq_worker_sleeping(prev, cpu); + to_wakeup = wq_worker_sleeping(prev); if (to_wakeup) try_to_wake_up_local(to_wakeup); } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3a1c99b0c1b3..16f4986205e9 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -858,7 +858,6 @@ void wq_worker_waking_up(struct task_struct *task, int cpu) /** * wq_worker_sleeping - a worker is going to sleep * @task: task going to sleep - * @cpu: CPU in question, must be the current CPU number * * This function is called during schedule() when a busy worker is * going to sleep. Worker on the same cpu can be woken up by @@ -870,7 +869,7 @@ void wq_worker_waking_up(struct task_struct *task, int cpu) * Return: * Worker task on @cpu to wake up, %NULL if none. */ -struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) +struct task_struct *wq_worker_sleeping(struct task_struct *task) { struct worker *worker = kthread_data(task), *to_wakeup = NULL; struct worker_pool *pool; @@ -886,7 +885,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) pool = worker->pool; /* this can only happen on the local cpu */ - if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu)) + if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id())) return NULL; /* diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 45215870ac6c..8635417c587b 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -69,6 +69,6 @@ static inline struct worker *current_wq_worker(void) * sched/core.c and workqueue.c. */ void wq_worker_waking_up(struct task_struct *task, int cpu); -struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu); +struct task_struct *wq_worker_sleeping(struct task_struct *task); #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ -- cgit v1.2.3 From 2378d8b8ba3d2adffb4f98c0c60ee2f448b3be69 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:57:57 -0500 Subject: cgroup: re-hash init_css_set after subsystems are initialized css_sets are hashed by their subsys[] contents and in cgroup_init() init_css_set is hashed early, before subsystem inits, when all entries in its subsys[] are NULL, so that cgroup_dfl_root initialization can find and link to it. As subsystems are initialized, init_css_set.subsys[] is filled up but the hashing is never updated making init_css_set hashed in the wrong place. While incorrect, this doesn't cause a critical failure as css_set management code would create an identical css_set dynamically. Fix it by rehashing init_css_set after subsystems are initialized. While at it, drop unnecessary @key local variable. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 46529502e9d5..e97772b42dfb 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5266,7 +5266,6 @@ static u16 cgroup_disable_mask __initdata; int __init cgroup_init(void) { struct cgroup_subsys *ss; - unsigned long key; int ssid; BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); @@ -5276,9 +5275,12 @@ int __init cgroup_init(void) mutex_lock(&cgroup_mutex); - /* Add init_css_set to the hash table */ - key = css_set_hash(init_css_set.subsys); - hash_add(css_set_table, &init_css_set.hlist, key); + /* + * Add init_css_set to the hash table so that dfl_root can link to + * it during init. + */ + hash_add(css_set_table, &init_css_set.hlist, + css_set_hash(init_css_set.subsys)); BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); @@ -5331,6 +5333,11 @@ int __init cgroup_init(void) ss->bind(init_css_set.subsys[ssid]); } + /* init_css_set.subsys[] has been updated, re-hash */ + hash_del(&init_css_set.hlist); + hash_add(css_set_table, &init_css_set.hlist, + css_set_hash(init_css_set.subsys)); + WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); WARN_ON(register_filesystem(&cgroup_fs_type)); WARN_ON(register_filesystem(&cgroup2_fs_type)); -- cgit v1.2.3 From 20b454a61fba59be13de52b4493898583ea26d20 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:57:57 -0500 Subject: cgroup: suppress spurious de-populated events During task migration, tasks may transfer between two css_sets which are associated with the same cgroup. If those tasks are the only tasks in the cgroup, this currently triggers a spurious de-populated event on the cgroup. Fix it by bumping up populated count before bumping it down during migration to ensure that it doesn't reach zero spuriously. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e97772b42dfb..5d452e7fcb4f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -678,6 +678,9 @@ static void css_set_move_task(struct task_struct *task, { lockdep_assert_held(&css_set_lock); + if (to_cset && !css_set_populated(to_cset)) + css_set_update_populated(to_cset, true); + if (from_cset) { struct css_task_iter *it, *pos; @@ -711,8 +714,6 @@ static void css_set_move_task(struct task_struct *task, */ WARN_ON_ONCE(task->flags & PF_EXITING); - if (!css_set_populated(to_cset)) - css_set_update_populated(to_cset, true); rcu_assign_pointer(task->cgroups, to_cset); list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : &to_cset->tasks); -- cgit v1.2.3 From 6cd0f5bbaf594f40a97d01dbc565dda41f30d37c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:57:58 -0500 Subject: cgroup: separate out interface file creation from css creation Currently, interface files are created when a css is created depending on whether @visible is set. This patch separates out the two into separate steps to help code refactoring and eventually allow cgroups which aren't visible through cgroup fs. Move css_populate_dir() out of create_css() and drop @visible. While at it, rename the function to css_create() for consistency. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 72 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 43 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5d452e7fcb4f..4178e45becb4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -222,8 +222,8 @@ static struct cftype cgroup_legacy_base_files[]; static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); static void css_task_iter_advance(struct css_task_iter *it); static int cgroup_destroy_locked(struct cgroup *cgrp); -static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, - bool visible); +static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, + struct cgroup_subsys *ss); static void css_release(struct percpu_ref *ref); static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup_subsys_state *css, @@ -3082,14 +3082,26 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, */ do_each_subsys_mask(ss, ssid, enable) { cgroup_for_each_live_child(child, cgrp) { - if (css_enable & (1 << ssid)) - ret = create_css(child, ss, - cgrp->subtree_control & (1 << ssid)); - else + if (css_enable & (1 << ssid)) { + struct cgroup_subsys_state *css; + + css = css_create(child, ss); + if (IS_ERR(css)) { + ret = PTR_ERR(css); + goto err_undo_css; + } + + if (cgrp->subtree_control & (1 << ssid)) { + ret = css_populate_dir(css, NULL); + if (ret) + goto err_undo_css; + } + } else { ret = css_populate_dir(cgroup_css(child, ss), NULL); - if (ret) - goto err_undo_css; + if (ret) + goto err_undo_css; + } } } while_each_subsys_mask(); @@ -4717,7 +4729,9 @@ static void css_release_work_fn(struct work_struct *work) * Those are supported by RCU protecting clearing of * cgrp->kn->priv backpointer. */ - RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); + if (cgrp->kn) + RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, + NULL); } mutex_unlock(&cgroup_mutex); @@ -4801,17 +4815,16 @@ static void offline_css(struct cgroup_subsys_state *css) } /** - * create_css - create a cgroup_subsys_state + * css_create - create a cgroup_subsys_state * @cgrp: the cgroup new css will be associated with * @ss: the subsys of new css - * @visible: whether to create control knobs for the new css or not * * Create a new css associated with @cgrp - @ss pair. On success, the new - * css is online and installed in @cgrp with all interface files created if - * @visible. Returns 0 on success, -errno on failure. + * css is online and installed in @cgrp. This function doesn't create the + * interface files. Returns 0 on success, -errno on failure. */ -static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, - bool visible) +static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, + struct cgroup_subsys *ss) { struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); @@ -4822,7 +4835,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, css = ss->css_alloc(parent_css); if (IS_ERR(css)) - return PTR_ERR(css); + return css; init_and_link_css(css, ss, cgrp); @@ -4835,12 +4848,6 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, goto err_free_percpu_ref; css->id = err; - if (visible) { - err = css_populate_dir(css, NULL); - if (err) - goto err_free_id; - } - /* @css is ready to be brought online now, make it visible */ list_add_tail_rcu(&css->sibling, &parent_css->children); cgroup_idr_replace(&ss->css_idr, css, css->id); @@ -4858,18 +4865,16 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, ss->warned_broken_hierarchy = true; } - return 0; + return css; err_list_del: list_del_rcu(&css->sibling); - css_clear_dir(css, NULL); -err_free_id: cgroup_idr_remove(&ss->css_idr, css->id); err_free_percpu_ref: percpu_ref_exit(&css->refcnt); err_free_css: call_rcu(&css->rcu_head, css_free_rcu_fn); - return err; + return ERR_PTR(err); } static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, @@ -4966,10 +4971,19 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, /* let's create and online css's */ do_each_subsys_mask(ss, ssid, parent->subtree_ss_mask) { - ret = create_css(cgrp, ss, - parent->subtree_control & (1 << ssid)); - if (ret) + struct cgroup_subsys_state *css; + + css = css_create(cgrp, ss); + if (IS_ERR(css)) { + ret = PTR_ERR(css); goto out_destroy; + } + + if (parent->subtree_control & (1 << ssid)) { + ret = css_populate_dir(css, NULL); + if (ret) + goto out_destroy; + } } while_each_subsys_mask(); /* -- cgit v1.2.3 From 88cb04b96a1934ecbfd1d324e7cde55890c1a576 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:57:58 -0500 Subject: cgroup: explicitly track whether a cgroup_subsys_state is visible to userland Currently, whether a css (cgroup_subsys_state) has its interface files created is not tracked and assumed to change together with the owning cgroup's lifecycle. cgroup directory and interface creation is being separated out from internal object creation to help refactoring and eventually allow cgroups which are not visible through cgroupfs. This patch adds CSS_VISIBLE to track whether a css has its interface files created and perform management operations only when necessary which helps decoupling interface file handling from internal object lifecycle. After this patch, all css interface file management functions can be called regardless of the current state and will achieve the expected result. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4178e45becb4..a9a53ca942f3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1421,6 +1421,11 @@ static void css_clear_dir(struct cgroup_subsys_state *css, struct cgroup *cgrp = cgrp_override ?: css->cgroup; struct cftype *cfts; + if (!(css->flags & CSS_VISIBLE)) + return; + + css->flags &= ~CSS_VISIBLE; + list_for_each_entry(cfts, &css->ss->cfts, node) cgroup_addrm_files(css, cgrp, cfts, false); } @@ -1439,6 +1444,9 @@ static int css_populate_dir(struct cgroup_subsys_state *css, struct cftype *cfts, *failed_cfts; int ret; + if (css->flags & CSS_VISIBLE) + return 0; + if (!css->ss) { if (cgroup_on_dfl(cgrp)) cfts = cgroup_dfl_base_files; @@ -1455,6 +1463,9 @@ static int css_populate_dir(struct cgroup_subsys_state *css, goto err; } } + + css->flags |= CSS_VISIBLE; + return 0; err: list_for_each_entry(cfts, &css->ss->cfts, node) { @@ -3403,7 +3414,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) css_for_each_descendant_pre(css, cgroup_css(root, ss)) { struct cgroup *cgrp = css->cgroup; - if (cgroup_is_dead(cgrp)) + if (!(css->flags & CSS_VISIBLE)) continue; ret = cgroup_addrm_files(css, cgrp, cfts, is_add); -- cgit v1.2.3 From 195e9b6c4b09434dad6ec3c163fdf037e16b3c96 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:57:58 -0500 Subject: cgroup: reorder operations in cgroup_mkdir() Currently, operations to initialize internal objects and create interface directory and files are intermixed in cgroup_mkdir(). We're in the process of refactoring cgroup and css management paths to separate them out to eventually allow cgroups which aren't visible through cgroup fs. This patch reorders operations inside cgroup_mkdir() so that interface directory and file handling comes after internal object initialization. This will enable further refactoring. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 61 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 30 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a9a53ca942f3..a6d484a667aa 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4945,20 +4945,6 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); - /* create the directory */ - kn = kernfs_create_dir(parent->kn, name, mode, cgrp); - if (IS_ERR(kn)) { - ret = PTR_ERR(kn); - goto out_free_id; - } - cgrp->kn = kn; - - /* - * This extra ref will be put in cgroup_free_fn() and guarantees - * that @cgrp->kn is always accessible. - */ - kernfs_get(kn); - cgrp->self.serial_nr = css_serial_nr_next++; /* allocation complete, commit to creation */ @@ -4972,15 +4958,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, */ cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); - ret = cgroup_kn_set_ugid(kn); - if (ret) - goto out_destroy; - - ret = css_populate_dir(&cgrp->self, NULL); - if (ret) - goto out_destroy; - - /* let's create and online css's */ + /* create the csses */ do_each_subsys_mask(ss, ssid, parent->subtree_ss_mask) { struct cgroup_subsys_state *css; @@ -4989,12 +4967,6 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, ret = PTR_ERR(css); goto out_destroy; } - - if (parent->subtree_control & (1 << ssid)) { - ret = css_populate_dir(css, NULL); - if (ret) - goto out_destroy; - } } while_each_subsys_mask(); /* @@ -5006,13 +4978,40 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, cgroup_refresh_subtree_ss_mask(cgrp); } + /* create the directory */ + kn = kernfs_create_dir(parent->kn, name, mode, cgrp); + if (IS_ERR(kn)) { + ret = PTR_ERR(kn); + goto out_destroy; + } + cgrp->kn = kn; + + /* + * This extra ref will be put in cgroup_free_fn() and guarantees + * that @cgrp->kn is always accessible. + */ + kernfs_get(kn); + + ret = cgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + + ret = css_populate_dir(&cgrp->self, NULL); + if (ret) + goto out_destroy; + + do_each_subsys_mask(ss, ssid, parent->subtree_control) { + ret = css_populate_dir(cgroup_css(cgrp, ss), NULL); + if (ret) + goto out_destroy; + } while_each_subsys_mask(); + + /* let's create and online css's */ kernfs_activate(kn); ret = 0; goto out_unlock; -out_free_id: - cgroup_idr_remove(&root->cgroup_idr, cgrp->id); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: -- cgit v1.2.3 From a5bca2152036de826595723437c5cbe8f6c13983 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:57:58 -0500 Subject: cgroup: factor out cgroup_create() out of cgroup_mkdir() We're in the process of refactoring cgroup and css management paths to separate them out to eventually allow cgroups which aren't visible through cgroup fs. This patch factors out cgroup_create() out of cgroup_mkdir(). cgroup_create() contains all internal object creation and initialization. cgroup_mkdir() uses cgroup_create() to create the internal cgroup and adds interface directory and file creation. This patch doesn't cause any behavior differences. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 72 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 43 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a6d484a667aa..e1b3d0fead05 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4888,33 +4888,19 @@ err_free_css: return ERR_PTR(err); } -static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, - umode_t mode) +static struct cgroup *cgroup_create(struct cgroup *parent) { - struct cgroup *parent, *cgrp, *tcgrp; - struct cgroup_root *root; + struct cgroup_root *root = parent->root; struct cgroup_subsys *ss; - struct kernfs_node *kn; - int level, ssid, ret; - - /* Do not accept '\n' to prevent making /proc//cgroup unparsable. - */ - if (strchr(name, '\n')) - return -EINVAL; - - parent = cgroup_kn_lock_live(parent_kn); - if (!parent) - return -ENODEV; - root = parent->root; - level = parent->level + 1; + struct cgroup *cgrp, *tcgrp; + int level = parent->level + 1; + int ssid, ret; /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(sizeof(*cgrp) + sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL); - if (!cgrp) { - ret = -ENOMEM; - goto out_unlock; - } + if (!cgrp) + return ERR_PTR(-ENOMEM); ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) @@ -4978,6 +4964,40 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, cgroup_refresh_subtree_ss_mask(cgrp); } + return cgrp; + +out_cancel_ref: + percpu_ref_exit(&cgrp->self.refcnt); +out_free_cgrp: + kfree(cgrp); + return ERR_PTR(ret); +out_destroy: + cgroup_destroy_locked(cgrp); + return ERR_PTR(ret); +} + +static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) +{ + struct cgroup *parent, *cgrp; + struct cgroup_subsys *ss; + struct kernfs_node *kn; + int ssid, ret; + + /* do not accept '\n' to prevent making /proc//cgroup unparsable */ + if (strchr(name, '\n')) + return -EINVAL; + + parent = cgroup_kn_lock_live(parent_kn); + if (!parent) + return -ENODEV; + + cgrp = cgroup_create(parent); + if (IS_ERR(cgrp)) { + ret = PTR_ERR(cgrp); + goto out_unlock; + } + /* create the directory */ kn = kernfs_create_dir(parent->kn, name, mode, cgrp); if (IS_ERR(kn)) { @@ -5012,17 +5032,11 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, ret = 0; goto out_unlock; -out_cancel_ref: - percpu_ref_exit(&cgrp->self.refcnt); -out_free_cgrp: - kfree(cgrp); +out_destroy: + cgroup_destroy_locked(cgrp); out_unlock: cgroup_kn_unlock(parent_kn); return ret; - -out_destroy: - cgroup_destroy_locked(cgrp); - goto out_unlock; } /* -- cgit v1.2.3 From 5531dc915ba10b78a80dcffa48d7360d3c0bab61 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:57:58 -0500 Subject: cgroup: introduce cgroup_control() and cgroup_ss_mask() When a controller is enabled and visible on a non-root cgroup is determined by subtree_control and subtree_ss_mask of the parent cgroup. For a root cgroup, by the type of the hierarchy and which controllers are attached to it. Deciding the above on each usage is fragile and unnecessarily complicates the users. This patch introduces cgroup_control() and cgroup_ss_mask() which calculate and return the [visibly] enabled subsyste mask for the specified cgroup and conver the existing usages. * cgroup_e_css() is restructured for simplicity. * cgroup_calc_subtree_ss_mask() and cgroup_subtree_control_write() no longer need to distinguish root and non-root cases. * With cgroup_control(), cgroup_controllers_show() can now handle both root and non-root cases. cgroup_root_controllers_show() is removed. v2: cgroup_control() updated to yield the correct result on v1 hierarchies too. cgroup_subtree_control_write() converted. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 72 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 37 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e1b3d0fead05..2cb4b5419852 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -346,6 +346,32 @@ static struct cgroup *cgroup_parent(struct cgroup *cgrp) return NULL; } +/* subsystems visibly enabled on a cgroup */ +static u16 cgroup_control(struct cgroup *cgrp) +{ + struct cgroup *parent = cgroup_parent(cgrp); + u16 root_ss_mask = cgrp->root->subsys_mask; + + if (parent) + return parent->subtree_control; + + if (cgroup_on_dfl(cgrp)) + root_ss_mask &= ~cgrp_dfl_inhibit_ss_mask; + + return root_ss_mask; +} + +/* subsystems enabled on a cgroup */ +static u16 cgroup_ss_mask(struct cgroup *cgrp) +{ + struct cgroup *parent = cgroup_parent(cgrp); + + if (parent) + return parent->subtree_ss_mask; + + return cgrp->root->subsys_mask; +} + /** * cgroup_css - obtain a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest @@ -385,16 +411,15 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, if (!ss) return &cgrp->self; - if (!(cgrp->root->subsys_mask & (1 << ss->id))) - return NULL; - /* * This function is used while updating css associations and thus - * can't test the csses directly. Use ->subtree_ss_mask. + * can't test the csses directly. Test ss_mask. */ - while (cgroup_parent(cgrp) && - !(cgroup_parent(cgrp)->subtree_ss_mask & (1 << ss->id))) + while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) { cgrp = cgroup_parent(cgrp); + if (!cgrp) + return NULL; + } return cgroup_css(cgrp, ss); } @@ -1276,7 +1301,6 @@ static umode_t cgroup_file_mode(const struct cftype *cft) */ static u16 cgroup_calc_subtree_ss_mask(struct cgroup *cgrp, u16 subtree_control) { - struct cgroup *parent = cgroup_parent(cgrp); u16 cur_ss_mask = subtree_control; struct cgroup_subsys *ss; int ssid; @@ -1298,10 +1322,7 @@ static u16 cgroup_calc_subtree_ss_mask(struct cgroup *cgrp, u16 subtree_control) * happen only if some depended-upon subsystems were bound * to non-default hierarchies. */ - if (parent) - new_ss_mask &= parent->subtree_ss_mask; - else - new_ss_mask &= cgrp->root->subsys_mask; + new_ss_mask &= cgroup_ss_mask(cgrp); if (new_ss_mask == cur_ss_mask) break; @@ -2864,22 +2885,12 @@ static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) seq_putc(seq, '\n'); } -/* show controllers which are currently attached to the default hierarchy */ -static int cgroup_root_controllers_show(struct seq_file *seq, void *v) -{ - struct cgroup *cgrp = seq_css(seq)->cgroup; - - cgroup_print_ss_mask(seq, cgrp->root->subsys_mask & - ~cgrp_dfl_inhibit_ss_mask); - return 0; -} - /* show controllers which are enabled from the parent */ static int cgroup_controllers_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control); + cgroup_print_ss_mask(seq, cgroup_control(cgrp)); return 0; } @@ -3005,10 +3016,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, continue; } - /* unavailable or not enabled on the parent? */ - if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || - (cgroup_parent(cgrp) && - !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) { + if (!(cgroup_control(cgrp) & (1 << ssid))) { ret = -ENOENT; goto out_unlock; } @@ -4566,12 +4574,6 @@ static struct cftype cgroup_dfl_base_files[] = { }, { .name = "cgroup.controllers", - .flags = CFTYPE_ONLY_ON_ROOT, - .seq_show = cgroup_root_controllers_show, - }, - { - .name = "cgroup.controllers", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_controllers_show, }, { @@ -4945,7 +4947,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); /* create the csses */ - do_each_subsys_mask(ss, ssid, parent->subtree_ss_mask) { + do_each_subsys_mask(ss, ssid, cgroup_ss_mask(cgrp)) { struct cgroup_subsys_state *css; css = css_create(cgrp, ss); @@ -4960,7 +4962,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) * subtree_control from the parent. Each is configured manually. */ if (!cgroup_on_dfl(cgrp)) { - cgrp->subtree_control = parent->subtree_control; + cgrp->subtree_control = cgroup_control(cgrp); cgroup_refresh_subtree_ss_mask(cgrp); } @@ -5020,7 +5022,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (ret) goto out_destroy; - do_each_subsys_mask(ss, ssid, parent->subtree_control) { + do_each_subsys_mask(ss, ssid, cgroup_control(cgrp)) { ret = css_populate_dir(cgroup_css(cgrp, ss), NULL); if (ret) goto out_destroy; -- cgit v1.2.3 From 1b9b96a12b5433ccc477265111122720ccb4965e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:57:59 -0500 Subject: cgroup: factor out cgroup_drain_offline() from cgroup_subtree_control_write() Factor out async css offline draining into cgroup_drain_offline(). * Nest subsystem walk inside child walk. The child walk will later be converted to subtree walk which is a bit more expensive. * Relocate the draining above subsystem mask preparation, which doesn't create any behavior differences but helps further refactoring. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 77 ++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 52 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2cb4b5419852..d295e6a91cdc 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2965,6 +2965,53 @@ out_finish: return ret; } +/** + * cgroup_drain_offline - wait for previously offlined csses to go away + * @cgrp: parent of the target cgroups + * + * Because css offlining is asynchronous, userland may try to re-enable a + * controller while the previous css is still around. This function drains + * the previous css instances of @cgrp's children. + * + * Must be called with cgroup_mutex held. Returns %false if there were no + * dying css instances. Returns %true if there were one or more and this + * function waited. On %true return, cgroup_mutex has been dropped and + * re-acquired inbetween which anything could have happened. The caller + * typically would have to start over. + */ +static bool cgroup_drain_offline(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys *ss; + int ssid; + + lockdep_assert_held(&cgroup_mutex); + + cgroup_for_each_live_child(dsct, cgrp) { + for_each_subsys(ss, ssid) { + struct cgroup_subsys_state *css = cgroup_css(dsct, ss); + DEFINE_WAIT(wait); + + if (!css) + continue; + + cgroup_get(dsct); + prepare_to_wait(&dsct->offline_waitq, &wait, + TASK_UNINTERRUPTIBLE); + + mutex_unlock(&cgroup_mutex); + schedule(); + finish_wait(&dsct->offline_waitq, &wait); + mutex_lock(&cgroup_mutex); + + cgroup_put(dsct); + return true; + } + } + + return false; +} + /* change the enabled child controllers for a cgroup in the default hierarchy */ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes, @@ -3050,6 +3097,11 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, goto out_unlock; } + if (cgroup_drain_offline(cgrp)) { + cgroup_kn_unlock(of->kn); + return restart_syscall(); + } + /* * Update subsys masks and calculate what needs to be done. More * subsystems than specified may need to be enabled or disabled @@ -3065,31 +3117,6 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, enable |= css_enable; disable |= css_disable; - /* - * Because css offlining is asynchronous, userland might try to - * re-enable the same controller while the previous instance is - * still around. In such cases, wait till it's gone using - * offline_waitq. - */ - do_each_subsys_mask(ss, ssid, css_enable) { - cgroup_for_each_live_child(child, cgrp) { - DEFINE_WAIT(wait); - - if (!cgroup_css(child, ss)) - continue; - - cgroup_get(child); - prepare_to_wait(&child->offline_waitq, &wait, - TASK_UNINTERRUPTIBLE); - cgroup_kn_unlock(of->kn); - schedule(); - finish_wait(&child->offline_waitq, &wait); - cgroup_put(child); - - return restart_syscall(); - } - } while_each_subsys_mask(); - cgrp->subtree_control = new_sc; cgrp->subtree_ss_mask = new_ss; -- cgit v1.2.3 From 12b3bb6af862477f96e1adac51b201a143a8f3c4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:57:59 -0500 Subject: cgroup: factor out cgroup_apply_control_disable() from cgroup_subtree_control_write() Factor out css disabling and hiding into cgroup_apply_control_disable(). * Nest subsystem walk inside child walk. The child walk will later be converted to subtree walk which is a bit more expensive. * Instead of operating on the differential masks @css_enable and @css_disable, simply disable or hide csses according to the current cgroup_control() and cgroup_ss_mask(). This leads to the same result and is simpler and more robust. * This allows error handling path to share the same code. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 74 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 41 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d295e6a91cdc..97cb1315bcac 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3012,6 +3012,43 @@ static bool cgroup_drain_offline(struct cgroup *cgrp) return false; } +/** + * cgroup_apply_control_disable - kill or hide csses according to control + * @cgrp: parent of the target cgroups + * + * Walk @cgrp's children and kill and hide csses so that they match + * cgroup_ss_mask() and cgroup_visible_mask(). + * + * A css is hidden when the userland requests it to be disabled while other + * subsystems are still depending on it. The css must not actively control + * resources and be in the vanilla state if it's made visible again later. + * Controllers which may be depended upon should provide ->css_reset() for + * this purpose. + */ +static void cgroup_apply_control_disable(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys *ss; + int ssid; + + cgroup_for_each_live_child(dsct, cgrp) { + for_each_subsys(ss, ssid) { + struct cgroup_subsys_state *css = cgroup_css(dsct, ss); + + if (!css) + continue; + + if (!(cgroup_ss_mask(dsct) & (1 << ss->id))) { + kill_css(css); + } else if (!(cgroup_control(dsct) & (1 << ss->id))) { + css_clear_dir(css, NULL); + if (ss->css_reset) + ss->css_reset(css); + } + } + } +} + /* change the enabled child controllers for a cgroup in the default hierarchy */ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes, @@ -3160,27 +3197,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, if (ret) goto err_undo_css; - /* - * All tasks are migrated out of disabled csses. Kill or hide - * them. A css is hidden when the userland requests it to be - * disabled while other subsystems are still depending on it. The - * css must not actively control resources and be in the vanilla - * state if it's made visible again later. Controllers which may - * be depended upon should provide ->css_reset() for this purpose. - */ - do_each_subsys_mask(ss, ssid, disable) { - cgroup_for_each_live_child(child, cgrp) { - struct cgroup_subsys_state *css = cgroup_css(child, ss); - - if (css_disable & (1 << ssid)) { - kill_css(css); - } else { - css_clear_dir(css, NULL); - if (ss->css_reset) - ss->css_reset(css); - } - } - } while_each_subsys_mask(); + /* all tasks are migrated out of disabled csses, commit disable */ + cgroup_apply_control_disable(cgrp); kernfs_activate(cgrp->kn); ret = 0; @@ -3189,22 +3207,12 @@ out_unlock: return ret ?: nbytes; err_undo_css: + /* restore masks and shoot down new csses */ cgrp->subtree_control = old_sc; cgrp->subtree_ss_mask = old_ss; - do_each_subsys_mask(ss, ssid, enable) { - cgroup_for_each_live_child(child, cgrp) { - struct cgroup_subsys_state *css = cgroup_css(child, ss); - - if (!css) - continue; + cgroup_apply_control_disable(cgrp); - if (css_enable & (1 << ssid)) - kill_css(css); - else - css_clear_dir(css, NULL); - } - } while_each_subsys_mask(); goto out_unlock; } -- cgit v1.2.3 From bdb53bd797dcef46d1a252b9529f8fd511bf714c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:57:59 -0500 Subject: cgroup: factor out cgroup_apply_control_enable() from cgroup_subtree_control_write() Factor out css enabling and showing into cgroup_apply_control_enable(). * Nest subsystem walk inside child walk. The child walk will later be converted to subtree walk which is a bit more expensive. * Instead of operating on the differential masks @css_enable, simply enable or show csses according to the current cgroup_control() and cgroup_ss_mask(). This leads to the same result and is simpler and more robust. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 77 +++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 97cb1315bcac..1193038d0729 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3012,6 +3012,49 @@ static bool cgroup_drain_offline(struct cgroup *cgrp) return false; } +/** + * cgroup_apply_control_enable - enable or show csses according to control + * @cgrp: parent of the target cgroups + * + * Walk @cgrp's children and create new csses or make the existing ones + * visible. A css is created invisible if it's being implicitly enabled + * through dependency. An invisible css is made visible when the userland + * explicitly enables it. + * + * Returns 0 on success, -errno on failure. On failure, csses which have + * been processed already aren't cleaned up. The caller is responsible for + * cleaning up with cgroup_apply_control_disble(). + */ +static int cgroup_apply_control_enable(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys *ss; + int ssid, ret; + + cgroup_for_each_live_child(dsct, cgrp) { + for_each_subsys(ss, ssid) { + struct cgroup_subsys_state *css = cgroup_css(dsct, ss); + + if (!(cgroup_ss_mask(dsct) & (1 << ss->id))) + continue; + + if (!css) { + css = css_create(dsct, ss); + if (IS_ERR(css)) + return PTR_ERR(css); + } + + if (cgroup_control(dsct) & (1 << ss->id)) { + ret = css_populate_dir(css, NULL); + if (ret) + return ret; + } + } + } + + return 0; +} + /** * cgroup_apply_control_disable - kill or hide csses according to control * @cgrp: parent of the target cgroups @@ -3157,36 +3200,10 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, cgrp->subtree_control = new_sc; cgrp->subtree_ss_mask = new_ss; - /* - * Create new csses or make the existing ones visible. A css is - * created invisible if it's being implicitly enabled through - * dependency. An invisible css is made visible when the userland - * explicitly enables it. - */ - do_each_subsys_mask(ss, ssid, enable) { - cgroup_for_each_live_child(child, cgrp) { - if (css_enable & (1 << ssid)) { - struct cgroup_subsys_state *css; - - css = css_create(child, ss); - if (IS_ERR(css)) { - ret = PTR_ERR(css); - goto err_undo_css; - } - - if (cgrp->subtree_control & (1 << ssid)) { - ret = css_populate_dir(css, NULL); - if (ret) - goto err_undo_css; - } - } else { - ret = css_populate_dir(cgroup_css(child, ss), - NULL); - if (ret) - goto err_undo_css; - } - } - } while_each_subsys_mask(); + /* prepare csses */ + ret = cgroup_apply_control_enable(cgrp); + if (ret) + goto err_undo_css; /* * At this point, cgroup_e_css() results reflect the new csses -- cgit v1.2.3 From ce3f1d9d19371045981a64815227bab822554878 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:57:59 -0500 Subject: cgroup: make cgroup_drain_offline() and cgroup_apply_control_{disable|enable}() recursive The three factored out css management operations - cgroup_drain_offline() and cgroup_apply_control_{disable|enable}() - only depend on the current state of the target cgroups and idempotent and thus can be easily made to operate on the subtree instead of the immediate children. This patch introduces the iterators which walk live subtree and converts the three functions to operate on the subtree including self instead of the children. While this leads to spurious walking and be slightly more expensive, it will allow them to be used for wider scope of operations. Note that cgroup_drain_offline() now tests for whether a css is dying before trying to drain it. This is to avoid trying to drain live csses as there can be mix of live and dying csses in a subtree unlike children of the same parent. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 41 +++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1193038d0729..0398f2a6673b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -573,6 +573,24 @@ static int notify_on_release(const struct cgroup *cgrp) ; \ else +/* walk live descendants in preorder */ +#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \ + css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ + (dsct) = (d_css)->cgroup; \ + cgroup_is_dead(dsct); })) \ + ; \ + else + +/* walk live descendants in postorder */ +#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \ + css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ + (dsct) = (d_css)->cgroup; \ + cgroup_is_dead(dsct); })) \ + ; \ + else + static void cgroup_release_agent(struct work_struct *work); static void check_for_release(struct cgroup *cgrp); @@ -2967,11 +2985,11 @@ out_finish: /** * cgroup_drain_offline - wait for previously offlined csses to go away - * @cgrp: parent of the target cgroups + * @cgrp: root of the target subtree * * Because css offlining is asynchronous, userland may try to re-enable a * controller while the previous css is still around. This function drains - * the previous css instances of @cgrp's children. + * the previous css instances of @cgrp's subtree. * * Must be called with cgroup_mutex held. Returns %false if there were no * dying css instances. Returns %true if there were one or more and this @@ -2982,17 +3000,18 @@ out_finish: static bool cgroup_drain_offline(struct cgroup *cgrp) { struct cgroup *dsct; + struct cgroup_subsys_state *d_css; struct cgroup_subsys *ss; int ssid; lockdep_assert_held(&cgroup_mutex); - cgroup_for_each_live_child(dsct, cgrp) { + cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); DEFINE_WAIT(wait); - if (!css) + if (!css || !percpu_ref_is_dying(&css->refcnt)) continue; cgroup_get(dsct); @@ -3014,9 +3033,9 @@ static bool cgroup_drain_offline(struct cgroup *cgrp) /** * cgroup_apply_control_enable - enable or show csses according to control - * @cgrp: parent of the target cgroups + * @cgrp: root of the target subtree * - * Walk @cgrp's children and create new csses or make the existing ones + * Walk @cgrp's subtree and create new csses or make the existing ones * visible. A css is created invisible if it's being implicitly enabled * through dependency. An invisible css is made visible when the userland * explicitly enables it. @@ -3028,10 +3047,11 @@ static bool cgroup_drain_offline(struct cgroup *cgrp) static int cgroup_apply_control_enable(struct cgroup *cgrp) { struct cgroup *dsct; + struct cgroup_subsys_state *d_css; struct cgroup_subsys *ss; int ssid, ret; - cgroup_for_each_live_child(dsct, cgrp) { + cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); @@ -3057,9 +3077,9 @@ static int cgroup_apply_control_enable(struct cgroup *cgrp) /** * cgroup_apply_control_disable - kill or hide csses according to control - * @cgrp: parent of the target cgroups + * @cgrp: root of the target subtree * - * Walk @cgrp's children and kill and hide csses so that they match + * Walk @cgrp's subtree and kill and hide csses so that they match * cgroup_ss_mask() and cgroup_visible_mask(). * * A css is hidden when the userland requests it to be disabled while other @@ -3071,10 +3091,11 @@ static int cgroup_apply_control_enable(struct cgroup *cgrp) static void cgroup_apply_control_disable(struct cgroup *cgrp) { struct cgroup *dsct; + struct cgroup_subsys_state *d_css; struct cgroup_subsys *ss; int ssid; - cgroup_for_each_live_child(dsct, cgrp) { + cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); -- cgit v1.2.3 From 15a27c362d54378f17ec078579b2f6af88495a3f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:57:59 -0500 Subject: cgroup: introduce cgroup_{save|propagate|restore}_control() While controllers are being enabled and disabled in cgroup_subtree_control_write(), the original subsystem masks are stashed in local variables so that they can be restored if the operation fails in the middle. This patch adds dedicated fields to struct cgroup to be used instead of the local variables and implements functions to stash the current values, propagate the changes and restore them recursively. Combined with the previous changes, this makes subsystem management operations fully recursive and modularlized. This will be used to expand cgroup core functionalities. While at it, remove now unused @css_enable and @css_disable from cgroup_subtree_control_write(). Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 82 +++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 62 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0398f2a6673b..452a90e455fa 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3031,6 +3031,62 @@ static bool cgroup_drain_offline(struct cgroup *cgrp) return false; } +/** + * cgroup_save_control - save control masks of a subtree + * @cgrp: root of the target subtree + * + * Save ->subtree_control and ->subtree_ss_mask to the respective old_ + * prefixed fields for @cgrp's subtree including @cgrp itself. + */ +static void cgroup_save_control(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; + + cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { + dsct->old_subtree_control = dsct->subtree_control; + dsct->old_subtree_ss_mask = dsct->subtree_ss_mask; + } +} + +/** + * cgroup_propagate_control - refresh control masks of a subtree + * @cgrp: root of the target subtree + * + * For @cgrp and its subtree, ensure ->subtree_ss_mask matches + * ->subtree_control and propagate controller availability through the + * subtree so that descendants don't have unavailable controllers enabled. + */ +static void cgroup_propagate_control(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; + + cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { + dsct->subtree_control &= cgroup_control(dsct); + dsct->subtree_ss_mask = cgroup_calc_subtree_ss_mask(dsct, + dsct->subtree_control); + } +} + +/** + * cgroup_restore_control - restore control masks of a subtree + * @cgrp: root of the target subtree + * + * Restore ->subtree_control and ->subtree_ss_mask from the respective old_ + * prefixed fields for @cgrp's subtree including @cgrp itself. + */ +static void cgroup_restore_control(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; + + cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { + dsct->subtree_control = dsct->old_subtree_control; + dsct->subtree_ss_mask = dsct->old_subtree_ss_mask; + } +} + /** * cgroup_apply_control_enable - enable or show csses according to control * @cgrp: root of the target subtree @@ -3119,7 +3175,6 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, loff_t off) { u16 enable = 0, disable = 0; - u16 css_enable, css_disable, old_sc, new_sc, old_ss, new_ss; struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; @@ -3203,25 +3258,14 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, return restart_syscall(); } - /* - * Update subsys masks and calculate what needs to be done. More - * subsystems than specified may need to be enabled or disabled - * depending on subsystem dependencies. - */ - old_sc = cgrp->subtree_control; - old_ss = cgrp->subtree_ss_mask; - new_sc = (old_sc | enable) & ~disable; - new_ss = cgroup_calc_subtree_ss_mask(cgrp, new_sc); + /* save and update control masks and prepare csses */ + cgroup_save_control(cgrp); - css_enable = ~old_ss & new_ss; - css_disable = old_ss & ~new_ss; - enable |= css_enable; - disable |= css_disable; + cgrp->subtree_control |= enable; + cgrp->subtree_control &= ~disable; - cgrp->subtree_control = new_sc; - cgrp->subtree_ss_mask = new_ss; + cgroup_propagate_control(cgrp); - /* prepare csses */ ret = cgroup_apply_control_enable(cgrp); if (ret) goto err_undo_css; @@ -3246,9 +3290,7 @@ out_unlock: err_undo_css: /* restore masks and shoot down new csses */ - cgrp->subtree_control = old_sc; - cgrp->subtree_ss_mask = old_ss; - + cgroup_restore_control(cgrp); cgroup_apply_control_disable(cgrp); goto out_unlock; -- cgit v1.2.3 From f7b2814bb9b6cb1d69333e1592c702260fcb4184 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:58:00 -0500 Subject: cgroup: factor out cgroup_{apply|finalize}_control() from cgroup_subtree_control_write() Factor out cgroup_{apply|finalize}_control() so that control mask update can be done in several simple steps. This patch doesn't introduce behavior changes. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 81 +++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 58 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 452a90e455fa..2adf0433a3cf 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3169,6 +3169,62 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp) } } +/** + * cgroup_apply_control - apply control mask updates to the subtree + * @cgrp: root of the target subtree + * + * subsystems can be enabled and disabled in a subtree using the following + * steps. + * + * 1. Call cgroup_save_control() to stash the current state. + * 2. Update ->subtree_control masks in the subtree as desired. + * 3. Call cgroup_apply_control() to apply the changes. + * 4. Optionally perform other related operations. + * 5. Call cgroup_finalize_control() to finish up. + * + * This function implements step 3 and propagates the mask changes + * throughout @cgrp's subtree, updates csses accordingly and perform + * process migrations. + */ +static int cgroup_apply_control(struct cgroup *cgrp) +{ + int ret; + + cgroup_propagate_control(cgrp); + + ret = cgroup_apply_control_enable(cgrp); + if (ret) + return ret; + + /* + * At this point, cgroup_e_css() results reflect the new csses + * making the following cgroup_update_dfl_csses() properly update + * css associations of all tasks in the subtree. + */ + ret = cgroup_update_dfl_csses(cgrp); + if (ret) + return ret; + + return 0; +} + +/** + * cgroup_finalize_control - finalize control mask update + * @cgrp: root of the target subtree + * @ret: the result of the update + * + * Finalize control mask update. See cgroup_apply_control() for more info. + */ +static void cgroup_finalize_control(struct cgroup *cgrp, int ret) +{ + if (ret) { + cgroup_restore_control(cgrp); + cgroup_propagate_control(cgrp); + } + + cgroup_apply_control_disable(cgrp); +} + /* change the enabled child controllers for a cgroup in the default hierarchy */ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes, @@ -3264,36 +3320,15 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, cgrp->subtree_control |= enable; cgrp->subtree_control &= ~disable; - cgroup_propagate_control(cgrp); + ret = cgroup_apply_control(cgrp); - ret = cgroup_apply_control_enable(cgrp); - if (ret) - goto err_undo_css; - - /* - * At this point, cgroup_e_css() results reflect the new csses - * making the following cgroup_update_dfl_csses() properly update - * css associations of all tasks in the subtree. - */ - ret = cgroup_update_dfl_csses(cgrp); - if (ret) - goto err_undo_css; - - /* all tasks are migrated out of disabled csses, commit disable */ - cgroup_apply_control_disable(cgrp); + cgroup_finalize_control(cgrp, ret); kernfs_activate(cgrp->kn); ret = 0; out_unlock: cgroup_kn_unlock(of->kn); return ret ?: nbytes; - -err_undo_css: - /* restore masks and shoot down new csses */ - cgroup_restore_control(cgrp); - cgroup_apply_control_disable(cgrp); - - goto out_unlock; } static int cgroup_events_show(struct seq_file *seq, void *v) -- cgit v1.2.3 From 945ba1996888809cf510a8da000a9c20a9fab5ad Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:58:00 -0500 Subject: cgroup: combine cgroup_mutex locking and offline css draining cgroup_drain_offline() is used to wait for csses being offlined to uninstall itself from cgroup->subsys[] array so that new csses can be installed. The function's only user, cgroup_subtree_control_write(), calls it after performing some checks and restarts the whole process via restart_syscall() if draining has to release cgroup_mutex to wait. This can be simplified by draining before other synchronized operations so that there's nothing to restart. This patch converts cgroup_drain_offline() to cgroup_lock_and_drain_offline() which performs both locking and draining and updates cgroup_kn_lock_live() use it instead of cgroup_mutex() if requested. This combined locking and draining operations are easier to use and less error-prone. While at it, add WARNs in control_apply functions which triggers if the subtree isn't properly drained. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 55 +++++++++++++++++++++++++++---------------------------- 1 file changed, 27 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2adf0433a3cf..bbeb35f14eda 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -220,6 +220,7 @@ static struct cftype cgroup_dfl_base_files[]; static struct cftype cgroup_legacy_base_files[]; static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); +static void cgroup_lock_and_drain_offline(struct cgroup *cgrp); static void css_task_iter_advance(struct css_task_iter *it); static int cgroup_destroy_locked(struct cgroup *cgrp); static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, @@ -1391,19 +1392,22 @@ static void cgroup_kn_unlock(struct kernfs_node *kn) /** * cgroup_kn_lock_live - locking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced + * @drain_offline: perform offline draining on the cgroup * * This helper is to be used by a cgroup kernfs method currently servicing * @kn. It breaks the active protection, performs cgroup locking and * verifies that the associated cgroup is alive. Returns the cgroup if * alive; otherwise, %NULL. A successful return should be undone by a - * matching cgroup_kn_unlock() invocation. + * matching cgroup_kn_unlock() invocation. If @drain_offline is %true, the + * cgroup is drained of offlining csses before return. * * Any cgroup kernfs method implementation which requires locking the * associated cgroup should use this helper. It avoids nesting cgroup * locking under kernfs active protection and allows all kernfs operations * including self-removal. */ -static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) +static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, + bool drain_offline) { struct cgroup *cgrp; @@ -1422,7 +1426,10 @@ static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) return NULL; kernfs_break_active_protection(kn); - mutex_lock(&cgroup_mutex); + if (drain_offline) + cgroup_lock_and_drain_offline(cgrp); + else + mutex_lock(&cgroup_mutex); if (!cgroup_is_dead(cgrp)) return cgrp; @@ -2761,7 +2768,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return -EINVAL; - cgrp = cgroup_kn_lock_live(of->kn); + cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; @@ -2859,7 +2866,7 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); - cgrp = cgroup_kn_lock_live(of->kn); + cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; spin_lock(&release_agent_path_lock); @@ -2984,27 +2991,23 @@ out_finish: } /** - * cgroup_drain_offline - wait for previously offlined csses to go away + * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses * @cgrp: root of the target subtree * * Because css offlining is asynchronous, userland may try to re-enable a - * controller while the previous css is still around. This function drains - * the previous css instances of @cgrp's subtree. - * - * Must be called with cgroup_mutex held. Returns %false if there were no - * dying css instances. Returns %true if there were one or more and this - * function waited. On %true return, cgroup_mutex has been dropped and - * re-acquired inbetween which anything could have happened. The caller - * typically would have to start over. + * controller while the previous css is still around. This function grabs + * cgroup_mutex and drains the previous css instances of @cgrp's subtree. */ -static bool cgroup_drain_offline(struct cgroup *cgrp) +static void cgroup_lock_and_drain_offline(struct cgroup *cgrp) + __acquires(&cgroup_mutex) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; struct cgroup_subsys *ss; int ssid; - lockdep_assert_held(&cgroup_mutex); +restart: + mutex_lock(&cgroup_mutex); cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { for_each_subsys(ss, ssid) { @@ -3021,14 +3024,11 @@ static bool cgroup_drain_offline(struct cgroup *cgrp) mutex_unlock(&cgroup_mutex); schedule(); finish_wait(&dsct->offline_waitq, &wait); - mutex_lock(&cgroup_mutex); cgroup_put(dsct); - return true; + goto restart; } } - - return false; } /** @@ -3111,6 +3111,8 @@ static int cgroup_apply_control_enable(struct cgroup *cgrp) for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); + WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt)); + if (!(cgroup_ss_mask(dsct) & (1 << ss->id))) continue; @@ -3155,6 +3157,8 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp) for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); + WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt)); + if (!css) continue; @@ -3264,7 +3268,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, return -EINVAL; } - cgrp = cgroup_kn_lock_live(of->kn); + cgrp = cgroup_kn_lock_live(of->kn, true); if (!cgrp) return -ENODEV; @@ -3309,11 +3313,6 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, goto out_unlock; } - if (cgroup_drain_offline(cgrp)) { - cgroup_kn_unlock(of->kn); - return restart_syscall(); - } - /* save and update control masks and prepare csses */ cgroup_save_control(cgrp); @@ -5140,7 +5139,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (strchr(name, '\n')) return -EINVAL; - parent = cgroup_kn_lock_live(parent_kn); + parent = cgroup_kn_lock_live(parent_kn, false); if (!parent) return -ENODEV; @@ -5339,7 +5338,7 @@ static int cgroup_rmdir(struct kernfs_node *kn) struct cgroup *cgrp; int ret = 0; - cgrp = cgroup_kn_lock_live(kn); + cgrp = cgroup_kn_lock_live(kn, false); if (!cgrp) return 0; -- cgit v1.2.3 From 03970d3c11faf870dc5126bb2e84fd1d692af1b7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:58:00 -0500 Subject: cgroup: use cgroup_apply_enable_control() in cgroup creation path cgroup_create() manually updates control masks and creates child csses which cgroup_mkdir() then manually populates. Both can be simplified by using cgroup_apply_enable_control() and friends. The only catch is that it calls css_populate_dir() with NULL cgroup->kn during cgroup_create(). This is worked around by making the function noop on NULL kn. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 38 ++++++++++++++------------------------ 1 file changed, 14 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index bbeb35f14eda..ee6951b1e35d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1490,7 +1490,7 @@ static int css_populate_dir(struct cgroup_subsys_state *css, struct cftype *cfts, *failed_cfts; int ret; - if (css->flags & CSS_VISIBLE) + if ((css->flags & CSS_VISIBLE) || !cgrp->kn) return 0; if (!css->ss) { @@ -5042,10 +5042,9 @@ err_free_css: static struct cgroup *cgroup_create(struct cgroup *parent) { struct cgroup_root *root = parent->root; - struct cgroup_subsys *ss; struct cgroup *cgrp, *tcgrp; int level = parent->level + 1; - int ssid, ret; + int ret; /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(sizeof(*cgrp) + @@ -5095,25 +5094,19 @@ static struct cgroup *cgroup_create(struct cgroup *parent) */ cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); - /* create the csses */ - do_each_subsys_mask(ss, ssid, cgroup_ss_mask(cgrp)) { - struct cgroup_subsys_state *css; - - css = css_create(cgrp, ss); - if (IS_ERR(css)) { - ret = PTR_ERR(css); - goto out_destroy; - } - } while_each_subsys_mask(); - /* * On the default hierarchy, a child doesn't automatically inherit * subtree_control from the parent. Each is configured manually. */ - if (!cgroup_on_dfl(cgrp)) { + if (!cgroup_on_dfl(cgrp)) cgrp->subtree_control = cgroup_control(cgrp); - cgroup_refresh_subtree_ss_mask(cgrp); - } + + cgroup_propagate_control(cgrp); + + /* @cgrp doesn't have dir yet so the following will only create csses */ + ret = cgroup_apply_control_enable(cgrp); + if (ret) + goto out_destroy; return cgrp; @@ -5131,9 +5124,8 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { struct cgroup *parent, *cgrp; - struct cgroup_subsys *ss; struct kernfs_node *kn; - int ssid, ret; + int ret; /* do not accept '\n' to prevent making /proc//cgroup unparsable */ if (strchr(name, '\n')) @@ -5171,11 +5163,9 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (ret) goto out_destroy; - do_each_subsys_mask(ss, ssid, cgroup_control(cgrp)) { - ret = css_populate_dir(cgroup_css(cgrp, ss), NULL); - if (ret) - goto out_destroy; - } while_each_subsys_mask(); + ret = cgroup_apply_control_enable(cgrp); + if (ret) + goto out_destroy; /* let's create and online css's */ kernfs_activate(kn); -- cgit v1.2.3 From 334c3679ec4b2b113c35ebe37d2018b112dd5013 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:58:01 -0500 Subject: cgroup: reimplement rebind_subsystems() using cgroup_apply_control() and friends rebind_subsystem() open codes quite a bit of css and interface file manipulations. It tries to be fail-safe but doesn't quite achieve it. It can be greatly simplified by using the new css management helpers. This patch reimplements rebind_subsytsems() using cgroup_apply_control() and friends. * The half-baked rollback on file creation failure is dropped. It is an extremely cold path, failure isn't critical, and, aside from kernel bugs, the only reason it can fail is memory allocation failure which pretty much doesn't happen for small allocations. * As cgroup_apply_control_disable() is now used to clean up root cgroup on rebind, make sure that it doesn't end up killing root csses. * All callers of rebind_subsystems() are updated to use cgroup_lock_and_drain_offline() as the apply_control functions require drained subtree. * This leaves cgroup_refresh_subtree_ss_mask() without any user. Removed. * css_populate_dir() and css_clear_dir() no longer needs @cgrp_override parameter. Dropped. * While at it, add WARN_ON() to rebind_subsystem() calls which are expected to always succeed just in case. While the rules visible to userland aren't changed, this reimplementation not only simplifies rebind_subsystems() but also allows it to disable and enable csses recursively. This can be used to implement more flexible rebinding. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 107 +++++++++++++++----------------------------------------- 1 file changed, 28 insertions(+), 79 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ee6951b1e35d..98e644b0a532 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -221,6 +221,8 @@ static struct cftype cgroup_legacy_base_files[]; static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); static void cgroup_lock_and_drain_offline(struct cgroup *cgrp); +static int cgroup_apply_control(struct cgroup *cgrp); +static void cgroup_finalize_control(struct cgroup *cgrp, int ret); static void css_task_iter_advance(struct css_task_iter *it); static int cgroup_destroy_locked(struct cgroup *cgrp); static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, @@ -1160,13 +1162,13 @@ static void cgroup_destroy_root(struct cgroup_root *root) struct cgroup *cgrp = &root->cgrp; struct cgrp_cset_link *link, *tmp_link; - mutex_lock(&cgroup_mutex); + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); BUG_ON(atomic_read(&root->nr_cgrps)); BUG_ON(!list_empty(&cgrp->self.children)); /* Rebind all subsystems back to the default hierarchy */ - rebind_subsystems(&cgrp_dfl_root, root->subsys_mask); + WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask)); /* * Release all the links from cset_links to this hierarchy's @@ -1351,19 +1353,6 @@ static u16 cgroup_calc_subtree_ss_mask(struct cgroup *cgrp, u16 subtree_control) return cur_ss_mask; } -/** - * cgroup_refresh_subtree_ss_mask - update subtree_ss_mask - * @cgrp: the target cgroup - * - * Update @cgrp->subtree_ss_mask according to the current - * @cgrp->subtree_control using cgroup_calc_subtree_ss_mask(). - */ -static void cgroup_refresh_subtree_ss_mask(struct cgroup *cgrp) -{ - cgrp->subtree_ss_mask = - cgroup_calc_subtree_ss_mask(cgrp, cgrp->subtree_control); -} - /** * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced @@ -1459,12 +1448,10 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) /** * css_clear_dir - remove subsys files in a cgroup directory * @css: taget css - * @cgrp_override: specify if target cgroup is different from css->cgroup */ -static void css_clear_dir(struct cgroup_subsys_state *css, - struct cgroup *cgrp_override) +static void css_clear_dir(struct cgroup_subsys_state *css) { - struct cgroup *cgrp = cgrp_override ?: css->cgroup; + struct cgroup *cgrp = css->cgroup; struct cftype *cfts; if (!(css->flags & CSS_VISIBLE)) @@ -1479,14 +1466,12 @@ static void css_clear_dir(struct cgroup_subsys_state *css, /** * css_populate_dir - create subsys files in a cgroup directory * @css: target css - * @cgrp_overried: specify if target cgroup is different from css->cgroup * * On failure, no file is added. */ -static int css_populate_dir(struct cgroup_subsys_state *css, - struct cgroup *cgrp_override) +static int css_populate_dir(struct cgroup_subsys_state *css) { - struct cgroup *cgrp = cgrp_override ?: css->cgroup; + struct cgroup *cgrp = css->cgroup; struct cftype *cfts, *failed_cfts; int ret; @@ -1526,7 +1511,6 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; - u16 tmp_ss_mask; int ssid, i, ret; lockdep_assert_held(&cgroup_mutex); @@ -1541,46 +1525,6 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) return -EBUSY; } while_each_subsys_mask(); - /* skip creating root files on dfl_root for inhibited subsystems */ - tmp_ss_mask = ss_mask; - if (dst_root == &cgrp_dfl_root) - tmp_ss_mask &= ~cgrp_dfl_inhibit_ss_mask; - - do_each_subsys_mask(ss, ssid, tmp_ss_mask) { - struct cgroup *scgrp = &ss->root->cgrp; - int tssid; - - ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp); - if (!ret) - continue; - - /* - * Rebinding back to the default root is not allowed to - * fail. Using both default and non-default roots should - * be rare. Moving subsystems back and forth even more so. - * Just warn about it and continue. - */ - if (dst_root == &cgrp_dfl_root) { - if (cgrp_dfl_visible) { - pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n", - ret, ss_mask); - pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); - } - continue; - } - - do_each_subsys_mask(ss, tssid, tmp_ss_mask) { - if (tssid == ssid) - break; - css_clear_dir(cgroup_css(scgrp, ss), dcgrp); - } while_each_subsys_mask(); - return ret; - } while_each_subsys_mask(); - - /* - * Nothing can fail from this point on. Remove files for the - * removed subsystems and rebind each subsystem. - */ do_each_subsys_mask(ss, ssid, ss_mask) { struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; @@ -1589,8 +1533,12 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) WARN_ON(!css || cgroup_css(dcgrp, ss)); - css_clear_dir(css, NULL); + /* disable from the source */ + src_root->subsys_mask &= ~(1 << ssid); + WARN_ON(cgroup_apply_control(scgrp)); + cgroup_finalize_control(scgrp, 0); + /* rebind */ RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); rcu_assign_pointer(dcgrp->subsys[ssid], css); ss->root = dst_root; @@ -1602,20 +1550,20 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) &dcgrp->e_csets[ss->id]); spin_unlock_bh(&css_set_lock); - src_root->subsys_mask &= ~(1 << ssid); - scgrp->subtree_control &= ~(1 << ssid); - cgroup_refresh_subtree_ss_mask(scgrp); - /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; if (dst_root == &cgrp_dfl_root) { static_branch_enable(cgroup_subsys_on_dfl_key[ssid]); } else { dcgrp->subtree_control |= 1 << ssid; - cgroup_refresh_subtree_ss_mask(dcgrp); static_branch_disable(cgroup_subsys_on_dfl_key[ssid]); } + ret = cgroup_apply_control(dcgrp); + if (ret) + pr_warn("partial failure to rebind %s controller (err=%d)\n", + ss->name, ret); + if (ss->bind) ss->bind(css); } while_each_subsys_mask(); @@ -1807,7 +1755,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) return -EINVAL; } - mutex_lock(&cgroup_mutex); + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); /* See what subsystems are wanted */ ret = parse_cgroupfs_options(data, &opts); @@ -1840,7 +1788,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) if (ret) goto out_unlock; - rebind_subsystems(&cgrp_dfl_root, removed_mask); + WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); if (opts.release_agent) { spin_lock(&release_agent_path_lock); @@ -1991,7 +1939,7 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) } root_cgrp->kn = root->kf_root->kn; - ret = css_populate_dir(&root_cgrp->self, NULL); + ret = css_populate_dir(&root_cgrp->self); if (ret) goto destroy_root; @@ -2070,7 +2018,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, goto out_mount; } - mutex_lock(&cgroup_mutex); + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); /* First find the desired set of subsystems */ ret = parse_cgroupfs_options(data, &opts); @@ -3123,7 +3071,7 @@ static int cgroup_apply_control_enable(struct cgroup *cgrp) } if (cgroup_control(dsct) & (1 << ss->id)) { - ret = css_populate_dir(css, NULL); + ret = css_populate_dir(css); if (ret) return ret; } @@ -3162,10 +3110,11 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp) if (!css) continue; - if (!(cgroup_ss_mask(dsct) & (1 << ss->id))) { + if (css->parent && + !(cgroup_ss_mask(dsct) & (1 << ss->id))) { kill_css(css); } else if (!(cgroup_control(dsct) & (1 << ss->id))) { - css_clear_dir(css, NULL); + css_clear_dir(css); if (ss->css_reset) ss->css_reset(css); } @@ -5159,7 +5108,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (ret) goto out_destroy; - ret = css_populate_dir(&cgrp->self, NULL); + ret = css_populate_dir(&cgrp->self); if (ret) goto out_destroy; @@ -5231,7 +5180,7 @@ static void kill_css(struct cgroup_subsys_state *css) * This must happen before css is disassociated with its cgroup. * See seq_css() for details. */ - css_clear_dir(css, NULL); + css_clear_dir(css); /* * Killing would put the base ref, but we need to keep it alive -- cgit v1.2.3 From 5ced2518bd3e3a4f01e2122122211f217cd99f4f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:58:01 -0500 Subject: cgroup: make cgroup_calc_subtree_ss_mask() take @this_ss_mask cgroup_calc_subtree_ss_mask() currently takes @cgrp and @subtree_control. @cgrp is used for two purposes - to decide whether it's for default hierarchy and the mask of available subsystems. The former doesn't matter as the results are the same regardless. The latter can be specified directly through a subsystem mask. This patch makes cgroup_calc_subtree_ss_mask() perform the same calculations for both default and legacy hierarchies and take @this_ss_mask for available subsystems. @cgrp is no longer used and dropped. This is to allow using the function in contexts where available controllers can't be decided from the cgroup. v2: cgroup_refres_subtree_ss_mask() is removed by a previous patch. Updated accordingly. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 98e644b0a532..58e02e9aa970 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1309,18 +1309,17 @@ static umode_t cgroup_file_mode(const struct cftype *cft) /** * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask - * @cgrp: the target cgroup * @subtree_control: the new subtree_control mask to consider + * @this_ss_mask: available subsystems * * On the default hierarchy, a subsystem may request other subsystems to be * enabled together through its ->depends_on mask. In such cases, more * subsystems than specified in "cgroup.subtree_control" may be enabled. * * This function calculates which subsystems need to be enabled if - * @subtree_control is to be applied to @cgrp. The returned mask is always - * a superset of @subtree_control and follows the usual hierarchy rules. + * @subtree_control is to be applied while restricted to @this_ss_mask. */ -static u16 cgroup_calc_subtree_ss_mask(struct cgroup *cgrp, u16 subtree_control) +static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask) { u16 cur_ss_mask = subtree_control; struct cgroup_subsys *ss; @@ -1328,9 +1327,6 @@ static u16 cgroup_calc_subtree_ss_mask(struct cgroup *cgrp, u16 subtree_control) lockdep_assert_held(&cgroup_mutex); - if (!cgroup_on_dfl(cgrp)) - return cur_ss_mask; - while (true) { u16 new_ss_mask = cur_ss_mask; @@ -1343,7 +1339,7 @@ static u16 cgroup_calc_subtree_ss_mask(struct cgroup *cgrp, u16 subtree_control) * happen only if some depended-upon subsystems were bound * to non-default hierarchies. */ - new_ss_mask &= cgroup_ss_mask(cgrp); + new_ss_mask &= this_ss_mask; if (new_ss_mask == cur_ss_mask) break; @@ -3012,8 +3008,9 @@ static void cgroup_propagate_control(struct cgroup *cgrp) cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { dsct->subtree_control &= cgroup_control(dsct); - dsct->subtree_ss_mask = cgroup_calc_subtree_ss_mask(dsct, - dsct->subtree_control); + dsct->subtree_ss_mask = + cgroup_calc_subtree_ss_mask(dsct->subtree_control, + cgroup_ss_mask(dsct)); } } -- cgit v1.2.3 From 04313591ae487da8b5781a0d8d444073a3fdee0d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:58:01 -0500 Subject: cgroup: allocate 2x cgrp_cset_links when setting up a new root During prep, cgroup_setup_root() allocates cgrp_cset_links matching the number of existing css_sets to later link the new root. This is fine for now as the only operation which can happen inbetween is rebind_subsystems() and rebinding of empty subsystems doesn't create new css_sets. However, while not yet allowed, with the recent reimplementation, rebind_subsystems() can rebind subsystems with descendant csses and thus can create new css_sets. This patch makes cgroup_setup_root() allocate 2x of the existing css_sets so that later use of live subsystem rebinding doesn't blow up. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 58e02e9aa970..40ed329482dd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1915,10 +1915,11 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) /* * We're accessing css_set_count without locking css_set_lock here, * but that's OK - it can only be increased by someone holding - * cgroup_lock, and that's us. The worst that can happen is that we - * have some link structures left over + * cgroup_lock, and that's us. Later rebinding may disable + * controllers on the default hierarchy and thus create new csets, + * which can't be more than the existing ones. Allocate 2x. */ - ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); + ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links); if (ret) goto cancel_ref; -- cgit v1.2.3 From 549626047df99f1129d4e742cce741055bdc2dcb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Mar 2016 09:58:01 -0500 Subject: cgroup: update css iteration in cgroup_update_dfl_csses() The existing sequences of operations ensure that the offlining csses are drained before cgroup_update_dfl_csses(), so even though cgroup_update_dfl_csses() uses css_for_each_descendant_pre() to walk the target cgroups, it doesn't end up operating on dead cgroups. Also, the function explicitly excludes the subtree root from operation. This is fragile and inconsistent with the rest of css update operations. This patch updates cgroup_update_dfl_csses() to use cgroup_for_each_live_descendant_pre() instead and include the subtree root. Signed-off-by: Tejun Heo Acked-by: Zefan Li --- kernel/cgroup.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 40ed329482dd..c63fce0c5b24 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2877,16 +2877,17 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy * @cgrp: root of the subtree to update csses for * - * @cgrp's subtree_ss_mask has changed and its subtree's (self excluded) - * css associations need to be updated accordingly. This function looks up - * all css_sets which are attached to the subtree, creates the matching - * updated css_sets and migrates the tasks to the new ones. + * @cgrp's control masks have changed and its subtree's css associations + * need to be updated accordingly. This function looks up all css_sets + * which are attached to the subtree, creates the matching updated css_sets + * and migrates the tasks to the new ones. */ static int cgroup_update_dfl_csses(struct cgroup *cgrp) { LIST_HEAD(preloaded_csets); struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); - struct cgroup_subsys_state *css; + struct cgroup_subsys_state *d_css; + struct cgroup *dsct; struct css_set *src_cset; int ret; @@ -2896,14 +2897,10 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) /* look up all csses currently attached to @cgrp's subtree */ spin_lock_bh(&css_set_lock); - css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { + cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { struct cgrp_cset_link *link; - /* self is not affected by subtree_ss_mask change */ - if (css->cgroup == cgrp) - continue; - - list_for_each_entry(link, &css->cgroup->cset_links, cset_link) + list_for_each_entry(link, &dsct->cset_links, cset_link) cgroup_migrate_add_src(link->cset, cgrp, &preloaded_csets); } -- cgit v1.2.3 From 4e8ae72a75aae285ec5b93518b9680da198afd0d Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 3 Mar 2016 21:49:27 +0000 Subject: X.509: Make algo identifiers text instead of enum Make the identifier public key and digest algorithm fields text instead of enum. Signed-off-by: David Howells Acked-by: Herbert Xu --- kernel/module_signing.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/module_signing.c b/kernel/module_signing.c index 9cfa46d8d14f..64b9dead4a07 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -16,6 +16,12 @@ #include #include "module-internal.h" +enum pkey_id_type { + PKEY_ID_PGP, /* OpenPGP generated key ID */ + PKEY_ID_X509, /* X.509 arbitrary subjectKeyIdentifier */ + PKEY_ID_PKCS7, /* Signature in PKCS#7 message */ +}; + /* * Module signature information block. * -- cgit v1.2.3 From e9532e69b8d1d1284e8ecf8d2586de34aec61244 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 4 Mar 2016 15:59:42 +0100 Subject: sched/cputime: Fix steal time accounting vs. CPU hotplug On CPU hotplug the steal time accounting can keep a stale rq->prev_steal_time value over CPU down and up. So after the CPU comes up again the delta calculation in steal_account_process_tick() wreckages itself due to the unsigned math: u64 steal = paravirt_steal_clock(smp_processor_id()); steal -= this_rq()->prev_steal_time; So if steal is smaller than rq->prev_steal_time we end up with an insane large value which then gets added to rq->prev_steal_time, resulting in a permanent wreckage of the accounting. As a consequence the per CPU stats in /proc/stat become stale. Nice trick to tell the world how idle the system is (100%) while the CPU is 100% busy running tasks. Though we prefer realistic numbers. None of the accounting values which use a previous value to account for fractions is reset at CPU hotplug time. update_rq_clock_task() has a sanity check for prev_irq_time and prev_steal_time_rq, but that sanity check solely deals with clock warps and limits the /proc/stat visible wreckage. The prev_time values are still wrong. Solution is simple: Reset rq->prev_*_time when the CPU is plugged in again. Signed-off-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Cc: Frederic Weisbecker Cc: Glauber Costa Cc: Linus Torvalds Cc: Peter Zijlstra Fixes: commit 095c0aa83e52 "sched: adjust scheduler cpu power for stolen time" Fixes: commit aa483808516c "sched: Remove irq time from available CPU power" Fixes: commit e6e6685accfa "KVM guest: Steal time accounting" Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1603041539490.3686@nanos Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 + kernel/sched/sched.h | 13 +++++++++++++ 2 files changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ab814bf100e1..406182af99ac 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5627,6 +5627,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_UP_PREPARE: rq->calc_load_update = calc_load_update; + account_reset_rq(rq); break; case CPU_ONLINE: diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 30ea2d871ba7..4f6598ae4c31 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1738,3 +1738,16 @@ static inline u64 irq_time_read(int cpu) } #endif /* CONFIG_64BIT */ #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +static inline void account_reset_rq(struct rq *rq) +{ +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + rq->prev_irq_time = 0; +#endif +#ifdef CONFIG_PARAVIRT + rq->prev_steal_time = 0; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + rq->prev_steal_time_rq = 0; +#endif +} -- cgit v1.2.3 From d39cdd2036a63eef17a14efbd969405ca5612886 Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Tue, 8 Mar 2016 21:37:01 +0800 Subject: tracing: Make tracer_flags use the right set_flag callback When I was updating the ftrace_stress test of ltp. I encountered a strange phenomemon, excute following steps: echo nop > /sys/kernel/debug/tracing/current_tracer echo 0 > /sys/kernel/debug/tracing/options/funcgraph-cpu bash: echo: write error: Invalid argument check dmesg: [ 1024.903855] nop_test_refuse flag set to 0: we refuse.Now cat trace_options to see the result The reason is that the trace option test will randomly setup trace option under tracing/options no matter what the current_tracer is. but the set_tracer_option is always using the set_flag callback from the current_tracer. This patch adds a pointer to tracer_flags and make it point to the tracer it belongs to. When the option is setup, the set_flag of the right tracer will be used no matter what the the current_tracer is. And the old dummy_tracer_flags is used for all the tracers which doesn't have a tracer_flags, having issue to use it to save the pointer of a tracer. So remove it and use dynamic dummy tracer_flags for tracers needing a dummy tracer_flags, as a result, there are no tracers sharing tracer_flags, so remove the check code. And save the current tracer to trace_option_dentry seems not good as it may waste mem space when mount the debug/trace fs more than one time. Link: http://lkml.kernel.org/r/1457444222-8654-1-git-send-email-chuhu@redhat.com Signed-off-by: Chunyu Hu [ Fixed up function tracer options to work with the change ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 28 ++++++++++++++-------------- kernel/trace/trace.h | 1 + kernel/trace/trace_functions.c | 6 ++++++ 3 files changed, 21 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d9293402ee68..b401a1892dc6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -74,11 +74,6 @@ static struct tracer_opt dummy_tracer_opt[] = { { } }; -static struct tracer_flags dummy_tracer_flags = { - .val = 0, - .opts = dummy_tracer_opt -}; - static int dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { @@ -1258,12 +1253,20 @@ int __init register_tracer(struct tracer *type) if (!type->set_flag) type->set_flag = &dummy_set_flag; - if (!type->flags) - type->flags = &dummy_tracer_flags; - else + if (!type->flags) { + /*allocate a dummy tracer_flags*/ + type->flags = kmalloc(sizeof(*type->flags), GFP_KERNEL); + if (!type->flags) + return -ENOMEM; + type->flags->val = 0; + type->flags->opts = dummy_tracer_opt; + } else if (!type->flags->opts) type->flags->opts = dummy_tracer_opt; + /* store the tracer for __set_tracer_option */ + type->flags->trace = type; + ret = run_tracer_selftest(type); if (ret < 0) goto out; @@ -3505,7 +3508,7 @@ static int __set_tracer_option(struct trace_array *tr, struct tracer_flags *tracer_flags, struct tracer_opt *opts, int neg) { - struct tracer *trace = tr->current_trace; + struct tracer *trace = tracer_flags->trace; int ret; ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg); @@ -6391,11 +6394,8 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer) return; for (i = 0; i < tr->nr_topts; i++) { - /* - * Check if these flags have already been added. - * Some tracers share flags. - */ - if (tr->topts[i].tracer->flags == tracer->flags) + /* Make sure there's no duplicate flags. */ + if (WARN_ON_ONCE(tr->topts[i].tracer->flags == tracer->flags)) return; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8414fa40bf27..b4cae47f283e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -345,6 +345,7 @@ struct tracer_opt { struct tracer_flags { u32 val; struct tracer_opt *opts; + struct tracer *trace; }; /* Makes more easy to define a tracer opt */ diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index fcd41a166405..5a095c2e4b69 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -219,6 +219,8 @@ static void tracing_stop_function_trace(struct trace_array *tr) unregister_ftrace_function(tr->ops); } +static struct tracer function_trace; + static int func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { @@ -228,6 +230,10 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) break; + /* We can change this flag when not running. */ + if (tr->current_trace != &function_trace) + break; + unregister_ftrace_function(tr->ops); if (set) { -- cgit v1.2.3 From 4ef56902fba4d9949918c6266e67ba7d05fba7a4 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 10 Dec 2015 12:50:43 -0600 Subject: tracing: Make ftrace_event_field checking functions available Make is_string_field() and is_function_field() accessible outside of trace_event_filters.c for other users of ftrace_event_fields. Link: http://lkml.kernel.org/r/2d3f00d3311702e556e82eed7754bae6f017939f.1449767187.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Reviewed-by: Masami Hiramatsu Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 12 ++++++++++++ kernel/trace/trace_events_filter.c | 12 ------------ 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b4cae47f283e..81a8359e9c29 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1112,6 +1112,18 @@ struct filter_pred { unsigned short right; }; +static inline bool is_string_field(struct ftrace_event_field *field) +{ + return field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_STATIC_STRING || + field->filter_type == FILTER_PTR_STRING; +} + +static inline bool is_function_field(struct ftrace_event_field *field) +{ + return field->filter_type == FILTER_TRACE_FN; +} + extern enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not); extern void print_event_filter(struct trace_event_file *file, diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 6816302542b2..b3f5051cd4e9 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -961,18 +961,6 @@ int filter_assign_type(const char *type) return FILTER_OTHER; } -static bool is_function_field(struct ftrace_event_field *field) -{ - return field->filter_type == FILTER_TRACE_FN; -} - -static bool is_string_field(struct ftrace_event_field *field) -{ - return field->filter_type == FILTER_DYN_STRING || - field->filter_type == FILTER_STATIC_STRING || - field->filter_type == FILTER_PTR_STRING; -} - static bool is_legal_op(struct ftrace_event_field *field, int op) { if (is_string_field(field) && -- cgit v1.2.3 From ab4bf008928e8fc73fe1cbaa9249792d36845345 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 10 Dec 2015 12:50:44 -0600 Subject: tracing: Make event trigger functions available Make various event trigger utility functions available outside of trace_events_trigger.c so that new triggers can be defined outside of that file. Link: http://lkml.kernel.org/r/4a40c1695dd43cac6cd475d72e13ffe30ba84bff.1449767187.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 14 ++++++++++++++ kernel/trace/trace_events_trigger.c | 28 +++++++++++++--------------- 2 files changed, 27 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 81a8359e9c29..b2bc956e2b0d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1175,6 +1175,20 @@ struct event_trigger_data { struct list_head list; }; +extern void trigger_data_free(struct event_trigger_data *data); +extern int event_trigger_init(struct event_trigger_ops *ops, + struct event_trigger_data *data); +extern int trace_event_trigger_enable_disable(struct trace_event_file *file, + int trigger_enable); +extern void update_cond_flag(struct trace_event_file *file); +extern void unregister_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *test, + struct trace_event_file *file); +extern int set_trigger_filter(char *filter_str, + struct event_trigger_data *trigger_data, + struct trace_event_file *file); +extern int register_event_command(struct event_command *cmd); + /** * struct event_trigger_ops - callbacks for trace event triggers * diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index b38f617b6181..f40424f35dcb 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -28,8 +28,7 @@ static LIST_HEAD(trigger_commands); static DEFINE_MUTEX(trigger_cmd_mutex); -static void -trigger_data_free(struct event_trigger_data *data) +void trigger_data_free(struct event_trigger_data *data) { if (data->cmd_ops->set_filter) data->cmd_ops->set_filter(NULL, data, NULL); @@ -306,7 +305,7 @@ const struct file_operations event_trigger_fops = { * Currently we only register event commands from __init, so mark this * __init too. */ -static __init int register_event_command(struct event_command *cmd) +__init int register_event_command(struct event_command *cmd) { struct event_command *p; int ret = 0; @@ -395,9 +394,8 @@ event_trigger_print(const char *name, struct seq_file *m, * * Return: 0 on success, errno otherwise */ -static int -event_trigger_init(struct event_trigger_ops *ops, - struct event_trigger_data *data) +int event_trigger_init(struct event_trigger_ops *ops, + struct event_trigger_data *data) { data->ref++; return 0; @@ -425,8 +423,8 @@ event_trigger_free(struct event_trigger_ops *ops, trigger_data_free(data); } -static int trace_event_trigger_enable_disable(struct trace_event_file *file, - int trigger_enable) +int trace_event_trigger_enable_disable(struct trace_event_file *file, + int trigger_enable) { int ret = 0; @@ -483,7 +481,7 @@ clear_event_triggers(struct trace_array *tr) * its TRIGGER_COND bit set, otherwise the TRIGGER_COND bit should be * cleared. */ -static void update_cond_flag(struct trace_event_file *file) +void update_cond_flag(struct trace_event_file *file) { struct event_trigger_data *data; bool set_cond = false; @@ -560,9 +558,9 @@ out: * Usually used directly as the @unreg method in event command * implementations. */ -static void unregister_trigger(char *glob, struct event_trigger_ops *ops, - struct event_trigger_data *test, - struct trace_event_file *file) +void unregister_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *test, + struct trace_event_file *file) { struct event_trigger_data *data; bool unregistered = false; @@ -696,9 +694,9 @@ event_trigger_callback(struct event_command *cmd_ops, * * Return: 0 on success, errno otherwise */ -static int set_trigger_filter(char *filter_str, - struct event_trigger_data *trigger_data, - struct trace_event_file *file) +int set_trigger_filter(char *filter_str, + struct event_trigger_data *trigger_data, + struct trace_event_file *file) { struct event_trigger_data *data = trigger_data; struct event_filter *filter = NULL, *tmp; -- cgit v1.2.3 From c4a5923055c9e0c87dfc0387f7cda5ee2bbac3c1 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 10 Dec 2015 12:50:45 -0600 Subject: tracing: Add event record param to trigger_ops.func() Some triggers may need access to the trace event, so pass it in. Also fix up the existing trigger funcs and their callers. Link: http://lkml.kernel.org/r/543e31e9fc445ef61077421ab219033401c39846.1449767187.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 6 ++++-- kernel/trace/trace_events_trigger.c | 36 +++++++++++++++++++----------------- 2 files changed, 23 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b2bc956e2b0d..c10456e72106 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1201,7 +1201,8 @@ extern int register_event_command(struct event_command *cmd); * @func: The trigger 'probe' function called when the triggering * event occurs. The data passed into this callback is the data * that was supplied to the event_command @reg() function that - * registered the trigger (see struct event_command). + * registered the trigger (see struct event_command) along with + * the trace record, rec. * * @init: An optional initialization function called for the trigger * when the trigger is registered (via the event_command reg() @@ -1226,7 +1227,8 @@ extern int register_event_command(struct event_command *cmd); * (see trace_event_triggers.c). */ struct event_trigger_ops { - void (*func)(struct event_trigger_data *data); + void (*func)(struct event_trigger_data *data, + void *rec); int (*init)(struct event_trigger_ops *ops, struct event_trigger_data *data); void (*free)(struct event_trigger_ops *ops, diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index f40424f35dcb..0a62887c63c0 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -73,7 +73,7 @@ event_triggers_call(struct trace_event_file *file, void *rec) list_for_each_entry_rcu(data, &file->triggers, list) { if (!rec) { - data->ops->func(data); + data->ops->func(data, rec); continue; } filter = rcu_dereference_sched(data->filter); @@ -83,7 +83,7 @@ event_triggers_call(struct trace_event_file *file, void *rec) tt |= data->cmd_ops->trigger_type; continue; } - data->ops->func(data); + data->ops->func(data, rec); } return tt; } @@ -93,6 +93,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call); * event_triggers_post_call - Call 'post_triggers' for a trace event * @file: The trace_event_file associated with the event * @tt: enum event_trigger_type containing a set bit for each trigger to invoke + * @rec: The trace entry for the event * * For each trigger associated with an event, invoke the trigger * function registered with the associated trigger command, if the @@ -103,13 +104,14 @@ EXPORT_SYMBOL_GPL(event_triggers_call); */ void event_triggers_post_call(struct trace_event_file *file, - enum event_trigger_type tt) + enum event_trigger_type tt, + void *rec) { struct event_trigger_data *data; list_for_each_entry_rcu(data, &file->triggers, list) { if (data->cmd_ops->trigger_type & tt) - data->ops->func(data); + data->ops->func(data, rec); } } EXPORT_SYMBOL_GPL(event_triggers_post_call); @@ -745,7 +747,7 @@ int set_trigger_filter(char *filter_str, } static void -traceon_trigger(struct event_trigger_data *data) +traceon_trigger(struct event_trigger_data *data, void *rec) { if (tracing_is_on()) return; @@ -754,7 +756,7 @@ traceon_trigger(struct event_trigger_data *data) } static void -traceon_count_trigger(struct event_trigger_data *data) +traceon_count_trigger(struct event_trigger_data *data, void *rec) { if (tracing_is_on()) return; @@ -769,7 +771,7 @@ traceon_count_trigger(struct event_trigger_data *data) } static void -traceoff_trigger(struct event_trigger_data *data) +traceoff_trigger(struct event_trigger_data *data, void *rec) { if (!tracing_is_on()) return; @@ -778,7 +780,7 @@ traceoff_trigger(struct event_trigger_data *data) } static void -traceoff_count_trigger(struct event_trigger_data *data) +traceoff_count_trigger(struct event_trigger_data *data, void *rec) { if (!tracing_is_on()) return; @@ -874,13 +876,13 @@ static struct event_command trigger_traceoff_cmd = { #ifdef CONFIG_TRACER_SNAPSHOT static void -snapshot_trigger(struct event_trigger_data *data) +snapshot_trigger(struct event_trigger_data *data, void *rec) { tracing_snapshot(); } static void -snapshot_count_trigger(struct event_trigger_data *data) +snapshot_count_trigger(struct event_trigger_data *data, void *rec) { if (!data->count) return; @@ -888,7 +890,7 @@ snapshot_count_trigger(struct event_trigger_data *data) if (data->count != -1) (data->count)--; - snapshot_trigger(data); + snapshot_trigger(data, rec); } static int @@ -967,13 +969,13 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; } #define STACK_SKIP 3 static void -stacktrace_trigger(struct event_trigger_data *data) +stacktrace_trigger(struct event_trigger_data *data, void *rec) { trace_dump_stack(STACK_SKIP); } static void -stacktrace_count_trigger(struct event_trigger_data *data) +stacktrace_count_trigger(struct event_trigger_data *data, void *rec) { if (!data->count) return; @@ -981,7 +983,7 @@ stacktrace_count_trigger(struct event_trigger_data *data) if (data->count != -1) (data->count)--; - stacktrace_trigger(data); + stacktrace_trigger(data, rec); } static int @@ -1052,7 +1054,7 @@ struct enable_trigger_data { }; static void -event_enable_trigger(struct event_trigger_data *data) +event_enable_trigger(struct event_trigger_data *data, void *rec) { struct enable_trigger_data *enable_data = data->private_data; @@ -1063,7 +1065,7 @@ event_enable_trigger(struct event_trigger_data *data) } static void -event_enable_count_trigger(struct event_trigger_data *data) +event_enable_count_trigger(struct event_trigger_data *data, void *rec) { struct enable_trigger_data *enable_data = data->private_data; @@ -1077,7 +1079,7 @@ event_enable_count_trigger(struct event_trigger_data *data) if (data->count != -1) (data->count)--; - event_enable_trigger(data); + event_enable_trigger(data, rec); } static int -- cgit v1.2.3 From dbfeaa7abae4f105afdf8ed4f85b5879cff136ea Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 10 Dec 2015 12:50:46 -0600 Subject: tracing: Add get_syscall_name() Add a utility function to grab the syscall name from the syscall metadata, given a syscall id. Link: http://lkml.kernel.org/r/be26a8dfe3f15e16a837799f1c1e2b4d62742843.1449767187.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 5 +++++ kernel/trace/trace_syscalls.c | 11 +++++++++++ 2 files changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c10456e72106..0044b91d5469 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1394,8 +1394,13 @@ int perf_ftrace_event_register(struct trace_event_call *call, #ifdef CONFIG_FTRACE_SYSCALLS void init_ftrace_syscalls(void); +const char *get_syscall_name(int syscall); #else static inline void init_ftrace_syscalls(void) { } +static inline const char *get_syscall_name(int syscall) +{ + return NULL; +} #endif #ifdef CONFIG_EVENT_TRACING diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 0655afbea83f..50be5602217c 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -106,6 +106,17 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr) return syscalls_metadata[nr]; } +const char *get_syscall_name(int syscall) +{ + struct syscall_metadata *entry; + + entry = syscall_nr_to_meta(syscall); + if (!entry) + return NULL; + + return entry->name; +} + static enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags, struct trace_event *event) -- cgit v1.2.3 From 104f281044a9c2ac86b851bbebbf74500172b625 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 10 Dec 2015 12:50:47 -0600 Subject: tracing: Add a per-event-trigger 'paused' field Add a simple per-trigger 'paused' flag, allowing individual triggers to pause. We could leave it to individual triggers that need this functionality to do it themselves, but we also want to allow other events to control pausing, so add it to the trigger data. Link: http://lkml.kernel.org/r/fed37e4879684d7dcc57fe00ce0cbf170032b06d.1449767187.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 1 + kernel/trace/trace_events_trigger.c | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0044b91d5469..f1868677f856 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1172,6 +1172,7 @@ struct event_trigger_data { struct event_filter __rcu *filter; char *filter_str; void *private_data; + bool paused; struct list_head list; }; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 0a62887c63c0..e4d8b3763175 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -72,6 +72,8 @@ event_triggers_call(struct trace_event_file *file, void *rec) return tt; list_for_each_entry_rcu(data, &file->triggers, list) { + if (data->paused) + continue; if (!rec) { data->ops->func(data, rec); continue; @@ -110,6 +112,8 @@ event_triggers_post_call(struct trace_event_file *file, struct event_trigger_data *data; list_for_each_entry_rcu(data, &file->triggers, list) { + if (data->paused) + continue; if (data->cmd_ops->trigger_type & tt) data->ops->func(data, rec); } -- cgit v1.2.3 From a5863dae84e2da83a1e5de485a7f150d0c28f08e Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 10 Dec 2015 12:50:48 -0600 Subject: tracing: Add needs_rec flag to event triggers Add a new needs_rec flag for triggers that require unconditional access to trace records in order to function. Normally a trigger requires access to the contents of a trace record only if it has a filter associated with it (since filters need the contents of a record in order to make a filtering decision). Some types of triggers, such as 'hist' triggers, require access to trace record contents independent of the presence of filters, so add a new flag for those triggers. Link: http://lkml.kernel.org/r/7be8fa38f9b90fdb6c47ca0f98d20a07b9fd512b.1449767187.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 7 +++++++ kernel/trace/trace_events_trigger.c | 3 ++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f1868677f856..8c6aefbb24d2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1292,6 +1292,12 @@ struct event_trigger_ops { * itself logs to the trace buffer, this flag should be set, * otherwise it can be left unspecified. * + * @needs_rec: A flag that says whether or not this command needs + * access to the trace record in order to perform its function, + * regardless of whether or not it has a filter associated with + * it (filters make a trigger require access to the trace record + * but are not always present). + * * All the methods below, except for @set_filter(), must be * implemented. * @@ -1332,6 +1338,7 @@ struct event_command { char *name; enum event_trigger_type trigger_type; bool post_trigger; + bool needs_rec; int (*func)(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, char *cmd, char *params); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index e4d8b3763175..a11bb4780f82 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -493,7 +493,8 @@ void update_cond_flag(struct trace_event_file *file) bool set_cond = false; list_for_each_entry_rcu(data, &file->triggers, list) { - if (data->filter || data->cmd_ops->post_trigger) { + if (data->filter || data->cmd_ops->post_trigger || + data->cmd_ops->needs_rec) { set_cond = true; break; } -- cgit v1.2.3 From a88e1cfb1d3081ffb34864d9cf8a5c289630f48e Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 10 Dec 2015 12:50:49 -0600 Subject: tracing: Add an unreg_all() callback to trigger commands Add a new unreg_all() callback that can be used to remove all command-specific triggers from an event and arrange to have it called whenever a trigger file is opened with O_TRUNC set. Commands that don't want truncate semantics, or existing commands that don't implement this function simply do nothing and their triggers remain intact. Link: http://lkml.kernel.org/r/2b7d62854d01f28c19185e1bbb8f826f385edfba.1449767187.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 9 +++++++-- kernel/trace/trace_events_trigger.c | 13 +++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8c6aefbb24d2..f4dd0adf71df 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1298,8 +1298,8 @@ struct event_trigger_ops { * it (filters make a trigger require access to the trace record * but are not always present). * - * All the methods below, except for @set_filter(), must be - * implemented. + * All the methods below, except for @set_filter() and @unreg_all(), + * must be implemented. * * @func: The callback function responsible for parsing and * registering the trigger written to the 'trigger' file by the @@ -1324,6 +1324,10 @@ struct event_trigger_ops { * This is usually implemented by the generic utility function * @unregister_trigger() (see trace_event_triggers.c). * + * @unreg_all: An optional function called to remove all the triggers + * from the list of triggers associated with the event. Called + * when a trigger file is opened in truncate mode. + * * @set_filter: An optional function called to parse and set a filter * for the trigger. If no @set_filter() method is set for the * event command, filters set by the user for the command will be @@ -1350,6 +1354,7 @@ struct event_command { struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file); + void (*unreg_all)(struct trace_event_file *file); int (*set_filter)(char *filter_str, struct event_trigger_data *data, struct trace_event_file *file); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index a11bb4780f82..cbb7ee531983 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -193,6 +193,19 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file) return -ENODEV; } + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) { + struct trace_event_file *event_file; + struct event_command *p; + + event_file = event_file_data(file); + + list_for_each_entry(p, &trigger_commands, list) { + if (p->unreg_all) + p->unreg_all(event_file); + } + } + if (file->f_mode & FMODE_READ) { ret = seq_open(file, &event_triggers_seq_ops); if (!ret) { -- cgit v1.2.3 From 353206f5ca05eb65704b2b3ec9a331b4fdfd3257 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 22 Feb 2016 15:55:09 -0500 Subject: tracing: Use flags instead of bool in trigger structure gcc isn't known for handling bool in structures. Instead of using bool, use an integer mask and use bit flags instead. Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 71 +++++++++++++++++++++++-------------- kernel/trace/trace_events_trigger.c | 8 ++--- 2 files changed, 49 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f4dd0adf71df..39588c23dd8b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1273,30 +1273,7 @@ struct event_trigger_ops { * values are defined by adding new values to the trigger_type * enum in include/linux/trace_events.h. * - * @post_trigger: A flag that says whether or not this command needs - * to have its action delayed until after the current event has - * been closed. Some triggers need to avoid being invoked while - * an event is currently in the process of being logged, since - * the trigger may itself log data into the trace buffer. Thus - * we make sure the current event is committed before invoking - * those triggers. To do that, the trigger invocation is split - * in two - the first part checks the filter using the current - * trace record; if a command has the @post_trigger flag set, it - * sets a bit for itself in the return value, otherwise it - * directly invokes the trigger. Once all commands have been - * either invoked or set their return flag, the current record is - * either committed or discarded. At that point, if any commands - * have deferred their triggers, those commands are finally - * invoked following the close of the current event. In other - * words, if the event_trigger_ops @func() probe implementation - * itself logs to the trace buffer, this flag should be set, - * otherwise it can be left unspecified. - * - * @needs_rec: A flag that says whether or not this command needs - * access to the trace record in order to perform its function, - * regardless of whether or not it has a filter associated with - * it (filters make a trigger require access to the trace record - * but are not always present). + * @flags: See the enum event_command_flags below. * * All the methods below, except for @set_filter() and @unreg_all(), * must be implemented. @@ -1341,8 +1318,7 @@ struct event_command { struct list_head list; char *name; enum event_trigger_type trigger_type; - bool post_trigger; - bool needs_rec; + int flags; int (*func)(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, char *cmd, char *params); @@ -1361,6 +1337,49 @@ struct event_command { struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param); }; +/** + * enum event_command_flags - flags for struct event_command + * + * @POST_TRIGGER: A flag that says whether or not this command needs + * to have its action delayed until after the current event has + * been closed. Some triggers need to avoid being invoked while + * an event is currently in the process of being logged, since + * the trigger may itself log data into the trace buffer. Thus + * we make sure the current event is committed before invoking + * those triggers. To do that, the trigger invocation is split + * in two - the first part checks the filter using the current + * trace record; if a command has the @post_trigger flag set, it + * sets a bit for itself in the return value, otherwise it + * directly invokes the trigger. Once all commands have been + * either invoked or set their return flag, the current record is + * either committed or discarded. At that point, if any commands + * have deferred their triggers, those commands are finally + * invoked following the close of the current event. In other + * words, if the event_trigger_ops @func() probe implementation + * itself logs to the trace buffer, this flag should be set, + * otherwise it can be left unspecified. + * + * @NEEDS_REC: A flag that says whether or not this command needs + * access to the trace record in order to perform its function, + * regardless of whether or not it has a filter associated with + * it (filters make a trigger require access to the trace record + * but are not always present). + */ +enum event_command_flags { + EVENT_CMD_FL_POST_TRIGGER = 1, + EVENT_CMD_FL_NEEDS_REC = 2, +}; + +static inline bool event_command_post_trigger(struct event_command *cmd_ops) +{ + return cmd_ops->flags & EVENT_CMD_FL_POST_TRIGGER; +} + +static inline bool event_command_needs_rec(struct event_command *cmd_ops) +{ + return cmd_ops->flags & EVENT_CMD_FL_NEEDS_REC; +} + extern int trace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable); extern int tracing_alloc_snapshot(void); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index cbb7ee531983..d67992f3bb0e 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -81,7 +81,7 @@ event_triggers_call(struct trace_event_file *file, void *rec) filter = rcu_dereference_sched(data->filter); if (filter && !filter_match_preds(filter, rec)) continue; - if (data->cmd_ops->post_trigger) { + if (event_command_post_trigger(data->cmd_ops)) { tt |= data->cmd_ops->trigger_type; continue; } @@ -506,8 +506,8 @@ void update_cond_flag(struct trace_event_file *file) bool set_cond = false; list_for_each_entry_rcu(data, &file->triggers, list) { - if (data->filter || data->cmd_ops->post_trigger || - data->cmd_ops->needs_rec) { + if (data->filter || event_command_post_trigger(data->cmd_ops) || + event_command_needs_rec(data->cmd_ops)) { set_cond = true; break; } @@ -1035,7 +1035,7 @@ stacktrace_get_trigger_ops(char *cmd, char *param) static struct event_command trigger_stacktrace_cmd = { .name = "stacktrace", .trigger_type = ETT_STACKTRACE, - .post_trigger = true, + .flags = EVENT_CMD_FL_POST_TRIGGER, .func = event_trigger_callback, .reg = register_trigger, .unreg = unregister_trigger, -- cgit v1.2.3 From 1cf8067b541884366b7db3a328342073fed2f38f Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Tue, 8 Mar 2016 21:37:02 +0800 Subject: tracing: Fix typoes in code comment and printk in trace_nop.c echo nop > /sys/kernel/debug/tracing/options/current_tracer echo 1 > /sys/kernel/debug/tracing/options/test_nop_accept echo 0 > /sys/kernel/debug/tracing/options/test_nop_accept echo 1 > /sys/kernel/debug/tracing/options/test_nop_refuse Before the fix, the dmesg is a bit ugly since a align issue. [ 191.973081] nop_test_accept flag set to 1: we accept. Now cat trace_options to see the result [ 195.156942] nop_test_refuse flag set to 1: we refuse.Now cat trace_options to see the result After the fix, the dmesg will show aligned log for nop_test_refuse and nop_test_accept. [ 2718.032413] nop_test_refuse flag set to 1: we refuse. Now cat trace_options to see the result [ 2734.253360] nop_test_accept flag set to 1: we accept. Now cat trace_options to see the result Link: http://lkml.kernel.org/r/1457444222-8654-2-git-send-email-chuhu@redhat.com Signed-off-by: Chunyu Hu Signed-off-by: Steven Rostedt --- kernel/trace/trace_nop.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index 8bb2071474dd..49f61fe96a6b 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -56,7 +56,7 @@ static void nop_trace_reset(struct trace_array *tr) } /* It only serves as a signal handler and a callback to - * accept or refuse tthe setting of a flag. + * accept or refuse the setting of a flag. * If you don't implement it, then the flag setting will be * automatically accepted. */ @@ -75,7 +75,7 @@ static int nop_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) if (bit == TRACE_NOP_OPT_REFUSE) { printk(KERN_DEBUG "nop_test_refuse flag set to %d: we refuse." - "Now cat trace_options to see the result\n", + " Now cat trace_options to see the result\n", set); return -EINVAL; } -- cgit v1.2.3 From 58cdb1ceb15aab7b34719ad225ff023775d774e7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 8 Mar 2016 11:51:25 -0500 Subject: cgroup: fix incorrect destination cgroup in cgroup_update_dfl_csses() cgroup_update_dfl_csses() should move each task in the subtree to self; however, it was incorrectly calling cgroup_migrate_add_src() with the root of the subtree as @dst_cgrp. Fortunately, cgroup_migrate_add_src() currently uses @dst_cgrp only to determine the hierarchy and the bug doesn't cause any actual breakages. Fix it. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c63fce0c5b24..50879aadcbd0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2901,7 +2901,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) struct cgrp_cset_link *link; list_for_each_entry(link, &dsct->cset_links, cset_link) - cgroup_migrate_add_src(link->cset, cgrp, + cgroup_migrate_add_src(link->cset, dsct, &preloaded_csets); } spin_unlock_bh(&css_set_lock); -- cgit v1.2.3 From 6c694c88255b2052d9922d62df6df7c9e152eeeb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 8 Mar 2016 11:51:25 -0500 Subject: cgroup: move migration destination verification out of cgroup_migrate_prepare_dst() cgroup_migrate_prepare_dst() verifies whether the destination cgroup is allowable; however, the test doesn't really belong there. It's too deep and common in the stack and as a result the test itself is gated by another test. Separate the test out into cgroup_may_migrate_to() and update cgroup_attach_task() and cgroup_transfer_tasks() to perform the test directly. This doesn't cause any behavior differences. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 50879aadcbd0..8a02076d4317 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2443,6 +2443,20 @@ out_release_tset: return ret; } +/** + * cgroup_may_migrate_to - verify whether a cgroup can be migration destination + * @dst_cgrp: destination cgroup to test + * + * On the default hierarchy, except for the root, subtree_control must be + * zero for migration destination cgroups with tasks so that child cgroups + * don't compete against tasks. + */ +static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp) +{ + return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) || + !dst_cgrp->subtree_control; +} + /** * cgroup_migrate_finish - cleanup after attach * @preloaded_csets: list of preloaded css_sets @@ -2529,14 +2543,6 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, lockdep_assert_held(&cgroup_mutex); - /* - * Except for the root, subtree_control must be zero for a cgroup - * with tasks so that child cgroups don't compete against tasks. - */ - if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) && - dst_cgrp->subtree_control) - return -EBUSY; - /* look up the dst cset for each src cset and link it to src */ list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) { struct css_set *dst_cset; @@ -2634,6 +2640,9 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *task; int ret; + if (!cgroup_may_migrate_to(dst_cgrp)) + return -EBUSY; + /* look up all src csets */ spin_lock_bh(&css_set_lock); rcu_read_lock(); @@ -4136,6 +4145,9 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) struct task_struct *task; int ret; + if (!cgroup_may_migrate_to(to)) + return -EBUSY; + mutex_lock(&cgroup_mutex); /* all tasks in @from are being moved, all csets are source */ -- cgit v1.2.3 From 37ff9f8f474216d0cfca7565a4e0caa521ee6e7e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 8 Mar 2016 11:51:26 -0500 Subject: cgroup: make cgroup[_taskset]_migrate() take cgroup_root instead of cgroup On the default hierarchy, a migration can be multi-source and/or multi-destination. cgroup_taskest_migrate() used to incorrectly assume single destination cgroup but the bug has been fixed by 1f7dd3e5a6e4 ("cgroup: fix handling of multi-destination migration from subtree_control enabling"). Since the commit, @dst_cgrp to cgroup[_taskset]_migrate() is only used to determine which subsystems are affected or which cgroup_root the migration is taking place in. As such, @dst_cgrp is misleading. This patch replaces @dst_cgrp with @root. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 70 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 35 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8a02076d4317..5dd761355033 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2355,38 +2355,38 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, } /** - * cgroup_taskset_migrate - migrate a taskset to a cgroup + * cgroup_taskset_migrate - migrate a taskset * @tset: taget taskset - * @dst_cgrp: destination cgroup + * @root: cgroup root the migration is taking place on * - * Migrate tasks in @tset to @dst_cgrp. This function fails iff one of the - * ->can_attach callbacks fails and guarantees that either all or none of - * the tasks in @tset are migrated. @tset is consumed regardless of - * success. + * Migrate tasks in @tset as setup by migration preparation functions. + * This function fails iff one of the ->can_attach callbacks fails and + * guarantees that either all or none of the tasks in @tset are migrated. + * @tset is consumed regardless of success. */ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, - struct cgroup *dst_cgrp) + struct cgroup_root *root) { - struct cgroup_subsys_state *css, *failed_css = NULL; + struct cgroup_subsys *ss; struct task_struct *task, *tmp_task; struct css_set *cset, *tmp_cset; - int i, ret; + int ssid, failed_ssid, ret; /* methods shouldn't be called if no task is actually migrating */ if (list_empty(&tset->src_csets)) return 0; /* check that we can legitimately attach to the cgroup */ - for_each_e_css(css, i, dst_cgrp) { - if (css->ss->can_attach) { - tset->ssid = i; - ret = css->ss->can_attach(tset); + do_each_subsys_mask(ss, ssid, root->subsys_mask) { + if (ss->can_attach) { + tset->ssid = ssid; + ret = ss->can_attach(tset); if (ret) { - failed_css = css; + failed_ssid = ssid; goto out_cancel_attach; } } - } + } while_each_subsys_mask(); /* * Now that we're guaranteed success, proceed to move all tasks to @@ -2413,25 +2413,25 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, */ tset->csets = &tset->dst_csets; - for_each_e_css(css, i, dst_cgrp) { - if (css->ss->attach) { - tset->ssid = i; - css->ss->attach(tset); + do_each_subsys_mask(ss, ssid, root->subsys_mask) { + if (ss->attach) { + tset->ssid = ssid; + ss->attach(tset); } - } + } while_each_subsys_mask(); ret = 0; goto out_release_tset; out_cancel_attach: - for_each_e_css(css, i, dst_cgrp) { - if (css == failed_css) + do_each_subsys_mask(ss, ssid, root->subsys_mask) { + if (ssid == failed_ssid) break; - if (css->ss->cancel_attach) { - tset->ssid = i; - css->ss->cancel_attach(tset); + if (ss->cancel_attach) { + tset->ssid = ssid; + ss->cancel_attach(tset); } - } + } while_each_subsys_mask(); out_release_tset: spin_lock_bh(&css_set_lock); list_splice_init(&tset->dst_csets, &tset->src_csets); @@ -2586,11 +2586,11 @@ err: * cgroup_migrate - migrate a process or task to a cgroup * @leader: the leader of the process or the task to migrate * @threadgroup: whether @leader points to the whole process or a single task - * @cgrp: the destination cgroup + * @root: cgroup root migration is taking place on * - * Migrate a process or task denoted by @leader to @cgrp. If migrating a - * process, the caller must be holding cgroup_threadgroup_rwsem. The - * caller is also responsible for invoking cgroup_migrate_add_src() and + * Migrate a process or task denoted by @leader. If migrating a process, + * the caller must be holding cgroup_threadgroup_rwsem. The caller is also + * responsible for invoking cgroup_migrate_add_src() and * cgroup_migrate_prepare_dst() on the targets before invoking this * function and following up with cgroup_migrate_finish(). * @@ -2601,7 +2601,7 @@ err: * actually starting migrating. */ static int cgroup_migrate(struct task_struct *leader, bool threadgroup, - struct cgroup *cgrp) + struct cgroup_root *root) { struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); struct task_struct *task; @@ -2622,7 +2622,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup, rcu_read_unlock(); spin_unlock_bh(&css_set_lock); - return cgroup_taskset_migrate(&tset, cgrp); + return cgroup_taskset_migrate(&tset, root); } /** @@ -2659,7 +2659,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, /* prepare dst csets and commit */ ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets); if (!ret) - ret = cgroup_migrate(leader, threadgroup, dst_cgrp); + ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root); cgroup_migrate_finish(&preloaded_csets); return ret; @@ -2934,7 +2934,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) } spin_unlock_bh(&css_set_lock); - ret = cgroup_taskset_migrate(&tset, cgrp); + ret = cgroup_taskset_migrate(&tset, cgrp->root); out_finish: cgroup_migrate_finish(&preloaded_csets); percpu_up_write(&cgroup_threadgroup_rwsem); @@ -4172,7 +4172,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) css_task_iter_end(&it); if (task) { - ret = cgroup_migrate(task, false, to); + ret = cgroup_migrate(task, false, to->root); put_task_struct(task); } } while (task && !ret); -- cgit v1.2.3 From e4857982f49d21c05a84351b56724bf353022355 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 8 Mar 2016 11:51:26 -0500 Subject: cgroup: use css_set->mg_dst_cgrp for the migration target cgroup Migration can be multi-target on the default hierarchy when a controller is enabled - processes belonging to each child cgroup have to be moved to the child cgroup itself to refresh css association. This isn't a problem for cgroup_migrate_add_src() as each source css_set still maps to single source and target cgroups; however, cgroup_migrate_prepare_dst() is called once after all source css_sets are added and thus might not have a single destination cgroup. This is currently worked around by specifying NULL for @dst_cgrp and using the source's default cgroup as destination as the only multi-target migration in use is self-targetting. While this works, it's subtle and clunky. As all taget cgroups are already specified while preparing the source css_sets, this clunkiness can easily be removed by recording the target cgroup in each source css_set. This patch adds css_set->mg_dst_cgrp which is recorded on cgroup_migrate_src() and used by cgroup_migrate_prepare_dst(). This also makes migration code ready for arbitrary multi-target migration. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5dd761355033..fbd3e99a4e98 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2473,6 +2473,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) spin_lock_bh(&css_set_lock); list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { cset->mg_src_cgrp = NULL; + cset->mg_dst_cgrp = NULL; cset->mg_dst_cset = NULL; list_del_init(&cset->mg_preload_node); put_css_set_locked(cset); @@ -2511,32 +2512,31 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, return; WARN_ON(src_cset->mg_src_cgrp); + WARN_ON(src_cset->mg_dst_cgrp); WARN_ON(!list_empty(&src_cset->mg_tasks)); WARN_ON(!list_empty(&src_cset->mg_node)); src_cset->mg_src_cgrp = src_cgrp; + src_cset->mg_dst_cgrp = dst_cgrp; get_css_set(src_cset); list_add(&src_cset->mg_preload_node, preloaded_csets); } /** * cgroup_migrate_prepare_dst - prepare destination css_sets for migration - * @dst_cgrp: the destination cgroup (may be %NULL) * @preloaded_csets: list of preloaded source css_sets * - * Tasks are about to be moved to @dst_cgrp and all the source css_sets - * have been preloaded to @preloaded_csets. This function looks up and - * pins all destination css_sets, links each to its source, and append them - * to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each - * source css_set is assumed to be its cgroup on the default hierarchy. + * Tasks are about to be moved and all the source css_sets have been + * preloaded to @preloaded_csets. This function looks up and pins all + * destination css_sets, links each to its source, and append them to + * @preloaded_csets. * * This function must be called after cgroup_migrate_add_src() has been * called on each migration source css_set. After migration is performed * using cgroup_migrate(), cgroup_migrate_finish() must be called on * @preloaded_csets. */ -static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, - struct list_head *preloaded_csets) +static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets) { LIST_HEAD(csets); struct css_set *src_cset, *tmp_cset; @@ -2547,8 +2547,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) { struct css_set *dst_cset; - dst_cset = find_css_set(src_cset, - dst_cgrp ?: src_cset->dfl_cgrp); + dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp); if (!dst_cset) goto err; @@ -2561,6 +2560,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, */ if (src_cset == dst_cset) { src_cset->mg_src_cgrp = NULL; + src_cset->mg_dst_cgrp = NULL; list_del_init(&src_cset->mg_preload_node); put_css_set(src_cset); put_css_set(dst_cset); @@ -2657,7 +2657,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, spin_unlock_bh(&css_set_lock); /* prepare dst csets and commit */ - ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets); + ret = cgroup_migrate_prepare_dst(&preloaded_csets); if (!ret) ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root); @@ -2916,7 +2916,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) spin_unlock_bh(&css_set_lock); /* NULL dst indicates self on default hierarchy */ - ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets); + ret = cgroup_migrate_prepare_dst(&preloaded_csets); if (ret) goto out_finish; @@ -4156,7 +4156,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) cgroup_migrate_add_src(link->cset, to, &preloaded_csets); spin_unlock_bh(&css_set_lock); - ret = cgroup_migrate_prepare_dst(to, &preloaded_csets); + ret = cgroup_migrate_prepare_dst(&preloaded_csets); if (ret) goto out_err; -- cgit v1.2.3 From f6d635ad341d5cc0b9c7ab46adfbf3bf5886cee4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 8 Mar 2016 11:51:26 -0500 Subject: cgroup: implement cgroup_subsys->implicit_on_dfl Some controllers, perf_event for now and possibly freezer in the future, don't really make sense to control explicitly through "cgroup.subtree_control". For example, the primary role of perf_event is identifying the cgroups of tasks; however, because the controller also keeps a small amount of state per cgroup, it can't be replaced with simple cgroup membership tests. This patch implements cgroup_subsys->implicit_on_dfl flag. When set, the controller is implicitly enabled on all cgroups on the v2 hierarchy so that utility type controllers such as perf_event can be enabled and function transparently. An implicit controller doesn't show up in "cgroup.controllers" or "cgroup.subtree_control", is exempt from no internal process rule and can be stolen from the default hierarchy even if there are non-root csses. v2: Reimplemented on top of the recent updates to css handling and subsystem rebinding. Rebinding implicit subsystems is now a simple matter of exempting it from the busy subsystem check. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fbd3e99a4e98..e22df5d81e59 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -186,6 +186,9 @@ static u16 cgroup_no_v1_mask; /* some controllers are not supported in the default hierarchy */ static u16 cgrp_dfl_inhibit_ss_mask; +/* some controllers are implicitly enabled on the default hierarchy */ +static unsigned long cgrp_dfl_implicit_ss_mask; + /* The list of hierarchy roots */ static LIST_HEAD(cgroup_roots); @@ -359,8 +362,8 @@ static u16 cgroup_control(struct cgroup *cgrp) return parent->subtree_control; if (cgroup_on_dfl(cgrp)) - root_ss_mask &= ~cgrp_dfl_inhibit_ss_mask; - + root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask | + cgrp_dfl_implicit_ss_mask); return root_ss_mask; } @@ -1327,6 +1330,8 @@ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask) lockdep_assert_held(&cgroup_mutex); + cur_ss_mask |= cgrp_dfl_implicit_ss_mask; + while (true) { u16 new_ss_mask = cur_ss_mask; @@ -1512,8 +1517,13 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) lockdep_assert_held(&cgroup_mutex); do_each_subsys_mask(ss, ssid, ss_mask) { - /* if @ss has non-root csses attached to it, can't move */ - if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss))) + /* + * If @ss has non-root csses attached to it, can't move. + * If @ss is an implicit controller, it is exempt from this + * rule and can be stolen. + */ + if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) && + !ss->implicit_on_dfl) return -EBUSY; /* can't move between two non-dummy roots either */ @@ -3039,6 +3049,18 @@ static void cgroup_restore_control(struct cgroup *cgrp) } } +static bool css_visible(struct cgroup_subsys_state *css) +{ + struct cgroup_subsys *ss = css->ss; + struct cgroup *cgrp = css->cgroup; + + if (cgroup_control(cgrp) & (1 << ss->id)) + return true; + if (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) + return false; + return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl; +} + /** * cgroup_apply_control_enable - enable or show csses according to control * @cgrp: root of the target subtree @@ -3074,7 +3096,7 @@ static int cgroup_apply_control_enable(struct cgroup *cgrp) return PTR_ERR(css); } - if (cgroup_control(dsct) & (1 << ss->id)) { + if (css_visible(css)) { ret = css_populate_dir(css); if (ret) return ret; @@ -3117,7 +3139,7 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp) if (css->parent && !(cgroup_ss_mask(dsct) & (1 << ss->id))) { kill_css(css); - } else if (!(cgroup_control(dsct) & (1 << ss->id))) { + } else if (!css_visible(css)) { css_clear_dir(css); if (ss->css_reset) ss->css_reset(css); @@ -5455,7 +5477,9 @@ int __init cgroup_init(void) cgrp_dfl_root.subsys_mask |= 1 << ss->id; - if (!ss->dfl_cftypes) + if (ss->implicit_on_dfl) + cgrp_dfl_implicit_ss_mask |= 1 << ss->id; + else if (!ss->dfl_cftypes) cgrp_dfl_inhibit_ss_mask |= 1 << ss->id; if (ss->dfl_cftypes == ss->legacy_cftypes) { -- cgit v1.2.3 From b121d1e74d1f24654bdc3165d3db1ca149501356 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 7 Mar 2016 21:57:13 -0800 Subject: bpf: prevent kprobe+bpf deadlocks if kprobe is placed within update or delete hash map helpers that hold bucket spin lock and triggered bpf program is trying to grab the spinlock for the same bucket on the same cpu, it will deadlock. Fix it by extending existing recursion prevention mechanism. Note, map_lookup and other tracing helpers don't have this problem, since they don't hold any locks and don't modify global data. bpf_trace_printk has its own recursive check and ok as well. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 13 +++++++++++++ kernel/trace/bpf_trace.c | 2 -- 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c95a753c2007..dc99f6a000f5 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -18,6 +18,8 @@ #include #include +DEFINE_PER_CPU(int, bpf_prog_active); + int sysctl_unprivileged_bpf_disabled __read_mostly; static LIST_HEAD(bpf_map_types); @@ -347,6 +349,11 @@ static int map_update_elem(union bpf_attr *attr) if (copy_from_user(value, uvalue, value_size) != 0) goto free_value; + /* must increment bpf_prog_active to avoid kprobe+bpf triggering from + * inside bpf map update or delete otherwise deadlocks are possible + */ + preempt_disable(); + __this_cpu_inc(bpf_prog_active); if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { err = bpf_percpu_hash_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { @@ -356,6 +363,8 @@ static int map_update_elem(union bpf_attr *attr) err = map->ops->map_update_elem(map, key, value, attr->flags); rcu_read_unlock(); } + __this_cpu_dec(bpf_prog_active); + preempt_enable(); free_value: kfree(value); @@ -394,9 +403,13 @@ static int map_delete_elem(union bpf_attr *attr) if (copy_from_user(key, ukey, map->key_size) != 0) goto free_key; + preempt_disable(); + __this_cpu_inc(bpf_prog_active); rcu_read_lock(); err = map->ops->map_delete_elem(map, key); rcu_read_unlock(); + __this_cpu_dec(bpf_prog_active); + preempt_enable(); free_key: kfree(key); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4b8caa392b86..3e4ffb3ace5f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -13,8 +13,6 @@ #include #include "trace.h" -static DEFINE_PER_CPU(int, bpf_prog_active); - /** * trace_call_bpf - invoke BPF program * @prog: BPF program -- cgit v1.2.3 From e19494edab82f55a633911f25094581891bdc351 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 7 Mar 2016 21:57:14 -0800 Subject: bpf: introduce percpu_freelist Introduce simple percpu_freelist to keep single list of elements spread across per-cpu singly linked lists. /* push element into the list */ void pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *); /* pop element from the list */ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *); The object is pushed to the current cpu list. Pop first trying to get the object from the current cpu list, if it's empty goes to the neigbour cpu list. For bpf program usage pattern the collision rate is very low, since programs push and pop the objects typically on the same cpu. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/Makefile | 2 +- kernel/bpf/percpu_freelist.c | 100 +++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/percpu_freelist.h | 31 ++++++++++++++ 3 files changed, 132 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/percpu_freelist.c create mode 100644 kernel/bpf/percpu_freelist.h (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 8a932d079c24..eed911d091da 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,7 +1,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o -obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c new file mode 100644 index 000000000000..5c51d1985b51 --- /dev/null +++ b/kernel/bpf/percpu_freelist.c @@ -0,0 +1,100 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include "percpu_freelist.h" + +int pcpu_freelist_init(struct pcpu_freelist *s) +{ + int cpu; + + s->freelist = alloc_percpu(struct pcpu_freelist_head); + if (!s->freelist) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu); + + raw_spin_lock_init(&head->lock); + head->first = NULL; + } + return 0; +} + +void pcpu_freelist_destroy(struct pcpu_freelist *s) +{ + free_percpu(s->freelist); +} + +static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head, + struct pcpu_freelist_node *node) +{ + raw_spin_lock(&head->lock); + node->next = head->first; + head->first = node; + raw_spin_unlock(&head->lock); +} + +void pcpu_freelist_push(struct pcpu_freelist *s, + struct pcpu_freelist_node *node) +{ + struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist); + + __pcpu_freelist_push(head, node); +} + +void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, + u32 nr_elems) +{ + struct pcpu_freelist_head *head; + unsigned long flags; + int i, cpu, pcpu_entries; + + pcpu_entries = nr_elems / num_possible_cpus() + 1; + i = 0; + + /* disable irq to workaround lockdep false positive + * in bpf usage pcpu_freelist_populate() will never race + * with pcpu_freelist_push() + */ + local_irq_save(flags); + for_each_possible_cpu(cpu) { +again: + head = per_cpu_ptr(s->freelist, cpu); + __pcpu_freelist_push(head, buf); + i++; + buf += elem_size; + if (i == nr_elems) + break; + if (i % pcpu_entries) + goto again; + } + local_irq_restore(flags); +} + +struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) +{ + struct pcpu_freelist_head *head; + struct pcpu_freelist_node *node; + int orig_cpu, cpu; + + orig_cpu = cpu = raw_smp_processor_id(); + while (1) { + head = per_cpu_ptr(s->freelist, cpu); + raw_spin_lock(&head->lock); + node = head->first; + if (node) { + head->first = node->next; + raw_spin_unlock(&head->lock); + return node; + } + raw_spin_unlock(&head->lock); + cpu = cpumask_next(cpu, cpu_possible_mask); + if (cpu >= nr_cpu_ids) + cpu = 0; + if (cpu == orig_cpu) + return NULL; + } +} diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h new file mode 100644 index 000000000000..3049aae8ea1e --- /dev/null +++ b/kernel/bpf/percpu_freelist.h @@ -0,0 +1,31 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef __PERCPU_FREELIST_H__ +#define __PERCPU_FREELIST_H__ +#include +#include + +struct pcpu_freelist_head { + struct pcpu_freelist_node *first; + raw_spinlock_t lock; +}; + +struct pcpu_freelist { + struct pcpu_freelist_head __percpu *freelist; +}; + +struct pcpu_freelist_node { + struct pcpu_freelist_node *next; +}; + +void pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *); +struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *); +void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, + u32 nr_elems); +int pcpu_freelist_init(struct pcpu_freelist *); +void pcpu_freelist_destroy(struct pcpu_freelist *s); +#endif -- cgit v1.2.3 From 6c90598174322b8888029e40dd84a4eb01f56afe Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 7 Mar 2016 21:57:15 -0800 Subject: bpf: pre-allocate hash map elements If kprobe is placed on spin_unlock then calling kmalloc/kfree from bpf programs is not safe, since the following dead lock is possible: kfree->spin_lock(kmem_cache_node->lock)...spin_unlock->kprobe-> bpf_prog->map_update->kmalloc->spin_lock(of the same kmem_cache_node->lock) and deadlocks. The following solutions were considered and some implemented, but eventually discarded - kmem_cache_create for every map - add recursion check to slow-path of slub - use reserved memory in bpf_map_update for in_irq or in preempt_disabled - kmalloc via irq_work At the end pre-allocation of all map elements turned out to be the simplest solution and since the user is charged upfront for all the memory, such pre-allocation doesn't affect the user space visible behavior. Since it's impossible to tell whether kprobe is triggered in a safe location from kmalloc point of view, use pre-allocation by default and introduce new BPF_F_NO_PREALLOC flag. While testing of per-cpu hash maps it was discovered that alloc_percpu(GFP_ATOMIC) has odd corner cases and often fails to allocate memory even when 90% of it is free. The pre-allocation of per-cpu hash elements solves this problem as well. Turned out that bpf_map_update() quickly followed by bpf_map_lookup()+bpf_map_delete() is very common pattern used in many of iovisor/bcc/tools, so there is additional benefit of pre-allocation, since such use cases are must faster. Since all hash map elements are now pre-allocated we can remove atomic increment of htab->count and save few more cycles. Also add bpf_map_precharge_memlock() to check rlimit_memlock early to avoid large malloc/free done by users who don't have sufficient limits. Pre-allocation is done with vmalloc and alloc/free is done via percpu_freelist. Here are performance numbers for different pre-allocation algorithms that were implemented, but discarded in favor of percpu_freelist: 1 cpu: pcpu_ida 2.1M pcpu_ida nolock 2.3M bt 2.4M kmalloc 1.8M hlist+spinlock 2.3M pcpu_freelist 2.6M 4 cpu: pcpu_ida 1.5M pcpu_ida nolock 1.8M bt w/smp_align 1.7M bt no/smp_align 1.1M kmalloc 0.7M hlist+spinlock 0.2M pcpu_freelist 2.0M 8 cpu: pcpu_ida 0.7M bt w/smp_align 0.8M kmalloc 0.4M pcpu_freelist 1.5M 32 cpu: kmalloc 0.13M pcpu_freelist 0.49M pcpu_ida nolock is a modified percpu_ida algorithm without percpu_ida_cpu locks and without cross-cpu tag stealing. It's faster than existing percpu_ida, but not as fast as pcpu_freelist. bt is a variant of block/blk-mq-tag.c simlified and customized for bpf use case. bt w/smp_align is using cache line for every 'long' (similar to blk-mq-tag). bt no/smp_align allocates 'long' bitmasks continuously to save memory. It's comparable to percpu_ida and in some cases faster, but slower than percpu_freelist hlist+spinlock is the simplest free list with single spinlock. As expeceted it has very bad scaling in SMP. kmalloc is existing implementation which is still available via BPF_F_NO_PREALLOC flag. It's significantly slower in single cpu and in 8 cpu setup it's 3 times slower than pre-allocation with pcpu_freelist, but saves memory, so in cases where map->max_entries can be large and number of map update/delete per second is low, it may make sense to use it. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/hashtab.c | 240 +++++++++++++++++++++++++++++++++++---------------- kernel/bpf/syscall.c | 15 +++- 2 files changed, 181 insertions(+), 74 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index a68e95133fcd..fff3650d52fc 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1,4 +1,5 @@ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -13,6 +14,7 @@ #include #include #include +#include "percpu_freelist.h" struct bucket { struct hlist_head head; @@ -22,6 +24,8 @@ struct bucket { struct bpf_htab { struct bpf_map map; struct bucket *buckets; + void *elems; + struct pcpu_freelist freelist; atomic_t count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ @@ -29,15 +33,86 @@ struct bpf_htab { /* each htab element is struct htab_elem + key + value */ struct htab_elem { - struct hlist_node hash_node; - struct rcu_head rcu; union { - u32 hash; - u32 key_size; + struct hlist_node hash_node; + struct bpf_htab *htab; + struct pcpu_freelist_node fnode; }; + struct rcu_head rcu; + u32 hash; char key[0] __aligned(8); }; +static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, + void __percpu *pptr) +{ + *(void __percpu **)(l->key + key_size) = pptr; +} + +static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) +{ + return *(void __percpu **)(l->key + key_size); +} + +static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) +{ + return (struct htab_elem *) (htab->elems + i * htab->elem_size); +} + +static void htab_free_elems(struct bpf_htab *htab) +{ + int i; + + if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) + goto free_elems; + + for (i = 0; i < htab->map.max_entries; i++) { + void __percpu *pptr; + + pptr = htab_elem_get_ptr(get_htab_elem(htab, i), + htab->map.key_size); + free_percpu(pptr); + } +free_elems: + vfree(htab->elems); +} + +static int prealloc_elems_and_freelist(struct bpf_htab *htab) +{ + int err = -ENOMEM, i; + + htab->elems = vzalloc(htab->elem_size * htab->map.max_entries); + if (!htab->elems) + return -ENOMEM; + + if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) + goto skip_percpu_elems; + + for (i = 0; i < htab->map.max_entries; i++) { + u32 size = round_up(htab->map.value_size, 8); + void __percpu *pptr; + + pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN); + if (!pptr) + goto free_elems; + htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, + pptr); + } + +skip_percpu_elems: + err = pcpu_freelist_init(&htab->freelist); + if (err) + goto free_elems; + + pcpu_freelist_populate(&htab->freelist, htab->elems, htab->elem_size, + htab->map.max_entries); + return 0; + +free_elems: + htab_free_elems(htab); + return err; +} + /* Called from syscall */ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { @@ -46,6 +121,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) int err, i; u64 cost; + if (attr->map_flags & ~BPF_F_NO_PREALLOC) + /* reserved bits should not be used */ + return ERR_PTR(-EINVAL); + htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) return ERR_PTR(-ENOMEM); @@ -55,6 +134,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) htab->map.key_size = attr->key_size; htab->map.value_size = attr->value_size; htab->map.max_entries = attr->max_entries; + htab->map.map_flags = attr->map_flags; /* check sanity of attributes. * value_size == 0 may be allowed in the future to use map as a set @@ -92,7 +172,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (percpu) htab->elem_size += sizeof(void *); else - htab->elem_size += htab->map.value_size; + htab->elem_size += round_up(htab->map.value_size, 8); /* prevent zero size kmalloc and check for u32 overflow */ if (htab->n_buckets == 0 || @@ -112,6 +192,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + /* if map size is larger than memlock limit, reject it early */ + err = bpf_map_precharge_memlock(htab->map.pages); + if (err) + goto free_htab; + err = -ENOMEM; htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket), GFP_USER | __GFP_NOWARN); @@ -127,10 +212,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) raw_spin_lock_init(&htab->buckets[i].lock); } - atomic_set(&htab->count, 0); + if (!(attr->map_flags & BPF_F_NO_PREALLOC)) { + err = prealloc_elems_and_freelist(htab); + if (err) + goto free_buckets; + } return &htab->map; +free_buckets: + kvfree(htab->buckets); free_htab: kfree(htab); return ERR_PTR(err); @@ -249,42 +340,42 @@ find_first_elem: } } - /* itereated over all buckets and all elements */ + /* iterated over all buckets and all elements */ return -ENOENT; } - -static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, - void __percpu *pptr) -{ - *(void __percpu **)(l->key + key_size) = pptr; -} - -static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) -{ - return *(void __percpu **)(l->key + key_size); -} - -static void htab_percpu_elem_free(struct htab_elem *l) +static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) { - free_percpu(htab_elem_get_ptr(l, l->key_size)); + if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) + free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); kfree(l); + } -static void htab_percpu_elem_free_rcu(struct rcu_head *head) +static void htab_elem_free_rcu(struct rcu_head *head) { struct htab_elem *l = container_of(head, struct htab_elem, rcu); + struct bpf_htab *htab = l->htab; - htab_percpu_elem_free(l); + /* must increment bpf_prog_active to avoid kprobe+bpf triggering while + * we're calling kfree, otherwise deadlock is possible if kprobes + * are placed somewhere inside of slub + */ + preempt_disable(); + __this_cpu_inc(bpf_prog_active); + htab_elem_free(htab, l); + __this_cpu_dec(bpf_prog_active); + preempt_enable(); } -static void free_htab_elem(struct htab_elem *l, bool percpu, u32 key_size) +static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) { - if (percpu) { - l->key_size = key_size; - call_rcu(&l->rcu, htab_percpu_elem_free_rcu); + if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) { + pcpu_freelist_push(&htab->freelist, &l->fnode); } else { - kfree_rcu(l, rcu); + atomic_dec(&htab->count); + l->htab = htab; + call_rcu(&l->rcu, htab_elem_free_rcu); } } @@ -293,23 +384,39 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, bool percpu, bool onallcpus) { u32 size = htab->map.value_size; + bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC); struct htab_elem *l_new; void __percpu *pptr; - l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); - if (!l_new) - return NULL; + if (prealloc) { + l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist); + if (!l_new) + return ERR_PTR(-E2BIG); + } else { + if (atomic_inc_return(&htab->count) > htab->map.max_entries) { + atomic_dec(&htab->count); + return ERR_PTR(-E2BIG); + } + l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); + if (!l_new) + return ERR_PTR(-ENOMEM); + } memcpy(l_new->key, key, key_size); if (percpu) { /* round up value_size to 8 bytes */ size = round_up(size, 8); - /* alloc_percpu zero-fills */ - pptr = __alloc_percpu_gfp(size, 8, GFP_ATOMIC | __GFP_NOWARN); - if (!pptr) { - kfree(l_new); - return NULL; + if (prealloc) { + pptr = htab_elem_get_ptr(l_new, key_size); + } else { + /* alloc_percpu zero-fills */ + pptr = __alloc_percpu_gfp(size, 8, + GFP_ATOMIC | __GFP_NOWARN); + if (!pptr) { + kfree(l_new); + return ERR_PTR(-ENOMEM); + } } if (!onallcpus) { @@ -324,7 +431,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, off += size; } } - htab_elem_set_ptr(l_new, key_size, pptr); + if (!prealloc) + htab_elem_set_ptr(l_new, key_size, pptr); } else { memcpy(l_new->key + round_up(key_size, 8), value, size); } @@ -336,12 +444,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, u64 map_flags) { - if (!l_old && unlikely(atomic_read(&htab->count) >= htab->map.max_entries)) - /* if elem with this 'key' doesn't exist and we've reached - * max_entries limit, fail insertion of new elem - */ - return -E2BIG; - if (l_old && map_flags == BPF_NOEXIST) /* elem already exists */ return -EEXIST; @@ -375,13 +477,6 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, hash = htab_map_hash(key, key_size); - /* allocate new element outside of the lock, since - * we're most likley going to insert it - */ - l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false); - if (!l_new) - return -ENOMEM; - b = __select_bucket(htab, hash); head = &b->head; @@ -394,21 +489,24 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, if (ret) goto err; + l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false); + if (IS_ERR(l_new)) { + /* all pre-allocated elements are in use or memory exhausted */ + ret = PTR_ERR(l_new); + goto err; + } + /* add new element to the head of the list, so that * concurrent search will find it before old elem */ hlist_add_head_rcu(&l_new->hash_node, head); if (l_old) { hlist_del_rcu(&l_old->hash_node); - kfree_rcu(l_old, rcu); - } else { - atomic_inc(&htab->count); + free_htab_elem(htab, l_old); } - raw_spin_unlock_irqrestore(&b->lock, flags); - return 0; + ret = 0; err: raw_spin_unlock_irqrestore(&b->lock, flags); - kfree(l_new); return ret; } @@ -466,12 +564,11 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, } else { l_new = alloc_htab_elem(htab, key, value, key_size, hash, true, onallcpus); - if (!l_new) { - ret = -ENOMEM; + if (IS_ERR(l_new)) { + ret = PTR_ERR(l_new); goto err; } hlist_add_head_rcu(&l_new->hash_node, head); - atomic_inc(&htab->count); } ret = 0; err: @@ -489,7 +586,6 @@ static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, static int htab_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - bool percpu = map->map_type == BPF_MAP_TYPE_PERCPU_HASH; struct hlist_head *head; struct bucket *b; struct htab_elem *l; @@ -511,8 +607,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) if (l) { hlist_del_rcu(&l->hash_node); - atomic_dec(&htab->count); - free_htab_elem(l, percpu, key_size); + free_htab_elem(htab, l); ret = 0; } @@ -531,17 +626,10 @@ static void delete_all_elements(struct bpf_htab *htab) hlist_for_each_entry_safe(l, n, head, hash_node) { hlist_del_rcu(&l->hash_node); - atomic_dec(&htab->count); - if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) { - l->key_size = htab->map.key_size; - htab_percpu_elem_free(l); - } else { - kfree(l); - } + htab_elem_free(htab, l); } } } - /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void htab_map_free(struct bpf_map *map) { @@ -554,10 +642,16 @@ static void htab_map_free(struct bpf_map *map) */ synchronize_rcu(); - /* some of kfree_rcu() callbacks for elements of this map may not have - * executed. It's ok. Proceed to free residual elements and map itself + /* some of free_htab_elem() callbacks for elements of this map may + * not have executed. Wait for them. */ - delete_all_elements(htab); + rcu_barrier(); + if (htab->map.map_flags & BPF_F_NO_PREALLOC) { + delete_all_elements(htab); + } else { + htab_free_elems(htab); + pcpu_freelist_destroy(&htab->freelist); + } kvfree(htab->buckets); kfree(htab); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index dc99f6a000f5..cbd94b2144ff 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -48,6 +48,19 @@ void bpf_register_map_type(struct bpf_map_type_list *tl) list_add(&tl->list_node, &bpf_map_types); } +int bpf_map_precharge_memlock(u32 pages) +{ + struct user_struct *user = get_current_user(); + unsigned long memlock_limit, cur; + + memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + cur = atomic_long_read(&user->locked_vm); + free_uid(user); + if (cur + pages > memlock_limit) + return -EPERM; + return 0; +} + static int bpf_map_charge_memlock(struct bpf_map *map) { struct user_struct *user = get_current_user(); @@ -153,7 +166,7 @@ int bpf_map_new_fd(struct bpf_map *map) offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL -#define BPF_MAP_CREATE_LAST_FIELD max_entries +#define BPF_MAP_CREATE_LAST_FIELD map_flags /* called via syscall */ static int map_create(union bpf_attr *attr) { -- cgit v1.2.3 From 823707b68d6e6c4b1be619b039c7045fef1740e6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 7 Mar 2016 21:57:16 -0800 Subject: bpf: check for reserved flag bits in array and stack maps Suggested-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 2 +- kernel/bpf/stackmap.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index bd3bdf2486a7..76d5a794e426 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -53,7 +53,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size == 0) + attr->value_size == 0 || attr->map_flags) return ERR_PTR(-EINVAL); if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1)) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 8a60ee14a977..f0a02c344358 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -35,6 +35,9 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); + if (attr->map_flags) + return ERR_PTR(-EINVAL); + /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || value_size < 8 || value_size % 8 || -- cgit v1.2.3 From 557c0c6e7df8e14a46bd7560d193fa5bbc00a858 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 7 Mar 2016 21:57:17 -0800 Subject: bpf: convert stackmap to pre-allocation It was observed that calling bpf_get_stackid() from a kprobe inside slub or from spin_unlock causes similar deadlock as with hashmap, therefore convert stackmap to use pre-allocated memory. The call_rcu is no longer feasible mechanism, since delayed freeing causes bpf_get_stackid() to fail unpredictably when number of actual stacks is significantly less than user requested max_entries. Since elements are no longer freed into slub, we can push elements into freelist immediately and let them be recycled. However the very unlikley race between user space map_lookup() and program-side recycling is possible: cpu0 cpu1 ---- ---- user does lookup(stackidX) starts copying ips into buffer delete(stackidX) calls bpf_get_stackid() which recyles the element and overwrites with new stack trace To avoid user space seeing a partial stack trace consisting of two merged stack traces, do bucket = xchg(, NULL); copy; xchg(,bucket); to preserve consistent stack trace delivery to user space. Now we can move memset(,0) of left-over element value from critical path of bpf_get_stackid() into slow-path of user space lookup. Also disallow lookup() from bpf program, since it's useless and program shouldn't be messing with collected stack trace. Note that similar race between user space lookup and kernel side updates is also present in hashmap, but it's not a new race. bpf programs were always allowed to modify hash and array map elements while user space is copying them. Fixes: d5a3b1f69186 ("bpf: introduce BPF_MAP_TYPE_STACK_TRACE") Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/stackmap.c | 86 ++++++++++++++++++++++++++++++++++++++++----------- kernel/bpf/syscall.c | 2 ++ 2 files changed, 70 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index f0a02c344358..499d9e933f8e 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -10,9 +10,10 @@ #include #include #include +#include "percpu_freelist.h" struct stack_map_bucket { - struct rcu_head rcu; + struct pcpu_freelist_node fnode; u32 hash; u32 nr; u64 ip[]; @@ -20,10 +21,34 @@ struct stack_map_bucket { struct bpf_stack_map { struct bpf_map map; + void *elems; + struct pcpu_freelist freelist; u32 n_buckets; - struct stack_map_bucket __rcu *buckets[]; + struct stack_map_bucket *buckets[]; }; +static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) +{ + u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size; + int err; + + smap->elems = vzalloc(elem_size * smap->map.max_entries); + if (!smap->elems) + return -ENOMEM; + + err = pcpu_freelist_init(&smap->freelist); + if (err) + goto free_elems; + + pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size, + smap->map.max_entries); + return 0; + +free_elems: + vfree(smap->elems); + return err; +} + /* Called from syscall */ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) { @@ -70,12 +95,22 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) smap->n_buckets = n_buckets; smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + err = bpf_map_precharge_memlock(smap->map.pages); + if (err) + goto free_smap; + err = get_callchain_buffers(); if (err) goto free_smap; + err = prealloc_elems_and_freelist(smap); + if (err) + goto put_buffers; + return &smap->map; +put_buffers: + put_callchain_buffers(); free_smap: kvfree(smap); return ERR_PTR(err); @@ -121,7 +156,7 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) ips = trace->ip + skip + init_nr; hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); id = hash & (smap->n_buckets - 1); - bucket = rcu_dereference(smap->buckets[id]); + bucket = READ_ONCE(smap->buckets[id]); if (bucket && bucket->hash == hash) { if (flags & BPF_F_FAST_STACK_CMP) @@ -135,19 +170,18 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) if (bucket && !(flags & BPF_F_REUSE_STACKID)) return -EEXIST; - new_bucket = kmalloc(sizeof(struct stack_map_bucket) + map->value_size, - GFP_ATOMIC | __GFP_NOWARN); + new_bucket = (struct stack_map_bucket *) + pcpu_freelist_pop(&smap->freelist); if (unlikely(!new_bucket)) return -ENOMEM; memcpy(new_bucket->ip, ips, trace_len); - memset(new_bucket->ip + trace_len / 8, 0, map->value_size - trace_len); new_bucket->hash = hash; new_bucket->nr = trace_nr; old_bucket = xchg(&smap->buckets[id], new_bucket); if (old_bucket) - kfree_rcu(old_bucket, rcu); + pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); return id; } @@ -160,17 +194,34 @@ const struct bpf_func_proto bpf_get_stackid_proto = { .arg3_type = ARG_ANYTHING, }; -/* Called from syscall or from eBPF program */ +/* Called from eBPF program */ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) +{ + return NULL; +} + +/* Called from syscall */ +int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); - struct stack_map_bucket *bucket; - u32 id = *(u32 *)key; + struct stack_map_bucket *bucket, *old_bucket; + u32 id = *(u32 *)key, trace_len; if (unlikely(id >= smap->n_buckets)) - return NULL; - bucket = rcu_dereference(smap->buckets[id]); - return bucket ? bucket->ip : NULL; + return -ENOENT; + + bucket = xchg(&smap->buckets[id], NULL); + if (!bucket) + return -ENOENT; + + trace_len = bucket->nr * sizeof(u64); + memcpy(value, bucket->ip, trace_len); + memset(value + trace_len, 0, map->value_size - trace_len); + + old_bucket = xchg(&smap->buckets[id], bucket); + if (old_bucket) + pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); + return 0; } static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key) @@ -196,7 +247,7 @@ static int stack_map_delete_elem(struct bpf_map *map, void *key) old_bucket = xchg(&smap->buckets[id], NULL); if (old_bucket) { - kfree_rcu(old_bucket, rcu); + pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); return 0; } else { return -ENOENT; @@ -207,13 +258,12 @@ static int stack_map_delete_elem(struct bpf_map *map, void *key) static void stack_map_free(struct bpf_map *map) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); - int i; + /* wait for bpf programs to complete before freeing stack map */ synchronize_rcu(); - for (i = 0; i < smap->n_buckets; i++) - if (smap->buckets[i]) - kfree_rcu(smap->buckets[i], rcu); + vfree(smap->elems); + pcpu_freelist_destroy(&smap->freelist); kvfree(smap); put_callchain_buffers(); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cbd94b2144ff..2978d0d08869 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -290,6 +290,8 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_percpu_hash_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { + err = bpf_stackmap_copy(map, key, value); } else { rcu_read_lock(); ptr = map->ops->map_lookup_elem(map, key); -- cgit v1.2.3 From e237a5518425155faa508a087f28269f58074b92 Mon Sep 17 00:00:00 2001 From: Chen Fan Date: Mon, 15 Feb 2016 12:52:01 +0800 Subject: x86/ACPI/PCI: Recognize that Interrupt Line 255 means "not connected" Per the x86-specific footnote to PCI spec r3.0, sec 6.2.4, the value 255 in the Interrupt Line register means "unknown" or "no connection." Previously, when we couldn't derive an IRQ from the _PRT, we fell back to using the value from Interrupt Line as an IRQ. It's questionable whether we should do that at all, but the spec clearly suggests we shouldn't do it for the value 255 on x86. Calling request_irq() with IRQ 255 may succeed, but the driver won't receive any interrupts. Or, if IRQ 255 is shared with another device, it may succeed, and the driver's ISR will be called at random times when the *other* device interrupts. Or it may fail if another device is using IRQ 255 with incompatible flags. What we *want* is for request_irq() to fail predictably so the driver can fall back to polling. On x86, assume 255 in the Interrupt Line means the INTx line is not connected. In that case, set dev->irq to IRQ_NOTCONNECTED so request_irq() will fail gracefully with -ENOTCONN. We found this problem on a system where Secure Boot firmware assigned Interrupt Line 255 to an i801_smbus device and another device was already using MSI-X IRQ 255. This was in v3.10, where i801_probe() fails if request_irq() fails: i801_smbus 0000:00:1f.3: enabling device (0140 -> 0143) i801_smbus 0000:00:1f.3: can't derive routing for PCI INT C i801_smbus 0000:00:1f.3: PCI INT C: no GSI genirq: Flags mismatch irq 255. 00000080 (i801_smbus) vs. 00000000 (megasa) CPU: 0 PID: 2487 Comm: kworker/0:1 Not tainted 3.10.0-229.el7.x86_64 #1 Hardware name: FUJITSU PRIMEQUEST 2800E2/D3736, BIOS PRIMEQUEST 2000 Serie5 Call Trace: dump_stack+0x19/0x1b __setup_irq+0x54a/0x570 request_threaded_irq+0xcc/0x170 i801_probe+0x32f/0x508 [i2c_i801] local_pci_probe+0x45/0xa0 i801_smbus 0000:00:1f.3: Failed to allocate irq 255: -16 i801_smbus: probe of 0000:00:1f.3 failed with error -16 After aeb8a3d16ae0 ("i2c: i801: Check if interrupts are disabled"), i801_probe() will fall back to polling if request_irq() fails. But we still need this patch because request_irq() may succeed or fail depending on other devices in the system. If request_irq() fails, i801_smbus will work by falling back to polling, but if it succeeds, i801_smbus won't work because it expects interrupts that it may not receive. Signed-off-by: Chen Fan Acked-by: Thomas Gleixner Acked-by: Bjorn Helgaas Signed-off-by: Rafael J. Wysocki --- kernel/irq/manage.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 841187239adc..e79e60f50bce 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1609,6 +1609,9 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, struct irq_desc *desc; int retval; + if (irq == IRQ_NOTCONNECTED) + return -ENOTCONN; + /* * Sanity-check: shared interrupts must pass in a real dev-ID, * otherwise we'll have trouble later trying to figure out @@ -1699,9 +1702,13 @@ EXPORT_SYMBOL(request_threaded_irq); int request_any_context_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev_id) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc; int ret; + if (irq == IRQ_NOTCONNECTED) + return -ENOTCONN; + + desc = irq_to_desc(irq); if (!desc) return -EINVAL; -- cgit v1.2.3 From 34e2c555f3e13c90e9284e23d00f03be8a6e06c5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 15 Feb 2016 20:20:42 +0100 Subject: cpufreq: Add mechanism for registering utilization update callbacks Introduce a mechanism by which parts of the cpufreq subsystem ("setpolicy" drivers or the core) can register callbacks to be executed from cpufreq_update_util() which is invoked by the scheduler's update_load_avg() on CPU utilization changes. This allows the "setpolicy" drivers to dispense with their timers and do all of the computations they need and frequency/voltage adjustments in the update_load_avg() code path, among other things. The update_load_avg() changes were suggested by Peter Zijlstra. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar Acked-by: Peter Zijlstra (Intel) Acked-by: Ingo Molnar --- kernel/sched/deadline.c | 4 ++++ kernel/sched/fair.c | 26 +++++++++++++++++++++++++- kernel/sched/rt.c | 4 ++++ kernel/sched/sched.h | 1 + 4 files changed, 34 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index cd64c979d0e1..21a0aa6f810d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -726,6 +726,10 @@ static void update_curr_dl(struct rq *rq) if (!dl_task(curr) || !on_dl_rq(dl_se)) return; + /* Kick cpufreq (see the comment in linux/cpufreq.h). */ + if (cpu_of(rq) == smp_processor_id()) + cpufreq_trigger_update(rq_clock(rq)); + /* * Consumed budget is computed considering the time as * observed by schedulable tasks (excluding time spent diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 56b7d4b83947..e2987a7e489d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2824,7 +2824,8 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) { struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); - int cpu = cpu_of(rq_of(cfs_rq)); + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); /* * Track task load average for carrying it to new CPU after migrated, and @@ -2836,6 +2837,29 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) update_tg_load_avg(cfs_rq, 0); + + if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) { + unsigned long max = rq->cpu_capacity_orig; + + /* + * There are a few boundary cases this might miss but it should + * get called often enough that that should (hopefully) not be + * a real problem -- added to that it only calls on the local + * CPU, so if we enqueue remotely we'll miss an update, but + * the next tick/schedule should update. + * + * It will not get called when we go idle, because the idle + * thread is a different class (!fair), nor will the utilization + * number include things like RT tasks. + * + * As is, the util number is not freq-invariant (we'd have to + * implement arch_scale_freq_capacity() for that). + * + * See cpu_util(). + */ + cpufreq_update_util(rq_clock(rq), + min(cfs_rq->avg.util_avg, max), max); + } } static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 8ec86abe0ea1..27f5b03cbdbe 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -945,6 +945,10 @@ static void update_curr_rt(struct rq *rq) if (curr->sched_class != &rt_sched_class) return; + /* Kick cpufreq (see the comment in linux/cpufreq.h). */ + if (cpu_of(rq) == smp_processor_id()) + cpufreq_trigger_update(rq_clock(rq)); + delta_exec = rq_clock_task(rq) - curr->se.exec_start; if (unlikely((s64)delta_exec <= 0)) return; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 10f16374df7f..f042190c8002 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -9,6 +9,7 @@ #include #include #include +#include #include "cpupri.h" #include "cpudeadline.h" -- cgit v1.2.3 From 4e0d8f7eff3fbfa3e3ac5782669c078f590dc9e2 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 9 Mar 2016 12:47:03 -0700 Subject: resource: Change __request_region to inherit from immediate parent __request_region() sets 'flags' of a new resource from @parent as it inherits the parent's attribute. When a target resource has a conflict, this function inserts the new resource entry under the conflicted entry by updating @parent. In this case, the new resource entry needs to inherit attribute from the updated parent. This conflict is a typical case since __request_region() is used to allocate a new resource from a specific resource range. For instance, request_mem_region() calls __request_region() with @parent set to &iomem_resource, which is the root entry of the whole iomem range. When this request results in inserting a new entry "DEV-A" under "BUS-1", "DEV-A" needs to inherit from the immediate parent "BUS-1" as it holds specific attribute for the range. root (&iomem_resource) : + "BUS-1" + "DEV-A" Change __request_region() to set 'flags' and 'desc' of a new entry from the immediate parent. Signed-off-by: Toshi Kani Cc: Ingo Molnar Cc: Borislav Petkov Cc: Andrew Morton Cc: Dan Williams Signed-off-by: Dan Williams --- kernel/resource.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 4d466052426b..5a56e8f24058 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1085,15 +1085,16 @@ struct resource * __request_region(struct resource *parent, res->name = name; res->start = start; res->end = start + n - 1; - res->flags = resource_type(parent) | resource_ext_type(parent); - res->flags |= IORESOURCE_BUSY | flags; - res->desc = IORES_DESC_NONE; write_lock(&resource_lock); for (;;) { struct resource *conflict; + res->flags = resource_type(parent) | resource_ext_type(parent); + res->flags |= IORESOURCE_BUSY | flags; + res->desc = parent->desc; + conflict = __request_resource(parent, res); if (!conflict) break; -- cgit v1.2.3 From ff3cc952d3f009e6c376cc40651b87187ce364a6 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 9 Mar 2016 12:47:04 -0700 Subject: resource: Add remove_resource interface insert_resource() and insert_resource_conflict() are called by resource producers to insert a new resource. When there is any conflict, they move conflicting resources down to the children of the new resource. There is no destructor of these interfaces, however. Add remove_resource(), which removes a resource previously inserted by insert_resource() or insert_resource_conflict(), and moves the children up to where they were before. __release_resource() is changed to have @release_child, so that this function can be used for remove_resource() as well. Also add comments to clarify that these functions are intended for producers of resources to avoid any confusion with request/release_resource() for consumers. Signed-off-by: Toshi Kani Cc: Ingo Molnar Cc: Borislav Petkov Cc: Andrew Morton Cc: Dan Williams Signed-off-by: Dan Williams --- kernel/resource.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 5a56e8f24058..effb6ee2c3e8 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -233,9 +233,9 @@ static struct resource * __request_resource(struct resource *root, struct resour } } -static int __release_resource(struct resource *old) +static int __release_resource(struct resource *old, bool release_child) { - struct resource *tmp, **p; + struct resource *tmp, **p, *chd; p = &old->parent->child; for (;;) { @@ -243,7 +243,17 @@ static int __release_resource(struct resource *old) if (!tmp) break; if (tmp == old) { - *p = tmp->sibling; + if (release_child || !(tmp->child)) { + *p = tmp->sibling; + } else { + for (chd = tmp->child;; chd = chd->sibling) { + chd->parent = tmp->parent; + if (!(chd->sibling)) + break; + } + *p = tmp->child; + chd->sibling = tmp->sibling; + } old->parent = NULL; return 0; } @@ -325,7 +335,7 @@ int release_resource(struct resource *old) int retval; write_lock(&resource_lock); - retval = __release_resource(old); + retval = __release_resource(old, true); write_unlock(&resource_lock); return retval; } @@ -679,7 +689,7 @@ static int reallocate_resource(struct resource *root, struct resource *old, old->start = new.start; old->end = new.end; } else { - __release_resource(old); + __release_resource(old, true); *old = new; conflict = __request_resource(root, old); BUG_ON(conflict); @@ -825,6 +835,9 @@ static struct resource * __insert_resource(struct resource *parent, struct resou * entirely fit within the range of the new resource, then the new * resource is inserted and the conflicting resources become children of * the new resource. + * + * This function is intended for producers of resources, such as FW modules + * and bus drivers. */ struct resource *insert_resource_conflict(struct resource *parent, struct resource *new) { @@ -842,6 +855,9 @@ struct resource *insert_resource_conflict(struct resource *parent, struct resour * @new: new resource to insert * * Returns 0 on success, -EBUSY if the resource can't be inserted. + * + * This function is intended for producers of resources, such as FW modules + * and bus drivers. */ int insert_resource(struct resource *parent, struct resource *new) { @@ -885,6 +901,31 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new) write_unlock(&resource_lock); } +/** + * remove_resource - Remove a resource in the resource tree + * @old: resource to remove + * + * Returns 0 on success, -EINVAL if the resource is not valid. + * + * This function removes a resource previously inserted by insert_resource() + * or insert_resource_conflict(), and moves the children (if any) up to + * where they were before. insert_resource() and insert_resource_conflict() + * insert a new resource, and move any conflicting resources down to the + * children of the new resource. + * + * insert_resource(), insert_resource_conflict() and remove_resource() are + * intended for producers of resources, such as FW modules and bus drivers. + */ +int remove_resource(struct resource *old) +{ + int retval; + + write_lock(&resource_lock); + retval = __release_resource(old, false); + write_unlock(&resource_lock); + return retval; +} + static int __adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) { -- cgit v1.2.3 From 8095d0f225fe31eaac4a013177b77ed5283278f8 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 9 Mar 2016 12:47:05 -0700 Subject: resource: Export insert_resource and remove_resource insert_resource() and remove_resouce() are called by producers of resources, such as FW modules and bus drivers. These modules may be implemented as loadable modules. Export insert_resource() and remove_resouce() so that they can be called from such modules. link: https://lkml.org/lkml/2016/3/8/872 Signed-off-by: Toshi Kani Cc: Linus Torvalds Cc: Ingo Molnar Cc: Borislav Petkov Cc: Andrew Morton Cc: Dan Williams Signed-off-by: Dan Williams --- kernel/resource.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index effb6ee2c3e8..2e78ead30934 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -866,6 +866,7 @@ int insert_resource(struct resource *parent, struct resource *new) conflict = insert_resource_conflict(parent, new); return conflict ? -EBUSY : 0; } +EXPORT_SYMBOL_GPL(insert_resource); /** * insert_resource_expand_to_fit - Insert a resource into the resource tree @@ -925,6 +926,7 @@ int remove_resource(struct resource *old) write_unlock(&resource_lock); return retval; } +EXPORT_SYMBOL_GPL(remove_resource); static int __adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) -- cgit v1.2.3 From f995b5f720a72dfe7a1b33a43f2841b4e72d53b7 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 9 Mar 2016 15:20:59 +0100 Subject: livepatch: Fix the error message about unresolvable ambiguity klp_find_callback() stops the search when sympos is not defined and a second symbol of the same name is found. It means that the current error message about the unresolvable ambiguity always prints "(2 matches)". Let's remove this information. The total number of occurrences is not much helpful. The author of the patch still must put a non-trivial effort into searching the right position in the object file. [jkosina@suse.cz: fixed grammar as suggested by Josh] Signed-off-by: Petr Mladek Acked-by: Josh Poimboeuf Acked-by: Chris J Arges Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index bc2c85c064c1..780f00cdb7e5 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -190,8 +190,8 @@ static int klp_find_object_symbol(const char *objname, const char *name, if (args.addr == 0) pr_err("symbol '%s' not found in symbol table\n", name); else if (args.count > 1 && sympos == 0) { - pr_err("unresolvable ambiguity (%lu matches) on symbol '%s' in object '%s'\n", - args.count, name, objname); + pr_err("unresolvable ambiguity for symbol '%s' in object '%s'\n", + name, objname); } else if (sympos != args.count && sympos > 0) { pr_err("symbol position %lu for symbol '%s' in object '%s' not found\n", sympos, name, objname ? objname : "vmlinux"); -- cgit v1.2.3 From b8cdc05173f05d212627b7aba7ec47fa334a79f2 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 9 Mar 2016 18:56:49 -0800 Subject: bpf: bpf_stackmap_copy depends on CONFIG_PERF_EVENTS 0-day bot reported build error: kernel/built-in.o: In function `map_lookup_elem': >> kernel/bpf/.tmp_syscall.o:(.text+0x329b3c): undefined reference to `bpf_stackmap_copy' when CONFIG_BPF_SYSCALL is set and CONFIG_PERF_EVENTS is not. Add weak definition to resolve it. This code path in map_lookup_elem() is never taken when CONFIG_PERF_EVENTS is not set. Fixes: 557c0c6e7df8 ("bpf: convert stackmap to pre-allocation") Reported-by: Fengguang Wu Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2978d0d08869..2a2efe1bc76c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -244,6 +244,11 @@ static void __user *u64_to_ptr(__u64 val) return (void __user *) (unsigned long) val; } +int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) +{ + return -ENOTSUPP; +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value -- cgit v1.2.3 From cdc4e47da8f4c32eeb6b2061a8a834f4362a12b7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 9 Mar 2016 20:02:33 -0800 Subject: bpf: avoid copying junk bytes in bpf_get_current_comm() Lots of places in the kernel use memcpy(buf, comm, TASK_COMM_LEN); but the result is typically passed to print("%s", buf) and extra bytes after zero don't cause any harm. In bpf the result of bpf_get_current_comm() is used as the part of map key and was causing spurious hash map mismatches. Use strlcpy() to guarantee zero-terminated string. bpf verifier checks that output buffer is zero-initialized, so even for short task names the output buffer don't have junk bytes. Note it's not a security concern, since kprobe+bpf is root only. Fixes: ffeedafbf023 ("bpf: introduce current->pid, tgid, uid, gid, comm accessors") Reported-by: Tobias Waldekranz Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 4504ca66118d..50da680c479f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -166,7 +166,7 @@ static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5) if (!task) return -EINVAL; - memcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm))); + strlcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm))); return 0; } -- cgit v1.2.3 From adaf9fcd136970e480d7ca834c0cf25ce922ea74 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 10 Mar 2016 20:44:47 +0100 Subject: cpufreq: Move scheduler-related code to the sched directory Create cpufreq.c under kernel/sched/ and move the cpufreq code related to the scheduler to that file and to sched.h. Redefine cpufreq_update_util() as a static inline function to avoid function calls at its call sites in the scheduler code (as suggested by Peter Zijlstra). Also move the definition of struct update_util_data and declaration of cpufreq_set_update_util_data() from include/linux/cpufreq.h to include/linux/sched.h. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) --- kernel/sched/Makefile | 1 + kernel/sched/cpufreq.c | 37 +++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 49 ++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 kernel/sched/cpufreq.c (limited to 'kernel') diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 67687973ce80..9507522164ac 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ) += cpufreq.o diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c new file mode 100644 index 000000000000..928c4ba32f68 --- /dev/null +++ b/kernel/sched/cpufreq.c @@ -0,0 +1,37 @@ +/* + * Scheduler code and data structures related to cpufreq. + * + * Copyright (C) 2016, Intel Corporation + * Author: Rafael J. Wysocki + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "sched.h" + +DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); + +/** + * cpufreq_set_update_util_data - Populate the CPU's update_util_data pointer. + * @cpu: The CPU to set the pointer for. + * @data: New pointer value. + * + * Set and publish the update_util_data pointer for the given CPU. That pointer + * points to a struct update_util_data object containing a callback function + * to call from cpufreq_update_util(). That function will be called from an RCU + * read-side critical section, so it must not sleep. + * + * Callers must use RCU-sched callbacks to free any memory that might be + * accessed via the old update_util_data pointer or invoke synchronize_sched() + * right after this function to avoid use-after-free. + */ +void cpufreq_set_update_util_data(int cpu, struct update_util_data *data) +{ + if (WARN_ON(data && !data->func)) + return; + + rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data); +} +EXPORT_SYMBOL_GPL(cpufreq_set_update_util_data); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f042190c8002..faf7e2758dd0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -9,7 +9,6 @@ #include #include #include -#include #include "cpupri.h" #include "cpudeadline.h" @@ -1739,3 +1738,51 @@ static inline u64 irq_time_read(int cpu) } #endif /* CONFIG_64BIT */ #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#ifdef CONFIG_CPU_FREQ +DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); + +/** + * cpufreq_update_util - Take a note about CPU utilization changes. + * @time: Current time. + * @util: Current utilization. + * @max: Utilization ceiling. + * + * This function is called by the scheduler on every invocation of + * update_load_avg() on the CPU whose utilization is being updated. + * + * It can only be called from RCU-sched read-side critical sections. + */ +static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) +{ + struct update_util_data *data; + + data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); + if (data) + data->func(data, time, util, max); +} + +/** + * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed. + * @time: Current time. + * + * The way cpufreq is currently arranged requires it to evaluate the CPU + * performance state (frequency/voltage) on a regular basis to prevent it from + * being stuck in a completely inadequate performance level for too long. + * That is not guaranteed to happen if the updates are only triggered from CFS, + * though, because they may not be coming in if RT or deadline tasks are active + * all the time (or there are RT and DL tasks only). + * + * As a workaround for that issue, this function is called by the RT and DL + * sched classes to trigger extra cpufreq updates to prevent it from stalling, + * but that really is a band-aid. Going forward it should be replaced with + * solutions targeted more specifically at RT and DL tasks. + */ +static inline void cpufreq_trigger_update(u64 time) +{ + cpufreq_update_util(time, ULONG_MAX, 0); +} +#else +static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {} +static inline void cpufreq_trigger_update(u64 time) {} +#endif /* CONFIG_CPU_FREQ */ -- cgit v1.2.3 From 22aceb317678057dced5f1d6e3ac15acdb863e7b Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Thu, 10 Mar 2016 12:07:38 +0100 Subject: workqueue: Fix comment for work_on_cpu() Function is processed in thread context, not in user context. Cc: Tejun Heo Cc: Lai Jiangshan Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Anna-Maria Gleixner Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 16f4986205e9..c3692d9eda55 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4695,7 +4695,7 @@ static void work_for_cpu_fn(struct work_struct *work) } /** - * work_on_cpu - run a function in user context on a particular cpu + * work_on_cpu - run a function in thread context on a particular cpu * @cpu: the cpu to run on * @fn: the function to run * @arg: the function arg -- cgit v1.2.3 From 25528213fe9f75f4e286f08d35a73ca2bb634a50 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Mar 2016 14:52:49 -0700 Subject: tags: Fix DEFINE_PER_CPU expansions $ make tags GEN tags ctags: Warning: drivers/acpi/processor_idle.c:64: null expansion of name pattern "\1" ctags: Warning: drivers/xen/events/events_2l.c:41: null expansion of name pattern "\1" ctags: Warning: kernel/locking/lockdep.c:151: null expansion of name pattern "\1" ctags: Warning: kernel/rcu/rcutorture.c:133: null expansion of name pattern "\1" ctags: Warning: kernel/rcu/rcutorture.c:135: null expansion of name pattern "\1" ctags: Warning: kernel/workqueue.c:323: null expansion of name pattern "\1" ctags: Warning: net/ipv4/syncookies.c:53: null expansion of name pattern "\1" ctags: Warning: net/ipv6/syncookies.c:44: null expansion of name pattern "\1" ctags: Warning: net/rds/page.c:45: null expansion of name pattern "\1" Which are all the result of the DEFINE_PER_CPU pattern: scripts/tags.sh:200: '/\ Acked-by: David S. Miller Acked-by: Rafael J. Wysocki Cc: Tejun Heo Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/locking/lockdep.c | 3 +-- kernel/rcu/rcutorture.c | 6 ++---- kernel/workqueue.c | 3 +-- 3 files changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index f894a2cd9b2a..53ab2f85d77e 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -148,8 +148,7 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock) } #ifdef CONFIG_LOCK_STAT -static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], - cpu_lock_stats); +static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], cpu_lock_stats); static inline u64 lockstat_clock(void) { diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d2988d047d66..4d5cc6aa7e1e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -130,10 +130,8 @@ static struct rcu_torture __rcu *rcu_torture_current; static unsigned long rcu_torture_current_version; static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; static DEFINE_SPINLOCK(rcu_torture_lock); -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], - rcu_torture_count) = { 0 }; -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], - rcu_torture_batch) = { 0 }; +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = { 0 }; +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = { 0 }; static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; static atomic_t n_rcu_torture_alloc; static atomic_t n_rcu_torture_alloc_fail; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7ff5dc7d2ac5..16e13d8628a3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -320,8 +320,7 @@ static bool wq_debug_force_rr_cpu = false; module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644); /* the per-cpu worker pools */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], - cpu_worker_pools); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */ -- cgit v1.2.3 From 07061aab2f750bbf61337b922aa8a245b5da85e1 Mon Sep 17 00:00:00 2001 From: Andreas Ziegler Date: Tue, 15 Mar 2016 14:55:33 -0700 Subject: mm: fix two typos in comments for to_vmem_altmap() Commit 4b94ffdc4163 ("x86, mm: introduce vmem_altmap to augment vmemmap_populate()"), introduced the to_vmem_altmap() function. The comments in this function contain two typos (one misspelling of the Kconfig option CONFIG_SPARSEMEM_VMEMMAP, and one missing letter 'n'), let's fix them up. Signed-off-by: Andreas Ziegler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index fb9b88787ebc..584febd13e2e 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -391,7 +391,7 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) /* * 'memmap_start' is the virtual address for the first "struct * page" in this range of the vmemmap array. In the case of - * CONFIG_SPARSE_VMEMMAP a page_to_pfn conversion is simple + * CONFIG_SPARSEMEM_VMEMMAP a page_to_pfn conversion is simple * pointer arithmetic, so we can perform this to_vmem_altmap() * conversion without concern for the initialization state of * the struct page fields. @@ -400,7 +400,7 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) struct dev_pagemap *pgmap; /* - * Uncoditionally retrieve a dev_pagemap associated with the + * Unconditionally retrieve a dev_pagemap associated with the * given physical address, this is only for use in the * arch_{add|remove}_memory() for setting up and tearing down * the memmap. -- cgit v1.2.3 From 1414c7f4f7d72d138fff35f00151d15749b5beda Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Tue, 15 Mar 2016 14:56:30 -0700 Subject: mm/page_poisoning.c: allow for zero poisoning By default, page poisoning uses a poison value (0xaa) on free. If this is changed to 0, the page is not only sanitized but zeroing on alloc with __GFP_ZERO can be skipped as well. The tradeoff is that detecting corruption from the poisoning is harder to detect. This feature also cannot be used with hibernation since pages are not guaranteed to be zeroed after hibernation. Credit to Grsecurity/PaX team for inspiring this work Signed-off-by: Laura Abbott Acked-by: Rafael J. Wysocki Cc: "Kirill A. Shutemov" Cc: Vlastimil Babka Cc: Michal Hocko Cc: Kees Cook Cc: Mathias Krause Cc: Dave Hansen Cc: Jianyu Zhan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/hibernate.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index b7342a24f559..aa0f26b58426 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1158,6 +1158,22 @@ static int __init kaslr_nohibernate_setup(char *str) return nohibernate_setup(str); } +static int __init page_poison_nohibernate_setup(char *str) +{ +#ifdef CONFIG_PAGE_POISONING_ZERO + /* + * The zeroing option for page poison skips the checks on alloc. + * since hibernation doesn't save free pages there's no way to + * guarantee the pages will still be zeroed. + */ + if (!strcmp(str, "on")) { + pr_info("Disabling hibernation due to page poisoning\n"); + return nohibernate_setup(str); + } +#endif + return 1; +} + __setup("noresume", noresume_setup); __setup("resume_offset=", resume_offset_setup); __setup("resume=", resume_setup); @@ -1166,3 +1182,4 @@ __setup("resumewait", resumewait_setup); __setup("resumedelay=", resumedelay_setup); __setup("nohibernate", nohibernate_setup); __setup("kaslr", kaslr_nohibernate_setup); +__setup("page_poison=", page_poison_nohibernate_setup); -- cgit v1.2.3 From 2213e9a66bb87d8344a1256b4ef568220d9587fb Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 15 Mar 2016 14:58:19 -0700 Subject: kallsyms: add support for relative offsets in kallsyms address table Similar to how relative extables are implemented, it is possible to emit the kallsyms table in such a way that it contains offsets relative to some anchor point in the kernel image rather than absolute addresses. On 64-bit architectures, it cuts the size of the kallsyms address table in half, since offsets between kernel symbols can typically be expressed in 32 bits. This saves several hundreds of kilobytes of permanent .rodata on average. In addition, the kallsyms address table is no longer subject to dynamic relocation when CONFIG_RELOCATABLE is in effect, so the relocation work done after decompression now doesn't have to do relocation updates for all these values. This saves up to 24 bytes (i.e., the size of a ELF64 RELA relocation table entry) per value, which easily adds up to a couple of megabytes of uncompressed __init data on ppc64 or arm64. Even if these relocation entries typically compress well, the combined size reduction of 2.8 MB uncompressed for a ppc64_defconfig build (of which 2.4 MB is __init data) results in a ~500 KB space saving in the compressed image. Since it is useful for some architectures (like x86) to retain the ability to emit absolute values as well, this patch also adds support for capturing both absolute and relative values when KALLSYMS_ABSOLUTE_PERCPU is in effect, by emitting absolute per-cpu addresses as positive 32-bit values, and addresses relative to the lowest encountered relative symbol as negative values, which are subtracted from the runtime address of this base symbol to produce the actual address. Support for the above is enabled by default for all architectures except IA-64 and Tile-GX, whose symbols are too far apart to capture in this manner. Signed-off-by: Ard Biesheuvel Tested-by: Guenter Roeck Reviewed-by: Kees Cook Tested-by: Kees Cook Cc: Heiko Carstens Cc: Michael Ellerman Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Benjamin Herrenschmidt Cc: Michal Marek Cc: Rusty Russell Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kallsyms.c | 42 +++++++++++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 5c5987f10819..fafd1a3ef0da 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -38,6 +38,7 @@ * during the second link stage. */ extern const unsigned long kallsyms_addresses[] __weak; +extern const int kallsyms_offsets[] __weak; extern const u8 kallsyms_names[] __weak; /* @@ -47,6 +48,9 @@ extern const u8 kallsyms_names[] __weak; extern const unsigned long kallsyms_num_syms __attribute__((weak, section(".rodata"))); +extern const unsigned long kallsyms_relative_base +__attribute__((weak, section(".rodata"))); + extern const u8 kallsyms_token_table[] __weak; extern const u16 kallsyms_token_index[] __weak; @@ -176,6 +180,23 @@ static unsigned int get_symbol_offset(unsigned long pos) return name - kallsyms_names; } +static unsigned long kallsyms_sym_address(int idx) +{ + if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE)) + return kallsyms_addresses[idx]; + + /* values are unsigned offsets if --absolute-percpu is not in effect */ + if (!IS_ENABLED(CONFIG_KALLSYMS_ABSOLUTE_PERCPU)) + return kallsyms_relative_base + (u32)kallsyms_offsets[idx]; + + /* ...otherwise, positive offsets are absolute values */ + if (kallsyms_offsets[idx] >= 0) + return kallsyms_offsets[idx]; + + /* ...and negative offsets are relative to kallsyms_relative_base - 1 */ + return kallsyms_relative_base - 1 - kallsyms_offsets[idx]; +} + /* Lookup the address for this symbol. Returns 0 if not found. */ unsigned long kallsyms_lookup_name(const char *name) { @@ -187,7 +208,7 @@ unsigned long kallsyms_lookup_name(const char *name) off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); if (strcmp(namebuf, name) == 0) - return kallsyms_addresses[i]; + return kallsyms_sym_address(i); } return module_kallsyms_lookup_name(name); } @@ -204,7 +225,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, for (i = 0, off = 0; i < kallsyms_num_syms; i++) { off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); - ret = fn(data, namebuf, NULL, kallsyms_addresses[i]); + ret = fn(data, namebuf, NULL, kallsyms_sym_address(i)); if (ret != 0) return ret; } @@ -220,7 +241,10 @@ static unsigned long get_symbol_pos(unsigned long addr, unsigned long i, low, high, mid; /* This kernel should never had been booted. */ - BUG_ON(!kallsyms_addresses); + if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE)) + BUG_ON(!kallsyms_addresses); + else + BUG_ON(!kallsyms_offsets); /* Do a binary search on the sorted kallsyms_addresses array. */ low = 0; @@ -228,7 +252,7 @@ static unsigned long get_symbol_pos(unsigned long addr, while (high - low > 1) { mid = low + (high - low) / 2; - if (kallsyms_addresses[mid] <= addr) + if (kallsyms_sym_address(mid) <= addr) low = mid; else high = mid; @@ -238,15 +262,15 @@ static unsigned long get_symbol_pos(unsigned long addr, * Search for the first aliased symbol. Aliased * symbols are symbols with the same address. */ - while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low]) + while (low && kallsyms_sym_address(low-1) == kallsyms_sym_address(low)) --low; - symbol_start = kallsyms_addresses[low]; + symbol_start = kallsyms_sym_address(low); /* Search for next non-aliased symbol. */ for (i = low + 1; i < kallsyms_num_syms; i++) { - if (kallsyms_addresses[i] > symbol_start) { - symbol_end = kallsyms_addresses[i]; + if (kallsyms_sym_address(i) > symbol_start) { + symbol_end = kallsyms_sym_address(i); break; } } @@ -470,7 +494,7 @@ static unsigned long get_ksymbol_core(struct kallsym_iter *iter) unsigned off = iter->nameoff; iter->module_name[0] = '\0'; - iter->value = kallsyms_addresses[iter->pos]; + iter->value = kallsyms_sym_address(iter->pos); iter->type = kallsyms_get_symbol_type(off); -- cgit v1.2.3 From 2b021cbf3cb6208f0d40fd2f1869f237934340ed Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 15 Mar 2016 20:43:04 -0400 Subject: cgroup: ignore css_sets associated with dead cgroups during migration Before 2e91fa7f6d45 ("cgroup: keep zombies associated with their original cgroups"), all dead tasks were associated with init_css_set. If a zombie task is requested for migration, while migration prep operations would still be performed on init_css_set, the actual migration would ignore zombie tasks. As init_css_set is always valid, this worked fine. However, after 2e91fa7f6d45, zombie tasks stay with the css_set it was associated with at the time of death. Let's say a task T associated with cgroup A on hierarchy H-1 and cgroup B on hiearchy H-2. After T becomes a zombie, it would still remain associated with A and B. If A only contains zombie tasks, it can be removed. On removal, A gets marked offline but stays pinned until all zombies are drained. At this point, if migration is initiated on T to a cgroup C on hierarchy H-2, migration path would try to prepare T's css_set for migration and trigger the following. WARNING: CPU: 0 PID: 1576 at kernel/cgroup.c:474 cgroup_get+0x121/0x160() CPU: 0 PID: 1576 Comm: bash Not tainted 4.4.0-work+ #289 ... Call Trace: [] dump_stack+0x4e/0x82 [] warn_slowpath_common+0x78/0xb0 [] warn_slowpath_null+0x15/0x20 [] cgroup_get+0x121/0x160 [] link_css_set+0x7b/0x90 [] find_css_set+0x3bc/0x5e0 [] cgroup_migrate_prepare_dst+0x89/0x1f0 [] cgroup_attach_task+0x157/0x230 [] __cgroup_procs_write+0x2b7/0x470 [] cgroup_tasks_write+0xc/0x10 [] cgroup_file_write+0x30/0x1b0 [] kernfs_fop_write+0x13c/0x180 [] __vfs_write+0x23/0xe0 [] vfs_write+0xa4/0x1a0 [] SyS_write+0x44/0xa0 [] entry_SYSCALL_64_fastpath+0x12/0x6f It doesn't make sense to prepare migration for css_sets pointing to dead cgroups as they are guaranteed to contain only zombies which are ignored later during migration. This patch makes cgroup destruction path mark all affected css_sets as dead and updates the migration path to ignore them during preparation. Signed-off-by: Tejun Heo Fixes: 2e91fa7f6d45 ("cgroup: keep zombies associated with their original cgroups") Cc: stable@vger.kernel.org # v4.4+ --- kernel/cgroup.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e22df5d81e59..d57318950076 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2516,6 +2516,14 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); + /* + * If ->dead, @src_set is associated with one or more dead cgroups + * and doesn't contain any migratable tasks. Ignore it early so + * that the rest of migration path doesn't get confused by it. + */ + if (src_cset->dead) + return; + src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); if (!list_empty(&src_cset->mg_preload_node)) @@ -5258,6 +5266,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct cgroup_subsys_state *css; + struct cgrp_cset_link *link; int ssid; lockdep_assert_held(&cgroup_mutex); @@ -5278,11 +5287,18 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) return -EBUSY; /* - * Mark @cgrp dead. This prevents further task migration and child - * creation by disabling cgroup_lock_live_group(). + * Mark @cgrp and the associated csets dead. The former prevents + * further task migration and child creation by disabling + * cgroup_lock_live_group(). The latter makes the csets ignored by + * the migration path. */ cgrp->self.flags &= ~CSS_ONLINE; + spin_lock_bh(&css_set_lock); + list_for_each_entry(link, &cgrp->cset_links, cset_link) + link->cset->dead = true; + spin_unlock_bh(&css_set_lock); + /* initiate massacre of all css's */ for_each_css(css, ssid, cgrp) kill_css(css); -- cgit v1.2.3 From cfe02a8a973e7e5f66926b8ae38dfce404b19e29 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 15 Mar 2016 00:21:06 +0100 Subject: cgroup: avoid false positive gcc-6 warning When all subsystems are disabled, gcc notices that cgroup_subsys_enabled_key is a zero-length array and that any access to it must be out of bounds: In file included from ../include/linux/cgroup.h:19:0, from ../kernel/cgroup.c:31: ../kernel/cgroup.c: In function 'cgroup_add_cftypes': ../kernel/cgroup.c:261:53: error: array subscript is above array bounds [-Werror=array-bounds] return static_key_enabled(cgroup_subsys_enabled_key[ssid]); ~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~ ../include/linux/jump_label.h:271:40: note: in definition of macro 'static_key_enabled' static_key_count((struct static_key *)x) > 0; \ ^ We should never call the function in this particular case, so this is not a bug. In order to silence the warning, this adds an explicit check for the CGROUP_SUBSYS_COUNT==0 case. Signed-off-by: Arnd Bergmann Signed-off-by: Tejun Heo --- kernel/cgroup.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d57318950076..3fe02c152799 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -246,6 +246,9 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, */ static bool cgroup_ssid_enabled(int ssid) { + if (CGROUP_SUBSYS_COUNT == 0) + return false; + return static_key_enabled(cgroup_subsys_enabled_key[ssid]); } -- cgit v1.2.3 From 4c973d1620ae08f5cbe27644c5f5b974c8f594ec Mon Sep 17 00:00:00 2001 From: Jessica Yu Date: Wed, 16 Mar 2016 20:55:38 -0400 Subject: modules: split part of complete_formation() into prepare_coming_module() Put all actions in complete_formation() that are performed after module->state is set to MODULE_STATE_COMING into a separate function prepare_coming_module(). This split prepares for the removal of the livepatch module notifiers in favor of hard-coding function calls to klp_module_{coming,going} in the module loader. The complete_formation -> prepare_coming_module split will also make error handling easier since we can jump to the appropriate error label to do any module GOING cleanup after all the COMING-actions have completed. Signed-off-by: Jessica Yu Reviewed-by: Josh Poimboeuf Reviewed-by: Petr Mladek Acked-by: Rusty Russell Signed-off-by: Jiri Kosina --- kernel/module.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 794ebe8e878d..6dbfad415d51 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3392,9 +3392,6 @@ static int complete_formation(struct module *mod, struct load_info *info) mod->state = MODULE_STATE_COMING; mutex_unlock(&module_mutex); - ftrace_module_enable(mod); - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_COMING, mod); return 0; out: @@ -3402,6 +3399,14 @@ out: return err; } +static int prepare_coming_module(struct module *mod) +{ + ftrace_module_enable(mod); + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_COMING, mod); + return 0; +} + static int unknown_module_param_cb(char *param, char *val, const char *modname, void *arg) { @@ -3516,13 +3521,17 @@ static int load_module(struct load_info *info, const char __user *uargs, if (err) goto ddebug_cleanup; + err = prepare_coming_module(mod); + if (err) + goto bug_cleanup; + /* Module is ready to execute: parsing args may do that. */ after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, -32768, 32767, mod, unknown_module_param_cb); if (IS_ERR(after_dashes)) { err = PTR_ERR(after_dashes); - goto bug_cleanup; + goto coming_cleanup; } else if (after_dashes) { pr_warn("%s: parameters '%s' after `--' ignored\n", mod->name, after_dashes); @@ -3531,7 +3540,7 @@ static int load_module(struct load_info *info, const char __user *uargs, /* Link in to syfs. */ err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); if (err < 0) - goto bug_cleanup; + goto coming_cleanup; /* Get rid of temporary copy. */ free_copy(info); @@ -3541,15 +3550,16 @@ static int load_module(struct load_info *info, const char __user *uargs, return do_init_module(mod); + coming_cleanup: + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_GOING, mod); + bug_cleanup: /* module_bug_cleanup needs module_mutex protection */ mutex_lock(&module_mutex); module_bug_cleanup(mod); mutex_unlock(&module_mutex); - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_GOING, mod); - /* we can't deallocate the module until we clear memory protection */ module_disable_ro(mod); module_disable_nx(mod); -- cgit v1.2.3 From 7e545d6eca20ce8ef7f66a63146cbff82b2ba760 Mon Sep 17 00:00:00 2001 From: Jessica Yu Date: Wed, 16 Mar 2016 20:55:39 -0400 Subject: livepatch/module: remove livepatch module notifier Remove the livepatch module notifier in favor of directly enabling and disabling patches to modules in the module loader. Hard-coding the function calls ensures that ftrace_module_enable() is run before klp_module_coming() during module load, and that klp_module_going() is run before ftrace_release_mod() during module unload. This way, ftrace and livepatch code is run in the correct order during the module load/unload sequence without dependence on the module notifier call chain. Signed-off-by: Jessica Yu Reviewed-by: Petr Mladek Acked-by: Josh Poimboeuf Acked-by: Rusty Russell Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 147 +++++++++++++++++++++++------------------------- kernel/module.c | 10 ++++ 2 files changed, 81 insertions(+), 76 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 780f00cdb7e5..d68fbf63b083 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -99,12 +99,12 @@ static void klp_find_object_module(struct klp_object *obj) /* * We do not want to block removal of patched modules and therefore * we do not take a reference here. The patches are removed by - * a going module handler instead. + * klp_module_going() instead. */ mod = find_module(obj->name); /* - * Do not mess work of the module coming and going notifiers. - * Note that the patch might still be needed before the going handler + * Do not mess work of klp_module_coming() and klp_module_going(). + * Note that the patch might still be needed before klp_module_going() * is called. Module functions can be called even in the GOING state * until mod->exit() finishes. This is especially important for * patches that modify semantic of the functions. @@ -866,103 +866,108 @@ int klp_register_patch(struct klp_patch *patch) } EXPORT_SYMBOL_GPL(klp_register_patch); -static int klp_module_notify_coming(struct klp_patch *patch, - struct klp_object *obj) +int klp_module_coming(struct module *mod) { - struct module *pmod = patch->mod; - struct module *mod = obj->mod; int ret; + struct klp_patch *patch; + struct klp_object *obj; - ret = klp_init_object_loaded(patch, obj); - if (ret) { - pr_warn("failed to initialize patch '%s' for module '%s' (%d)\n", - pmod->name, mod->name, ret); - return ret; - } + if (WARN_ON(mod->state != MODULE_STATE_COMING)) + return -EINVAL; - if (patch->state == KLP_DISABLED) - return 0; + mutex_lock(&klp_mutex); + /* + * Each module has to know that klp_module_coming() + * has been called. We never know what module will + * get patched by a new patch. + */ + mod->klp_alive = true; - pr_notice("applying patch '%s' to loading module '%s'\n", - pmod->name, mod->name); + list_for_each_entry(patch, &klp_patches, list) { + klp_for_each_object(patch, obj) { + if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) + continue; - ret = klp_enable_object(obj); - if (ret) - pr_warn("failed to apply patch '%s' to module '%s' (%d)\n", - pmod->name, mod->name, ret); - return ret; -} + obj->mod = mod; -static void klp_module_notify_going(struct klp_patch *patch, - struct klp_object *obj) -{ - struct module *pmod = patch->mod; - struct module *mod = obj->mod; + ret = klp_init_object_loaded(patch, obj); + if (ret) { + pr_warn("failed to initialize patch '%s' for module '%s' (%d)\n", + patch->mod->name, obj->mod->name, ret); + goto err; + } - if (patch->state == KLP_DISABLED) - goto disabled; + if (patch->state == KLP_DISABLED) + break; + + pr_notice("applying patch '%s' to loading module '%s'\n", + patch->mod->name, obj->mod->name); + + ret = klp_enable_object(obj); + if (ret) { + pr_warn("failed to apply patch '%s' to module '%s' (%d)\n", + patch->mod->name, obj->mod->name, ret); + goto err; + } + + break; + } + } - pr_notice("reverting patch '%s' on unloading module '%s'\n", - pmod->name, mod->name); + mutex_unlock(&klp_mutex); - klp_disable_object(obj); + return 0; -disabled: +err: + /* + * If a patch is unsuccessfully applied, return + * error to the module loader. + */ + pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n", + patch->mod->name, obj->mod->name, obj->mod->name); + mod->klp_alive = false; klp_free_object_loaded(obj); + mutex_unlock(&klp_mutex); + + return ret; } -static int klp_module_notify(struct notifier_block *nb, unsigned long action, - void *data) +void klp_module_going(struct module *mod) { - int ret; - struct module *mod = data; struct klp_patch *patch; struct klp_object *obj; - if (action != MODULE_STATE_COMING && action != MODULE_STATE_GOING) - return 0; + if (WARN_ON(mod->state != MODULE_STATE_GOING && + mod->state != MODULE_STATE_COMING)) + return; mutex_lock(&klp_mutex); - /* - * Each module has to know that the notifier has been called. - * We never know what module will get patched by a new patch. + * Each module has to know that klp_module_going() + * has been called. We never know what module will + * get patched by a new patch. */ - if (action == MODULE_STATE_COMING) - mod->klp_alive = true; - else /* MODULE_STATE_GOING */ - mod->klp_alive = false; + mod->klp_alive = false; list_for_each_entry(patch, &klp_patches, list) { klp_for_each_object(patch, obj) { if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) continue; - if (action == MODULE_STATE_COMING) { - obj->mod = mod; - ret = klp_module_notify_coming(patch, obj); - if (ret) { - obj->mod = NULL; - pr_warn("patch '%s' is in an inconsistent state!\n", - patch->mod->name); - } - } else /* MODULE_STATE_GOING */ - klp_module_notify_going(patch, obj); + if (patch->state != KLP_DISABLED) { + pr_notice("reverting patch '%s' on unloading module '%s'\n", + patch->mod->name, obj->mod->name); + klp_disable_object(obj); + } + klp_free_object_loaded(obj); break; } } mutex_unlock(&klp_mutex); - - return 0; } -static struct notifier_block klp_module_nb = { - .notifier_call = klp_module_notify, - .priority = INT_MIN+1, /* called late but before ftrace notifier */ -}; - static int __init klp_init(void) { int ret; @@ -973,21 +978,11 @@ static int __init klp_init(void) return -EINVAL; } - ret = register_module_notifier(&klp_module_nb); - if (ret) - return ret; - klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj); - if (!klp_root_kobj) { - ret = -ENOMEM; - goto unregister; - } + if (!klp_root_kobj) + return -ENOMEM; return 0; - -unregister: - unregister_module_notifier(&klp_module_nb); - return ret; } module_init(klp_init); diff --git a/kernel/module.c b/kernel/module.c index 6dbfad415d51..4b65fbb10bdc 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include @@ -984,6 +985,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, mod->exit(); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); + klp_module_going(mod); ftrace_release_mod(mod); async_synchronize_full(); @@ -3315,6 +3317,7 @@ fail: module_put(mod); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); + klp_module_going(mod); ftrace_release_mod(mod); free_module(mod); wake_up_all(&module_wq); @@ -3401,7 +3404,13 @@ out: static int prepare_coming_module(struct module *mod) { + int err; + ftrace_module_enable(mod); + err = klp_module_coming(mod); + if (err) + return err; + blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); return 0; @@ -3553,6 +3562,7 @@ static int load_module(struct load_info *info, const char __user *uargs, coming_cleanup: blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); + klp_module_going(mod); bug_cleanup: /* module_bug_cleanup needs module_mutex protection */ -- cgit v1.2.3 From a1ee1932aa6bea0bb074f5e3ced112664e4637ed Mon Sep 17 00:00:00 2001 From: Joshua Hunt Date: Thu, 17 Mar 2016 14:17:23 -0700 Subject: watchdog: don't run proc_watchdog_update if new value is same as old While working on a script to restore all sysctl params before a series of tests I found that writing any value into the /proc/sys/kernel/{nmi_watchdog,soft_watchdog,watchdog,watchdog_thresh} causes them to call proc_watchdog_update(). NMI watchdog: enabled on all CPUs, permanently consumes one hw-PMU counter. NMI watchdog: enabled on all CPUs, permanently consumes one hw-PMU counter. NMI watchdog: enabled on all CPUs, permanently consumes one hw-PMU counter. NMI watchdog: enabled on all CPUs, permanently consumes one hw-PMU counter. There doesn't appear to be a reason for doing this work every time a write occurs, so only do it when the values change. Signed-off-by: Josh Hunt Acked-by: Don Zickus Reviewed-by: Aaron Tomlin Cc: Ulrich Obergfell Cc: [4.1.x+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b3ace6ebbba3..9acb29f280ec 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -923,6 +923,9 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, * both lockup detectors are disabled if proc_watchdog_update() * returns an error. */ + if (old == new) + goto out; + err = proc_watchdog_update(); } out: @@ -967,7 +970,7 @@ int proc_soft_watchdog(struct ctl_table *table, int write, int proc_watchdog_thresh(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int err, old; + int err, old, new; get_online_cpus(); mutex_lock(&watchdog_proc_mutex); @@ -987,6 +990,10 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, /* * Update the sample period. Restore on failure. */ + new = ACCESS_ONCE(watchdog_thresh); + if (old == new) + goto out; + set_sample_period(); err = proc_watchdog_update(); if (err) { -- cgit v1.2.3 From 12580e4b54ba8a1b22ec977c200be0174ca42348 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:17:38 -0700 Subject: mm: memcontrol: report kernel stack usage in cgroup2 memory.stat Show how much memory is allocated to kernel stacks. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 2e391c754ae7..accb7221d547 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -164,12 +164,20 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); + if (page) + memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, + 1 << THREAD_SIZE_ORDER); + return page ? page_address(page) : NULL; } static inline void free_thread_info(struct thread_info *ti) { - free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); + struct page *page = virt_to_page(ti); + + memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, + -(1 << THREAD_SIZE_ORDER)); + __free_kmem_pages(page, THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_info_cache; -- cgit v1.2.3 From 795ae7a0de6b834a0cc202aa55c190ef81496665 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 17 Mar 2016 14:19:14 -0700 Subject: mm: scale kswapd watermarks in proportion to memory In machines with 140G of memory and enterprise flash storage, we have seen read and write bursts routinely exceed the kswapd watermarks and cause thundering herds in direct reclaim. Unfortunately, the only way to tune kswapd aggressiveness is through adjusting min_free_kbytes - the system's emergency reserves - which is entirely unrelated to the system's latency requirements. In order to get kswapd to maintain a 250M buffer of free memory, the emergency reserves need to be set to 1G. That is a lot of memory wasted for no good reason. On the other hand, it's reasonable to assume that allocation bursts and overall allocation concurrency scale with memory capacity, so it makes sense to make kswapd aggressiveness a function of that as well. Change the kswapd watermark scale factor from the currently fixed 25% of the tunable emergency reserve to a tunable 0.1% of memory. Beyond 1G of memory, this will produce bigger watermark steps than the current formula in default settings. Ensure that the new formula never chooses steps smaller than that, i.e. 25% of the emergency reserve. On a 140G machine, this raises the default watermark steps - the distance between min and low, and low and high - from 16M to 143M. Signed-off-by: Johannes Weiner Acked-by: Mel Gorman Acked-by: Rik van Riel Acked-by: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f5102fabef7f..725587f10667 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -126,6 +126,7 @@ static int __maybe_unused two = 2; static int __maybe_unused four = 4; static unsigned long one_ul = 1; static int one_hundred = 100; +static int one_thousand = 1000; #ifdef CONFIG_PRINTK static int ten_thousand = 10000; #endif @@ -1403,6 +1404,15 @@ static struct ctl_table vm_table[] = { .proc_handler = min_free_kbytes_sysctl_handler, .extra1 = &zero, }, + { + .procname = "watermark_scale_factor", + .data = &watermark_scale_factor, + .maxlen = sizeof(watermark_scale_factor), + .mode = 0644, + .proc_handler = watermark_scale_factor_sysctl_handler, + .extra1 = &one, + .extra2 = &one_thousand, + }, { .procname = "percpu_pagelist_fraction", .data = &percpu_pagelist_fraction, -- cgit v1.2.3 From da8b44d5a9f8bf26da637b7336508ca534d6b319 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 17 Mar 2016 14:20:51 -0700 Subject: timer: convert timer_slack_ns from unsigned long to u64 This patchset introduces a /proc//timerslack_ns interface which would allow controlling processes to be able to set the timerslack value on other processes in order to save power by avoiding wakeups (Something Android currently does via out-of-tree patches). The first patch tries to fix the internal timer_slack_ns usage which was defined as a long, which limits the slack range to ~4 seconds on 32bit systems. It converts it to a u64, which provides the same basically unlimited slack (500 years) on both 32bit and 64bit machines. The second patch introduces the /proc//timerslack_ns interface which allows the full 64bit slack range for a task to be read or set on both 32bit and 64bit machines. With these two patches, on a 32bit machine, after setting the slack on bash to 10 seconds: $ time sleep 1 real 0m10.747s user 0m0.001s sys 0m0.005s The first patch is a little ugly, since I had to chase the slack delta arguments through a number of functions converting them to u64s. Let me know if it makes sense to break that up more or not. Other than that things are fairly straightforward. This patch (of 2): The timer_slack_ns value in the task struct is currently a unsigned long. This means that on 32bit applications, the maximum slack is just over 4 seconds. However, on 64bit machines, its much much larger (~500 years). This disparity could make application development a little (as well as the default_slack) to a u64. This means both 32bit and 64bit systems have the same effective internal slack range. Now the existing ABI via PR_GET_TIMERSLACK and PR_SET_TIMERSLACK specify the interface as a unsigned long, so we preserve that limitation on 32bit systems, where SET_TIMERSLACK can only set the slack to a unsigned long value, and GET_TIMERSLACK will return ULONG_MAX if the slack is actually larger then what can be stored by an unsigned long. This patch also modifies hrtimer functions which specified the slack delta as a unsigned long. Signed-off-by: John Stultz Cc: Arjan van de Ven Cc: Thomas Gleixner Cc: Oren Laadan Cc: Ruchi Kandoi Cc: Rom Lemarchand Cc: Kees Cook Cc: Android Kernel Team Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 5 ++++- kernel/time/hrtimer.c | 8 ++++---- kernel/time/timer.c | 4 ++-- 3 files changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 78947de6f969..cf8ba545c7d3 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2169,7 +2169,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = perf_event_task_enable(); break; case PR_GET_TIMERSLACK: - error = current->timer_slack_ns; + if (current->timer_slack_ns > ULONG_MAX) + error = ULONG_MAX; + else + error = current->timer_slack_ns; break; case PR_SET_TIMERSLACK: if (arg2 <= 0) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index fa909f9fd559..58a321c34cfb 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -979,7 +979,7 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, * relative (HRTIMER_MODE_REL) */ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, const enum hrtimer_mode mode) + u64 delta_ns, const enum hrtimer_mode mode) { struct hrtimer_clock_base *base, *new_base; unsigned long flags; @@ -1548,7 +1548,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, struct restart_block *restart; struct hrtimer_sleeper t; int ret = 0; - unsigned long slack; + u64 slack; slack = current->timer_slack_ns; if (dl_task(current) || rt_task(current)) @@ -1724,7 +1724,7 @@ void __init hrtimers_init(void) * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME */ int __sched -schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, +schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, const enum hrtimer_mode mode, int clock) { struct hrtimer_sleeper t; @@ -1792,7 +1792,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, * * Returns 0 when the timer has expired otherwise -EINTR */ -int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, +int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode) { return schedule_hrtimeout_range_clock(expires, delta, mode, diff --git a/kernel/time/timer.c b/kernel/time/timer.c index bbc5d1114583..d1798fa0c743 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1698,10 +1698,10 @@ EXPORT_SYMBOL(msleep_interruptible); static void __sched do_usleep_range(unsigned long min, unsigned long max) { ktime_t kmin; - unsigned long delta; + u64 delta; kmin = ktime_set(0, min * NSEC_PER_USEC); - delta = (max - min) * NSEC_PER_USEC; + delta = (u64)(max - min) * NSEC_PER_USEC; schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); } -- cgit v1.2.3 From a8199371afc27946d72f0d53e938e78d2ea0bae3 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 17 Mar 2016 14:21:20 -0700 Subject: printk: move can_use_console() out of console_trylock_for_printk() console_unlock() allows to cond_resched() if its caller has set `console_may_schedule' to 1 (this functionality is present since 8d91f8b15361 ("printk: do cond_resched() between lines while outputting to consoles"). The rules are: -- console_lock() always sets `console_may_schedule' to 1 -- console_trylock() always sets `console_may_schedule' to 0 printk() calls console_unlock() with preemption desabled, which basically can lead to RCU stalls, watchdog soft lockups, etc. if something is simultaneously calling printk() frequent enough (IOW, console_sem owner always has new data to send to console divers and can't leave console_unlock() for a long time). printk()->console_trylock() callers do not necessarily execute in atomic contexts, and some of them can cond_resched() in console_unlock(). console_trylock() can set `console_may_schedule' to 1 (allow cond_resched() later in consoe_unlock()) when it's safe. This patch (of 3): vprintk_emit() disables preemption around console_trylock_for_printk() and console_unlock() calls for a strong reason -- can_use_console() check. The thing is that vprintl_emit() can be called on a CPU that is not fully brought up yet (!cpu_online()), which potentially can cause problems if console driver wants to access per-cpu data. A console driver can explicitly state that it's safe to call it from !online cpu by setting CON_ANYTIME bit in console ->flags. That's why for !cpu_online() can_use_console() iterates all the console to find out if there is a CON_ANYTIME console, otherwise console_unlock() must be avoided. can_use_console() ensures that console_unlock() call is safe in vprintk_emit() only; console_lock() and console_trylock() are not covered by this check. Even though call_console_drivers(), invoked from console_cont_flush() and console_unlock(), tests `!cpu_online() && CON_ANYTIME' for_each_console(), it may be too late, which can result in messages loss. Assume that we have 2 cpus -- CPU0 is online, CPU1 is !online, and no CON_ANYTIME consoles available. CPU0 online CPU1 !online console_trylock() ... console_unlock() console_cont_flush spin_lock logbuf_lock if (!cont.len) { spin_unlock logbuf_lock return } for (;;) { vprintk_emit spin_lock logbuf_lock log_store spin_unlock logbuf_lock spin_lock logbuf_lock !console_trylock_for_printk msg_print_text return console_idx = log_next() console_seq++ console_prev = msg->flags spin_unlock logbuf_lock call_console_drivers() for_each_console(con) { if (!cpu_online() && !(con->flags & CON_ANYTIME)) continue; } /* * no message printed, we lost it */ vprintk_emit spin_lock logbuf_lock log_store spin_unlock logbuf_lock !console_trylock_for_printk return /* * go to the beginning of the loop, * find out there are new messages, * lose it */ } console_trylock()/console_lock() call on CPU1 may come from cpu notifiers registered on that CPU. Since notifiers are not getting unregistered when CPU is going DOWN, all of the notifiers receive notifications during CPU UP. For example, on my x86_64, I see around 50 notification sent from offline CPU to itself [swapper/2] from cpu:2 to:2 action:CPU_STARTING hotplug_hrtick [swapper/2] from cpu:2 to:2 action:CPU_STARTING blk_mq_main_cpu_notify [swapper/2] from cpu:2 to:2 action:CPU_STARTING blk_mq_queue_reinit_notify [swapper/2] from cpu:2 to:2 action:CPU_STARTING console_cpu_notify while doing echo 0 > /sys/devices/system/cpu/cpu2/online echo 1 > /sys/devices/system/cpu/cpu2/online So grabbing the console_sem lock while CPU is !online is possible, in theory. This patch moves can_use_console() check out of console_trylock_for_printk(). Instead it calls it in console_unlock(), so now console_lock()/console_unlock() are also 'protected' by can_use_console(). This also means that console_trylock_for_printk() is not really needed anymore and can be removed. Signed-off-by: Sergey Senozhatsky Reviewed-by: Petr Mladek Cc: Jan Kara Cc: Tejun Heo Cc: Kyle McMartin Cc: Dave Jones Cc: Calvin Owens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 97 ++++++++++++++++++++++---------------------------- 1 file changed, 42 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index c963ba534a78..2523332bd998 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1483,58 +1483,6 @@ static void zap_locks(void) sema_init(&console_sem, 1); } -/* - * Check if we have any console that is capable of printing while cpu is - * booting or shutting down. Requires console_sem. - */ -static int have_callable_console(void) -{ - struct console *con; - - for_each_console(con) - if (con->flags & CON_ANYTIME) - return 1; - - return 0; -} - -/* - * Can we actually use the console at this time on this cpu? - * - * Console drivers may assume that per-cpu resources have been allocated. So - * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't - * call them until this CPU is officially up. - */ -static inline int can_use_console(unsigned int cpu) -{ - return cpu_online(cpu) || have_callable_console(); -} - -/* - * Try to get console ownership to actually show the kernel - * messages from a 'printk'. Return true (and with the - * console_lock held, and 'console_locked' set) if it - * is successful, false otherwise. - */ -static int console_trylock_for_printk(void) -{ - unsigned int cpu = smp_processor_id(); - - if (!console_trylock()) - return 0; - /* - * If we can't use the console, we need to release the console - * semaphore by hand to avoid flushing the buffer. We need to hold the - * console semaphore in order to do this test safely. - */ - if (!can_use_console(cpu)) { - console_locked = 0; - up_console_sem(); - return 0; - } - return 1; -} - int printk_delay_msec __read_mostly; static inline void printk_delay(void) @@ -1681,7 +1629,6 @@ asmlinkage int vprintk_emit(int facility, int level, boot_delay_msec(level); printk_delay(); - /* This stops the holder of console_sem just where we want him */ local_irq_save(flags); this_cpu = smp_processor_id(); @@ -1705,6 +1652,7 @@ asmlinkage int vprintk_emit(int facility, int level, } lockdep_off(); + /* This stops the holder of console_sem just where we want him */ raw_spin_lock(&logbuf_lock); logbuf_cpu = this_cpu; @@ -1821,7 +1769,7 @@ asmlinkage int vprintk_emit(int facility, int level, * semaphore. The release will print out buffers and wake up * /dev/kmsg and syslog() users. */ - if (console_trylock_for_printk()) + if (console_trylock()) console_unlock(); preempt_enable(); lockdep_on(); @@ -2184,6 +2132,33 @@ int is_console_locked(void) return console_locked; } +/* + * Check if we have any console that is capable of printing while cpu is + * booting or shutting down. Requires console_sem. + */ +static int have_callable_console(void) +{ + struct console *con; + + for_each_console(con) + if (con->flags & CON_ANYTIME) + return 1; + + return 0; +} + +/* + * Can we actually use the console at this time on this cpu? + * + * Console drivers may assume that per-cpu resources have been allocated. So + * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't + * call them until this CPU is officially up. + */ +static inline int can_use_console(void) +{ + return cpu_online(raw_smp_processor_id()) || have_callable_console(); +} + static void console_cont_flush(char *text, size_t size) { unsigned long flags; @@ -2254,9 +2229,21 @@ void console_unlock(void) do_cond_resched = console_may_schedule; console_may_schedule = 0; +again: + /* + * We released the console_sem lock, so we need to recheck if + * cpu is online and (if not) is there at least one CON_ANYTIME + * console. + */ + if (!can_use_console()) { + console_locked = 0; + up_console_sem(); + return; + } + /* flush buffered message fragment immediately to console */ console_cont_flush(text, sizeof(text)); -again: + for (;;) { struct printk_log *msg; size_t ext_len = 0; -- cgit v1.2.3 From 6b97a20d3a7909daa06625d4440c2c52d7bf08d7 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 17 Mar 2016 14:21:23 -0700 Subject: printk: set may_schedule for some of console_trylock() callers console_unlock() allows to cond_resched() if its caller has set `console_may_schedule' to 1, since 8d91f8b15361 ("printk: do cond_resched() between lines while outputting to consoles"). The rules are: -- console_lock() always sets `console_may_schedule' to 1 -- console_trylock() always sets `console_may_schedule' to 0 However, console_trylock() callers (among them is printk()) do not always call printk() from atomic contexts, and some of them can cond_resched() in console_unlock(), so console_trylock() can set `console_may_schedule' to 1 for such processes. For !CONFIG_PREEMPT_COUNT kernels, however, console_trylock() always sets `console_may_schedule' to 0. It's possible to drop explicit preempt_disable()/preempt_enable() in vprintk_emit(), because console_unlock() and console_trylock() are now smart enough: a) console_unlock() does not cond_resched() when it's unsafe (console_trylock() takes care of that) b) console_unlock() does can_use_console() check. Signed-off-by: Sergey Senozhatsky Reviewed-by: Petr Mladek Cc: Jan Kara Cc: Tejun Heo Cc: Kyle McMartin Cc: Dave Jones Cc: Calvin Owens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 2523332bd998..a6d023c3b852 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1757,13 +1757,6 @@ asmlinkage int vprintk_emit(int facility, int level, /* If called from the scheduler, we can not call up(). */ if (!in_sched) { lockdep_off(); - /* - * Disable preemption to avoid being preempted while holding - * console_sem which would prevent anyone from printing to - * console - */ - preempt_disable(); - /* * Try to acquire and then immediately release the console * semaphore. The release will print out buffers and wake up @@ -1771,7 +1764,6 @@ asmlinkage int vprintk_emit(int facility, int level, */ if (console_trylock()) console_unlock(); - preempt_enable(); lockdep_on(); } @@ -2122,7 +2114,20 @@ int console_trylock(void) return 0; } console_locked = 1; - console_may_schedule = 0; + /* + * When PREEMPT_COUNT disabled we can't reliably detect if it's + * safe to schedule (e.g. calling printk while holding a spin_lock), + * because preempt_disable()/preempt_enable() are just barriers there + * and preempt_count() is always 0. + * + * RCU read sections have a separate preemption counter when + * PREEMPT_RCU enabled thus we must take extra care and check + * rcu_preempt_depth(), otherwise RCU read sections modify + * preempt_count(). + */ + console_may_schedule = !oops_in_progress && + preemptible() && + !rcu_preempt_depth(); return 1; } EXPORT_SYMBOL(console_trylock); -- cgit v1.2.3 From adaf6590ee7db23c3a124fb9f213c90c15cecf96 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 17 Mar 2016 14:21:27 -0700 Subject: printk: check CON_ENABLED in have_callable_console() have_callable_console() must also test CON_ENABLED bit, not just CON_ANYTIME. We may have disabled CON_ANYTIME console so printk can wrongly assume that it's safe to call_console_drivers(). Signed-off-by: Sergey Senozhatsky Reviewed-by: Petr Mladek Cc: Jan Kara Cc: Tejun Heo Cc: Kyle McMartin Cc: Dave Jones Cc: Calvin Owens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a6d023c3b852..d5fd844e5b08 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2146,7 +2146,8 @@ static int have_callable_console(void) struct console *con; for_each_console(con) - if (con->flags & CON_ANYTIME) + if ((con->flags & CON_ENABLED) && + (con->flags & CON_ANYTIME)) return 1; return 0; -- cgit v1.2.3 From f468908bb55a0b01d9424c74f8ec8eb906835150 Mon Sep 17 00:00:00 2001 From: Ivan Delalande Date: Thu, 17 Mar 2016 14:21:30 -0700 Subject: printk: add clear_idx symbol to vmcoreinfo This allows us to extract from the vmcore only the messages emitted since the last time the ring buffer was cleared. We just have to make sure its value is always up-to-date, when old messages are discarded to free space in log_make_free_space() for example. Signed-off-by: Zeyu Zhao Signed-off-by: Ivan Delalande Cc: Kay Sievers Cc: Neil Horman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index d5fd844e5b08..bfbf284e4218 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -367,16 +367,20 @@ static int logbuf_has_space(u32 msg_size, bool empty) static int log_make_free_space(u32 msg_size) { - while (log_first_seq < log_next_seq) { - if (logbuf_has_space(msg_size, false)) - return 0; + while (log_first_seq < log_next_seq && + !logbuf_has_space(msg_size, false)) { /* drop old messages until we have enough contiguous space */ log_first_idx = log_next(log_first_idx); log_first_seq++; } + if (clear_seq < log_first_seq) { + clear_seq = log_first_seq; + clear_idx = log_first_idx; + } + /* sequence numbers are equal, so the log buffer is empty */ - if (logbuf_has_space(msg_size, true)) + if (logbuf_has_space(msg_size, log_first_seq == log_next_seq)) return 0; return -ENOMEM; @@ -854,6 +858,7 @@ void log_buf_kexec_setup(void) VMCOREINFO_SYMBOL(log_buf); VMCOREINFO_SYMBOL(log_buf_len); VMCOREINFO_SYMBOL(log_first_idx); + VMCOREINFO_SYMBOL(clear_idx); VMCOREINFO_SYMBOL(log_next_idx); /* * Export struct printk_log size and field offsets. User space tools can @@ -1216,12 +1221,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear) u32 idx; enum log_flags prev; - if (clear_seq < log_first_seq) { - /* messages are gone, move to first available one */ - clear_seq = log_first_seq; - clear_idx = log_first_idx; - } - /* * Find first record that fits, including all following records, * into the user-provided buffer for this dump. -- cgit v1.2.3 From 4cc7ecb7f2a60e8deb783b8fbf7c1ae467acb920 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Mar 2016 14:23:00 -0700 Subject: param: convert some "on"/"off" users to strtobool This changes several users of manual "on"/"off" parsing to use strtobool. Some side-effects: - these uses will now parse y/n/1/0 meaningfully too - the early_param uses will now bubble up parse errors Signed-off-by: Kees Cook Acked-by: Heiko Carstens Acked-by: Michael Ellerman Cc: Amitkumar Karwar Cc: Andy Shevchenko Cc: Daniel Borkmann Cc: Joe Perches Cc: Kalle Valo Cc: Martin Schwidefsky Cc: Nishant Sarmukadam Cc: Rasmus Villemoes Cc: Steve French Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/hrtimer.c | 10 ++-------- kernel/time/tick-sched.c | 10 ++-------- 2 files changed, 4 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 58a321c34cfb..fa0b983290cf 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -515,7 +515,7 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) /* * High resolution timer enabled ? */ -static int hrtimer_hres_enabled __read_mostly = 1; +static bool hrtimer_hres_enabled __read_mostly = true; unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; EXPORT_SYMBOL_GPL(hrtimer_resolution); @@ -524,13 +524,7 @@ EXPORT_SYMBOL_GPL(hrtimer_resolution); */ static int __init setup_hrtimer_hres(char *str) { - if (!strcmp(str, "off")) - hrtimer_hres_enabled = 0; - else if (!strcmp(str, "on")) - hrtimer_hres_enabled = 1; - else - return 0; - return 1; + return (kstrtobool(str, &hrtimer_hres_enabled) == 0); } __setup("highres=", setup_hrtimer_hres); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 969e6704c3c9..195fe7d2caad 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -486,20 +486,14 @@ void __init tick_nohz_init(void) /* * NO HZ enabled ? */ -int tick_nohz_enabled __read_mostly = 1; +bool tick_nohz_enabled __read_mostly = true; unsigned long tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ static int __init setup_tick_nohz(char *str) { - if (!strcmp(str, "off")) - tick_nohz_enabled = 0; - else if (!strcmp(str, "on")) - tick_nohz_enabled = 1; - else - return 0; - return 1; + return (kstrtobool(str, &tick_nohz_enabled) == 0); } __setup("nohz=", setup_tick_nohz); -- cgit v1.2.3 From 2553b67a1fbe7bf202e4e8070ab0b00d3d3a06a2 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 17 Mar 2016 14:23:04 -0700 Subject: lib/bug.c: use common WARN helper The traceoff_on_warning option doesn't have any effect on s390, powerpc, arm64, parisc, and sh because there are two different types of WARN implementations: 1) The above mentioned architectures treat WARN() as a special case of a BUG() exception. They handle warnings in report_bug() in lib/bug.c. 2) All other architectures just call warn_slowpath_*() directly. Their warnings are handled in warn_slowpath_common() in kernel/panic.c. Support traceoff_on_warning on all architectures and prevent any future divergence by using a single common function to emit the warning. Also remove the '()' from '%pS()', because the parentheses look funky: [ 45.607629] WARNING: at /root/warn_mod/warn_mod.c:17 .init_dummy+0x20/0x40 [warn_mod]() Reported-by: Chunyu Hu Signed-off-by: Josh Poimboeuf Acked-by: Heiko Carstens Tested-by: Prarit Bhargava Acked-by: Prarit Bhargava Acked-by: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 41 ++++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index d96469de72dc..fa400852bf6c 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -24,6 +24,7 @@ #include #include #include +#include #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 @@ -449,20 +450,25 @@ void oops_exit(void) kmsg_dump(KMSG_DUMP_OOPS); } -#ifdef WANT_WARN_ON_SLOWPATH -struct slowpath_args { +struct warn_args { const char *fmt; va_list args; }; -static void warn_slowpath_common(const char *file, int line, void *caller, - unsigned taint, struct slowpath_args *args) +void __warn(const char *file, int line, void *caller, unsigned taint, + struct pt_regs *regs, struct warn_args *args) { disable_trace_on_warning(); pr_warn("------------[ cut here ]------------\n"); - pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS()\n", - raw_smp_processor_id(), current->pid, file, line, caller); + + if (file) + pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n", + raw_smp_processor_id(), current->pid, file, line, + caller); + else + pr_warn("WARNING: CPU: %d PID: %d at %pS\n", + raw_smp_processor_id(), current->pid, caller); if (args) vprintk(args->fmt, args->args); @@ -479,20 +485,27 @@ static void warn_slowpath_common(const char *file, int line, void *caller, } print_modules(); - dump_stack(); + + if (regs) + show_regs(regs); + else + dump_stack(); + print_oops_end_marker(); + /* Just a warning, don't kill lockdep. */ add_taint(taint, LOCKDEP_STILL_OK); } +#ifdef WANT_WARN_ON_SLOWPATH void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) { - struct slowpath_args args; + struct warn_args args; args.fmt = fmt; va_start(args.args, fmt); - warn_slowpath_common(file, line, __builtin_return_address(0), - TAINT_WARN, &args); + __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, + &args); va_end(args.args); } EXPORT_SYMBOL(warn_slowpath_fmt); @@ -500,20 +513,18 @@ EXPORT_SYMBOL(warn_slowpath_fmt); void warn_slowpath_fmt_taint(const char *file, int line, unsigned taint, const char *fmt, ...) { - struct slowpath_args args; + struct warn_args args; args.fmt = fmt; va_start(args.args, fmt); - warn_slowpath_common(file, line, __builtin_return_address(0), - taint, &args); + __warn(file, line, __builtin_return_address(0), taint, NULL, &args); va_end(args.args); } EXPORT_SYMBOL(warn_slowpath_fmt_taint); void warn_slowpath_null(const char *file, int line) { - warn_slowpath_common(file, line, __builtin_return_address(0), - TAINT_WARN, NULL); + __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL); } EXPORT_SYMBOL(warn_slowpath_null); #endif -- cgit v1.2.3 From 84b6d3e6149c5280bc18b42e2f12efdaf354e49c Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 16 Mar 2016 15:34:32 +0100 Subject: ftrace: Make ftrace_hash_rec_enable return update bool Change __ftrace_hash_rec_update to return true in case we need to update dynamic ftrace call records. It return false in case no update is needed. Link: http://lkml.kernel.org/r/1458138873-1553-5-git-send-email-jolsa@kernel.org Acked-by: Namhyung Kim Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 57a6eea84694..11ffcfd3804e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1610,7 +1610,7 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec) return keep_regs; } -static void __ftrace_hash_rec_update(struct ftrace_ops *ops, +static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, int filter_hash, bool inc) { @@ -1618,12 +1618,13 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, struct ftrace_hash *other_hash; struct ftrace_page *pg; struct dyn_ftrace *rec; + bool update = false; int count = 0; int all = 0; /* Only update if the ops has been registered */ if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) - return; + return false; /* * In the filter_hash case: @@ -1650,7 +1651,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, * then there's nothing to do. */ if (ftrace_hash_empty(hash)) - return; + return false; } do_for_each_ftrace_rec(pg, rec) { @@ -1694,7 +1695,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, if (inc) { rec->flags++; if (FTRACE_WARN_ON(ftrace_rec_count(rec) == FTRACE_REF_MAX)) - return; + return false; /* * If there's only a single callback registered to a @@ -1720,7 +1721,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, rec->flags |= FTRACE_FL_REGS; } else { if (FTRACE_WARN_ON(ftrace_rec_count(rec) == 0)) - return; + return false; rec->flags--; /* @@ -1753,22 +1754,28 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, */ } count++; + + /* Must match FTRACE_UPDATE_CALLS in ftrace_modify_all_code() */ + update |= ftrace_test_record(rec, 1) != FTRACE_UPDATE_IGNORE; + /* Shortcut, if we handled all records, we are done. */ if (!all && count == hash->count) - return; + return update; } while_for_each_ftrace_rec(); + + return update; } -static void ftrace_hash_rec_disable(struct ftrace_ops *ops, +static bool ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash) { - __ftrace_hash_rec_update(ops, filter_hash, 0); + return __ftrace_hash_rec_update(ops, filter_hash, 0); } -static void ftrace_hash_rec_enable(struct ftrace_ops *ops, +static bool ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash) { - __ftrace_hash_rec_update(ops, filter_hash, 1); + return __ftrace_hash_rec_update(ops, filter_hash, 1); } static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops, -- cgit v1.2.3 From 7f50d06bb6b825d34f069c6c7a1aab96ad0b94d9 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 16 Mar 2016 15:34:33 +0100 Subject: ftrace: Update dynamic ftrace calls only if necessary Currently dynamic ftrace calls are updated any time the ftrace_ops is un/registered. If we do this update only when it's needed, we save lot of time for perf system wide ftrace function sampling/counting. The reason is that for system wide sampling/counting, perf creates event for each cpu in the system. Each event then registers separate copy of ftrace_ops, which ends up in FTRACE_UPDATE_CALLS updates. On servers with many cpus that means serious stall (240 cpus server): Counting: # time ./perf stat -e ftrace:function -a sleep 1 Performance counter stats for 'system wide': 370,663 ftrace:function 1.401427505 seconds time elapsed real 3m51.743s user 0m0.023s sys 3m48.569s Sampling: # time ./perf record -e ftrace:function -a sleep 1 [ perf record: Woken up 0 times to write data ] Warning: Processed 141200 events and lost 5 chunks! [ perf record: Captured and wrote 10.703 MB perf.data (135950 samples) ] real 2m31.429s user 0m0.213s sys 2m29.494s There's no reason to do the FTRACE_UPDATE_CALLS update for each event in perf case, because all the ftrace_ops always share the same filter, so the updated calls are always the same. It's required that only first ftrace_ops registration does the FTRACE_UPDATE_CALLS update (also sometimes the second if the first one used the trampoline), but the rest can be only cheaply linked into the ftrace_ops list. Counting: # time ./perf stat -e ftrace:function -a sleep 1 Performance counter stats for 'system wide': 398,571 ftrace:function 1.377503733 seconds time elapsed real 0m2.787s user 0m0.005s sys 0m1.883s Sampling: # time ./perf record -e ftrace:function -a sleep 1 [ perf record: Woken up 0 times to write data ] Warning: Processed 261730 events and lost 9 chunks! [ perf record: Captured and wrote 19.907 MB perf.data (256293 samples) ] real 1m31.948s user 0m0.309s sys 1m32.051s Link: http://lkml.kernel.org/r/1458138873-1553-6-git-send-email-jolsa@kernel.org Acked-by: Namhyung Kim Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 11ffcfd3804e..d3850cbb840f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2651,7 +2651,6 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) return ret; ftrace_start_up++; - command |= FTRACE_UPDATE_CALLS; /* * Note that ftrace probes uses this to start up @@ -2672,7 +2671,8 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) return ret; } - ftrace_hash_rec_enable(ops, 1); + if (ftrace_hash_rec_enable(ops, 1)) + command |= FTRACE_UPDATE_CALLS; ftrace_startup_enable(command); @@ -2702,11 +2702,11 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) /* Disabling ipmodify never fails */ ftrace_hash_ipmodify_disable(ops); - ftrace_hash_rec_disable(ops, 1); - ops->flags &= ~FTRACE_OPS_FL_ENABLED; + if (ftrace_hash_rec_disable(ops, 1)) + command |= FTRACE_UPDATE_CALLS; - command |= FTRACE_UPDATE_CALLS; + ops->flags &= ~FTRACE_OPS_FL_ENABLED; if (saved_ftrace_func != ftrace_trace_function) { saved_ftrace_func = ftrace_trace_function; -- cgit v1.2.3 From 6363c6b599ae67b779d01a48642a7c0d7d721814 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 15 Mar 2016 22:12:34 +0800 Subject: ftrace: Use kasprintf() in ftrace_profile_tracefs() Use kasprintf() instead of kmalloc() and snprintf(). Link: http://lkml.kernel.org/r/135a7bc36e51fd9eaa57124dd2140285b771f738.1458050835.git.geliangtang@163.com Acked-by: Namhyung Kim Signed-off-by: Geliang Tang Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d3850cbb840f..6a93faafbea4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1030,8 +1030,7 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) for_each_possible_cpu(cpu) { stat = &per_cpu(ftrace_profile_stats, cpu); - /* allocate enough for function name + cpu number */ - name = kmalloc(32, GFP_KERNEL); + name = kasprintf(GFP_KERNEL, "function%d", cpu); if (!name) { /* * The files created are permanent, if something happens @@ -1043,7 +1042,6 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) return; } stat->stat = function_stats; - snprintf(name, 32, "function%d", cpu); stat->stat.name = name; ret = register_stat_tracer(&stat->stat); if (ret) { -- cgit v1.2.3 From c8ca003b2fde177b83de87f9f20f6a5933fb50bf Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Mon, 14 Mar 2016 20:35:41 +0800 Subject: tracing: Fix return while holding a lock in register_tracer() commit d39cdd2036a6 ("tracing: Make tracer_flags use the right set_flag callback") introduces a potential mutex deadlock issue, as it forgets to free the mutex when allocaing the tracer_flags gets fail. The issue was found by Dan Carpenter through Smatch static code check tool. Link: http://lkml.kernel.org/r/1457958941-30265-1-git-send-email-chuhu@redhat.com Fixes: d39cdd2036a6 ("tracing: Make tracer_flags use the right set_flag callback") Reported-by: Dan Carpenter Signed-off-by: Chunyu Hu Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b401a1892dc6..0ae46048f724 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1256,8 +1256,10 @@ int __init register_tracer(struct tracer *type) if (!type->flags) { /*allocate a dummy tracer_flags*/ type->flags = kmalloc(sizeof(*type->flags), GFP_KERNEL); - if (!type->flags) - return -ENOMEM; + if (!type->flags) { + ret = -ENOMEM; + goto out; + } type->flags->val = 0; type->flags->opts = dummy_tracer_opt; } else -- cgit v1.2.3 From cb86e05390debcc084cfdb0a71ed4c5dbbec517d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 18 Mar 2016 12:27:43 -0400 Subject: tracing: Have preempt(irqs)off trace preempt disabled functions Joel Fernandes reported that the function tracing of preempt disabled sections was not being reported when running either the preemptirqsoff or preemptoff tracers. This was due to the fact that the function tracer callback for those tracers checked if irqs were disabled before tracing. But this fails when we want to trace preempt off locations as well. Joel explained that he wanted to see funcitons where interrupts are enabled but preemption was disabled. The expected output he wanted: <...>-2265 1d.h1 3419us : preempt_count_sub <-irq_exit <...>-2265 1d..1 3419us : __do_softirq <-irq_exit <...>-2265 1d..1 3419us : msecs_to_jiffies <-__do_softirq <...>-2265 1d..1 3420us : irqtime_account_irq <-__do_softirq <...>-2265 1d..1 3420us : __local_bh_disable_ip <-__do_softirq <...>-2265 1..s1 3421us : run_timer_softirq <-__do_softirq <...>-2265 1..s1 3421us : hrtimer_run_pending <-run_timer_softirq <...>-2265 1..s1 3421us : _raw_spin_lock_irq <-run_timer_softirq <...>-2265 1d.s1 3422us : preempt_count_add <-_raw_spin_lock_irq <...>-2265 1d.s2 3422us : _raw_spin_unlock_irq <-run_timer_softirq <...>-2265 1..s2 3422us : preempt_count_sub <-_raw_spin_unlock_irq <...>-2265 1..s1 3423us : rcu_bh_qs <-__do_softirq <...>-2265 1d.s1 3423us : irqtime_account_irq <-__do_softirq <...>-2265 1d.s1 3423us : __local_bh_enable <-__do_softirq There's a comment saying that the irq disabled check is because there's a possible race that tracing_cpu may be set when the function is executed. But I don't remember that race. For now, I added a check for preemption being enabled too to not record the function, as there would be no race if that was the case. I need to re-investigate this, as I'm now thinking that the tracing_cpu will always be correct. But no harm in keeping the check for now, except for the slight performance hit. Link: http://lkml.kernel.org/r/1457770386-88717-1-git-send-email-agnel.joel@gmail.com Fixes: 5e6d2b9cfa3a "tracing: Use one prologue for the preempt irqs off tracer function tracers" Cc: stable@vget.kernel.org # 2.6.37+ Reported-by: Joel Fernandes Signed-off-by: Steven Rostedt --- kernel/trace/trace_irqsoff.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index e4e56589ec1d..be3222b7d72e 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -109,8 +109,12 @@ static int func_prolog_dec(struct trace_array *tr, return 0; local_save_flags(*flags); - /* slight chance to get a false positive on tracing_cpu */ - if (!irqs_disabled_flags(*flags)) + /* + * Slight chance to get a false positive on tracing_cpu, + * although I'm starting to think there isn't a chance. + * Leave this for now just to be paranoid. + */ + if (!irqs_disabled_flags(*flags) && !preempt_count()) return 0; *data = per_cpu_ptr(tr->trace_buffer.data, cpu); -- cgit v1.2.3 From a29054d9478d0435ab01b7544da4f674ab13f533 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 18 Mar 2016 15:46:48 -0400 Subject: tracing: Fix crash from reading trace_pipe with sendfile If tracing contains data and the trace_pipe file is read with sendfile(), then it can trigger a NULL pointer dereference and various BUG_ON within the VM code. There's a patch to fix this in the splice_to_pipe() code, but it's also a good idea to not let that happen from trace_pipe either. Link: http://lkml.kernel.org/r/1457641146-9068-1-git-send-email-rabin@rab.in Cc: stable@vger.kernel.org # 2.6.30+ Reported-by: Rabin Vincent Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0ae46048f724..cb2b708e4ea7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4954,7 +4954,10 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, spd.nr_pages = i; - ret = splice_to_pipe(pipe, &spd); + if (i) + ret = splice_to_pipe(pipe, &spd); + else + ret = 0; out: splice_shrink_spd(&spd); return ret; -- cgit v1.2.3 From 741f3a69f101250dc6b171b88e14ea51b099b1a9 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Mon, 19 Oct 2015 21:10:26 +0300 Subject: tracing: Remove redundant reset per-CPU buff in irqsoff tracer There is no reason to do it twice: from commit b6f11df26fdc28 ("trace: Call tracing_reset_online_cpus before tracer->init()") resetting of per-CPU buffers done before tracer->init() call. tracer->init() calls {irqs,preempt,preemptirqs}off_tracer_init() and it calls __irqsoff_tracer_init(), which resets per-CPU ringbuffer second time. It's slowpath, but anyway. Link: http://lkml.kernel.org/r/1445278226-16187-1-git-send-email-0x7f454c46@gmail.com Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_irqsoff.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index be3222b7d72e..03cdff84d026 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -626,7 +626,6 @@ static int __irqsoff_tracer_init(struct trace_array *tr) irqsoff_trace = tr; /* make sure that the tracer is visible */ smp_wmb(); - tracing_reset_online_cpus(&tr->trace_buffer); ftrace_init_array_ops(tr, irqsoff_tracer_call); -- cgit v1.2.3 From 2f5177f0fd7e531b26d54633be62d1d4cb94621c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Mar 2016 16:22:45 +0100 Subject: sched/cgroup: Fix/cleanup cgroup teardown/init The CPU controller hasn't kept up with the various changes in the whole cgroup initialization / destruction sequence, and commit: 2e91fa7f6d45 ("cgroup: keep zombies associated with their original cgroups") caused it to explode. The reason for this is that zombies do not inhibit css_offline() from being called, but do stall css_released(). Now we tear down the cfs_rq structures on css_offline() but zombies can run after that, leading to use-after-free issues. The solution is to move the tear-down to css_released(), which guarantees nobody (including no zombies) is still using our cgroup. Furthermore, a few simple cleanups are possible too. There doesn't appear to be any point to us using css_online() (anymore?) so fold that in css_alloc(). And since cgroup code guarantees an RCU grace period between css_released() and css_free() we can forgo using call_rcu() and free the stuff immediately. Suggested-by: Tejun Heo Reported-by: Kazuki Yamaguchi Reported-by: Niklas Cassel Tested-by: Niklas Cassel Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 2e91fa7f6d45 ("cgroup: keep zombies associated with their original cgroups") Link: http://lkml.kernel.org/r/20160316152245.GY6344@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4d872e19244b..2a87bdde8d4e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7536,7 +7536,7 @@ void set_curr_task(int cpu, struct task_struct *p) /* task_group_lock serializes the addition/removal of task groups */ static DEFINE_SPINLOCK(task_group_lock); -static void free_sched_group(struct task_group *tg) +static void sched_free_group(struct task_group *tg) { free_fair_sched_group(tg); free_rt_sched_group(tg); @@ -7562,7 +7562,7 @@ struct task_group *sched_create_group(struct task_group *parent) return tg; err: - free_sched_group(tg); + sched_free_group(tg); return ERR_PTR(-ENOMEM); } @@ -7582,17 +7582,16 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) } /* rcu callback to free various structures associated with a task group */ -static void free_sched_group_rcu(struct rcu_head *rhp) +static void sched_free_group_rcu(struct rcu_head *rhp) { /* now it should be safe to free those cfs_rqs */ - free_sched_group(container_of(rhp, struct task_group, rcu)); + sched_free_group(container_of(rhp, struct task_group, rcu)); } -/* Destroy runqueue etc associated with a task group */ void sched_destroy_group(struct task_group *tg) { /* wait for possible concurrent references to cfs_rqs complete */ - call_rcu(&tg->rcu, free_sched_group_rcu); + call_rcu(&tg->rcu, sched_free_group_rcu); } void sched_offline_group(struct task_group *tg) @@ -8051,31 +8050,26 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); + sched_online_group(tg, parent); + return &tg->css; } -static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) +static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); - struct task_group *parent = css_tg(css->parent); - if (parent) - sched_online_group(tg, parent); - return 0; + sched_offline_group(tg); } static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); - sched_destroy_group(tg); -} - -static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) -{ - struct task_group *tg = css_tg(css); - - sched_offline_group(tg); + /* + * Relies on the RCU grace period between css_released() and this. + */ + sched_free_group(tg); } static void cpu_cgroup_fork(struct task_struct *task) @@ -8435,9 +8429,8 @@ static struct cftype cpu_files[] = { struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, + .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, - .css_online = cpu_cgroup_css_online, - .css_offline = cpu_cgroup_css_offline, .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, -- cgit v1.2.3 From 3a47d5124a957358274e9ca7b115b2f3a914f56d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Mar 2016 13:04:03 +0100 Subject: sched/fair: Fix fairness issue on migration Pavan reported that in the presence of very light tasks (or cgroups) the placement of migrated tasks can cause severe fairness issues. The problem is that enqueue_entity() places the task before it updates time, thereby it can place the task far in the past (remember that light tasks will shoot virtual time forward at a high speed, so in relation to the pre-existing light task, we can land far in the past). This is done because update_curr() needs the current task, and we might be placing the current task. The obvious solution is to differentiate between the current and any other task; placing the current before we update time, and placing any other task after, such that !curr tasks end up at the current moment in time, and not in the past. Reported-by: Pavan Kondeti Tested-by: Pavan Kondeti Signed-off-by: Peter Zijlstra (Intel) Cc: Ben Segall Cc: Linus Torvalds Cc: Matt Fleming Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: byungchul.park@lge.com Link: http://lkml.kernel.org/r/20160309120403.GK6344@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 33130529e9b5..3c114d971d84 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3157,17 +3157,25 @@ static inline void check_schedstat_required(void) static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { + bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING); + bool curr = cfs_rq->curr == se; + /* - * Update the normalized vruntime before updating min_vruntime - * through calling update_curr(). + * If we're the current task, we must renormalise before calling + * update_curr(). */ - if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) + if (renorm && curr) se->vruntime += cfs_rq->min_vruntime; + update_curr(cfs_rq); + /* - * Update run-time statistics of the 'current'. + * Otherwise, renormalise after, such that we're placed at the current + * moment in time, instead of some random moment in the past. */ - update_curr(cfs_rq); + if (renorm && !curr) + se->vruntime += cfs_rq->min_vruntime; + enqueue_entity_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -3183,7 +3191,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_stats_enqueue(cfs_rq, se); check_spread(cfs_rq, se); } - if (se != cfs_rq->curr) + if (!curr) __enqueue_entity(cfs_rq, se); se->on_rq = 1; -- cgit v1.2.3 From d4335581dc30ec6545999c7443bb9fead274a980 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Wed, 9 Mar 2016 14:59:08 +0000 Subject: sched/fair: Add comments to explain select_idle_sibling() It's not entirely obvious how the main loop in select_idle_sibling() works on first glance. Sprinkle a few comments to explain the design and intention behind the loop based on some conversations with Mike and Peter. Signed-off-by: Matt Fleming Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1457535548-15329-1-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3c114d971d84..303d6392b389 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5055,7 +5055,19 @@ static int select_idle_sibling(struct task_struct *p, int target) return i; /* - * Otherwise, iterate the domains and find an elegible idle cpu. + * Otherwise, iterate the domains and find an eligible idle cpu. + * + * A completely idle sched group at higher domains is more + * desirable than an idle group at a lower level, because lower + * domains have smaller groups and usually share hardware + * resources which causes tasks to contend on them, e.g. x86 + * hyperthread siblings in the lowest domain (SMT) can contend + * on the shared cpu pipeline. + * + * However, while we prefer idle groups at higher domains + * finding an idle cpu at the lowest domain is still better than + * returning 'target', which we've already established, isn't + * idle. */ sd = rcu_dereference(per_cpu(sd_llc, target)); for_each_lower_domain(sd) { @@ -5065,11 +5077,16 @@ static int select_idle_sibling(struct task_struct *p, int target) tsk_cpus_allowed(p))) goto next; + /* Ensure the entire group is idle */ for_each_cpu(i, sched_group_cpus(sg)) { if (i == target || !idle_cpu(i)) goto next; } + /* + * It doesn't matter which cpu we pick, the + * whole group is idle. + */ target = cpumask_first_and(sched_group_cpus(sg), tsk_cpus_allowed(p)); goto done; -- cgit v1.2.3 From 1a736b77a3f50910843d076623204ba6e5057dc1 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Mon, 21 Dec 2015 19:14:42 +0800 Subject: sched/cpuacct: Rename parameter in cpuusage_write() for readability The name of the 'reset' parameter to cpuusage_write() is quite confusing, because the only valid value we allow is '0', so !reset is actually the case that resets ... Rename it to 'val' and explain it in a comment that we only allow 0. Signed-off-by: Dongsheng Yang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: cgroups@vger.kernel.org Cc: tj@kernel.org Link: http://lkml.kernel.org/r/1450696483-2864-1-git-send-email-yangds.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/cpuacct.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index dd7cbb55bbf2..9c2bbf7efa1a 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -145,13 +145,16 @@ static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) } static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, - u64 reset) + u64 val) { struct cpuacct *ca = css_ca(css); int err = 0; int i; - if (reset) { + /* + * Only allow '0' here to do a reset. + */ + if (val) { err = -EINVAL; goto out; } -- cgit v1.2.3 From 73e6aafd9ea81498d31361f01db84a0118da2d1c Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Thu, 17 Mar 2016 12:19:43 +0800 Subject: sched/cpuacct: Simplify the cpuacct code - Use for() instead of while() loop in some functions to make the code simpler. - Use this_cpu_ptr() instead of per_cpu_ptr() to make the code cleaner and a bit faster. Suggested-by: Peter Zijlstra Signed-off-by: Zhao Lei Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Tejun Heo Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/d8a7ef9592f55224630cb26dea239f05b6398a4e.1458187654.git.zhaolei@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/cpuacct.c | 28 +++++----------------------- kernel/sched/cpuacct.h | 4 ++-- 2 files changed, 7 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 9c2bbf7efa1a..434c2fa41352 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -238,23 +238,10 @@ static struct cftype files[] = { void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; - int cpu; - - cpu = task_cpu(tsk); rcu_read_lock(); - - ca = task_ca(tsk); - - while (true) { - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - *cpuusage += cputime; - - ca = parent_ca(ca); - if (!ca) - break; - } - + for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) + *this_cpu_ptr(ca->cpuusage) += cputime; rcu_read_unlock(); } @@ -263,18 +250,13 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) * * Note: it's the caller that updates the account of the root cgroup. */ -void cpuacct_account_field(struct task_struct *p, int index, u64 val) +void cpuacct_account_field(struct task_struct *tsk, int index, u64 val) { - struct kernel_cpustat *kcpustat; struct cpuacct *ca; rcu_read_lock(); - ca = task_ca(p); - while (ca != &root_cpuacct) { - kcpustat = this_cpu_ptr(ca->cpustat); - kcpustat->cpustat[index] += val; - ca = parent_ca(ca); - } + for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca)) + this_cpu_ptr(ca->cpustat)->cpustat[index] += val; rcu_read_unlock(); } diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h index ed605624a5e7..ba72807c73d4 100644 --- a/kernel/sched/cpuacct.h +++ b/kernel/sched/cpuacct.h @@ -1,7 +1,7 @@ #ifdef CONFIG_CGROUP_CPUACCT extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); -extern void cpuacct_account_field(struct task_struct *p, int index, u64 val); +extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); #else @@ -10,7 +10,7 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) } static inline void -cpuacct_account_field(struct task_struct *p, int index, u64 val) +cpuacct_account_field(struct task_struct *tsk, int index, u64 val) { } -- cgit v1.2.3 From 3debb0a9ddb16526de8b456491b7db60114f7b5e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 22 Mar 2016 17:30:58 -0400 Subject: tracing: Fix trace_printk() to print when not using bprintk() The trace_printk() code will allocate extra buffers if the compile detects that a trace_printk() is used. To do this, the format of the trace_printk() is saved to the __trace_printk_fmt section, and if that section is bigger than zero, the buffers are allocated (along with a message that this has happened). If trace_printk() uses a format that is not a constant, and thus something not guaranteed to be around when the print happens, the compiler optimizes the fmt out, as it is not used, and the __trace_printk_fmt section is not filled. This means the kernel will not allocate the special buffers needed for the trace_printk() and the trace_printk() will not write anything to the tracing buffer. Adding a "__used" to the variable in the __trace_printk_fmt section will keep it around, even though it is set to NULL. This will keep the string from being printed in the debugfs/tracing/printk_formats section as it is not needed. Reported-by: Vlastimil Babka Fixes: 07d777fe8c398 "tracing: Add percpu buffers for trace_printk()" Cc: stable@vger.kernel.org # v3.5+ Signed-off-by: Steven Rostedt --- kernel/trace/trace_printk.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 060df67dbdd1..f96f0383f6c6 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -296,6 +296,9 @@ static int t_show(struct seq_file *m, void *v) const char *str = *fmt; int i; + if (!*fmt) + return 0; + seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt); /* -- cgit v1.2.3 From 7e6867bf831c71fe0e47438831ae3a94d4c7ab3c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 18 Mar 2016 16:28:04 +0100 Subject: tracing: Record and show NMI state The latency tracer format has a nice column to indicate IRQ state, but this is not able to tell us about NMI state. When tracing perf interrupt handlers (which often run in NMI context) it is very useful to see how the events nest. Link: http://lkml.kernel.org/r/20160318153022.105068893@infradead.org Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 1 + kernel/trace/trace.h | 1 + kernel/trace/trace_output.c | 10 +++++++--- 3 files changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index cb2b708e4ea7..7bdf8ba323ec 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1664,6 +1664,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, #else TRACE_FLAG_IRQS_NOSUPPORT | #endif + ((pc & NMI_MASK ) ? TRACE_FLAG_NMI : 0) | ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) | diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 39588c23dd8b..3fff4adfd431 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -125,6 +125,7 @@ enum trace_flag_type { TRACE_FLAG_HARDIRQ = 0x08, TRACE_FLAG_SOFTIRQ = 0x10, TRACE_FLAG_PREEMPT_RESCHED = 0x20, + TRACE_FLAG_NMI = 0x40, }; #define TRACE_BUF_SIZE 1024 diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 282982195e09..0bb9cf2d53e6 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -389,7 +389,9 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) char irqs_off; int hardirq; int softirq; + int nmi; + nmi = entry->flags & TRACE_FLAG_NMI; hardirq = entry->flags & TRACE_FLAG_HARDIRQ; softirq = entry->flags & TRACE_FLAG_SOFTIRQ; @@ -415,10 +417,12 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) } hardsoft_irq = + (nmi && hardirq) ? 'Z' : + nmi ? 'z' : (hardirq && softirq) ? 'H' : - hardirq ? 'h' : - softirq ? 's' : - '.'; + hardirq ? 'h' : + softirq ? 's' : + '.' ; trace_seq_printf(s, "%c%c%c", irqs_off, need_resched, hardsoft_irq); -- cgit v1.2.3 From b4aa14a63cb3194d8eab355fcee194838ab09121 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 22 Mar 2016 14:24:39 -0700 Subject: kernel/hung_task.c: use timeout diff when timeout is updated When new timeout is written to /proc/sys/kernel/hung_task_timeout_secs, khungtaskd is interrupted and again sleeps for full timeout duration. This means that hang task will not be checked if new timeout is written periodically within old timeout duration and/or checking of hang task will be delayed for up to previous timeout duration. Fix this by remembering last time khungtaskd checked hang task. This change will allow other watchdog tasks (if any) to share khungtaskd by sleeping for minimal timeout diff of all watchdog tasks. Doing more watchdog tasks from khungtaskd will reduce the possibility of printk() collisions by multiple watchdog threads. Signed-off-by: Tetsuo Handa Cc: Oleg Nesterov Cc: Aaron Tomlin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hung_task.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index e0f90c2b57aa..d234022805dc 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -185,10 +185,12 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) rcu_read_unlock(); } -static unsigned long timeout_jiffies(unsigned long timeout) +static long hung_timeout_jiffies(unsigned long last_checked, + unsigned long timeout) { /* timeout of 0 will disable the watchdog */ - return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT; + return timeout ? last_checked - jiffies + timeout * HZ : + MAX_SCHEDULE_TIMEOUT; } /* @@ -224,18 +226,21 @@ EXPORT_SYMBOL_GPL(reset_hung_task_detector); */ static int watchdog(void *dummy) { + unsigned long hung_last_checked = jiffies; + set_user_nice(current, 0); for ( ; ; ) { unsigned long timeout = sysctl_hung_task_timeout_secs; + long t = hung_timeout_jiffies(hung_last_checked, timeout); - while (schedule_timeout_interruptible(timeout_jiffies(timeout))) - timeout = sysctl_hung_task_timeout_secs; - - if (atomic_xchg(&reset_hung_task, 0)) + if (t <= 0) { + if (!atomic_xchg(&reset_hung_task, 0)) + check_hung_uninterruptible_tasks(timeout); + hung_last_checked = jiffies; continue; - - check_hung_uninterruptible_tasks(timeout); + } + schedule_timeout_interruptible(t); } return 0; -- cgit v1.2.3 From 5c38065e021bc76f97fc08997f6d7fc7ea3fb7a7 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 22 Mar 2016 14:24:52 -0700 Subject: seccomp: check in_compat_syscall, not is_compat_task, in strict mode Seccomp wants to know the syscall bitness, not the caller task bitness, when it selects the syscall whitelist. As far as I know, this makes no difference on any architecture, so it's not a security problem. (It generates identical code everywhere except sparc, and, on sparc, the syscall numbering is the same for both ABIs.) Signed-off-by: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/seccomp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 15a1795bbba1..e1e5a354854e 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -395,7 +395,7 @@ seccomp_prepare_user_filter(const char __user *user_filter) struct seccomp_filter *filter = ERR_PTR(-EFAULT); #ifdef CONFIG_COMPAT - if (is_compat_task()) { + if (in_compat_syscall()) { struct compat_sock_fprog fprog32; if (copy_from_user(&fprog32, user_filter, sizeof(fprog32))) goto out; @@ -529,7 +529,7 @@ static void __secure_computing_strict(int this_syscall) { int *syscall_whitelist = mode1_syscalls; #ifdef CONFIG_COMPAT - if (is_compat_task()) + if (in_compat_syscall()) syscall_whitelist = mode1_syscalls_32; #endif do { -- cgit v1.2.3 From 5c465217a930d4bbb7dd35a56bde1eea5bbd14d6 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 22 Mar 2016 14:24:55 -0700 Subject: ptrace: in PEEK_SIGINFO, check syscall bitness, not task bitness Users of the 32-bit ptrace() ABI expect the full 32-bit ABI. siginfo translation should check ptrace() ABI, not caller task ABI. This is an ABI change on SPARC. Let's hope that no one relied on the old buggy ABI. Signed-off-by: Andy Lutomirski Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 2341efe7fe02..c79b91d09e35 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -681,7 +681,7 @@ static int ptrace_peek_siginfo(struct task_struct *child, break; #ifdef CONFIG_COMPAT - if (unlikely(is_compat_task())) { + if (unlikely(in_compat_syscall())) { compat_siginfo_t __user *uinfo = compat_ptr(data); if (copy_siginfo_to_user32(uinfo, &info) || -- cgit v1.2.3 From efbc0fbf34927bd4d3d49b50b370990be82809c2 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 22 Mar 2016 14:24:58 -0700 Subject: auditsc: for seccomp events, log syscall compat state using in_compat_syscall Except on SPARC, this is what the code always did. SPARC compat seccomp was buggy, although the impact of the bug was limited because SPARC 32-bit and 64-bit syscall numbers are the same. Signed-off-by: Andy Lutomirski Cc: Paul Moore Cc: Eric Paris Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditsc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 195ffaee50b9..7d0e3cf8abe1 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2412,8 +2412,8 @@ void __audit_seccomp(unsigned long syscall, long signr, int code) return; audit_log_task(ab); audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x", - signr, syscall_get_arch(), syscall, is_compat_task(), - KSTK_EIP(current), code); + signr, syscall_get_arch(), syscall, + in_compat_syscall(), KSTK_EIP(current), code); audit_log_end(ab); } -- cgit v1.2.3 From 1333ab03150478df8d6f5673a91df1e50dc6ab97 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 22 Mar 2016 14:25:33 -0700 Subject: ptrace: change __ptrace_unlink() to clear ->ptrace under ->siglock This test-case (simplified version of generated by syzkaller) #include #include #include void test(void) { for (;;) { if (fork()) { wait(NULL); continue; } ptrace(PTRACE_SEIZE, getppid(), 0, 0); ptrace(PTRACE_INTERRUPT, getppid(), 0, 0); _exit(0); } } int main(void) { int np; for (np = 0; np < 8; ++np) if (!fork()) test(); while (wait(NULL) > 0) ; return 0; } triggers the 2nd WARN_ON_ONCE(!signr) warning in do_jobctl_trap(). The problem is that __ptrace_unlink() clears task->jobctl under siglock but task->ptrace is cleared without this lock held; this fools the "else" branch which assumes that !PT_SEIZED means PT_PTRACED. Note also that most of other PTRACE_SEIZE checks can race with detach from the exiting tracer too. Say, the callers of ptrace_trap_notify() assume that SEIZED can't go away after it was checked. Signed-off-by: Oleg Nesterov Reported-by: Dmitry Vyukov Cc: Tejun Heo Cc: syzkaller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index c79b91d09e35..d49bfa1e53e6 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -73,12 +73,11 @@ void __ptrace_unlink(struct task_struct *child) { BUG_ON(!child->ptrace); - child->ptrace = 0; child->parent = child->real_parent; list_del_init(&child->ptrace_entry); spin_lock(&child->sighand->siglock); - + child->ptrace = 0; /* * Clear all pending traps and TRAPPING. TRAPPING should be * cleared regardless of JOBCTL_STOP_PENDING. Do it explicitly. -- cgit v1.2.3 From 378c6520e7d29280f400ef2ceaf155c86f05a71a Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 22 Mar 2016 14:25:36 -0700 Subject: fs/coredump: prevent fsuid=0 dumps into user-controlled directories This commit fixes the following security hole affecting systems where all of the following conditions are fulfilled: - The fs.suid_dumpable sysctl is set to 2. - The kernel.core_pattern sysctl's value starts with "/". (Systems where kernel.core_pattern starts with "|/" are not affected.) - Unprivileged user namespace creation is permitted. (This is true on Linux >=3.8, but some distributions disallow it by default using a distro patch.) Under these conditions, if a program executes under secure exec rules, causing it to run with the SUID_DUMP_ROOT flag, then unshares its user namespace, changes its root directory and crashes, the coredump will be written using fsuid=0 and a path derived from kernel.core_pattern - but this path is interpreted relative to the root directory of the process, allowing the attacker to control where a coredump will be written with root privileges. To fix the security issue, always interpret core_pattern for dumps that are written under SUID_DUMP_ROOT relative to the root directory of init. Signed-off-by: Jann Horn Acked-by: Kees Cook Cc: Al Viro Cc: "Eric W. Biederman" Cc: Andy Lutomirski Cc: Oleg Nesterov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl_binary.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 7e7746a42a62..10a1d7dc9313 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1321,7 +1321,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, } mnt = task_active_pid_ns(current)->proc_mnt; - file = file_open_root(mnt->mnt_root, mnt, pathname, flags); + file = file_open_root(mnt->mnt_root, mnt, pathname, flags, 0); result = PTR_ERR(file); if (IS_ERR(file)) goto out_putname; -- cgit v1.2.3 From ebc41f20d77f6ad91f1f2d2af5147dc9bb6b5eea Mon Sep 17 00:00:00 2001 From: Hidehiro Kawai Date: Tue, 22 Mar 2016 14:27:17 -0700 Subject: panic: change nmi_panic from macro to function Commit 1717f2096b54 ("panic, x86: Fix re-entrance problem due to panic on NMI") and commit 58c5661f2144 ("panic, x86: Allow CPUs to save registers even if looping in NMI context") introduced nmi_panic() which prevents concurrent/recursive execution of panic(). It also saves registers for the crash dump on x86. However, there are some cases where NMI handlers still use panic(). This patch set partially replaces them with nmi_panic() in those cases. Even this patchset is applied, some NMI or similar handlers (e.g. MCE handler) continue to use panic(). This is because I can't test them well and actual problems won't happen. For example, the possibility that normal panic and panic on MCE happen simultaneously is very low. This patch (of 3): Convert nmi_panic() to a proper function and export it instead of exporting internal implementation details to modules, for obvious reasons. Signed-off-by: Hidehiro Kawai Acked-by: Borislav Petkov Acked-by: Michal Nazarewicz Cc: Michal Hocko Cc: Rasmus Villemoes Cc: Nicolas Iooss Cc: Javi Merino Cc: Gobinda Charan Maji Cc: "Steven Rostedt (Red Hat)" Cc: Thomas Gleixner Cc: Vitaly Kuznetsov Cc: HATAYAMA Daisuke Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index fa400852bf6c..535c96510a44 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -73,6 +73,26 @@ void __weak nmi_panic_self_stop(struct pt_regs *regs) atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); +/* + * A variant of panic() called from NMI context. We return if we've already + * panicked on this CPU. If another CPU already panicked, loop in + * nmi_panic_self_stop() which can provide architecture dependent code such + * as saving register state for crash dump. + */ +void nmi_panic(struct pt_regs *regs, const char *msg) +{ + int old_cpu, cpu; + + cpu = raw_smp_processor_id(); + old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu); + + if (old_cpu == PANIC_CPU_INVALID) + panic("%s", msg); + else if (old_cpu != cpu) + nmi_panic_self_stop(regs); +} +EXPORT_SYMBOL(nmi_panic); + /** * panic - halt the system * @fmt: The text string to print -- cgit v1.2.3 From ade356b99a4187578609f2a91c4d2ed88e4e70dc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 22 Mar 2016 14:27:26 -0700 Subject: profile: hide unused functions when !CONFIG_PROC_FS A couple of functions and variables in the profile implementation are used only on SMP systems by the procfs code, but are unused if either procfs is disabled or in uniprocessor kernels. gcc prints a harmless warning about the unused symbols: kernel/profile.c:243:13: error: 'profile_flip_buffers' defined but not used [-Werror=unused-function] static void profile_flip_buffers(void) ^ kernel/profile.c:266:13: error: 'profile_discard_flip_buffers' defined but not used [-Werror=unused-function] static void profile_discard_flip_buffers(void) ^ kernel/profile.c:330:12: error: 'profile_cpu_callback' defined but not used [-Werror=unused-function] static int profile_cpu_callback(struct notifier_block *info, ^ This adds further #ifdef to the file, to annotate exactly in which cases they are used. I have done several thousand ARM randconfig kernels with this patch applied and no longer get any warnings in this file. Signed-off-by: Arnd Bergmann Cc: Vlastimil Babka Cc: Robin Holt Cc: Johannes Weiner Cc: Christoph Lameter Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/profile.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index 51369697466e..c2199e9901c9 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -44,7 +44,7 @@ int prof_on __read_mostly; EXPORT_SYMBOL_GPL(prof_on); static cpumask_var_t prof_cpu_mask; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); static DEFINE_PER_CPU(int, cpu_profile_flip); static DEFINE_MUTEX(profile_flip_mutex); @@ -202,7 +202,7 @@ int profile_event_unregister(enum profile_type type, struct notifier_block *n) } EXPORT_SYMBOL_GPL(profile_event_unregister); -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) /* * Each cpu has a pair of open-addressed hashtables for pending * profile hits. read_profile() IPI's all cpus to request them -- cgit v1.2.3 From 5c9a8750a6409c63a0f01d51a9024861022f6593 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Tue, 22 Mar 2016 14:27:30 -0700 Subject: kernel: add kcov code coverage kcov provides code coverage collection for coverage-guided fuzzing (randomized testing). Coverage-guided fuzzing is a testing technique that uses coverage feedback to determine new interesting inputs to a system. A notable user-space example is AFL (http://lcamtuf.coredump.cx/afl/). However, this technique is not widely used for kernel testing due to missing compiler and kernel support. kcov does not aim to collect as much coverage as possible. It aims to collect more or less stable coverage that is function of syscall inputs. To achieve this goal it does not collect coverage in soft/hard interrupts and instrumentation of some inherently non-deterministic or non-interesting parts of kernel is disbled (e.g. scheduler, locking). Currently there is a single coverage collection mode (tracing), but the API anticipates additional collection modes. Initially I also implemented a second mode which exposes coverage in a fixed-size hash table of counters (what Quentin used in his original patch). I've dropped the second mode for simplicity. This patch adds the necessary support on kernel side. The complimentary compiler support was added in gcc revision 231296. We've used this support to build syzkaller system call fuzzer, which has found 90 kernel bugs in just 2 months: https://github.com/google/syzkaller/wiki/Found-Bugs We've also found 30+ bugs in our internal systems with syzkaller. Another (yet unexplored) direction where kcov coverage would greatly help is more traditional "blob mutation". For example, mounting a random blob as a filesystem, or receiving a random blob over wire. Why not gcov. Typical fuzzing loop looks as follows: (1) reset coverage, (2) execute a bit of code, (3) collect coverage, repeat. A typical coverage can be just a dozen of basic blocks (e.g. an invalid input). In such context gcov becomes prohibitively expensive as reset/collect coverage steps depend on total number of basic blocks/edges in program (in case of kernel it is about 2M). Cost of kcov depends only on number of executed basic blocks/edges. On top of that, kernel requires per-thread coverage because there are always background threads and unrelated processes that also produce coverage. With inlined gcov instrumentation per-thread coverage is not possible. kcov exposes kernel PCs and control flow to user-space which is insecure. But debugfs should not be mapped as user accessible. Based on a patch by Quentin Casasnovas. [akpm@linux-foundation.org: make task_struct.kcov_mode have type `enum kcov_mode'] [akpm@linux-foundation.org: unbreak allmodconfig] [akpm@linux-foundation.org: follow x86 Makefile layout standards] Signed-off-by: Dmitry Vyukov Reviewed-by: Kees Cook Cc: syzkaller Cc: Vegard Nossum Cc: Catalin Marinas Cc: Tavis Ormandy Cc: Will Deacon Cc: Quentin Casasnovas Cc: Kostya Serebryany Cc: Eric Dumazet Cc: Alexander Potapenko Cc: Kees Cook Cc: Bjorn Helgaas Cc: Sasha Levin Cc: David Drysdale Cc: Ard Biesheuvel Cc: Andrey Ryabinin Cc: Kirill A. Shutemov Cc: Jiri Slaby Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 12 +++ kernel/exit.c | 2 + kernel/fork.c | 3 + kernel/kcov.c | 273 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/locking/Makefile | 3 + kernel/rcu/Makefile | 4 + kernel/sched/Makefile | 4 + 7 files changed, 301 insertions(+) create mode 100644 kernel/kcov.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index baa55e50a315..f0c40bf49d9f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -18,6 +18,17 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE) endif +# Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip() +# in coverage traces. +KCOV_INSTRUMENT_softirq.o := n +# These are called from save_stack_trace() on slub debug path, +# and produce insane amounts of uninteresting coverage. +KCOV_INSTRUMENT_module.o := n +KCOV_INSTRUMENT_extable.o := n +# Don't self-instrument. +KCOV_INSTRUMENT_kcov.o := n +KASAN_SANITIZE_kcov.o := n + # cond_syscall is currently not LTO compatible CFLAGS_sys_ni.o = $(DISABLE_LTO) @@ -68,6 +79,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_GCOV_KERNEL) += gcov/ +obj-$(CONFIG_KCOV) += kcov.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o diff --git a/kernel/exit.c b/kernel/exit.c index 10e088237fed..953d1a1c0387 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include @@ -655,6 +656,7 @@ void do_exit(long code) TASKS_RCU(int tasks_rcu_i); profile_task_exit(tsk); + kcov_task_exit(tsk); WARN_ON(blk_needs_flush_plug(tsk)); diff --git a/kernel/fork.c b/kernel/fork.c index 5b8d1e7ceeea..d277e83ed3e0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -75,6 +75,7 @@ #include #include #include +#include #include #include @@ -392,6 +393,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) account_kernel_stack(ti, 1); + kcov_task_init(tsk); + return tsk; free_ti: diff --git a/kernel/kcov.c b/kernel/kcov.c new file mode 100644 index 000000000000..3efbee0834a8 --- /dev/null +++ b/kernel/kcov.c @@ -0,0 +1,273 @@ +#define pr_fmt(fmt) "kcov: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * kcov descriptor (one per opened debugfs file). + * State transitions of the descriptor: + * - initial state after open() + * - then there must be a single ioctl(KCOV_INIT_TRACE) call + * - then, mmap() call (several calls are allowed but not useful) + * - then, repeated enable/disable for a task (only one task a time allowed) + */ +struct kcov { + /* + * Reference counter. We keep one for: + * - opened file descriptor + * - task with enabled coverage (we can't unwire it from another task) + */ + atomic_t refcount; + /* The lock protects mode, size, area and t. */ + spinlock_t lock; + enum kcov_mode mode; + /* Size of arena (in long's for KCOV_MODE_TRACE). */ + unsigned size; + /* Coverage buffer shared with user space. */ + void *area; + /* Task for which we collect coverage, or NULL. */ + struct task_struct *t; +}; + +/* + * Entry point from instrumented code. + * This is called once per basic-block/edge. + */ +void __sanitizer_cov_trace_pc(void) +{ + struct task_struct *t; + enum kcov_mode mode; + + t = current; + /* + * We are interested in code coverage as a function of a syscall inputs, + * so we ignore code executed in interrupts. + */ + if (!t || in_interrupt()) + return; + mode = READ_ONCE(t->kcov_mode); + if (mode == KCOV_MODE_TRACE) { + unsigned long *area; + unsigned long pos; + + /* + * There is some code that runs in interrupts but for which + * in_interrupt() returns false (e.g. preempt_schedule_irq()). + * READ_ONCE()/barrier() effectively provides load-acquire wrt + * interrupts, there are paired barrier()/WRITE_ONCE() in + * kcov_ioctl_locked(). + */ + barrier(); + area = t->kcov_area; + /* The first word is number of subsequent PCs. */ + pos = READ_ONCE(area[0]) + 1; + if (likely(pos < t->kcov_size)) { + area[pos] = _RET_IP_; + WRITE_ONCE(area[0], pos); + } + } +} +EXPORT_SYMBOL(__sanitizer_cov_trace_pc); + +static void kcov_get(struct kcov *kcov) +{ + atomic_inc(&kcov->refcount); +} + +static void kcov_put(struct kcov *kcov) +{ + if (atomic_dec_and_test(&kcov->refcount)) { + vfree(kcov->area); + kfree(kcov); + } +} + +void kcov_task_init(struct task_struct *t) +{ + t->kcov_mode = KCOV_MODE_DISABLED; + t->kcov_size = 0; + t->kcov_area = NULL; + t->kcov = NULL; +} + +void kcov_task_exit(struct task_struct *t) +{ + struct kcov *kcov; + + kcov = t->kcov; + if (kcov == NULL) + return; + spin_lock(&kcov->lock); + if (WARN_ON(kcov->t != t)) { + spin_unlock(&kcov->lock); + return; + } + /* Just to not leave dangling references behind. */ + kcov_task_init(t); + kcov->t = NULL; + spin_unlock(&kcov->lock); + kcov_put(kcov); +} + +static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) +{ + int res = 0; + void *area; + struct kcov *kcov = vma->vm_file->private_data; + unsigned long size, off; + struct page *page; + + area = vmalloc_user(vma->vm_end - vma->vm_start); + if (!area) + return -ENOMEM; + + spin_lock(&kcov->lock); + size = kcov->size * sizeof(unsigned long); + if (kcov->mode == KCOV_MODE_DISABLED || vma->vm_pgoff != 0 || + vma->vm_end - vma->vm_start != size) { + res = -EINVAL; + goto exit; + } + if (!kcov->area) { + kcov->area = area; + vma->vm_flags |= VM_DONTEXPAND; + spin_unlock(&kcov->lock); + for (off = 0; off < size; off += PAGE_SIZE) { + page = vmalloc_to_page(kcov->area + off); + if (vm_insert_page(vma, vma->vm_start + off, page)) + WARN_ONCE(1, "vm_insert_page() failed"); + } + return 0; + } +exit: + spin_unlock(&kcov->lock); + vfree(area); + return res; +} + +static int kcov_open(struct inode *inode, struct file *filep) +{ + struct kcov *kcov; + + kcov = kzalloc(sizeof(*kcov), GFP_KERNEL); + if (!kcov) + return -ENOMEM; + atomic_set(&kcov->refcount, 1); + spin_lock_init(&kcov->lock); + filep->private_data = kcov; + return nonseekable_open(inode, filep); +} + +static int kcov_close(struct inode *inode, struct file *filep) +{ + kcov_put(filep->private_data); + return 0; +} + +static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, + unsigned long arg) +{ + struct task_struct *t; + unsigned long size, unused; + + switch (cmd) { + case KCOV_INIT_TRACE: + /* + * Enable kcov in trace mode and setup buffer size. + * Must happen before anything else. + */ + if (kcov->mode != KCOV_MODE_DISABLED) + return -EBUSY; + /* + * Size must be at least 2 to hold current position and one PC. + * Later we allocate size * sizeof(unsigned long) memory, + * that must not overflow. + */ + size = arg; + if (size < 2 || size > INT_MAX / sizeof(unsigned long)) + return -EINVAL; + kcov->size = size; + kcov->mode = KCOV_MODE_TRACE; + return 0; + case KCOV_ENABLE: + /* + * Enable coverage for the current task. + * At this point user must have been enabled trace mode, + * and mmapped the file. Coverage collection is disabled only + * at task exit or voluntary by KCOV_DISABLE. After that it can + * be enabled for another task. + */ + unused = arg; + if (unused != 0 || kcov->mode == KCOV_MODE_DISABLED || + kcov->area == NULL) + return -EINVAL; + if (kcov->t != NULL) + return -EBUSY; + t = current; + /* Cache in task struct for performance. */ + t->kcov_size = kcov->size; + t->kcov_area = kcov->area; + /* See comment in __sanitizer_cov_trace_pc(). */ + barrier(); + WRITE_ONCE(t->kcov_mode, kcov->mode); + t->kcov = kcov; + kcov->t = t; + /* This is put either in kcov_task_exit() or in KCOV_DISABLE. */ + kcov_get(kcov); + return 0; + case KCOV_DISABLE: + /* Disable coverage for the current task. */ + unused = arg; + if (unused != 0 || current->kcov != kcov) + return -EINVAL; + t = current; + if (WARN_ON(kcov->t != t)) + return -EINVAL; + kcov_task_init(t); + kcov->t = NULL; + kcov_put(kcov); + return 0; + default: + return -ENOTTY; + } +} + +static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) +{ + struct kcov *kcov; + int res; + + kcov = filep->private_data; + spin_lock(&kcov->lock); + res = kcov_ioctl_locked(kcov, cmd, arg); + spin_unlock(&kcov->lock); + return res; +} + +static const struct file_operations kcov_fops = { + .open = kcov_open, + .unlocked_ioctl = kcov_ioctl, + .mmap = kcov_mmap, + .release = kcov_close, +}; + +static int __init kcov_init(void) +{ + if (!debugfs_create_file("kcov", 0600, NULL, NULL, &kcov_fops)) { + pr_err("failed to create kcov in debugfs\n"); + return -ENOMEM; + } + return 0; +} + +device_initcall(kcov_init); diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 8e96f6cc2a4a..31322a4275cd 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,3 +1,6 @@ +# Any varying coverage in these files is non-deterministic +# and is generally not a function of system call inputs. +KCOV_INSTRUMENT := n obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 61a16569ffbf..032b2c015beb 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -1,3 +1,7 @@ +# Any varying coverage in these files is non-deterministic +# and is generally not a function of system call inputs. +KCOV_INSTRUMENT := n + obj-y += update.o sync.o obj-$(CONFIG_SRCU) += srcu.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 302d6ebd64f7..414d9c16da42 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -2,6 +2,10 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE) endif +# These files are disabled because they produce non-interesting flaky coverage +# that is not a function of syscall inputs. E.g. involuntary context switches. +KCOV_INSTRUMENT := n + ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is # needed for x86 only. Why this used to be enabled for all architectures is beyond -- cgit v1.2.3 From 41b27154874b3a40d6673052d08c8e9fd0c6404f Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Tue, 22 Mar 2016 14:27:54 -0700 Subject: kernel/signal.c: add compile-time check for __ARCH_SI_PREAMBLE_SIZE The value of __ARCH_SI_PREAMBLE_SIZE defines the size (including padding) of the part of the struct siginfo that is before the union, and it is then used to calculate the needed padding (SI_PAD_SIZE) to make the size of struct siginfo equal to 128 (SI_MAX_SIZE) bytes. Depending on the target architecture and word width it equals to either 3 or 4 times sizeof int. Since the very beginning we had __ARCH_SI_PREAMBLE_SIZE wrong on the parisc architecture for the 64bit kernel build. It's even more frustrating, because it can easily be checked at compile time if the value was defined correctly. This patch adds such a check for the correctness of __ARCH_SI_PREAMBLE_SIZE in the hope that it will prevent existing and future architectures from running into the same problem. I refrained from replacing __ARCH_SI_PREAMBLE_SIZE by offsetof() in copy_siginfo() in include/asm-generic/siginfo.h, because a) it doesn't make any difference and b) it's used in the Documentation/kmemcheck.txt example. I ran this patch through the 0-DAY kernel test infrastructure and only the parisc architecture triggered as expected. That means that this patch should be OK for all major architectures. Signed-off-by: Helge Deller Cc: Stephen Rothwell Cc: Michael Ellerman Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index fe8ed298373c..aa9bf00749c1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3585,6 +3585,10 @@ __weak const char *arch_vma_name(struct vm_area_struct *vma) void __init signals_init(void) { + /* If this check fails, the __ARCH_SI_PREAMBLE_SIZE value is wrong! */ + BUILD_BUG_ON(__ARCH_SI_PREAMBLE_SIZE + != offsetof(struct siginfo, _sifields._pad)); + sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); } -- cgit v1.2.3 From cf61e2a1487d833e4748dead4096584de70bf742 Mon Sep 17 00:00:00 2001 From: Brian Starkey Date: Tue, 22 Mar 2016 14:27:57 -0700 Subject: memremap: don't modify flags These patches implement a MEMREMAP_WC flag for memremap(), which can be used to obtain writecombine mappings. This is then used for setting up dma_coherent_mem regions which use the DMA_MEMORY_MAP flag. The motivation is to fix an alignment fault on arm64, and the suggestion to implement MEMREMAP_WC for this case was made at [1]. That particular issue is handled in patch 4, which makes sure that the appropriate memset function is used when zeroing allocations mapped as IO memory. This patch (of 4): Don't modify the flags input argument to memremap(). MEMREMAP_WB is already a special case so we can check for it directly instead of clearing flag bits in each mapper. Signed-off-by: Brian Starkey Cc: Catalin Marinas Cc: Dan Williams Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 584febd13e2e..e5e685e6ff2a 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -64,6 +64,9 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); void *addr = NULL; + if (!flags) + return NULL; + if (is_ram == REGION_MIXED) { WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n", &offset, (unsigned long) size); @@ -72,7 +75,6 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) /* Try all mapping types requested until one returns non-NULL */ if (flags & MEMREMAP_WB) { - flags &= ~MEMREMAP_WB; /* * MEMREMAP_WB is special in that it can be satisifed * from the direct map. Some archs depend on the @@ -86,21 +88,19 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) } /* - * If we don't have a mapping yet and more request flags are - * pending then we will be attempting to establish a new virtual + * If we don't have a mapping yet and other request flags are + * present then we will be attempting to establish a new virtual * address mapping. Enforce that this mapping is not aliasing * System RAM. */ - if (!addr && is_ram == REGION_INTERSECTS && flags) { + if (!addr && is_ram == REGION_INTERSECTS && flags != MEMREMAP_WB) { WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n", &offset, (unsigned long) size); return NULL; } - if (!addr && (flags & MEMREMAP_WT)) { - flags &= ~MEMREMAP_WT; + if (!addr && (flags & MEMREMAP_WT)) addr = ioremap_wt(offset, size); - } return addr; } -- cgit v1.2.3 From c907e0eb43a522de60fb651c011c553f87273222 Mon Sep 17 00:00:00 2001 From: Brian Starkey Date: Tue, 22 Mar 2016 14:28:00 -0700 Subject: memremap: add MEMREMAP_WC flag Add a flag to memremap() for writecombine mappings. Mappings satisfied by this flag will not be cached, however writes may be delayed or combined into more efficient bursts. This is most suitable for buffers written sequentially by the CPU for use by other DMA devices. Signed-off-by: Brian Starkey Reviewed-by: Catalin Marinas Cc: Dan Williams Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index e5e685e6ff2a..a6d382312e6f 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -41,11 +41,13 @@ static void *try_ram_remap(resource_size_t offset, size_t size) * memremap() - remap an iomem_resource as cacheable memory * @offset: iomem resource start address * @size: size of remap - * @flags: either MEMREMAP_WB or MEMREMAP_WT + * @flags: any of MEMREMAP_WB, MEMREMAP_WT and MEMREMAP_WC * * memremap() is "ioremap" for cases where it is known that the resource * being mapped does not have i/o side effects and the __iomem - * annotation is not applicable. + * annotation is not applicable. In the case of multiple flags, the different + * mapping types will be attempted in the order listed below until one of + * them succeeds. * * MEMREMAP_WB - matches the default mapping for System RAM on * the architecture. This is usually a read-allocate write-back cache. @@ -57,6 +59,10 @@ static void *try_ram_remap(resource_size_t offset, size_t size) * cache or are written through to memory and never exist in a * cache-dirty state with respect to program visibility. Attempts to * map System RAM with this mapping type will fail. + * + * MEMREMAP_WC - establish a writecombine mapping, whereby writes may + * be coalesced together (e.g. in the CPU's write buffers), but is otherwise + * uncached. Attempts to map System RAM with this mapping type will fail. */ void *memremap(resource_size_t offset, size_t size, unsigned long flags) { @@ -102,6 +108,9 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) if (!addr && (flags & MEMREMAP_WT)) addr = ioremap_wt(offset, size); + if (!addr && (flags & MEMREMAP_WC)) + addr = ioremap_wc(offset, size); + return addr; } EXPORT_SYMBOL(memremap); -- cgit v1.2.3 From a395d6a7e3d6e3d1d316376db0c4c8b5d2995930 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 22 Mar 2016 14:28:09 -0700 Subject: kernel/...: convert pr_warning to pr_warn Use the more common logging method with the eventual goal of removing pr_warning altogether. Miscellanea: - Realign arguments - Coalesce formats - Add missing space between a few coalesced formats Signed-off-by: Joe Perches Acked-by: Rafael J. Wysocki [kernel/power/suspend.c] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 4 ++-- kernel/power/suspend.c | 3 +-- kernel/time/tick-sched.c | 8 ++++---- kernel/trace/blktrace.c | 4 ++-- kernel/trace/ftrace.c | 7 +++---- kernel/trace/trace.c | 40 ++++++++++++++++++------------------ kernel/trace/trace_functions_graph.c | 6 +++--- kernel/trace/trace_kprobe.c | 27 ++++++++++-------------- kernel/trace/trace_mmiotrace.c | 2 +- kernel/trace/trace_probe.c | 4 ++-- kernel/trace/trace_stat.c | 3 +-- kernel/trace/trace_uprobe.c | 2 +- kernel/tracepoint.c | 2 +- 13 files changed, 52 insertions(+), 60 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 64731e84c982..cc1cc641d653 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1322,8 +1322,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (nmsk != omsk) /* hope the handler works with current trigger mode */ - pr_warning("irq %d uses trigger mode %u; requested %u\n", - irq, nmsk, omsk); + pr_warn("irq %d uses trigger mode %u; requested %u\n", + irq, nmsk, omsk); } *old_ptr = new; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 230a77225e2e..5b70d64b871e 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -473,8 +473,7 @@ static int enter_state(suspend_state_t state) if (state == PM_SUSPEND_FREEZE) { #ifdef CONFIG_PM_DEBUG if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) { - pr_warning("PM: Unsupported test mode for suspend to idle," - "please choose none/freezer/devices/platform.\n"); + pr_warn("PM: Unsupported test mode for suspend to idle, please choose none/freezer/devices/platform.\n"); return -EAGAIN; } #endif diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 195fe7d2caad..084b79f5917e 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -378,7 +378,7 @@ static int __init tick_nohz_full_setup(char *str) { alloc_bootmem_cpumask_var(&tick_nohz_full_mask); if (cpulist_parse(str, tick_nohz_full_mask) < 0) { - pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); + pr_warn("NO_HZ: Incorrect nohz_full cpumask\n"); free_bootmem_cpumask_var(tick_nohz_full_mask); return 1; } @@ -446,8 +446,7 @@ void __init tick_nohz_init(void) * interrupts to avoid circular dependency on the tick */ if (!arch_irq_work_has_interrupt()) { - pr_warning("NO_HZ: Can't run full dynticks because arch doesn't " - "support irq work self-IPIs\n"); + pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n"); cpumask_clear(tick_nohz_full_mask); cpumask_copy(housekeeping_mask, cpu_possible_mask); tick_nohz_full_running = false; @@ -457,7 +456,8 @@ void __init tick_nohz_init(void) cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { - pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); + pr_warn("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", + cpu); cpumask_clear_cpu(cpu, tick_nohz_full_mask); } diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2aeb6ffc0a1e..f94e7a21f52d 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1437,12 +1437,12 @@ static struct trace_event trace_blk_event = { static int __init init_blk_tracer(void) { if (!register_trace_event(&trace_blk_event)) { - pr_warning("Warning: could not register block events\n"); + pr_warn("Warning: could not register block events\n"); return 1; } if (register_tracer(&blk_tracer) != 0) { - pr_warning("Warning: could not register the block tracer\n"); + pr_warn("Warning: could not register the block tracer\n"); unregister_trace_event(&trace_blk_event); return 1; } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 57a6eea84694..2ece9f1a3e5a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1058,8 +1058,7 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) entry = tracefs_create_file("function_profile_enabled", 0644, d_tracer, NULL, &ftrace_profile_fops); if (!entry) - pr_warning("Could not create tracefs " - "'function_profile_enabled' entry\n"); + pr_warn("Could not create tracefs 'function_profile_enabled' entry\n"); } #else /* CONFIG_FUNCTION_PROFILER */ @@ -2314,8 +2313,8 @@ unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) if (rec->flags & FTRACE_FL_TRAMP_EN) { ops = ftrace_find_tramp_ops_curr(rec); if (FTRACE_WARN_ON(!ops)) { - pr_warning("Bad trampoline accounting at: %p (%pS)\n", - (void *)rec->ip, (void *)rec->ip); + pr_warn("Bad trampoline accounting at: %p (%pS)\n", + (void *)rec->ip, (void *)rec->ip); /* Ftrace is shutting down, return anything */ return (unsigned long)FTRACE_ADDR; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d9293402ee68..032b388bea66 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2071,20 +2071,20 @@ void trace_printk_init_buffers(void) /* trace_printk() is for debug use only. Don't use it in production. */ - pr_warning("\n"); - pr_warning("**********************************************************\n"); - pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); - pr_warning("** **\n"); - pr_warning("** trace_printk() being used. Allocating extra memory. **\n"); - pr_warning("** **\n"); - pr_warning("** This means that this is a DEBUG kernel and it is **\n"); - pr_warning("** unsafe for production use. **\n"); - pr_warning("** **\n"); - pr_warning("** If you see this message and you are not debugging **\n"); - pr_warning("** the kernel, report this immediately to your vendor! **\n"); - pr_warning("** **\n"); - pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); - pr_warning("**********************************************************\n"); + pr_warn("\n"); + pr_warn("**********************************************************\n"); + pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warn("** **\n"); + pr_warn("** trace_printk() being used. Allocating extra memory. **\n"); + pr_warn("** **\n"); + pr_warn("** This means that this is a DEBUG kernel and it is **\n"); + pr_warn("** unsafe for production use. **\n"); + pr_warn("** **\n"); + pr_warn("** If you see this message and you are not debugging **\n"); + pr_warn("** the kernel, report this immediately to your vendor! **\n"); + pr_warn("** **\n"); + pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warn("**********************************************************\n"); /* Expand the buffers to set size */ tracing_update_buffers(); @@ -4101,7 +4101,7 @@ trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start, */ map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL); if (!map_array) { - pr_warning("Unable to allocate trace enum mapping\n"); + pr_warn("Unable to allocate trace enum mapping\n"); return; } @@ -6131,7 +6131,7 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) snprintf(cpu_dir, 30, "cpu%ld", cpu); d_cpu = tracefs_create_dir(cpu_dir, d_percpu); if (!d_cpu) { - pr_warning("Could not create tracefs '%s' entry\n", cpu_dir); + pr_warn("Could not create tracefs '%s' entry\n", cpu_dir); return; } @@ -6318,7 +6318,7 @@ struct dentry *trace_create_file(const char *name, ret = tracefs_create_file(name, mode, parent, data, fops); if (!ret) - pr_warning("Could not create tracefs '%s' entry\n", name); + pr_warn("Could not create tracefs '%s' entry\n", name); return ret; } @@ -6337,7 +6337,7 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr) tr->options = tracefs_create_dir("options", d_tracer); if (!tr->options) { - pr_warning("Could not create tracefs directory 'options'\n"); + pr_warn("Could not create tracefs directory 'options'\n"); return NULL; } @@ -7248,8 +7248,8 @@ __init static int tracer_alloc_buffers(void) if (trace_boot_clock) { ret = tracing_set_clock(&global_trace, trace_boot_clock); if (ret < 0) - pr_warning("Trace clock %s not defined, going back to default\n", - trace_boot_clock); + pr_warn("Trace clock %s not defined, going back to default\n", + trace_boot_clock); } /* diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index a663cbb84107..91d6a63a2ea7 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1350,7 +1350,7 @@ void graph_trace_open(struct trace_iterator *iter) out_err_free: kfree(data); out_err: - pr_warning("function graph tracer: not enough memory\n"); + pr_warn("function graph tracer: not enough memory\n"); } void graph_trace_close(struct trace_iterator *iter) @@ -1468,12 +1468,12 @@ static __init int init_graph_trace(void) max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); if (!register_trace_event(&graph_trace_entry_event)) { - pr_warning("Warning: could not register graph trace events\n"); + pr_warn("Warning: could not register graph trace events\n"); return 1; } if (!register_trace_event(&graph_trace_ret_event)) { - pr_warning("Warning: could not register graph trace events\n"); + pr_warn("Warning: could not register graph trace events\n"); return 1; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 21b81a41dae5..919e0ddd8fcc 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -459,16 +459,14 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) if (ret == 0) tk->tp.flags |= TP_FLAG_REGISTERED; else { - pr_warning("Could not insert probe at %s+%lu: %d\n", - trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret); + pr_warn("Could not insert probe at %s+%lu: %d\n", + trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret); if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) { - pr_warning("This probe might be able to register after" - "target module is loaded. Continue.\n"); + pr_warn("This probe might be able to register after target module is loaded. Continue.\n"); ret = 0; } else if (ret == -EILSEQ) { - pr_warning("Probing address(0x%p) is not an " - "instruction boundary.\n", - tk->rp.kp.addr); + pr_warn("Probing address(0x%p) is not an instruction boundary.\n", + tk->rp.kp.addr); ret = -EINVAL; } } @@ -529,7 +527,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk) /* Register new event */ ret = register_kprobe_event(tk); if (ret) { - pr_warning("Failed to register probe event(%d)\n", ret); + pr_warn("Failed to register probe event(%d)\n", ret); goto end; } @@ -564,10 +562,9 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, __unregister_trace_kprobe(tk); ret = __register_trace_kprobe(tk); if (ret) - pr_warning("Failed to re-register probe %s on" - "%s: %d\n", - trace_event_name(&tk->tp.call), - mod->name, ret); + pr_warn("Failed to re-register probe %s on %s: %d\n", + trace_event_name(&tk->tp.call), + mod->name, ret); } } mutex_unlock(&probe_lock); @@ -1336,16 +1333,14 @@ static __init int init_kprobe_trace(void) /* Event list interface */ if (!entry) - pr_warning("Could not create tracefs " - "'kprobe_events' entry\n"); + pr_warn("Could not create tracefs 'kprobe_events' entry\n"); /* Profile interface */ entry = tracefs_create_file("kprobe_profile", 0444, d_tracer, NULL, &kprobe_profile_ops); if (!entry) - pr_warning("Could not create tracefs " - "'kprobe_profile' entry\n"); + pr_warn("Could not create tracefs 'kprobe_profile' entry\n"); return 0; } fs_initcall(init_kprobe_trace); diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 2be8c4f2403d..68f376ca6d3f 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -146,7 +146,7 @@ static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp, /* XXX: This is later than where events were lost. */ trace_seq_printf(s, "MARK 0.000000 Lost %lu events.\n", n); if (!overrun_detected) - pr_warning("mmiotrace has lost events.\n"); + pr_warn("mmiotrace has lost events\n"); overrun_detected = true; goto print_out; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 1769a81da8a7..1d372fa6fefb 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -636,8 +636,8 @@ ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer, *tmp = '\0'; size = tmp - kbuf + 1; } else if (done + size < count) { - pr_warning("Line length is too long: " - "Should be less than %d.", WRITE_BUFSIZE); + pr_warn("Line length is too long: Should be less than %d\n", + WRITE_BUFSIZE); ret = -EINVAL; goto out; } diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 6cf935316769..413ff108fbd0 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -281,8 +281,7 @@ static int tracing_stat_init(void) stat_dir = tracefs_create_dir("trace_stat", d_tracing); if (!stat_dir) - pr_warning("Could not create tracefs " - "'trace_stat' entry\n"); + pr_warn("Could not create tracefs 'trace_stat' entry\n"); return 0; } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index d2f6d0be3503..7915142c89e4 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -334,7 +334,7 @@ static int register_trace_uprobe(struct trace_uprobe *tu) ret = register_uprobe_event(tu); if (ret) { - pr_warning("Failed to register probe event(%d)\n", ret); + pr_warn("Failed to register probe event(%d)\n", ret); goto end; } diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index ecd536de603a..d0639d917899 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -491,7 +491,7 @@ static __init int init_tracepoints(void) ret = register_module_notifier(&tracepoint_module_nb); if (ret) - pr_warning("Failed to register tracepoint module enter notifier\n"); + pr_warn("Failed to register tracepoint module enter notifier\n"); return ret; } -- cgit v1.2.3 From 276142730c39c9839465a36a90e5674a8c34e839 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Wed, 23 Mar 2016 00:11:20 +0100 Subject: PM / sleep: Clear pm_suspend_global_flags upon hibernate When suspending to RAM, waking up and later suspending to disk, we gratuitously runtime resume devices after the thaw phase. This does not occur if we always suspend to RAM or always to disk. pm_complete_with_resume_check(), which gets called from pci_pm_complete() among others, schedules a runtime resume if PM_SUSPEND_FLAG_FW_RESUME is set. The flag is set during a suspend-to-RAM cycle. It is cleared at the beginning of the suspend-to-RAM cycle but not afterwards and it is not cleared during a suspend-to-disk cycle at all. Fix it. Fixes: ef25ba047601 (PM / sleep: Add flags to indicate platform firmware involvement) Signed-off-by: Lukas Wunner Cc: 4.4+ # 4.4+ Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index b7342a24f559..b7dd5718836e 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -339,6 +339,7 @@ int hibernation_snapshot(int platform_mode) pm_message_t msg; int error; + pm_suspend_clear_flags(); error = platform_begin(platform_mode); if (error) goto Close; -- cgit v1.2.3 From 322cea2f41adb62c975f46a3242f4e3b43226fa1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 25 Mar 2016 00:30:25 +0100 Subject: bpf: add missing map_flags to bpf_map_show_fdinfo Add map_flags attribute to bpf_map_show_fdinfo(), so that tools like tc can check for them when loading objects from a pinned entry, e.g. if user intent wrt allocation (BPF_F_NO_PREALLOC) is different to the pinned object, it can bail out. Follow-up to 6c9059817432 ("bpf: pre-allocate hash map elements"), so that tc can still support this with v4.6. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2a2efe1bc76c..adc5e4bd74f8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -137,11 +137,13 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) "map_type:\t%u\n" "key_size:\t%u\n" "value_size:\t%u\n" - "max_entries:\t%u\n", + "max_entries:\t%u\n" + "map_flags:\t%#x\n", map->map_type, map->key_size, map->value_size, - map->max_entries); + map->max_entries, + map->map_flags); } #endif -- cgit v1.2.3 From 69b27baf00fa9b7b14b3263c105390d1683425b2 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 25 Mar 2016 14:20:21 -0700 Subject: sched: add schedule_timeout_idle() This will be needed in the patch "mm, oom: introduce oom reaper". Acked-by: Michal Hocko Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/timer.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index d1798fa0c743..73164c3aa56b 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1566,6 +1566,17 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) } EXPORT_SYMBOL(schedule_timeout_uninterruptible); +/* + * Like schedule_timeout_uninterruptible(), except this task will not contribute + * to load average. + */ +signed long __sched schedule_timeout_idle(signed long timeout) +{ + __set_current_state(TASK_IDLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_idle); + #ifdef CONFIG_HOTPLUG_CPU static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) { -- cgit v1.2.3 From 36324a990cf578b57828c04cd85ac62cd25cf5a4 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 25 Mar 2016 14:20:27 -0700 Subject: oom: clear TIF_MEMDIE after oom_reaper managed to unmap the address space When oom_reaper manages to unmap all the eligible vmas there shouldn't be much of the freable memory held by the oom victim left anymore so it makes sense to clear the TIF_MEMDIE flag for the victim and allow the OOM killer to select another task. The lack of TIF_MEMDIE also means that the victim cannot access memory reserves anymore but that shouldn't be a problem because it would get the access again if it needs to allocate and hits the OOM killer again due to the fatal_signal_pending resp. PF_EXITING check. We can safely hide the task from the OOM killer because it is clearly not a good candidate anymore as everyhing reclaimable has been torn down already. This patch will allow to cap the time an OOM victim can keep TIF_MEMDIE and thus hold off further global OOM killer actions granted the oom reaper is able to take mmap_sem for the associated mm struct. This is not guaranteed now but further steps should make sure that mmap_sem for write should be blocked killable which will help to reduce such a lock contention. This is not done by this patch. Note that exit_oom_victim might be called on a remote task from __oom_reap_task now so we have to check and clear the flag atomically otherwise we might race and underflow oom_victims or wake up waiters too early. Signed-off-by: Michal Hocko Suggested-by: Johannes Weiner Suggested-by: Tetsuo Handa Cc: Andrea Argangeli Cc: David Rientjes Cc: Hugh Dickins Cc: Mel Gorman Cc: Oleg Nesterov Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 953d1a1c0387..fd90195667e1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -435,7 +435,7 @@ static void exit_mm(struct task_struct *tsk) mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) - exit_oom_victim(); + exit_oom_victim(tsk); } static struct task_struct *find_alive_thread(struct task_struct *p) -- cgit v1.2.3 From be7635e7287e0e8013af3c89a6354a9e0182594c Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Fri, 25 Mar 2016 14:22:05 -0700 Subject: arch, ftrace: for KASAN put hard/soft IRQ entries into separate sections KASAN needs to know whether the allocation happens in an IRQ handler. This lets us strip everything below the IRQ entry point to reduce the number of unique stack traces needed to be stored. Move the definition of __irq_entry to so that the users don't need to pull in . Also introduce the __softirq_entry macro which is similar to __irq_entry, but puts the corresponding functions to the .softirqentry.text section. Signed-off-by: Alexander Potapenko Acked-by: Steven Rostedt Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Konstantin Serebryany Cc: Dmitry Chernenkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/softirq.c | 2 +- kernel/trace/trace_functions_graph.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 8aae49dd7da8..17caf4b63342 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -227,7 +227,7 @@ static inline bool lockdep_softirq_start(void) { return false; } static inline void lockdep_softirq_end(bool in_hardirq) { } #endif -asmlinkage __visible void __do_softirq(void) +asmlinkage __visible void __softirq_entry __do_softirq(void) { unsigned long end = jiffies + MAX_SOFTIRQ_TIME; unsigned long old_flags = current->flags; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 91d6a63a2ea7..3a0244ff7ea8 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -8,6 +8,7 @@ */ #include #include +#include #include #include -- cgit v1.2.3 From f009a7a767e792d5ab0b46c08d46236ea5271dd9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 24 Mar 2016 15:38:00 +0100 Subject: timers/nohz: Convert tick dependency mask to atomic_t The tick dependency mask was intially unsigned long because this is the type on which clear_bit() operates on and fetch_or() accepts it. But now that we have atomic_fetch_or(), we can instead use atomic_andnot() to clear the bit. This consolidates the type of our tick dependency mask, reduce its size on structures and benefit from possible architecture optimizations on atomic_t operations. Suggested-by: Linus Torvalds Signed-off-by: Frederic Weisbecker Cc: Andrew Morton Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1458830281-4255-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 61 ++++++++++++++++++++++++------------------------ kernel/time/tick-sched.h | 2 +- 2 files changed, 31 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 084b79f5917e..58e3310c9b21 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -157,52 +157,50 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) cpumask_var_t tick_nohz_full_mask; cpumask_var_t housekeeping_mask; bool tick_nohz_full_running; -static unsigned long tick_dep_mask; +static atomic_t tick_dep_mask; -static void trace_tick_dependency(unsigned long dep) +static bool check_tick_dependency(atomic_t *dep) { - if (dep & TICK_DEP_MASK_POSIX_TIMER) { + int val = atomic_read(dep); + + if (val & TICK_DEP_MASK_POSIX_TIMER) { trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); - return; + return true; } - if (dep & TICK_DEP_MASK_PERF_EVENTS) { + if (val & TICK_DEP_MASK_PERF_EVENTS) { trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS); - return; + return true; } - if (dep & TICK_DEP_MASK_SCHED) { + if (val & TICK_DEP_MASK_SCHED) { trace_tick_stop(0, TICK_DEP_MASK_SCHED); - return; + return true; } - if (dep & TICK_DEP_MASK_CLOCK_UNSTABLE) + if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) { trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE); + return true; + } + + return false; } static bool can_stop_full_tick(struct tick_sched *ts) { WARN_ON_ONCE(!irqs_disabled()); - if (tick_dep_mask) { - trace_tick_dependency(tick_dep_mask); + if (check_tick_dependency(&tick_dep_mask)) return false; - } - if (ts->tick_dep_mask) { - trace_tick_dependency(ts->tick_dep_mask); + if (check_tick_dependency(&ts->tick_dep_mask)) return false; - } - if (current->tick_dep_mask) { - trace_tick_dependency(current->tick_dep_mask); + if (check_tick_dependency(¤t->tick_dep_mask)) return false; - } - if (current->signal->tick_dep_mask) { - trace_tick_dependency(current->signal->tick_dep_mask); + if (check_tick_dependency(¤t->signal->tick_dep_mask)) return false; - } return true; } @@ -259,12 +257,12 @@ static void tick_nohz_full_kick_all(void) preempt_enable(); } -static void tick_nohz_dep_set_all(unsigned long *dep, +static void tick_nohz_dep_set_all(atomic_t *dep, enum tick_dep_bits bit) { - unsigned long prev; + int prev; - prev = fetch_or(dep, BIT_MASK(bit)); + prev = atomic_fetch_or(dep, BIT(bit)); if (!prev) tick_nohz_full_kick_all(); } @@ -280,7 +278,7 @@ void tick_nohz_dep_set(enum tick_dep_bits bit) void tick_nohz_dep_clear(enum tick_dep_bits bit) { - clear_bit(bit, &tick_dep_mask); + atomic_andnot(BIT(bit), &tick_dep_mask); } /* @@ -289,12 +287,12 @@ void tick_nohz_dep_clear(enum tick_dep_bits bit) */ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) { - unsigned long prev; + int prev; struct tick_sched *ts; ts = per_cpu_ptr(&tick_cpu_sched, cpu); - prev = fetch_or(&ts->tick_dep_mask, BIT_MASK(bit)); + prev = atomic_fetch_or(&ts->tick_dep_mask, BIT(bit)); if (!prev) { preempt_disable(); /* Perf needs local kick that is NMI safe */ @@ -313,7 +311,7 @@ void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) { struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); - clear_bit(bit, &ts->tick_dep_mask); + atomic_andnot(BIT(bit), &ts->tick_dep_mask); } /* @@ -331,7 +329,7 @@ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit) { - clear_bit(bit, &tsk->tick_dep_mask); + atomic_andnot(BIT(bit), &tsk->tick_dep_mask); } /* @@ -345,7 +343,7 @@ void tick_nohz_dep_set_signal(struct signal_struct *sig, enum tick_dep_bits bit) void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit) { - clear_bit(bit, &sig->tick_dep_mask); + atomic_andnot(BIT(bit), &sig->tick_dep_mask); } /* @@ -366,7 +364,8 @@ void __tick_nohz_task_switch(void) ts = this_cpu_ptr(&tick_cpu_sched); if (ts->tick_stopped) { - if (current->tick_dep_mask || current->signal->tick_dep_mask) + if (atomic_read(¤t->tick_dep_mask) || + atomic_read(¤t->signal->tick_dep_mask)) tick_nohz_full_kick(); } out: diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index eb4e32566a83..bf38226e5c17 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -60,7 +60,7 @@ struct tick_sched { u64 next_timer; ktime_t idle_expires; int do_timer_last; - unsigned long tick_dep_mask; + atomic_t tick_dep_mask; }; extern struct tick_sched *tick_get_tick_sched(int cpu); -- cgit v1.2.3 From 5529578a27288d11d4d15635c258c6dde0f0fb10 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 24 Mar 2016 15:38:01 +0100 Subject: locking/atomic, sched: Unexport fetch_or() This patch functionally reverts: 5fd7a09cfb8c ("atomic: Export fetch_or()") During the merge Linus observed that the generic version of fetch_or() was messy: " This makes the ugly "fetch_or()" macro that the scheduler used internally a new generic helper, and does a bad job at it. " e23604edac2a Merge branch 'timers-nohz-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Now that we have introduced atomic_fetch_or(), fetch_or() is only used by the scheduler in order to deal with thread_info flags which type can vary across architectures. Lets confine fetch_or() back to the scheduler so that we encourage future users to use the more robust and well typed atomic_t version instead. While at it, fetch_or() gets robustified, pasting improvements from a previous patch by Ingo Molnar that avoids needless expression re-evaluations in the loop. Reported-by: Linus Torvalds Signed-off-by: Frederic Weisbecker Cc: Andrew Morton Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1458830281-4255-4-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d8465eeab8b3..8b489fcac37b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -321,6 +321,24 @@ static inline void init_hrtick(void) } #endif /* CONFIG_SCHED_HRTICK */ +/* + * cmpxchg based fetch_or, macro so it works for different integer types + */ +#define fetch_or(ptr, mask) \ + ({ \ + typeof(ptr) _ptr = (ptr); \ + typeof(mask) _mask = (mask); \ + typeof(*_ptr) _old, _val = *_ptr; \ + \ + for (;;) { \ + _old = cmpxchg(_ptr, _val, _val | _mask); \ + if (_old == _val) \ + break; \ + _val = _old; \ + } \ + _old; \ +}) + #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) /* * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, -- cgit v1.2.3 From 39e2e173fb1f900959d3a25c21c65fa88b06c6ee Mon Sep 17 00:00:00 2001 From: Alfredo Alvarez Fernandez Date: Wed, 30 Mar 2016 19:03:36 +0200 Subject: locking/lockdep: Print chain_key collision information A sequence of pairs [class_idx -> corresponding chain_key iteration] is printed for both the current held_lock chain and the cached chain. That exposes the two different class_idx sequences that led to that particular hash value. This helps with debugging hash chain collision reports. Signed-off-by: Alfredo Alvarez Fernandez Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-fsdevel@vger.kernel.org Cc: sedat.dilek@gmail.com Cc: tytso@mit.edu Link: http://lkml.kernel.org/r/1459357416-19190-1-git-send-email-alfredoalvarezernandez@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 77 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 53ab2f85d77e..2324ba5310db 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1999,6 +1999,77 @@ static inline int get_first_held_lock(struct task_struct *curr, return ++i; } +/* + * Returns the next chain_key iteration + */ +static u64 print_chain_key_iteration(int class_idx, u64 chain_key) +{ + u64 new_chain_key = iterate_chain_key(chain_key, class_idx); + + printk(" class_idx:%d -> chain_key:%016Lx", + class_idx, + (unsigned long long)new_chain_key); + return new_chain_key; +} + +static void +print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_next) +{ + struct held_lock *hlock; + u64 chain_key = 0; + int depth = curr->lockdep_depth; + int i; + + printk("depth: %u\n", depth + 1); + for (i = get_first_held_lock(curr, hlock_next); i < depth; i++) { + hlock = curr->held_locks + i; + chain_key = print_chain_key_iteration(hlock->class_idx, chain_key); + + print_lock(hlock); + } + + print_chain_key_iteration(hlock_next->class_idx, chain_key); + print_lock(hlock_next); +} + +static void print_chain_keys_chain(struct lock_chain *chain) +{ + int i; + u64 chain_key = 0; + int class_id; + + printk("depth: %u\n", chain->depth); + for (i = 0; i < chain->depth; i++) { + class_id = chain_hlocks[chain->base + i]; + chain_key = print_chain_key_iteration(class_id + 1, chain_key); + + print_lock_name(lock_classes + class_id); + printk("\n"); + } +} + +static void print_collision(struct task_struct *curr, + struct held_lock *hlock_next, + struct lock_chain *chain) +{ + printk("\n"); + printk("======================\n"); + printk("[chain_key collision ]\n"); + print_kernel_ident(); + printk("----------------------\n"); + printk("%s/%d: ", current->comm, task_pid_nr(current)); + printk("Hash chain already cached but the contents don't match!\n"); + + printk("Held locks:"); + print_chain_keys_held_locks(curr, hlock_next); + + printk("Locks in cached chain:"); + print_chain_keys_chain(chain); + + printk("\nstack backtrace:\n"); + dump_stack(); +} + /* * Checks whether the chain and the current held locks are consistent * in depth and also in content. If they are not it most likely means @@ -2014,14 +2085,18 @@ static int check_no_collision(struct task_struct *curr, i = get_first_held_lock(curr, hlock); - if (DEBUG_LOCKS_WARN_ON(chain->depth != curr->lockdep_depth - (i - 1))) + if (DEBUG_LOCKS_WARN_ON(chain->depth != curr->lockdep_depth - (i - 1))) { + print_collision(curr, hlock, chain); return 0; + } for (j = 0; j < chain->depth - 1; j++, i++) { id = curr->held_locks[i].class_idx - 1; - if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) + if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) { + print_collision(curr, hlock, chain); return 0; + } } #endif return 1; -- cgit v1.2.3 From 09cbfeaf1a5a67bfb3201e0c83c810cecb2efa5a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 1 Apr 2016 15:29:47 +0300 Subject: mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> ; - >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> ; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 220fc17b9718..7edc95edfaee 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -321,7 +321,7 @@ retry: copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); ret = __replace_page(vma, vaddr, old_page, new_page); - page_cache_release(new_page); + put_page(new_page); put_old: put_page(old_page); @@ -539,14 +539,14 @@ static int __copy_insn(struct address_space *mapping, struct file *filp, * see uprobe_register(). */ if (mapping->a_ops->readpage) - page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp); + page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp); else - page = shmem_read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT); + page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT); if (IS_ERR(page)) return PTR_ERR(page); copy_from_page(page, offset, insn, nbytes); - page_cache_release(page); + put_page(page); return 0; } -- cgit v1.2.3