summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/bpf/arraymap.c137
-rw-r--r--kernel/bpf/core.c9
-rw-r--r--kernel/bpf/syscall.c2
-rw-r--r--kernel/bpf/verifier.c54
-rw-r--r--kernel/cgroup.c126
-rw-r--r--kernel/cgroup_freezer.c2
-rw-r--r--kernel/cgroup_pids.c355
-rw-r--r--kernel/cpu.c52
-rw-r--r--kernel/cpuset.c2
-rw-r--r--kernel/events/core.c278
-rw-r--r--kernel/events/ring_buffer.c15
-rw-r--r--kernel/events/uprobes.c228
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c60
-rw-r--r--kernel/irq/chip.c43
-rw-r--r--kernel/irq/generic-chip.c6
-rw-r--r--kernel/irq/handle.c4
-rw-r--r--kernel/irq/internals.h11
-rw-r--r--kernel/irq/irqdesc.c2
-rw-r--r--kernel/irq/irqdomain.c18
-rw-r--r--kernel/irq/manage.c64
-rw-r--r--kernel/irq/msi.c17
-rw-r--r--kernel/irq/pm.c12
-rw-r--r--kernel/irq/resend.c4
-rw-r--r--kernel/irq/spurious.c26
-rw-r--r--kernel/kprobes.c2
-rw-r--r--kernel/kthread.c24
-rw-r--r--kernel/livepatch/core.c6
-rw-r--r--kernel/module.c8
-rw-r--r--kernel/notifier.c2
-rw-r--r--kernel/pid.c5
-rw-r--r--kernel/power/Kconfig10
-rw-r--r--kernel/power/suspend.c2
-rw-r--r--kernel/power/swap.c12
-rw-r--r--kernel/power/wakelock.c18
-rw-r--r--kernel/rcu/rcutorture.c42
-rw-r--r--kernel/rcu/srcu.c15
-rw-r--r--kernel/rcu/tiny.c8
-rw-r--r--kernel/rcu/tree.c681
-rw-r--r--kernel/rcu/tree.h96
-rw-r--r--kernel/rcu/tree_plugin.h130
-rw-r--r--kernel/rcu/tree_trace.c19
-rw-r--r--kernel/rcu/update.c90
-rw-r--r--kernel/sched/core.c121
-rw-r--r--kernel/sched/cputime.c101
-rw-r--r--kernel/sched/deadline.c40
-rw-r--r--kernel/sched/debug.c48
-rw-r--r--kernel/sched/fair.c937
-rw-r--r--kernel/sched/features.h18
-rw-r--r--kernel/sched/idle.c14
-rw-r--r--kernel/sched/idle_task.c1
-rw-r--r--kernel/sched/rt.c42
-rw-r--r--kernel/sched/sched.h39
-rw-r--r--kernel/sched/stop_task.c1
-rw-r--r--kernel/signal.c13
-rw-r--r--kernel/stop_machine.c44
-rw-r--r--kernel/sys.c3
-rw-r--r--kernel/time/Kconfig2
-rw-r--r--kernel/time/hrtimer.c36
-rw-r--r--kernel/time/ntp.c5
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c49
-rw-r--r--kernel/time/tick-common.c3
-rw-r--r--kernel/time/tick-sched.c72
-rw-r--r--kernel/time/time.c53
-rw-r--r--kernel/time/timekeeping.c19
-rw-r--r--kernel/time/timer.c4
-rw-r--r--kernel/time/timer_list.c2
-rw-r--r--kernel/trace/Kconfig2
-rw-r--r--kernel/trace/blktrace.c10
-rw-r--r--kernel/trace/bpf_trace.c63
-rw-r--r--kernel/trace/trace_kprobe.c20
-rw-r--r--kernel/trace/trace_sched_switch.c2
-rw-r--r--kernel/trace/trace_sched_wakeup.c2
-rw-r--r--kernel/trace/trace_uprobe.c22
-rw-r--r--kernel/user_namespace.c4
-rw-r--r--kernel/workqueue.c28
77 files changed, 2711 insertions, 1809 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 43c4c920f30a..718fb8afab7a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup.o
obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
+obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o
obj-$(CONFIG_CPUSETS) += cpuset.o
obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index cb31229a6fa4..29ace107f236 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -150,15 +150,15 @@ static int __init register_array_map(void)
}
late_initcall(register_array_map);
-static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
+static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)
{
- /* only bpf_prog file descriptors can be stored in prog_array map */
+ /* only file descriptors can be stored in this type of map */
if (attr->value_size != sizeof(u32))
return ERR_PTR(-EINVAL);
return array_map_alloc(attr);
}
-static void prog_array_map_free(struct bpf_map *map)
+static void fd_array_map_free(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
@@ -167,21 +167,21 @@ static void prog_array_map_free(struct bpf_map *map)
/* make sure it's empty */
for (i = 0; i < array->map.max_entries; i++)
- BUG_ON(array->prog[i] != NULL);
+ BUG_ON(array->ptrs[i] != NULL);
kvfree(array);
}
-static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key)
+static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
{
return NULL;
}
/* only called from syscall */
-static int prog_array_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+static int fd_array_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
- struct bpf_prog *prog, *old_prog;
+ void *new_ptr, *old_ptr;
u32 index = *(u32 *)key, ufd;
if (map_flags != BPF_ANY)
@@ -191,57 +191,75 @@ static int prog_array_map_update_elem(struct bpf_map *map, void *key,
return -E2BIG;
ufd = *(u32 *)value;
- prog = bpf_prog_get(ufd);
- if (IS_ERR(prog))
- return PTR_ERR(prog);
-
- if (!bpf_prog_array_compatible(array, prog)) {
- bpf_prog_put(prog);
- return -EINVAL;
- }
+ new_ptr = map->ops->map_fd_get_ptr(map, ufd);
+ if (IS_ERR(new_ptr))
+ return PTR_ERR(new_ptr);
- old_prog = xchg(array->prog + index, prog);
- if (old_prog)
- bpf_prog_put_rcu(old_prog);
+ old_ptr = xchg(array->ptrs + index, new_ptr);
+ if (old_ptr)
+ map->ops->map_fd_put_ptr(old_ptr);
return 0;
}
-static int prog_array_map_delete_elem(struct bpf_map *map, void *key)
+static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
- struct bpf_prog *old_prog;
+ void *old_ptr;
u32 index = *(u32 *)key;
if (index >= array->map.max_entries)
return -E2BIG;
- old_prog = xchg(array->prog + index, NULL);
- if (old_prog) {
- bpf_prog_put_rcu(old_prog);
+ old_ptr = xchg(array->ptrs + index, NULL);
+ if (old_ptr) {
+ map->ops->map_fd_put_ptr(old_ptr);
return 0;
} else {
return -ENOENT;
}
}
+static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct bpf_prog *prog = bpf_prog_get(fd);
+ if (IS_ERR(prog))
+ return prog;
+
+ if (!bpf_prog_array_compatible(array, prog)) {
+ bpf_prog_put(prog);
+ return ERR_PTR(-EINVAL);
+ }
+ return prog;
+}
+
+static void prog_fd_array_put_ptr(void *ptr)
+{
+ struct bpf_prog *prog = ptr;
+
+ bpf_prog_put_rcu(prog);
+}
+
/* decrement refcnt of all bpf_progs that are stored in this map */
-void bpf_prog_array_map_clear(struct bpf_map *map)
+void bpf_fd_array_map_clear(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
for (i = 0; i < array->map.max_entries; i++)
- prog_array_map_delete_elem(map, &i);
+ fd_array_map_delete_elem(map, &i);
}
static const struct bpf_map_ops prog_array_ops = {
- .map_alloc = prog_array_map_alloc,
- .map_free = prog_array_map_free,
+ .map_alloc = fd_array_map_alloc,
+ .map_free = fd_array_map_free,
.map_get_next_key = array_map_get_next_key,
- .map_lookup_elem = prog_array_map_lookup_elem,
- .map_update_elem = prog_array_map_update_elem,
- .map_delete_elem = prog_array_map_delete_elem,
+ .map_lookup_elem = fd_array_map_lookup_elem,
+ .map_update_elem = fd_array_map_update_elem,
+ .map_delete_elem = fd_array_map_delete_elem,
+ .map_fd_get_ptr = prog_fd_array_get_ptr,
+ .map_fd_put_ptr = prog_fd_array_put_ptr,
};
static struct bpf_map_type_list prog_array_type __read_mostly = {
@@ -255,3 +273,60 @@ static int __init register_prog_array_map(void)
return 0;
}
late_initcall(register_prog_array_map);
+
+static void perf_event_array_map_free(struct bpf_map *map)
+{
+ bpf_fd_array_map_clear(map);
+ fd_array_map_free(map);
+}
+
+static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
+{
+ struct perf_event *event;
+ const struct perf_event_attr *attr;
+
+ event = perf_event_get(fd);
+ if (IS_ERR(event))
+ return event;
+
+ attr = perf_event_attrs(event);
+ if (IS_ERR(attr))
+ return (void *)attr;
+
+ if (attr->type != PERF_TYPE_RAW &&
+ attr->type != PERF_TYPE_HARDWARE) {
+ perf_event_release_kernel(event);
+ return ERR_PTR(-EINVAL);
+ }
+ return event;
+}
+
+static void perf_event_fd_array_put_ptr(void *ptr)
+{
+ struct perf_event *event = ptr;
+
+ perf_event_release_kernel(event);
+}
+
+static const struct bpf_map_ops perf_event_array_ops = {
+ .map_alloc = fd_array_map_alloc,
+ .map_free = perf_event_array_map_free,
+ .map_get_next_key = array_map_get_next_key,
+ .map_lookup_elem = fd_array_map_lookup_elem,
+ .map_update_elem = fd_array_map_update_elem,
+ .map_delete_elem = fd_array_map_delete_elem,
+ .map_fd_get_ptr = perf_event_fd_array_get_ptr,
+ .map_fd_put_ptr = perf_event_fd_array_put_ptr,
+};
+
+static struct bpf_map_type_list perf_event_array_type __read_mostly = {
+ .ops = &perf_event_array_ops,
+ .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+};
+
+static int __init register_perf_event_array_map(void)
+{
+ bpf_register_map_type(&perf_event_array_type);
+ return 0;
+}
+late_initcall(register_perf_event_array_map);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c5bedc82bc1c..67c380cfa9ca 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -177,6 +177,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
return 0;
}
+EXPORT_SYMBOL_GPL(__bpf_call_base);
/**
* __bpf_prog_run - run eBPF program on a given context
@@ -449,11 +450,15 @@ select_insn:
tail_call_cnt++;
- prog = READ_ONCE(array->prog[index]);
+ prog = READ_ONCE(array->ptrs[index]);
if (unlikely(!prog))
goto out;
- ARG1 = BPF_R1;
+ /* ARG1 at this point is guaranteed to point to CTX from
+ * the verifier side due to the fact that the tail call is
+ * handeled like a helper, that is, bpf_tail_call_proto,
+ * where arg1_type is ARG_PTR_TO_CTX.
+ */
insn = prog->insnsi;
goto select_insn;
out:
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a1b14d197a4f..dc9b464fefa9 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -72,7 +72,7 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
/* prog_array stores refcnt-ed bpf_prog pointers
* release them all when user space closes prog_array_fd
*/
- bpf_prog_array_map_clear(map);
+ bpf_fd_array_map_clear(map);
bpf_map_put(map);
return 0;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 039d866fd36a..ed12e385fb75 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -238,6 +238,14 @@ static const char * const reg_type_str[] = {
[CONST_IMM] = "imm",
};
+static const struct {
+ int map_type;
+ int func_id;
+} func_limit[] = {
+ {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
+ {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
+};
+
static void print_verifier_state(struct verifier_env *env)
{
enum bpf_reg_type t;
@@ -648,6 +656,9 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
struct verifier_state *state = &env->cur_state;
int size, err = 0;
+ if (state->regs[regno].type == PTR_TO_STACK)
+ off += state->regs[regno].imm;
+
size = bpf_size_to_bytes(bpf_size);
if (size < 0)
return size;
@@ -667,7 +678,8 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown_value(state->regs, value_regno);
- } else if (state->regs[regno].type == FRAME_PTR) {
+ } else if (state->regs[regno].type == FRAME_PTR ||
+ state->regs[regno].type == PTR_TO_STACK) {
if (off >= 0 || off < -MAX_BPF_STACK) {
verbose("invalid stack off=%d size=%d\n", off, size);
return -EACCES;
@@ -833,6 +845,28 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
return err;
}
+static int check_map_func_compatibility(struct bpf_map *map, int func_id)
+{
+ bool bool_map, bool_func;
+ int i;
+
+ if (!map)
+ return 0;
+
+ for (i = 0; i < ARRAY_SIZE(func_limit); i++) {
+ bool_map = (map->map_type == func_limit[i].map_type);
+ bool_func = (func_id == func_limit[i].func_id);
+ /* only when map & func pair match it can continue.
+ * don't allow any other map type to be passed into
+ * the special func;
+ */
+ if (bool_map != bool_func)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int check_call(struct verifier_env *env, int func_id)
{
struct verifier_state *state = &env->cur_state;
@@ -908,21 +942,9 @@ static int check_call(struct verifier_env *env, int func_id)
return -EINVAL;
}
- if (map && map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
- func_id != BPF_FUNC_tail_call)
- /* prog_array map type needs extra care:
- * only allow to pass it into bpf_tail_call() for now.
- * bpf_map_delete_elem() can be allowed in the future,
- * while bpf_map_update_elem() must only be done via syscall
- */
- return -EINVAL;
-
- if (func_id == BPF_FUNC_tail_call &&
- map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
- /* don't allow any other map type to be passed into
- * bpf_tail_call()
- */
- return -EINVAL;
+ err = check_map_func_compatibility(map, func_id);
+ if (err)
+ return err;
return 0;
}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f89d9292eee6..f3f5cd5e2c0d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -107,8 +107,8 @@ static DEFINE_SPINLOCK(release_agent_path_lock);
struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
#define cgroup_assert_mutex_or_rcu_locked() \
- rcu_lockdep_assert(rcu_read_lock_held() || \
- lockdep_is_held(&cgroup_mutex), \
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
+ !lockdep_is_held(&cgroup_mutex), \
"cgroup_mutex or RCU read lock required");
/*
@@ -145,6 +145,7 @@ static const char *cgroup_subsys_name[] = {
* part of that cgroup.
*/
struct cgroup_root cgrp_dfl_root;
+EXPORT_SYMBOL_GPL(cgrp_dfl_root);
/*
* The default hierarchy always exists but is hidden until mounted for the
@@ -186,6 +187,9 @@ static u64 css_serial_nr_next = 1;
static unsigned long have_fork_callback __read_mostly;
static unsigned long have_exit_callback __read_mostly;
+/* Ditto for the can_fork callback. */
+static unsigned long have_canfork_callback __read_mostly;
+
static struct cftype cgroup_dfl_base_files[];
static struct cftype cgroup_legacy_base_files[];
@@ -207,7 +211,7 @@ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
idr_preload(gfp_mask);
spin_lock_bh(&cgroup_idr_lock);
- ret = idr_alloc(idr, ptr, start, end, gfp_mask);
+ ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_WAIT);
spin_unlock_bh(&cgroup_idr_lock);
idr_preload_end();
return ret;
@@ -1027,10 +1031,13 @@ static const struct file_operations proc_cgroupstats_operations;
static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
char *buf)
{
+ struct cgroup_subsys *ss = cft->ss;
+
if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
!(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
- cft->ss->name, cft->name);
+ cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
+ cft->name);
else
strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
return buf;
@@ -1332,9 +1339,10 @@ static int cgroup_show_options(struct seq_file *seq,
struct cgroup_subsys *ss;
int ssid;
- for_each_subsys(ss, ssid)
- if (root->subsys_mask & (1 << ssid))
- seq_printf(seq, ",%s", ss->name);
+ if (root != &cgrp_dfl_root)
+ for_each_subsys(ss, ssid)
+ if (root->subsys_mask & (1 << ssid))
+ seq_printf(seq, ",%s", ss->legacy_name);
if (root->flags & CGRP_ROOT_NOPREFIX)
seq_puts(seq, ",noprefix");
if (root->flags & CGRP_ROOT_XATTR)
@@ -1447,7 +1455,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
}
for_each_subsys(ss, i) {
- if (strcmp(token, ss->name))
+ if (strcmp(token, ss->legacy_name))
continue;
if (ss->disabled)
continue;
@@ -1666,7 +1674,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
lockdep_assert_held(&cgroup_mutex);
- ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);
+ ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
if (ret < 0)
goto out;
root_cgrp->id = ret;
@@ -4579,7 +4587,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
if (err)
goto err_free_css;
- err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT);
+ err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
if (err < 0)
goto err_free_percpu_ref;
css->id = err;
@@ -4656,7 +4664,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
* Temporarily set the pointer to NULL, so idr_find() won't return
* a half-baked cgroup.
*/
- cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
+ cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
if (cgrp->id < 0) {
ret = -ENOMEM;
goto out_cancel_ref;
@@ -4955,6 +4963,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
have_fork_callback |= (bool)ss->fork << ss->id;
have_exit_callback |= (bool)ss->exit << ss->id;
+ have_canfork_callback |= (bool)ss->can_fork << ss->id;
/* At system boot, before all subsystems have been
* registered, no tasks have been forked, so we don't
@@ -4993,6 +5002,8 @@ int __init cgroup_init_early(void)
ss->id = i;
ss->name = cgroup_subsys_name[i];
+ if (!ss->legacy_name)
+ ss->legacy_name = cgroup_subsys_name[i];
if (ss->early_init)
cgroup_init_subsys(ss, true);
@@ -5136,9 +5147,11 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
continue;
seq_printf(m, "%d:", root->hierarchy_id);
- for_each_subsys(ss, ssid)
- if (root->subsys_mask & (1 << ssid))
- seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
+ if (root != &cgrp_dfl_root)
+ for_each_subsys(ss, ssid)
+ if (root->subsys_mask & (1 << ssid))
+ seq_printf(m, "%s%s", count++ ? "," : "",
+ ss->legacy_name);
if (strlen(root->name))
seq_printf(m, "%sname=%s", count ? "," : "",
root->name);
@@ -5178,7 +5191,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
for_each_subsys(ss, i)
seq_printf(m, "%s\t%d\t%d\t%d\n",
- ss->name, ss->root->hierarchy_id,
+ ss->legacy_name, ss->root->hierarchy_id,
atomic_read(&ss->root->nr_cgrps), !ss->disabled);
mutex_unlock(&cgroup_mutex);
@@ -5197,6 +5210,19 @@ static const struct file_operations proc_cgroupstats_operations = {
.release = single_release,
};
+static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
+{
+ if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END)
+ return &ss_priv[i - CGROUP_CANFORK_START];
+ return NULL;
+}
+
+static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
+{
+ void **private = subsys_canfork_priv_p(ss_priv, i);
+ return private ? *private : NULL;
+}
+
/**
* cgroup_fork - initialize cgroup related fields during copy_process()
* @child: pointer to task_struct of forking parent process.
@@ -5212,6 +5238,57 @@ void cgroup_fork(struct task_struct *child)
}
/**
+ * cgroup_can_fork - called on a new task before the process is exposed
+ * @child: the task in question.
+ *
+ * This calls the subsystem can_fork() callbacks. If the can_fork() callback
+ * returns an error, the fork aborts with that error code. This allows for
+ * a cgroup subsystem to conditionally allow or deny new forks.
+ */
+int cgroup_can_fork(struct task_struct *child,
+ void *ss_priv[CGROUP_CANFORK_COUNT])
+{
+ struct cgroup_subsys *ss;
+ int i, j, ret;
+
+ for_each_subsys_which(ss, i, &have_canfork_callback) {
+ ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i));
+ if (ret)
+ goto out_revert;
+ }
+
+ return 0;
+
+out_revert:
+ for_each_subsys(ss, j) {
+ if (j >= i)
+ break;
+ if (ss->cancel_fork)
+ ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j));
+ }
+
+ return ret;
+}
+
+/**
+ * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
+ * @child: the task in question
+ *
+ * This calls the cancel_fork() callbacks if a fork failed *after*
+ * cgroup_can_fork() succeded.
+ */
+void cgroup_cancel_fork(struct task_struct *child,
+ void *ss_priv[CGROUP_CANFORK_COUNT])
+{
+ struct cgroup_subsys *ss;
+ int i;
+
+ for_each_subsys(ss, i)
+ if (ss->cancel_fork)
+ ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i));
+}
+
+/**
* cgroup_post_fork - called on a new task after adding it to the task list
* @child: the task in question
*
@@ -5221,7 +5298,8 @@ void cgroup_fork(struct task_struct *child)
* cgroup_task_iter_start() - to guarantee that the new task ends up on its
* list.
*/
-void cgroup_post_fork(struct task_struct *child)
+void cgroup_post_fork(struct task_struct *child,
+ void *old_ss_priv[CGROUP_CANFORK_COUNT])
{
struct cgroup_subsys *ss;
int i;
@@ -5266,7 +5344,7 @@ void cgroup_post_fork(struct task_struct *child)
* and addition to css_set.
*/
for_each_subsys_which(ss, i, &have_fork_callback)
- ss->fork(child);
+ ss->fork(child, subsys_canfork_priv(old_ss_priv, i));
}
/**
@@ -5400,12 +5478,14 @@ static int __init cgroup_disable(char *str)
continue;
for_each_subsys(ss, i) {
- if (!strcmp(token, ss->name)) {
- ss->disabled = 1;
- printk(KERN_INFO "Disabling %s control group"
- " subsystem\n", ss->name);
- break;
- }
+ if (strcmp(token, ss->name) &&
+ strcmp(token, ss->legacy_name))
+ continue;
+
+ ss->disabled = 1;
+ printk(KERN_INFO "Disabling %s control group subsystem\n",
+ ss->name);
+ break;
}
}
return 1;
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 92b98cc0ee76..f1b30ad5dc6d 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -203,7 +203,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
* to do anything as freezer_attach() will put @task into the appropriate
* state.
*/
-static void freezer_fork(struct task_struct *task)
+static void freezer_fork(struct task_struct *task, void *private)
{
struct freezer *freezer;
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
new file mode 100644
index 000000000000..806cd7693ac8
--- /dev/null
+++ b/kernel/cgroup_pids.c
@@ -0,0 +1,355 @@
+/*
+ * Process number limiting controller for cgroups.
+ *
+ * Used to allow a cgroup hierarchy to stop any new processes from fork()ing
+ * after a certain limit is reached.
+ *
+ * Since it is trivial to hit the task limit without hitting any kmemcg limits
+ * in place, PIDs are a fundamental resource. As such, PID exhaustion must be
+ * preventable in the scope of a cgroup hierarchy by allowing resource limiting
+ * of the number of tasks in a cgroup.
+ *
+ * In order to use the `pids` controller, set the maximum number of tasks in
+ * pids.max (this is not available in the root cgroup for obvious reasons). The
+ * number of processes currently in the cgroup is given by pids.current.
+ * Organisational operations are not blocked by cgroup policies, so it is
+ * possible to have pids.current > pids.max. However, it is not possible to
+ * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
+ * would cause a cgroup policy to be violated.
+ *
+ * To set a cgroup to have no limit, set pids.max to "max". This is the default
+ * for all new cgroups (N.B. that PID limits are hierarchical, so the most
+ * stringent limit in the hierarchy is followed).
+ *
+ * pids.current tracks all child cgroup hierarchies, so parent/pids.current is
+ * a superset of parent/child/pids.current.
+ *
+ * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/threads.h>
+#include <linux/atomic.h>
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+
+#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
+#define PIDS_MAX_STR "max"
+
+struct pids_cgroup {
+ struct cgroup_subsys_state css;
+
+ /*
+ * Use 64-bit types so that we can safely represent "max" as
+ * %PIDS_MAX = (%PID_MAX_LIMIT + 1).
+ */
+ atomic64_t counter;
+ int64_t limit;
+};
+
+static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
+{
+ return container_of(css, struct pids_cgroup, css);
+}
+
+static struct pids_cgroup *parent_pids(struct pids_cgroup *pids)
+{
+ return css_pids(pids->css.parent);
+}
+
+static struct cgroup_subsys_state *
+pids_css_alloc(struct cgroup_subsys_state *parent)
+{
+ struct pids_cgroup *pids;
+
+ pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
+ if (!pids)
+ return ERR_PTR(-ENOMEM);
+
+ pids->limit = PIDS_MAX;
+ atomic64_set(&pids->counter, 0);
+ return &pids->css;
+}
+
+static void pids_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css_pids(css));
+}
+
+/**
+ * pids_cancel - uncharge the local pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to cancel
+ *
+ * This function will WARN if the pid count goes under 0, because such a case is
+ * a bug in the pids controller proper.
+ */
+static void pids_cancel(struct pids_cgroup *pids, int num)
+{
+ /*
+ * A negative count (or overflow for that matter) is invalid,
+ * and indicates a bug in the `pids` controller proper.
+ */
+ WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
+}
+
+/**
+ * pids_uncharge - hierarchically uncharge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to uncharge
+ */
+static void pids_uncharge(struct pids_cgroup *pids, int num)
+{
+ struct pids_cgroup *p;
+
+ for (p = pids; p; p = parent_pids(p))
+ pids_cancel(p, num);
+}
+
+/**
+ * pids_charge - hierarchically charge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to charge
+ *
+ * This function does *not* follow the pid limit set. It cannot fail and the new
+ * pid count may exceed the limit. This is only used for reverting failed
+ * attaches, where there is no other way out than violating the limit.
+ */
+static void pids_charge(struct pids_cgroup *pids, int num)
+{
+ struct pids_cgroup *p;
+
+ for (p = pids; p; p = parent_pids(p))
+ atomic64_add(num, &p->counter);
+}
+
+/**
+ * pids_try_charge - hierarchically try to charge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to charge
+ *
+ * This function follows the set limit. It will fail if the charge would cause
+ * the new value to exceed the hierarchical limit. Returns 0 if the charge
+ * succeded, otherwise -EAGAIN.
+ */
+static int pids_try_charge(struct pids_cgroup *pids, int num)
+{
+ struct pids_cgroup *p, *q;
+
+ for (p = pids; p; p = parent_pids(p)) {
+ int64_t new = atomic64_add_return(num, &p->counter);
+
+ /*
+ * Since new is capped to the maximum number of pid_t, if
+ * p->limit is %PIDS_MAX then we know that this test will never
+ * fail.
+ */
+ if (new > p->limit)
+ goto revert;
+ }
+
+ return 0;
+
+revert:
+ for (q = pids; q != p; q = parent_pids(q))
+ pids_cancel(q, num);
+ pids_cancel(p, num);
+
+ return -EAGAIN;
+}
+
+static int pids_can_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ struct pids_cgroup *pids = css_pids(css);
+ struct task_struct *task;
+
+ cgroup_taskset_for_each(task, tset) {
+ struct cgroup_subsys_state *old_css;
+ struct pids_cgroup *old_pids;
+
+ /*
+ * No need to pin @old_css between here and cancel_attach()
+ * because cgroup core protects it from being freed before
+ * the migration completes or fails.
+ */
+ old_css = task_css(task, pids_cgrp_id);
+ old_pids = css_pids(old_css);
+
+ pids_charge(pids, 1);
+ pids_uncharge(old_pids, 1);
+ }
+
+ return 0;
+}
+
+static void pids_cancel_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ struct pids_cgroup *pids = css_pids(css);
+ struct task_struct *task;
+
+ cgroup_taskset_for_each(task, tset) {
+ struct cgroup_subsys_state *old_css;
+ struct pids_cgroup *old_pids;
+
+ old_css = task_css(task, pids_cgrp_id);
+ old_pids = css_pids(old_css);
+
+ pids_charge(old_pids, 1);
+ pids_uncharge(pids, 1);
+ }
+}
+
+static int pids_can_fork(struct task_struct *task, void **priv_p)
+{
+ struct cgroup_subsys_state *css;
+ struct pids_cgroup *pids;
+ int err;
+
+ /*
+ * Use the "current" task_css for the pids subsystem as the tentative
+ * css. It is possible we will charge the wrong hierarchy, in which
+ * case we will forcefully revert/reapply the charge on the right
+ * hierarchy after it is committed to the task proper.
+ */
+ css = task_get_css(current, pids_cgrp_id);
+ pids = css_pids(css);
+
+ err = pids_try_charge(pids, 1);
+ if (err)
+ goto err_css_put;
+
+ *priv_p = css;
+ return 0;
+
+err_css_put:
+ css_put(css);
+ return err;
+}
+
+static void pids_cancel_fork(struct task_struct *task, void *priv)
+{
+ struct cgroup_subsys_state *css = priv;
+ struct pids_cgroup *pids = css_pids(css);
+
+ pids_uncharge(pids, 1);
+ css_put(css);
+}
+
+static void pids_fork(struct task_struct *task, void *priv)
+{
+ struct cgroup_subsys_state *css;
+ struct cgroup_subsys_state *old_css = priv;
+ struct pids_cgroup *pids;
+ struct pids_cgroup *old_pids = css_pids(old_css);
+
+ css = task_get_css(task, pids_cgrp_id);
+ pids = css_pids(css);
+
+ /*
+ * If the association has changed, we have to revert and reapply the
+ * charge/uncharge on the wrong hierarchy to the current one. Since
+ * the association can only change due to an organisation event, its
+ * okay for us to ignore the limit in this case.
+ */
+ if (pids != old_pids) {
+ pids_uncharge(old_pids, 1);
+ pids_charge(pids, 1);
+ }
+
+ css_put(css);
+ css_put(old_css);
+}
+
+static void pids_exit(struct cgroup_subsys_state *css,
+ struct cgroup_subsys_state *old_css,
+ struct task_struct *task)
+{
+ struct pids_cgroup *pids = css_pids(old_css);
+
+ pids_uncharge(pids, 1);
+}
+
+static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct cgroup_subsys_state *css = of_css(of);
+ struct pids_cgroup *pids = css_pids(css);
+ int64_t limit;
+ int err;
+
+ buf = strstrip(buf);
+ if (!strcmp(buf, PIDS_MAX_STR)) {
+ limit = PIDS_MAX;
+ goto set_limit;
+ }
+
+ err = kstrtoll(buf, 0, &limit);
+ if (err)
+ return err;
+
+ if (limit < 0 || limit >= PIDS_MAX)
+ return -EINVAL;
+
+set_limit:
+ /*
+ * Limit updates don't need to be mutex'd, since it isn't
+ * critical that any racing fork()s follow the new limit.
+ */
+ pids->limit = limit;
+ return nbytes;
+}
+
+static int pids_max_show(struct seq_file *sf, void *v)
+{
+ struct cgroup_subsys_state *css = seq_css(sf);
+ struct pids_cgroup *pids = css_pids(css);
+ int64_t limit = pids->limit;
+
+ if (limit >= PIDS_MAX)
+ seq_printf(sf, "%s\n", PIDS_MAX_STR);
+ else
+ seq_printf(sf, "%lld\n", limit);
+
+ return 0;
+}
+
+static s64 pids_current_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct pids_cgroup *pids = css_pids(css);
+
+ return atomic64_read(&pids->counter);
+}
+
+static struct cftype pids_files[] = {
+ {
+ .name = "max",
+ .write = pids_max_write,
+ .seq_show = pids_max_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "current",
+ .read_s64 = pids_current_read,
+ },
+ { } /* terminate */
+};
+
+struct cgroup_subsys pids_cgrp_subsys = {
+ .css_alloc = pids_css_alloc,
+ .css_free = pids_css_free,
+ .can_attach = pids_can_attach,
+ .cancel_attach = pids_cancel_attach,
+ .can_fork = pids_can_fork,
+ .cancel_fork = pids_cancel_fork,
+ .fork = pids_fork,
+ .exit = pids_exit,
+ .legacy_cftypes = pids_files,
+ .dfl_cftypes = pids_files,
+};
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 5644ec5582b9..82cf9dff4295 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -191,21 +191,22 @@ void cpu_hotplug_done(void)
void cpu_hotplug_disable(void)
{
cpu_maps_update_begin();
- cpu_hotplug_disabled = 1;
+ cpu_hotplug_disabled++;
cpu_maps_update_done();
}
+EXPORT_SYMBOL_GPL(cpu_hotplug_disable);
void cpu_hotplug_enable(void)
{
cpu_maps_update_begin();
- cpu_hotplug_disabled = 0;
+ WARN_ON(--cpu_hotplug_disabled < 0);
cpu_maps_update_done();
}
-
+EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
#endif /* CONFIG_HOTPLUG_CPU */
/* Need to know about CPUs going up/down? */
-int __ref register_cpu_notifier(struct notifier_block *nb)
+int register_cpu_notifier(struct notifier_block *nb)
{
int ret;
cpu_maps_update_begin();
@@ -214,7 +215,7 @@ int __ref register_cpu_notifier(struct notifier_block *nb)
return ret;
}
-int __ref __register_cpu_notifier(struct notifier_block *nb)
+int __register_cpu_notifier(struct notifier_block *nb)
{
return raw_notifier_chain_register(&cpu_chain, nb);
}
@@ -244,7 +245,7 @@ static void cpu_notify_nofail(unsigned long val, void *v)
EXPORT_SYMBOL(register_cpu_notifier);
EXPORT_SYMBOL(__register_cpu_notifier);
-void __ref unregister_cpu_notifier(struct notifier_block *nb)
+void unregister_cpu_notifier(struct notifier_block *nb)
{
cpu_maps_update_begin();
raw_notifier_chain_unregister(&cpu_chain, nb);
@@ -252,7 +253,7 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL(unregister_cpu_notifier);
-void __ref __unregister_cpu_notifier(struct notifier_block *nb)
+void __unregister_cpu_notifier(struct notifier_block *nb)
{
raw_notifier_chain_unregister(&cpu_chain, nb);
}
@@ -329,7 +330,7 @@ struct take_cpu_down_param {
};
/* Take this CPU down. */
-static int __ref take_cpu_down(void *_param)
+static int take_cpu_down(void *_param)
{
struct take_cpu_down_param *param = _param;
int err;
@@ -348,7 +349,7 @@ static int __ref take_cpu_down(void *_param)
}
/* Requires cpu_add_remove_lock to be held */
-static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
+static int _cpu_down(unsigned int cpu, int tasks_frozen)
{
int err, nr_calls = 0;
void *hcpu = (void *)(long)cpu;
@@ -381,14 +382,14 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
* will observe it.
*
* For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
- * not imply sync_sched(), so explicitly call both.
+ * not imply sync_sched(), so wait for both.
*
* Do sync before park smpboot threads to take care the rcu boost case.
*/
-#ifdef CONFIG_PREEMPT
- synchronize_sched();
-#endif
- synchronize_rcu();
+ if (IS_ENABLED(CONFIG_PREEMPT))
+ synchronize_rcu_mult(call_rcu, call_rcu_sched);
+ else
+ synchronize_rcu();
smpboot_park_threads(cpu);
@@ -401,7 +402,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
/*
* So now all preempt/rcu users must observe !cpu_active().
*/
- err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
+ err = stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
if (err) {
/* CPU didn't die: tell everyone. Can't complain. */
cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
@@ -442,7 +443,7 @@ out_release:
return err;
}
-int __ref cpu_down(unsigned int cpu)
+int cpu_down(unsigned int cpu)
{
int err;
@@ -608,13 +609,18 @@ int disable_nonboot_cpus(void)
}
}
- if (!error) {
+ if (!error)
BUG_ON(num_online_cpus() > 1);
- /* Make sure the CPUs won't be enabled by someone else */
- cpu_hotplug_disabled = 1;
- } else {
+ else
pr_err("Non-boot CPUs are not disabled\n");
- }
+
+ /*
+ * Make sure the CPUs won't be enabled by someone else. We need to do
+ * this even in case of failure as all disable_nonboot_cpus() users are
+ * supposed to do enable_nonboot_cpus() on the failure path.
+ */
+ cpu_hotplug_disabled++;
+
cpu_maps_update_done();
return error;
}
@@ -627,13 +633,13 @@ void __weak arch_enable_nonboot_cpus_end(void)
{
}
-void __ref enable_nonboot_cpus(void)
+void enable_nonboot_cpus(void)
{
int cpu, error;
/* Allow everyone to use the CPU hotplug again */
cpu_maps_update_begin();
- cpu_hotplug_disabled = 0;
+ WARN_ON(--cpu_hotplug_disabled < 0);
if (cpumask_empty(frozen_cpus))
goto out;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ee14e3a35a29..f0acff0f66c9 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1223,7 +1223,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
spin_unlock_irq(&callback_lock);
/* use trialcs->mems_allowed as a temp variable */
- update_nodemasks_hier(cs, &cs->mems_allowed);
+ update_nodemasks_hier(cs, &trialcs->mems_allowed);
done:
return retval;
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d3dae3419b99..e8183895691c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -163,6 +163,7 @@ static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
static atomic_t nr_freq_events __read_mostly;
+static atomic_t nr_switch_events __read_mostly;
static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
@@ -1868,8 +1869,6 @@ event_sched_in(struct perf_event *event,
perf_pmu_disable(event->pmu);
- event->tstamp_running += tstamp - event->tstamp_stopped;
-
perf_set_shadow_time(event, ctx, tstamp);
perf_log_itrace_start(event);
@@ -1881,6 +1880,8 @@ event_sched_in(struct perf_event *event,
goto out;
}
+ event->tstamp_running += tstamp - event->tstamp_stopped;
+
if (!is_software_event(event))
cpuctx->active_oncpu++;
if (!ctx->nr_active++)
@@ -2619,6 +2620,9 @@ static void perf_pmu_sched_task(struct task_struct *prev,
local_irq_restore(flags);
}
+static void perf_event_switch(struct task_struct *task,
+ struct task_struct *next_prev, bool sched_in);
+
#define for_each_task_context_nr(ctxn) \
for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
@@ -2641,6 +2645,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
if (__this_cpu_read(perf_sched_cb_usages))
perf_pmu_sched_task(task, next, false);
+ if (atomic_read(&nr_switch_events))
+ perf_event_switch(task, next, false);
+
for_each_task_context_nr(ctxn)
perf_event_context_sched_out(task, ctxn, next);
@@ -2831,6 +2838,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
perf_cgroup_sched_in(prev, task);
+ if (atomic_read(&nr_switch_events))
+ perf_event_switch(task, prev, true);
+
if (__this_cpu_read(perf_sched_cb_usages))
perf_pmu_sched_task(prev, task, true);
}
@@ -3212,6 +3222,59 @@ static inline u64 perf_event_count(struct perf_event *event)
return __perf_event_count(event);
}
+/*
+ * NMI-safe method to read a local event, that is an event that
+ * is:
+ * - either for the current task, or for this CPU
+ * - does not have inherit set, for inherited task events
+ * will not be local and we cannot read them atomically
+ * - must not have a pmu::count method
+ */
+u64 perf_event_read_local(struct perf_event *event)
+{
+ unsigned long flags;
+ u64 val;
+
+ /*
+ * Disabling interrupts avoids all counter scheduling (context
+ * switches, timer based rotation and IPIs).
+ */
+ local_irq_save(flags);
+
+ /* If this is a per-task event, it must be for current */
+ WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) &&
+ event->hw.target != current);
+
+ /* If this is a per-CPU event, it must be for this CPU */
+ WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) &&
+ event->cpu != smp_processor_id());
+
+ /*
+ * It must not be an event with inherit set, we cannot read
+ * all child counters from atomic context.
+ */
+ WARN_ON_ONCE(event->attr.inherit);
+
+ /*
+ * It must not have a pmu::count method, those are not
+ * NMI safe.
+ */
+ WARN_ON_ONCE(event->pmu->count);
+
+ /*
+ * If the event is currently on this CPU, its either a per-task event,
+ * or local to this CPU. Furthermore it means its ACTIVE (otherwise
+ * oncpu == -1).
+ */
+ if (event->oncpu == smp_processor_id())
+ event->pmu->read(event);
+
+ val = local64_read(&event->count);
+ local_irq_restore(flags);
+
+ return val;
+}
+
static u64 perf_event_read(struct perf_event *event)
{
/*
@@ -3454,6 +3517,10 @@ static void unaccount_event(struct perf_event *event)
atomic_dec(&nr_task_events);
if (event->attr.freq)
atomic_dec(&nr_freq_events);
+ if (event->attr.context_switch) {
+ static_key_slow_dec_deferred(&perf_sched_events);
+ atomic_dec(&nr_switch_events);
+ }
if (is_cgroup_event(event))
static_key_slow_dec_deferred(&perf_sched_events);
if (has_branch_stack(event))
@@ -3958,28 +4025,21 @@ static void perf_event_for_each(struct perf_event *event,
perf_event_for_each_child(sibling, func);
}
-static int perf_event_period(struct perf_event *event, u64 __user *arg)
-{
- struct perf_event_context *ctx = event->ctx;
- int ret = 0, active;
+struct period_event {
+ struct perf_event *event;
u64 value;
+};
- if (!is_sampling_event(event))
- return -EINVAL;
-
- if (copy_from_user(&value, arg, sizeof(value)))
- return -EFAULT;
-
- if (!value)
- return -EINVAL;
+static int __perf_event_period(void *info)
+{
+ struct period_event *pe = info;
+ struct perf_event *event = pe->event;
+ struct perf_event_context *ctx = event->ctx;
+ u64 value = pe->value;
+ bool active;
- raw_spin_lock_irq(&ctx->lock);
+ raw_spin_lock(&ctx->lock);
if (event->attr.freq) {
- if (value > sysctl_perf_event_sample_rate) {
- ret = -EINVAL;
- goto unlock;
- }
-
event->attr.sample_freq = value;
} else {
event->attr.sample_period = value;
@@ -3998,11 +4058,53 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
event->pmu->start(event, PERF_EF_RELOAD);
perf_pmu_enable(ctx->pmu);
}
+ raw_spin_unlock(&ctx->lock);
-unlock:
+ return 0;
+}
+
+static int perf_event_period(struct perf_event *event, u64 __user *arg)
+{
+ struct period_event pe = { .event = event, };
+ struct perf_event_context *ctx = event->ctx;
+ struct task_struct *task;
+ u64 value;
+
+ if (!is_sampling_event(event))
+ return -EINVAL;
+
+ if (copy_from_user(&value, arg, sizeof(value)))
+ return -EFAULT;
+
+ if (!value)
+ return -EINVAL;
+
+ if (event->attr.freq && value > sysctl_perf_event_sample_rate)
+ return -EINVAL;
+
+ task = ctx->task;
+ pe.value = value;
+
+ if (!task) {
+ cpu_function_call(event->cpu, __perf_event_period, &pe);
+ return 0;
+ }
+
+retry:
+ if (!task_function_call(task, __perf_event_period, &pe))
+ return 0;
+
+ raw_spin_lock_irq(&ctx->lock);
+ if (ctx->is_active) {
+ raw_spin_unlock_irq(&ctx->lock);
+ task = ctx->task;
+ goto retry;
+ }
+
+ __perf_event_period(&pe);
raw_spin_unlock_irq(&ctx->lock);
- return ret;
+ return 0;
}
static const struct file_operations perf_fops;
@@ -4740,12 +4842,20 @@ static const struct file_operations perf_fops = {
* to user-space before waking everybody up.
*/
+static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
+{
+ /* only the parent has fasync state */
+ if (event->parent)
+ event = event->parent;
+ return &event->fasync;
+}
+
void perf_event_wakeup(struct perf_event *event)
{
ring_buffer_wakeup(event);
if (event->pending_kill) {
- kill_fasync(&event->fasync, SIGIO, event->pending_kill);
+ kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
event->pending_kill = 0;
}
}
@@ -5982,6 +6092,91 @@ void perf_log_lost_samples(struct perf_event *event, u64 lost)
}
/*
+ * context_switch tracking
+ */
+
+struct perf_switch_event {
+ struct task_struct *task;
+ struct task_struct *next_prev;
+
+ struct {
+ struct perf_event_header header;
+ u32 next_prev_pid;
+ u32 next_prev_tid;
+ } event_id;
+};
+
+static int perf_event_switch_match(struct perf_event *event)
+{
+ return event->attr.context_switch;
+}
+
+static void perf_event_switch_output(struct perf_event *event, void *data)
+{
+ struct perf_switch_event *se = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int ret;
+
+ if (!perf_event_switch_match(event))
+ return;
+
+ /* Only CPU-wide events are allowed to see next/prev pid/tid */
+ if (event->ctx->task) {
+ se->event_id.header.type = PERF_RECORD_SWITCH;
+ se->event_id.header.size = sizeof(se->event_id.header);
+ } else {
+ se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
+ se->event_id.header.size = sizeof(se->event_id);
+ se->event_id.next_prev_pid =
+ perf_event_pid(event, se->next_prev);
+ se->event_id.next_prev_tid =
+ perf_event_tid(event, se->next_prev);
+ }
+
+ perf_event_header__init_id(&se->event_id.header, &sample, event);
+
+ ret = perf_output_begin(&handle, event, se->event_id.header.size);
+ if (ret)
+ return;
+
+ if (event->ctx->task)
+ perf_output_put(&handle, se->event_id.header);
+ else
+ perf_output_put(&handle, se->event_id);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
+static void perf_event_switch(struct task_struct *task,
+ struct task_struct *next_prev, bool sched_in)
+{
+ struct perf_switch_event switch_event;
+
+ /* N.B. caller checks nr_switch_events != 0 */
+
+ switch_event = (struct perf_switch_event){
+ .task = task,
+ .next_prev = next_prev,
+ .event_id = {
+ .header = {
+ /* .type */
+ .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
+ /* .size */
+ },
+ /* .next_prev_pid */
+ /* .next_prev_tid */
+ },
+ };
+
+ perf_event_aux(perf_event_switch_output,
+ &switch_event,
+ NULL);
+}
+
+/*
* IRQ throttle logging
*/
@@ -6040,8 +6235,6 @@ static void perf_log_itrace_start(struct perf_event *event)
event->hw.itrace_started)
return;
- event->hw.itrace_started = 1;
-
rec.header.type = PERF_RECORD_ITRACE_START;
rec.header.misc = 0;
rec.header.size = sizeof(rec);
@@ -6124,7 +6317,7 @@ static int __perf_event_overflow(struct perf_event *event,
else
perf_event_output(event, data, regs);
- if (event->fasync && event->pending_kill) {
+ if (*perf_event_fasync(event) && event->pending_kill) {
event->pending_wakeup = 1;
irq_work_queue(&event->pending);
}
@@ -6749,8 +6942,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
if (event->tp_event->prog)
return -EEXIST;
- if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
- /* bpf programs can only be attached to kprobes */
+ if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE))
+ /* bpf programs can only be attached to u/kprobes */
return -EINVAL;
prog = bpf_prog_get(prog_fd);
@@ -7479,6 +7672,10 @@ static void account_event(struct perf_event *event)
if (atomic_inc_return(&nr_freq_events) == 1)
tick_nohz_full_kick_all();
}
+ if (event->attr.context_switch) {
+ atomic_inc(&nr_switch_events);
+ static_key_slow_inc(&perf_sched_events.key);
+ }
if (has_branch_stack(event))
static_key_slow_inc(&perf_sched_events.key);
if (is_cgroup_event(event))
@@ -8574,6 +8771,31 @@ void perf_event_delayed_put(struct task_struct *task)
WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
}
+struct perf_event *perf_event_get(unsigned int fd)
+{
+ int err;
+ struct fd f;
+ struct perf_event *event;
+
+ err = perf_fget_light(fd, &f);
+ if (err)
+ return ERR_PTR(err);
+
+ event = f.file->private_data;
+ atomic_long_inc(&event->refcount);
+ fdput(f);
+
+ return event;
+}
+
+const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
+{
+ if (!event)
+ return ERR_PTR(-EINVAL);
+
+ return &event->attr;
+}
+
/*
* inherit a event from parent task to child task:
*/
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index b2be01b1aa9d..182bc30899d5 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -437,7 +437,10 @@ static struct page *rb_alloc_aux_page(int node, int order)
if (page && order) {
/*
- * Communicate the allocation size to the driver
+ * Communicate the allocation size to the driver:
+ * if we managed to secure a high-order allocation,
+ * set its first page's private to this order;
+ * !PagePrivate(page) means it's just a normal page.
*/
split_page(page, order);
SetPagePrivate(page);
@@ -559,11 +562,13 @@ static void __rb_free_aux(struct ring_buffer *rb)
rb->aux_priv = NULL;
}
- for (pg = 0; pg < rb->aux_nr_pages; pg++)
- rb_free_aux_page(rb, pg);
+ if (rb->aux_nr_pages) {
+ for (pg = 0; pg < rb->aux_nr_pages; pg++)
+ rb_free_aux_page(rb, pg);
- kfree(rb->aux_pages);
- rb->aux_nr_pages = 0;
+ kfree(rb->aux_pages);
+ rb->aux_nr_pages = 0;
+ }
}
void rb_free_aux(struct ring_buffer *rb)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index cb346f26a22d..4e5e9798aa0c 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -86,15 +86,6 @@ struct uprobe {
struct arch_uprobe arch;
};
-struct return_instance {
- struct uprobe *uprobe;
- unsigned long func;
- unsigned long orig_ret_vaddr; /* original return address */
- bool chained; /* true, if instance is nested */
-
- struct return_instance *next; /* keep as stack */
-};
-
/*
* Execute out of line area: anonymous executable mapping installed
* by the probed task to execute the copy of the original instruction
@@ -105,17 +96,18 @@ struct return_instance {
* allocated.
*/
struct xol_area {
- wait_queue_head_t wq; /* if all slots are busy */
- atomic_t slot_count; /* number of in-use slots */
- unsigned long *bitmap; /* 0 = free slot */
- struct page *page;
+ wait_queue_head_t wq; /* if all slots are busy */
+ atomic_t slot_count; /* number of in-use slots */
+ unsigned long *bitmap; /* 0 = free slot */
+ struct vm_special_mapping xol_mapping;
+ struct page *pages[2];
/*
* We keep the vma's vm_start rather than a pointer to the vma
* itself. The probed process or a naughty kernel module could make
* the vma go away, and we must handle that reasonably gracefully.
*/
- unsigned long vaddr; /* Page(s) of instruction slots */
+ unsigned long vaddr; /* Page(s) of instruction slots */
};
/*
@@ -366,6 +358,18 @@ set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v
return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn);
}
+static struct uprobe *get_uprobe(struct uprobe *uprobe)
+{
+ atomic_inc(&uprobe->ref);
+ return uprobe;
+}
+
+static void put_uprobe(struct uprobe *uprobe)
+{
+ if (atomic_dec_and_test(&uprobe->ref))
+ kfree(uprobe);
+}
+
static int match_uprobe(struct uprobe *l, struct uprobe *r)
{
if (l->inode < r->inode)
@@ -393,10 +397,8 @@ static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
while (n) {
uprobe = rb_entry(n, struct uprobe, rb_node);
match = match_uprobe(&u, uprobe);
- if (!match) {
- atomic_inc(&uprobe->ref);
- return uprobe;
- }
+ if (!match)
+ return get_uprobe(uprobe);
if (match < 0)
n = n->rb_left;
@@ -432,10 +434,8 @@ static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
parent = *p;
u = rb_entry(parent, struct uprobe, rb_node);
match = match_uprobe(uprobe, u);
- if (!match) {
- atomic_inc(&u->ref);
- return u;
- }
+ if (!match)
+ return get_uprobe(u);
if (match < 0)
p = &parent->rb_left;
@@ -472,12 +472,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
return u;
}
-static void put_uprobe(struct uprobe *uprobe)
-{
- if (atomic_dec_and_test(&uprobe->ref))
- kfree(uprobe);
-}
-
static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
{
struct uprobe *uprobe, *cur_uprobe;
@@ -1039,14 +1033,14 @@ static void build_probe_list(struct inode *inode,
if (u->inode != inode || u->offset < min)
break;
list_add(&u->pending_list, head);
- atomic_inc(&u->ref);
+ get_uprobe(u);
}
for (t = n; (t = rb_next(t)); ) {
u = rb_entry(t, struct uprobe, rb_node);
if (u->inode != inode || u->offset > max)
break;
list_add(&u->pending_list, head);
- atomic_inc(&u->ref);
+ get_uprobe(u);
}
}
spin_unlock(&uprobes_treelock);
@@ -1132,11 +1126,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
/* Slot allocation for XOL */
static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
{
- int ret = -EALREADY;
+ struct vm_area_struct *vma;
+ int ret;
down_write(&mm->mmap_sem);
- if (mm->uprobes_state.xol_area)
+ if (mm->uprobes_state.xol_area) {
+ ret = -EALREADY;
goto fail;
+ }
if (!area->vaddr) {
/* Try to map as high as possible, this is only a hint. */
@@ -1148,11 +1145,15 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
}
}
- ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
- VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page);
- if (ret)
+ vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE,
+ VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO,
+ &area->xol_mapping);
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
goto fail;
+ }
+ ret = 0;
smp_wmb(); /* pairs with get_xol_area() */
mm->uprobes_state.xol_area = area;
fail:
@@ -1175,21 +1176,24 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
if (!area->bitmap)
goto free_area;
- area->page = alloc_page(GFP_HIGHUSER);
- if (!area->page)
+ area->xol_mapping.name = "[uprobes]";
+ area->xol_mapping.pages = area->pages;
+ area->pages[0] = alloc_page(GFP_HIGHUSER);
+ if (!area->pages[0])
goto free_bitmap;
+ area->pages[1] = NULL;
area->vaddr = vaddr;
init_waitqueue_head(&area->wq);
/* Reserve the 1st slot for get_trampoline_vaddr() */
set_bit(0, area->bitmap);
atomic_set(&area->slot_count, 1);
- copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
+ copy_to_page(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE);
if (!xol_add_vma(mm, area))
return area;
- __free_page(area->page);
+ __free_page(area->pages[0]);
free_bitmap:
kfree(area->bitmap);
free_area:
@@ -1227,7 +1231,7 @@ void uprobe_clear_state(struct mm_struct *mm)
if (!area)
return;
- put_page(area->page);
+ put_page(area->pages[0]);
kfree(area->bitmap);
kfree(area);
}
@@ -1296,7 +1300,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
if (unlikely(!xol_vaddr))
return 0;
- arch_uprobe_copy_ixol(area->page, xol_vaddr,
+ arch_uprobe_copy_ixol(area->pages[0], xol_vaddr,
&uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
return xol_vaddr;
@@ -1333,6 +1337,7 @@ static void xol_free_insn_slot(struct task_struct *tsk)
clear_bit(slot_nr, area->bitmap);
atomic_dec(&area->slot_count);
+ smp_mb__after_atomic(); /* pairs with prepare_to_wait() */
if (waitqueue_active(&area->wq))
wake_up(&area->wq);
@@ -1376,6 +1381,14 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
return instruction_pointer(regs);
}
+static struct return_instance *free_ret_instance(struct return_instance *ri)
+{
+ struct return_instance *next = ri->next;
+ put_uprobe(ri->uprobe);
+ kfree(ri);
+ return next;
+}
+
/*
* Called with no locks held.
* Called in context of a exiting or a exec-ing thread.
@@ -1383,7 +1396,7 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
void uprobe_free_utask(struct task_struct *t)
{
struct uprobe_task *utask = t->utask;
- struct return_instance *ri, *tmp;
+ struct return_instance *ri;
if (!utask)
return;
@@ -1392,13 +1405,8 @@ void uprobe_free_utask(struct task_struct *t)
put_uprobe(utask->active_uprobe);
ri = utask->return_instances;
- while (ri) {
- tmp = ri;
- ri = ri->next;
-
- put_uprobe(tmp->uprobe);
- kfree(tmp);
- }
+ while (ri)
+ ri = free_ret_instance(ri);
xol_free_insn_slot(t);
kfree(utask);
@@ -1437,7 +1445,7 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
return -ENOMEM;
*n = *o;
- atomic_inc(&n->uprobe->ref);
+ get_uprobe(n->uprobe);
n->next = NULL;
*p = n;
@@ -1515,12 +1523,25 @@ static unsigned long get_trampoline_vaddr(void)
return trampoline_vaddr;
}
+static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
+ struct pt_regs *regs)
+{
+ struct return_instance *ri = utask->return_instances;
+ enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL;
+
+ while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
+ ri = free_ret_instance(ri);
+ utask->depth--;
+ }
+ utask->return_instances = ri;
+}
+
static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
{
struct return_instance *ri;
struct uprobe_task *utask;
unsigned long orig_ret_vaddr, trampoline_vaddr;
- bool chained = false;
+ bool chained;
if (!get_xol_area())
return;
@@ -1536,49 +1557,47 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
return;
}
- ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL);
+ ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
if (!ri)
- goto fail;
+ return;
trampoline_vaddr = get_trampoline_vaddr();
orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
if (orig_ret_vaddr == -1)
goto fail;
+ /* drop the entries invalidated by longjmp() */
+ chained = (orig_ret_vaddr == trampoline_vaddr);
+ cleanup_return_instances(utask, chained, regs);
+
/*
* We don't want to keep trampoline address in stack, rather keep the
* original return address of first caller thru all the consequent
* instances. This also makes breakpoint unwrapping easier.
*/
- if (orig_ret_vaddr == trampoline_vaddr) {
+ if (chained) {
if (!utask->return_instances) {
/*
* This situation is not possible. Likely we have an
* attack from user-space.
*/
- pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n",
- current->pid, current->tgid);
+ uprobe_warn(current, "handle tail call");
goto fail;
}
-
- chained = true;
orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
}
- atomic_inc(&uprobe->ref);
- ri->uprobe = uprobe;
+ ri->uprobe = get_uprobe(uprobe);
ri->func = instruction_pointer(regs);
+ ri->stack = user_stack_pointer(regs);
ri->orig_ret_vaddr = orig_ret_vaddr;
ri->chained = chained;
utask->depth++;
-
- /* add instance to the stack */
ri->next = utask->return_instances;
utask->return_instances = ri;
return;
-
fail:
kfree(ri);
}
@@ -1766,46 +1785,58 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
up_read(&uprobe->register_rwsem);
}
-static bool handle_trampoline(struct pt_regs *regs)
+static struct return_instance *find_next_ret_chain(struct return_instance *ri)
{
- struct uprobe_task *utask;
- struct return_instance *ri, *tmp;
bool chained;
+ do {
+ chained = ri->chained;
+ ri = ri->next; /* can't be NULL if chained */
+ } while (chained);
+
+ return ri;
+}
+
+static void handle_trampoline(struct pt_regs *regs)
+{
+ struct uprobe_task *utask;
+ struct return_instance *ri, *next;
+ bool valid;
+
utask = current->utask;
if (!utask)
- return false;
+ goto sigill;
ri = utask->return_instances;
if (!ri)
- return false;
-
- /*
- * TODO: we should throw out return_instance's invalidated by
- * longjmp(), currently we assume that the probed function always
- * returns.
- */
- instruction_pointer_set(regs, ri->orig_ret_vaddr);
-
- for (;;) {
- handle_uretprobe_chain(ri, regs);
-
- chained = ri->chained;
- put_uprobe(ri->uprobe);
-
- tmp = ri;
- ri = ri->next;
- kfree(tmp);
- utask->depth--;
+ goto sigill;
- if (!chained)
- break;
- BUG_ON(!ri);
- }
+ do {
+ /*
+ * We should throw out the frames invalidated by longjmp().
+ * If this chain is valid, then the next one should be alive
+ * or NULL; the latter case means that nobody but ri->func
+ * could hit this trampoline on return. TODO: sigaltstack().
+ */
+ next = find_next_ret_chain(ri);
+ valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs);
+
+ instruction_pointer_set(regs, ri->orig_ret_vaddr);
+ do {
+ if (valid)
+ handle_uretprobe_chain(ri, regs);
+ ri = free_ret_instance(ri);
+ utask->depth--;
+ } while (ri != next);
+ } while (!valid);
utask->return_instances = ri;
+ return;
+
+ sigill:
+ uprobe_warn(current, "handle uretprobe, sending SIGILL.");
+ force_sig_info(SIGILL, SEND_SIG_FORCED, current);
- return true;
}
bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
@@ -1813,6 +1844,12 @@ bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
return false;
}
+bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
+ struct pt_regs *regs)
+{
+ return true;
+}
+
/*
* Run handler and ask thread to singlestep.
* Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -1824,13 +1861,8 @@ static void handle_swbp(struct pt_regs *regs)
int uninitialized_var(is_swbp);
bp_vaddr = uprobe_get_swbp_addr(regs);
- if (bp_vaddr == get_trampoline_vaddr()) {
- if (handle_trampoline(regs))
- return;
-
- pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n",
- current->pid, current->tgid);
- }
+ if (bp_vaddr == get_trampoline_vaddr())
+ return handle_trampoline(regs);
uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
if (!uprobe) {
diff --git a/kernel/exit.c b/kernel/exit.c
index 031325e9acf9..ea95ee1b5ef7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1471,7 +1471,7 @@ static long do_wait(struct wait_opts *wo)
add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
repeat:
/*
- * If there is nothing that can match our critiera just get out.
+ * If there is nothing that can match our criteria, just get out.
* We will clear ->notask_error to zero if we see any child that
* might later match our criteria, even if we are not able to reap
* it yet.
diff --git a/kernel/fork.c b/kernel/fork.c
index dbd9b8d7b7cc..03aa2e6de7a4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1072,6 +1072,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
rcu_assign_pointer(tsk->sighand, sig);
if (!sig)
return -ENOMEM;
+
atomic_set(&sig->count, 1);
memcpy(sig->action, current->sighand->action, sizeof(sig->action));
return 0;
@@ -1133,6 +1134,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
init_sigpending(&sig->shared_pending);
INIT_LIST_HEAD(&sig->posix_timers);
seqlock_init(&sig->stats_lock);
+ prev_cputime_init(&sig->prev_cputime);
hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
sig->real_timer.function = it_real_fn;
@@ -1244,6 +1246,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
{
int retval;
struct task_struct *p;
+ void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
@@ -1278,10 +1281,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
/*
* If the new process will be in a different pid or user namespace
- * do not allow it to share a thread group or signal handlers or
- * parent with the forking task.
+ * do not allow it to share a thread group with the forking task.
*/
- if (clone_flags & CLONE_SIGHAND) {
+ if (clone_flags & CLONE_THREAD) {
if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
(task_active_pid_ns(current) !=
current->nsproxy->pid_ns_for_children))
@@ -1340,9 +1342,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->utime = p->stime = p->gtime = 0;
p->utimescaled = p->stimescaled = 0;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
- p->prev_cputime.utime = p->prev_cputime.stime = 0;
-#endif
+ prev_cputime_init(&p->prev_cputime);
+
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
seqlock_init(&p->vtime_seqlock);
p->vtime_snap = 0;
@@ -1518,6 +1519,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->task_works = NULL;
/*
+ * Ensure that the cgroup subsystem policies allow the new process to be
+ * forked. It should be noted the the new process's css_set can be changed
+ * between here and cgroup_post_fork() if an organisation operation is in
+ * progress.
+ */
+ retval = cgroup_can_fork(p, cgrp_ss_priv);
+ if (retval)
+ goto bad_fork_free_pid;
+
+ /*
* Make it visible to the rest of the system, but dont wake it up yet.
* Need tasklist lock for parent etc handling!
*/
@@ -1553,7 +1564,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
retval = -ERESTARTNOINTR;
- goto bad_fork_free_pid;
+ goto bad_fork_cancel_cgroup;
}
if (likely(p->pid)) {
@@ -1595,7 +1606,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
- cgroup_post_fork(p);
+ cgroup_post_fork(p, cgrp_ss_priv);
if (clone_flags & CLONE_THREAD)
threadgroup_change_end(current);
perf_event_fork(p);
@@ -1605,6 +1616,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
return p;
+bad_fork_cancel_cgroup:
+ cgroup_cancel_fork(p, cgrp_ss_priv);
bad_fork_free_pid:
if (pid != &init_struct_pid)
free_pid(pid);
@@ -1871,13 +1884,21 @@ static int check_unshare_flags(unsigned long unshare_flags)
CLONE_NEWUSER|CLONE_NEWPID))
return -EINVAL;
/*
- * Not implemented, but pretend it works if there is nothing to
- * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND
- * needs to unshare vm.
+ * Not implemented, but pretend it works if there is nothing
+ * to unshare. Note that unsharing the address space or the
+ * signal handlers also need to unshare the signal queues (aka
+ * CLONE_THREAD).
*/
if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
- /* FIXME: get_task_mm() increments ->mm_users */
- if (atomic_read(&current->mm->mm_users) > 1)
+ if (!thread_group_empty(current))
+ return -EINVAL;
+ }
+ if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
+ if (atomic_read(&current->sighand->count) > 1)
+ return -EINVAL;
+ }
+ if (unshare_flags & CLONE_VM) {
+ if (!current_is_single_threaded())
return -EINVAL;
}
@@ -1941,21 +1962,22 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
int err;
/*
- * If unsharing a user namespace must also unshare the thread.
+ * If unsharing a user namespace must also unshare the thread group
+ * and unshare the filesystem root and working directories.
*/
if (unshare_flags & CLONE_NEWUSER)
unshare_flags |= CLONE_THREAD | CLONE_FS;
/*
- * If unsharing a thread from a thread group, must also unshare vm.
- */
- if (unshare_flags & CLONE_THREAD)
- unshare_flags |= CLONE_VM;
- /*
* If unsharing vm, must also unshare signal handlers.
*/
if (unshare_flags & CLONE_VM)
unshare_flags |= CLONE_SIGHAND;
/*
+ * If unsharing a signal handlers, must also unshare the signal queues.
+ */
+ if (unshare_flags & CLONE_SIGHAND)
+ unshare_flags |= CLONE_THREAD;
+ /*
* If unsharing namespace, must also unshare filesystem information.
*/
if (unshare_flags & CLONE_NEWNS)
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 27f4332c7f84..6e40a9539763 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -63,7 +63,7 @@ int irq_set_irq_type(unsigned int irq, unsigned int type)
return -EINVAL;
type &= IRQ_TYPE_SENSE_MASK;
- ret = __irq_set_trigger(desc, irq, type);
+ ret = __irq_set_trigger(desc, type);
irq_put_desc_busunlock(desc, flags);
return ret;
}
@@ -187,7 +187,7 @@ int irq_startup(struct irq_desc *desc, bool resend)
irq_enable(desc);
}
if (resend)
- check_irq_resend(desc, desc->irq_data.irq);
+ check_irq_resend(desc);
return ret;
}
@@ -315,7 +315,7 @@ void handle_nested_irq(unsigned int irq)
raw_spin_lock_irq(&desc->lock);
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
- kstat_incr_irqs_this_cpu(irq, desc);
+ kstat_incr_irqs_this_cpu(desc);
action = desc->action;
if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) {
@@ -328,7 +328,7 @@ void handle_nested_irq(unsigned int irq)
action_ret = action->thread_fn(action->irq, action->dev_id);
if (!noirqdebug)
- note_interrupt(irq, desc, action_ret);
+ note_interrupt(desc, action_ret);
raw_spin_lock_irq(&desc->lock);
irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
@@ -391,7 +391,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
goto out_unlock;
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
- kstat_incr_irqs_this_cpu(irq, desc);
+ kstat_incr_irqs_this_cpu(desc);
if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
desc->istate |= IRQS_PENDING;
@@ -443,7 +443,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
goto out_unlock;
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
- kstat_incr_irqs_this_cpu(irq, desc);
+ kstat_incr_irqs_this_cpu(desc);
/*
* If its disabled or no action available
@@ -515,7 +515,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
goto out;
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
- kstat_incr_irqs_this_cpu(irq, desc);
+ kstat_incr_irqs_this_cpu(desc);
/*
* If its disabled or no action available
@@ -583,7 +583,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
goto out_unlock;
}
- kstat_incr_irqs_this_cpu(irq, desc);
+ kstat_incr_irqs_this_cpu(desc);
/* Start handling the irq */
desc->irq_data.chip->irq_ack(&desc->irq_data);
@@ -646,7 +646,7 @@ void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc)
goto out_eoi;
}
- kstat_incr_irqs_this_cpu(irq, desc);
+ kstat_incr_irqs_this_cpu(desc);
do {
if (unlikely(!desc->action))
@@ -675,7 +675,7 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
{
struct irq_chip *chip = irq_desc_get_chip(desc);
- kstat_incr_irqs_this_cpu(irq, desc);
+ kstat_incr_irqs_this_cpu(desc);
if (chip->irq_ack)
chip->irq_ack(&desc->irq_data);
@@ -705,7 +705,7 @@ void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc)
void *dev_id = raw_cpu_ptr(action->percpu_dev_id);
irqreturn_t res;
- kstat_incr_irqs_this_cpu(irq, desc);
+ kstat_incr_irqs_this_cpu(desc);
if (chip->irq_ack)
chip->irq_ack(&desc->irq_data);
@@ -985,6 +985,23 @@ int irq_chip_set_affinity_parent(struct irq_data *data,
}
/**
+ * irq_chip_set_type_parent - Set IRQ type on the parent interrupt
+ * @data: Pointer to interrupt specific data
+ * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
+ *
+ * Conditional, as the underlying parent chip might not implement it.
+ */
+int irq_chip_set_type_parent(struct irq_data *data, unsigned int type)
+{
+ data = data->parent_data;
+
+ if (data->chip->irq_set_type)
+ return data->chip->irq_set_type(data, type);
+
+ return -ENOSYS;
+}
+
+/**
* irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware
* @data: Pointer to interrupt specific data
*
@@ -997,13 +1014,13 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data)
if (data->chip && data->chip->irq_retrigger)
return data->chip->irq_retrigger(data);
- return -ENOSYS;
+ return 0;
}
/**
* irq_chip_set_vcpu_affinity_parent - Set vcpu affinity on the parent interrupt
* @data: Pointer to interrupt specific data
- * @dest: The vcpu affinity information
+ * @vcpu_info: The vcpu affinity information
*/
int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info)
{
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 15b370daf234..abd286afbd27 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -553,6 +553,9 @@ static int irq_gc_suspend(void)
if (data)
ct->chip.irq_suspend(data);
}
+
+ if (gc->suspend)
+ gc->suspend(gc);
}
return 0;
}
@@ -564,6 +567,9 @@ static void irq_gc_resume(void)
list_for_each_entry(gc, &gc_list, list) {
struct irq_chip_type *ct = gc->chip_types;
+ if (gc->resume)
+ gc->resume(gc);
+
if (ct->chip.irq_resume) {
struct irq_data *data = irq_gc_get_irq_data(gc);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 635480270858..b6eeea8a80c5 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -30,7 +30,7 @@
void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
{
print_irq_desc(irq, desc);
- kstat_incr_irqs_this_cpu(irq, desc);
+ kstat_incr_irqs_this_cpu(desc);
ack_bad_irq(irq);
}
@@ -176,7 +176,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
add_interrupt_randomness(irq, flags);
if (!noirqdebug)
- note_interrupt(irq, desc, retval);
+ note_interrupt(desc, retval);
return retval;
}
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 61008b8433ab..eee4b385cffb 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -59,10 +59,9 @@ enum {
#include "debug.h"
#include "settings.h"
-extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
- unsigned long flags);
-extern void __disable_irq(struct irq_desc *desc, unsigned int irq);
-extern void __enable_irq(struct irq_desc *desc, unsigned int irq);
+extern int __irq_set_trigger(struct irq_desc *desc, unsigned long flags);
+extern void __disable_irq(struct irq_desc *desc);
+extern void __enable_irq(struct irq_desc *desc);
extern int irq_startup(struct irq_desc *desc, bool resend);
extern void irq_shutdown(struct irq_desc *desc);
@@ -86,7 +85,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *act
irqreturn_t handle_irq_event(struct irq_desc *desc);
/* Resending of interrupts :*/
-void check_irq_resend(struct irq_desc *desc, unsigned int irq);
+void check_irq_resend(struct irq_desc *desc);
bool irq_wait_for_poll(struct irq_desc *desc);
void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action);
@@ -187,7 +186,7 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
return __irqd_to_state(d) & mask;
}
-static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc)
+static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)
{
__this_cpu_inc(*desc->kstat_irqs);
__this_cpu_inc(kstat.irqs_sum);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 4afc457613dd..0a2a4b697bcb 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -582,7 +582,7 @@ int irq_set_percpu_devid(unsigned int irq)
void kstat_incr_irq_this_cpu(unsigned int irq)
{
- kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
+ kstat_incr_irqs_this_cpu(irq_to_desc(irq));
}
/**
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 8c3577fef78c..79baaf8a7813 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -187,10 +187,12 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
/**
- * irq_find_host() - Locates a domain for a given device node
+ * irq_find_matching_host() - Locates a domain for a given device node
* @node: device-tree node of the interrupt controller
+ * @bus_token: domain-specific data
*/
-struct irq_domain *irq_find_host(struct device_node *node)
+struct irq_domain *irq_find_matching_host(struct device_node *node,
+ enum irq_domain_bus_token bus_token)
{
struct irq_domain *h, *found = NULL;
int rc;
@@ -199,13 +201,19 @@ struct irq_domain *irq_find_host(struct device_node *node)
* it might potentially be set to match all interrupts in
* the absence of a device node. This isn't a problem so far
* yet though...
+ *
+ * bus_token == DOMAIN_BUS_ANY matches any domain, any other
+ * values must generate an exact match for the domain to be
+ * selected.
*/
mutex_lock(&irq_domain_mutex);
list_for_each_entry(h, &irq_domain_list, link) {
if (h->ops->match)
- rc = h->ops->match(h, node);
+ rc = h->ops->match(h, node, bus_token);
else
- rc = (h->of_node != NULL) && (h->of_node == node);
+ rc = ((h->of_node != NULL) && (h->of_node == node) &&
+ ((bus_token == DOMAIN_BUS_ANY) ||
+ (h->bus_token == bus_token)));
if (rc) {
found = h;
@@ -215,7 +223,7 @@ struct irq_domain *irq_find_host(struct device_node *node)
mutex_unlock(&irq_domain_mutex);
return found;
}
-EXPORT_SYMBOL_GPL(irq_find_host);
+EXPORT_SYMBOL_GPL(irq_find_matching_host);
/**
* irq_set_default_host() - Set a "default" irq domain
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index f9744853b656..ad1b064f94fe 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -115,6 +115,14 @@ EXPORT_SYMBOL(synchronize_irq);
#ifdef CONFIG_SMP
cpumask_var_t irq_default_affinity;
+static int __irq_can_set_affinity(struct irq_desc *desc)
+{
+ if (!desc || !irqd_can_balance(&desc->irq_data) ||
+ !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
+ return 0;
+ return 1;
+}
+
/**
* irq_can_set_affinity - Check if the affinity of a given irq can be set
* @irq: Interrupt to check
@@ -122,13 +130,7 @@ cpumask_var_t irq_default_affinity;
*/
int irq_can_set_affinity(unsigned int irq)
{
- struct irq_desc *desc = irq_to_desc(irq);
-
- if (!desc || !irqd_can_balance(&desc->irq_data) ||
- !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
- return 0;
-
- return 1;
+ return __irq_can_set_affinity(irq_to_desc(irq));
}
/**
@@ -359,14 +361,13 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
/*
* Generic version of the affinity autoselector.
*/
-static int
-setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
+static int setup_affinity(struct irq_desc *desc, struct cpumask *mask)
{
struct cpumask *set = irq_default_affinity;
int node = irq_desc_get_node(desc);
/* Excludes PER_CPU and NO_BALANCE interrupts */
- if (!irq_can_set_affinity(irq))
+ if (!__irq_can_set_affinity(desc))
return 0;
/*
@@ -393,10 +394,10 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
return 0;
}
#else
-static inline int
-setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask)
+/* Wrapper for ALPHA specific affinity selector magic */
+static inline int setup_affinity(struct irq_desc *d, struct cpumask *mask)
{
- return irq_select_affinity(irq);
+ return irq_select_affinity(irq_desc_get_irq(d));
}
#endif
@@ -410,20 +411,20 @@ int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask)
int ret;
raw_spin_lock_irqsave(&desc->lock, flags);
- ret = setup_affinity(irq, desc, mask);
+ ret = setup_affinity(desc, mask);
raw_spin_unlock_irqrestore(&desc->lock, flags);
return ret;
}
#else
static inline int
-setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
+setup_affinity(struct irq_desc *desc, struct cpumask *mask)
{
return 0;
}
#endif
-void __disable_irq(struct irq_desc *desc, unsigned int irq)
+void __disable_irq(struct irq_desc *desc)
{
if (!desc->depth++)
irq_disable(desc);
@@ -436,7 +437,7 @@ static int __disable_irq_nosync(unsigned int irq)
if (!desc)
return -EINVAL;
- __disable_irq(desc, irq);
+ __disable_irq(desc);
irq_put_desc_busunlock(desc, flags);
return 0;
}
@@ -503,12 +504,13 @@ bool disable_hardirq(unsigned int irq)
}
EXPORT_SYMBOL_GPL(disable_hardirq);
-void __enable_irq(struct irq_desc *desc, unsigned int irq)
+void __enable_irq(struct irq_desc *desc)
{
switch (desc->depth) {
case 0:
err_out:
- WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
+ WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n",
+ irq_desc_get_irq(desc));
break;
case 1: {
if (desc->istate & IRQS_SUSPENDED)
@@ -516,7 +518,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq)
/* Prevent probing on this irq: */
irq_settings_set_noprobe(desc);
irq_enable(desc);
- check_irq_resend(desc, irq);
+ check_irq_resend(desc);
/* fall-through */
}
default:
@@ -546,7 +548,7 @@ void enable_irq(unsigned int irq)
KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
goto out;
- __enable_irq(desc, irq);
+ __enable_irq(desc);
out:
irq_put_desc_busunlock(desc, flags);
}
@@ -637,8 +639,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
return canrequest;
}
-int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
- unsigned long flags)
+int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
{
struct irq_chip *chip = desc->irq_data.chip;
int ret, unmask = 0;
@@ -648,7 +649,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
* IRQF_TRIGGER_* but the PIC does not support multiple
* flow-types?
*/
- pr_debug("No set_type function for IRQ %d (%s)\n", irq,
+ pr_debug("No set_type function for IRQ %d (%s)\n",
+ irq_desc_get_irq(desc),
chip ? (chip->name ? : "unknown") : "unknown");
return 0;
}
@@ -685,7 +687,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
break;
default:
pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n",
- flags, irq, chip->irq_set_type);
+ flags, irq_desc_get_irq(desc), chip->irq_set_type);
}
if (unmask)
unmask_irq(desc);
@@ -1221,8 +1223,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
/* Setup the type (level, edge polarity) if configured: */
if (new->flags & IRQF_TRIGGER_MASK) {
- ret = __irq_set_trigger(desc, irq,
- new->flags & IRQF_TRIGGER_MASK);
+ ret = __irq_set_trigger(desc,
+ new->flags & IRQF_TRIGGER_MASK);
if (ret)
goto out_mask;
@@ -1253,7 +1255,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
}
/* Set default affinity mask once everything is setup */
- setup_affinity(irq, desc, mask);
+ setup_affinity(desc, mask);
} else if (new->flags & IRQF_TRIGGER_MASK) {
unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
@@ -1280,7 +1282,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
*/
if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
desc->istate &= ~IRQS_SPURIOUS_DISABLED;
- __enable_irq(desc, irq);
+ __enable_irq(desc);
}
raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -1650,7 +1652,7 @@ void enable_percpu_irq(unsigned int irq, unsigned int type)
if (type != IRQ_TYPE_NONE) {
int ret;
- ret = __irq_set_trigger(desc, irq, type);
+ ret = __irq_set_trigger(desc, type);
if (ret) {
WARN(1, "failed to set type for IRQ%d\n", irq);
@@ -1875,6 +1877,7 @@ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
irq_put_desc_busunlock(desc, flags);
return err;
}
+EXPORT_SYMBOL_GPL(irq_get_irqchip_state);
/**
* irq_set_irqchip_state - set the state of a forwarded interrupt.
@@ -1920,3 +1923,4 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
irq_put_desc_busunlock(desc, flags);
return err;
}
+EXPORT_SYMBOL_GPL(irq_set_irqchip_state);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 7bf1f1bbb7fa..7e6512b9dc1f 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -18,6 +18,23 @@
/* Temparory solution for building, will be removed later */
#include <linux/pci.h>
+struct msi_desc *alloc_msi_entry(struct device *dev)
+{
+ struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL);
+ if (!desc)
+ return NULL;
+
+ INIT_LIST_HEAD(&desc->list);
+ desc->dev = dev;
+
+ return desc;
+}
+
+void free_msi_entry(struct msi_desc *entry)
+{
+ kfree(entry);
+}
+
void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
{
*msg = entry->msg;
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index d22786a6dbde..21c62617a35a 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -68,7 +68,7 @@ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action)
desc->cond_suspend_depth--;
}
-static bool suspend_device_irq(struct irq_desc *desc, int irq)
+static bool suspend_device_irq(struct irq_desc *desc)
{
if (!desc->action || desc->no_suspend_depth)
return false;
@@ -85,7 +85,7 @@ static bool suspend_device_irq(struct irq_desc *desc, int irq)
}
desc->istate |= IRQS_SUSPENDED;
- __disable_irq(desc, irq);
+ __disable_irq(desc);
/*
* Hardware which has no wakeup source configuration facility
@@ -126,7 +126,7 @@ void suspend_device_irqs(void)
if (irq_settings_is_nested_thread(desc))
continue;
raw_spin_lock_irqsave(&desc->lock, flags);
- sync = suspend_device_irq(desc, irq);
+ sync = suspend_device_irq(desc);
raw_spin_unlock_irqrestore(&desc->lock, flags);
if (sync)
@@ -135,7 +135,7 @@ void suspend_device_irqs(void)
}
EXPORT_SYMBOL_GPL(suspend_device_irqs);
-static void resume_irq(struct irq_desc *desc, int irq)
+static void resume_irq(struct irq_desc *desc)
{
irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
@@ -150,7 +150,7 @@ static void resume_irq(struct irq_desc *desc, int irq)
desc->depth++;
resume:
desc->istate &= ~IRQS_SUSPENDED;
- __enable_irq(desc, irq);
+ __enable_irq(desc);
}
static void resume_irqs(bool want_early)
@@ -169,7 +169,7 @@ static void resume_irqs(bool want_early)
continue;
raw_spin_lock_irqsave(&desc->lock, flags);
- resume_irq(desc, irq);
+ resume_irq(desc);
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
}
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 7a5237a1bce5..dd95f44f99b2 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -53,7 +53,7 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
*
* Is called with interrupts disabled and desc->lock held.
*/
-void check_irq_resend(struct irq_desc *desc, unsigned int irq)
+void check_irq_resend(struct irq_desc *desc)
{
/*
* We do not resend level type interrupts. Level type
@@ -74,6 +74,8 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
if (!desc->irq_data.chip->irq_retrigger ||
!desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
#ifdef CONFIG_HARDIRQS_SW_RESEND
+ unsigned int irq = irq_desc_get_irq(desc);
+
/*
* If the interrupt is running in the thread
* context of the parent irq we need to be
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index e2514b0e439e..32144175458d 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -60,7 +60,7 @@ bool irq_wait_for_poll(struct irq_desc *desc)
/*
* Recovery handler for misrouted interrupts.
*/
-static int try_one_irq(int irq, struct irq_desc *desc, bool force)
+static int try_one_irq(struct irq_desc *desc, bool force)
{
irqreturn_t ret = IRQ_NONE;
struct irqaction *action;
@@ -133,7 +133,7 @@ static int misrouted_irq(int irq)
if (i == irq) /* Already tried */
continue;
- if (try_one_irq(i, desc, false))
+ if (try_one_irq(desc, false))
ok = 1;
}
out:
@@ -164,7 +164,7 @@ static void poll_spurious_irqs(unsigned long dummy)
continue;
local_irq_disable();
- try_one_irq(i, desc, true);
+ try_one_irq(desc, true);
local_irq_enable();
}
out:
@@ -188,10 +188,9 @@ static inline int bad_action_ret(irqreturn_t action_ret)
* (The other 100-of-100,000 interrupts may have been a correctly
* functioning device sharing an IRQ with the failing one)
*/
-static void
-__report_bad_irq(unsigned int irq, struct irq_desc *desc,
- irqreturn_t action_ret)
+static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
{
+ unsigned int irq = irq_desc_get_irq(desc);
struct irqaction *action;
unsigned long flags;
@@ -224,14 +223,13 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
-static void
-report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret)
+static void report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
{
static int count = 100;
if (count > 0) {
count--;
- __report_bad_irq(irq, desc, action_ret);
+ __report_bad_irq(desc, action_ret);
}
}
@@ -272,15 +270,16 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
#define SPURIOUS_DEFERRED 0x80000000
-void note_interrupt(unsigned int irq, struct irq_desc *desc,
- irqreturn_t action_ret)
+void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
{
+ unsigned int irq;
+
if (desc->istate & IRQS_POLL_INPROGRESS ||
irq_settings_is_polled(desc))
return;
if (bad_action_ret(action_ret)) {
- report_bad_irq(irq, desc, action_ret);
+ report_bad_irq(desc, action_ret);
return;
}
@@ -398,6 +397,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
desc->last_unhandled = jiffies;
}
+ irq = irq_desc_get_irq(desc);
if (unlikely(try_misrouted_irq(irq, desc, action_ret))) {
int ok = misrouted_irq(irq);
if (action_ret == IRQ_NONE)
@@ -413,7 +413,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
/*
* The interrupt is stuck
*/
- __report_bad_irq(irq, desc, action_ret);
+ __report_bad_irq(desc, action_ret);
/*
* Now kill the IRQ
*/
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index c90e417bb963..d10ab6b9b5e0 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1332,7 +1332,7 @@ bool __weak arch_within_kprobe_blacklist(unsigned long addr)
addr < (unsigned long)__kprobes_text_end;
}
-static bool within_kprobe_blacklist(unsigned long addr)
+bool within_kprobe_blacklist(unsigned long addr)
{
struct kprobe_blacklist_entry *ent;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 10e489c448fe..490924cc9e7c 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -97,6 +97,7 @@ bool kthread_should_park(void)
{
return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags);
}
+EXPORT_SYMBOL_GPL(kthread_should_park);
/**
* kthread_freezable_should_stop - should this freezable kthread return now?
@@ -171,6 +172,7 @@ void kthread_parkme(void)
{
__kthread_parkme(to_kthread(current));
}
+EXPORT_SYMBOL_GPL(kthread_parkme);
static int kthread(void *_create)
{
@@ -325,16 +327,30 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
}
EXPORT_SYMBOL(kthread_create_on_node);
-static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
+static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state)
{
- /* Must have done schedule() in kthread() before we set_task_cpu */
+ unsigned long flags;
+
if (!wait_task_inactive(p, state)) {
WARN_ON(1);
return;
}
+
/* It's safe because the task is inactive. */
- do_set_cpus_allowed(p, cpumask_of(cpu));
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ do_set_cpus_allowed(p, mask);
p->flags |= PF_NO_SETAFFINITY;
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}
+
+static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
+{
+ __kthread_bind_mask(p, cpumask_of(cpu), state);
+}
+
+void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
+{
+ __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE);
}
/**
@@ -411,6 +427,7 @@ void kthread_unpark(struct task_struct *k)
if (kthread)
__kthread_unpark(k, kthread);
}
+EXPORT_SYMBOL_GPL(kthread_unpark);
/**
* kthread_park - park a thread created by kthread_create().
@@ -441,6 +458,7 @@ int kthread_park(struct task_struct *k)
}
return ret;
}
+EXPORT_SYMBOL_GPL(kthread_park);
/**
* kthread_stop - stop a thread created by kthread_create().
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index c40ebcca0495..6e5344112419 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -348,8 +348,10 @@ static void klp_disable_func(struct klp_func *func)
{
struct klp_ops *ops;
- WARN_ON(func->state != KLP_ENABLED);
- WARN_ON(!func->old_addr);
+ if (WARN_ON(func->state != KLP_ENABLED))
+ return;
+ if (WARN_ON(!func->old_addr))
+ return;
ops = klp_find_ops(func->old_addr);
if (WARN_ON(!ops))
diff --git a/kernel/module.c b/kernel/module.c
index 4d2b82e610e2..b86b7bf1be38 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -602,13 +602,16 @@ const struct kernel_symbol *find_symbol(const char *name,
}
EXPORT_SYMBOL_GPL(find_symbol);
-/* Search for module by name: must hold module_mutex. */
+/*
+ * Search for module by name: must hold module_mutex (or preempt disabled
+ * for read-only access).
+ */
static struct module *find_module_all(const char *name, size_t len,
bool even_unformed)
{
struct module *mod;
- module_assert_mutex();
+ module_assert_mutex_or_preempt();
list_for_each_entry(mod, &modules, list) {
if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
@@ -621,6 +624,7 @@ static struct module *find_module_all(const char *name, size_t len,
struct module *find_module(const char *name)
{
+ module_assert_mutex();
return find_module_all(name, strlen(name), false);
}
EXPORT_SYMBOL_GPL(find_module);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 980e4330fb59..fd2c9acbcc19 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -544,7 +544,7 @@ int notrace notify_die(enum die_val val, const char *str,
.signr = sig,
};
- rcu_lockdep_assert(rcu_is_watching(),
+ RCU_LOCKDEP_WARN(!rcu_is_watching(),
"notify_die called but RCU thinks we're quiescent");
return atomic_notifier_call_chain(&die_chain, val, &args);
}
diff --git a/kernel/pid.c b/kernel/pid.c
index 4fd07d5b7baf..ca368793808e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -451,9 +451,8 @@ EXPORT_SYMBOL(pid_task);
*/
struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
{
- rcu_lockdep_assert(rcu_read_lock_held(),
- "find_task_by_pid_ns() needs rcu_read_lock()"
- " protection");
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
+ "find_task_by_pid_ns() needs rcu_read_lock() protection");
return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 9e302315e33d..02e8dfaa1ce2 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -18,6 +18,16 @@ config SUSPEND_FREEZER
Turning OFF this setting is NOT recommended! If in doubt, say Y.
+config SUSPEND_SKIP_SYNC
+ bool "Skip kernel's sys_sync() on suspend to RAM/standby"
+ depends on SUSPEND
+ depends on EXPERT
+ help
+ Skip the kernel sys_sync() before freezing user processes.
+ Some systems prefer not to pay this cost on every invocation
+ of suspend, or they are content with invoking sync() from
+ user-space before invoking suspend. Say Y if that's your case.
+
config HIBERNATE_CALLBACKS
bool
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 53266b729fd9..7e4cda4a8dd9 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -484,11 +484,13 @@ static int enter_state(suspend_state_t state)
if (state == PM_SUSPEND_FREEZE)
freeze_begin();
+#ifndef CONFIG_SUSPEND_SKIP_SYNC
trace_suspend_resume(TPS("sync_filesystems"), 0, true);
printk(KERN_INFO "PM: Syncing filesystems ... ");
sys_sync();
printk("done.\n");
trace_suspend_resume(TPS("sync_filesystems"), 0, false);
+#endif
pr_debug("PM: Preparing system for sleep (%s)\n", pm_states[state]);
error = suspend_prepare(state);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 2f30ca91e4fa..b2066fb5b10f 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -227,27 +227,23 @@ static void hib_init_batch(struct hib_bio_batch *hb)
hb->error = 0;
}
-static void hib_end_io(struct bio *bio, int error)
+static void hib_end_io(struct bio *bio)
{
struct hib_bio_batch *hb = bio->bi_private;
- const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct page *page = bio->bi_io_vec[0].bv_page;
- if (!uptodate || error) {
+ if (bio->bi_error) {
printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
imajor(bio->bi_bdev->bd_inode),
iminor(bio->bi_bdev->bd_inode),
(unsigned long long)bio->bi_iter.bi_sector);
-
- if (!error)
- error = -EIO;
}
if (bio_data_dir(bio) == WRITE)
put_page(page);
- if (error && !hb->error)
- hb->error = error;
+ if (bio->bi_error && !hb->error)
+ hb->error = bio->bi_error;
if (atomic_dec_and_test(&hb->count))
wake_up(&hb->wait);
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index 019069c84ff6..1896386e16bb 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -17,6 +17,7 @@
#include <linux/list.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
+#include <linux/workqueue.h>
#include "power.h"
@@ -83,7 +84,9 @@ static inline void decrement_wakelocks_number(void) {}
#define WL_GC_COUNT_MAX 100
#define WL_GC_TIME_SEC 300
+static void __wakelocks_gc(struct work_struct *work);
static LIST_HEAD(wakelocks_lru_list);
+static DECLARE_WORK(wakelock_work, __wakelocks_gc);
static unsigned int wakelocks_gc_count;
static inline void wakelocks_lru_add(struct wakelock *wl)
@@ -96,13 +99,12 @@ static inline void wakelocks_lru_most_recent(struct wakelock *wl)
list_move(&wl->lru, &wakelocks_lru_list);
}
-static void wakelocks_gc(void)
+static void __wakelocks_gc(struct work_struct *work)
{
struct wakelock *wl, *aux;
ktime_t now;
- if (++wakelocks_gc_count <= WL_GC_COUNT_MAX)
- return;
+ mutex_lock(&wakelocks_lock);
now = ktime_get();
list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) {
@@ -127,6 +129,16 @@ static void wakelocks_gc(void)
}
}
wakelocks_gc_count = 0;
+
+ mutex_unlock(&wakelocks_lock);
+}
+
+static void wakelocks_gc(void)
+{
+ if (++wakelocks_gc_count <= WL_GC_COUNT_MAX)
+ return;
+
+ schedule_work(&wakelock_work);
}
#else /* !CONFIG_PM_WAKELOCKS_GC */
static inline void wakelocks_lru_add(struct wakelock *wl) {}
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 59e32684c23b..77192953dee5 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -635,6 +635,8 @@ static struct rcu_torture_ops sched_ops = {
.deferred_free = rcu_sched_torture_deferred_free,
.sync = synchronize_sched,
.exp_sync = synchronize_sched_expedited,
+ .get_state = get_state_synchronize_sched,
+ .cond_sync = cond_synchronize_sched,
.call = call_rcu_sched,
.cb_barrier = rcu_barrier_sched,
.fqs = rcu_sched_force_quiescent_state,
@@ -684,10 +686,20 @@ static struct rcu_torture_ops tasks_ops = {
#define RCUTORTURE_TASKS_OPS &tasks_ops,
+static bool __maybe_unused torturing_tasks(void)
+{
+ return cur_ops == &tasks_ops;
+}
+
#else /* #ifdef CONFIG_TASKS_RCU */
#define RCUTORTURE_TASKS_OPS
+static bool torturing_tasks(void)
+{
+ return false;
+}
+
#endif /* #else #ifdef CONFIG_TASKS_RCU */
/*
@@ -823,9 +835,7 @@ rcu_torture_cbflood(void *arg)
}
if (err) {
VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM");
- while (!torture_must_stop())
- schedule_timeout_interruptible(HZ);
- return 0;
+ goto wait_for_stop;
}
VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started");
do {
@@ -844,6 +854,7 @@ rcu_torture_cbflood(void *arg)
stutter_wait("rcu_torture_cbflood");
} while (!torture_must_stop());
vfree(rhp);
+wait_for_stop:
torture_kthread_stopping("rcu_torture_cbflood");
return 0;
}
@@ -1088,7 +1099,8 @@ static void rcu_torture_timer(unsigned long unused)
p = rcu_dereference_check(rcu_torture_current,
rcu_read_lock_bh_held() ||
rcu_read_lock_sched_held() ||
- srcu_read_lock_held(srcu_ctlp));
+ srcu_read_lock_held(srcu_ctlp) ||
+ torturing_tasks());
if (p == NULL) {
/* Leave because rcu_torture_writer is not yet underway */
cur_ops->readunlock(idx);
@@ -1162,7 +1174,8 @@ rcu_torture_reader(void *arg)
p = rcu_dereference_check(rcu_torture_current,
rcu_read_lock_bh_held() ||
rcu_read_lock_sched_held() ||
- srcu_read_lock_held(srcu_ctlp));
+ srcu_read_lock_held(srcu_ctlp) ||
+ torturing_tasks());
if (p == NULL) {
/* Wait for rcu_torture_writer to get underway */
cur_ops->readunlock(idx);
@@ -1507,7 +1520,7 @@ static int rcu_torture_barrier_init(void)
int i;
int ret;
- if (n_barrier_cbs == 0)
+ if (n_barrier_cbs <= 0)
return 0;
if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) {
pr_alert("%s" TORTURE_FLAG
@@ -1786,12 +1799,15 @@ rcu_torture_init(void)
writer_task);
if (firsterr)
goto unwind;
- fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
- GFP_KERNEL);
- if (fakewriter_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("out of memory");
- firsterr = -ENOMEM;
- goto unwind;
+ if (nfakewriters > 0) {
+ fakewriter_tasks = kzalloc(nfakewriters *
+ sizeof(fakewriter_tasks[0]),
+ GFP_KERNEL);
+ if (fakewriter_tasks == NULL) {
+ VERBOSE_TOROUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
}
for (i = 0; i < nfakewriters; i++) {
firsterr = torture_create_kthread(rcu_torture_fakewriter,
@@ -1818,7 +1834,7 @@ rcu_torture_init(void)
if (firsterr)
goto unwind;
}
- if (test_no_idle_hz) {
+ if (test_no_idle_hz && shuffle_interval > 0) {
firsterr = torture_shuffle_init(shuffle_interval * HZ);
if (firsterr)
goto unwind;
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index fb33d35ee0b7..d3fcb2ec8536 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -252,14 +252,15 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
}
/**
- * srcu_readers_active - returns approximate number of readers.
+ * srcu_readers_active - returns true if there are readers. and false
+ * otherwise
* @sp: which srcu_struct to count active readers (holding srcu_read_lock).
*
* Note that this is not an atomic primitive, and can therefore suffer
* severe errors when invoked on an active srcu_struct. That said, it
* can be useful as an error check at cleanup time.
*/
-static int srcu_readers_active(struct srcu_struct *sp)
+static bool srcu_readers_active(struct srcu_struct *sp)
{
int cpu;
unsigned long sum = 0;
@@ -414,11 +415,11 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
struct rcu_head *head = &rcu.head;
bool done = false;
- rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
- !lock_is_held(&rcu_bh_lock_map) &&
- !lock_is_held(&rcu_lock_map) &&
- !lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
+ RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) ||
+ lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
might_sleep();
init_completion(&rcu.completion);
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index c291bd65d2cb..d0471056d0af 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -191,10 +191,10 @@ static void rcu_process_callbacks(struct softirq_action *unused)
*/
void synchronize_sched(void)
{
- rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
- !lock_is_held(&rcu_lock_map) &&
- !lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_sched() in RCU read-side critical section");
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_sched() in RCU read-side critical section");
cond_resched();
}
EXPORT_SYMBOL_GPL(synchronize_sched);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 65137bc28b2b..9f75f25cc5d9 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -70,6 +70,8 @@ MODULE_ALIAS("rcutree");
static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
+static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
+static struct lock_class_key rcu_exp_sched_class[RCU_NUM_LVLS];
/*
* In order to export the rcu_state name to the tracing tools, it
@@ -124,13 +126,8 @@ module_param(rcu_fanout_exact, bool, 0444);
static int rcu_fanout_leaf = RCU_FANOUT_LEAF;
module_param(rcu_fanout_leaf, int, 0444);
int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
-static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */
- NUM_RCU_LVL_0,
- NUM_RCU_LVL_1,
- NUM_RCU_LVL_2,
- NUM_RCU_LVL_3,
- NUM_RCU_LVL_4,
-};
+/* Number of rcu_nodes at specified level. */
+static int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
/*
@@ -649,12 +646,12 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
* It is illegal to enter an extended quiescent state while
* in an RCU read-side critical section.
*/
- rcu_lockdep_assert(!lock_is_held(&rcu_lock_map),
- "Illegal idle entry in RCU read-side critical section.");
- rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map),
- "Illegal idle entry in RCU-bh read-side critical section.");
- rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map),
- "Illegal idle entry in RCU-sched read-side critical section.");
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map),
+ "Illegal idle entry in RCU read-side critical section.");
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map),
+ "Illegal idle entry in RCU-bh read-side critical section.");
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map),
+ "Illegal idle entry in RCU-sched read-side critical section.");
}
/*
@@ -701,7 +698,7 @@ void rcu_idle_enter(void)
}
EXPORT_SYMBOL_GPL(rcu_idle_enter);
-#ifdef CONFIG_RCU_USER_QS
+#ifdef CONFIG_NO_HZ_FULL
/**
* rcu_user_enter - inform RCU that we are resuming userspace.
*
@@ -714,7 +711,7 @@ void rcu_user_enter(void)
{
rcu_eqs_enter(1);
}
-#endif /* CONFIG_RCU_USER_QS */
+#endif /* CONFIG_NO_HZ_FULL */
/**
* rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
@@ -828,7 +825,7 @@ void rcu_idle_exit(void)
}
EXPORT_SYMBOL_GPL(rcu_idle_exit);
-#ifdef CONFIG_RCU_USER_QS
+#ifdef CONFIG_NO_HZ_FULL
/**
* rcu_user_exit - inform RCU that we are exiting userspace.
*
@@ -839,7 +836,7 @@ void rcu_user_exit(void)
{
rcu_eqs_exit(1);
}
-#endif /* CONFIG_RCU_USER_QS */
+#endif /* CONFIG_NO_HZ_FULL */
/**
* rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
@@ -978,9 +975,9 @@ bool notrace rcu_is_watching(void)
{
bool ret;
- preempt_disable();
+ preempt_disable_notrace();
ret = __rcu_is_watching();
- preempt_enable();
+ preempt_enable_notrace();
return ret;
}
EXPORT_SYMBOL_GPL(rcu_is_watching);
@@ -1178,9 +1175,11 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
j = jiffies;
gpa = READ_ONCE(rsp->gp_activity);
if (j - gpa > 2 * HZ)
- pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x\n",
+ pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n",
rsp->name, j - gpa,
- rsp->gpnum, rsp->completed, rsp->gp_flags);
+ rsp->gpnum, rsp->completed,
+ rsp->gp_flags, rsp->gp_state,
+ rsp->gp_kthread ? rsp->gp_kthread->state : 0);
}
/*
@@ -1906,6 +1905,26 @@ static int rcu_gp_init(struct rcu_state *rsp)
}
/*
+ * Helper function for wait_event_interruptible_timeout() wakeup
+ * at force-quiescent-state time.
+ */
+static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
+{
+ struct rcu_node *rnp = rcu_get_root(rsp);
+
+ /* Someone like call_rcu() requested a force-quiescent-state scan. */
+ *gfp = READ_ONCE(rsp->gp_flags);
+ if (*gfp & RCU_GP_FLAG_FQS)
+ return true;
+
+ /* The current grace period has completed. */
+ if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp))
+ return true;
+
+ return false;
+}
+
+/*
* Do one round of quiescent-state forcing.
*/
static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
@@ -2041,6 +2060,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
wait_event_interruptible(rsp->gp_wq,
READ_ONCE(rsp->gp_flags) &
RCU_GP_FLAG_INIT);
+ rsp->gp_state = RCU_GP_DONE_GPS;
/* Locking provides needed memory barrier. */
if (rcu_gp_init(rsp))
break;
@@ -2068,11 +2088,8 @@ static int __noreturn rcu_gp_kthread(void *arg)
TPS("fqswait"));
rsp->gp_state = RCU_GP_WAIT_FQS;
ret = wait_event_interruptible_timeout(rsp->gp_wq,
- ((gf = READ_ONCE(rsp->gp_flags)) &
- RCU_GP_FLAG_FQS) ||
- (!READ_ONCE(rnp->qsmask) &&
- !rcu_preempt_blocked_readers_cgp(rnp)),
- j);
+ rcu_gp_fqs_check_wake(rsp, &gf), j);
+ rsp->gp_state = RCU_GP_DOING_FQS;
/* Locking provides needed memory barriers. */
/* If grace period done, leave loop. */
if (!READ_ONCE(rnp->qsmask) &&
@@ -2110,7 +2127,9 @@ static int __noreturn rcu_gp_kthread(void *arg)
}
/* Handle grace-period end. */
+ rsp->gp_state = RCU_GP_CLEANUP;
rcu_gp_cleanup(rsp);
+ rsp->gp_state = RCU_GP_CLEANED;
}
}
@@ -3161,10 +3180,10 @@ static inline int rcu_blocking_is_gp(void)
*/
void synchronize_sched(void)
{
- rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
- !lock_is_held(&rcu_lock_map) &&
- !lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_sched() in RCU-sched read-side critical section");
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_sched() in RCU-sched read-side critical section");
if (rcu_blocking_is_gp())
return;
if (rcu_gp_is_expedited())
@@ -3188,10 +3207,10 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
*/
void synchronize_rcu_bh(void)
{
- rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
- !lock_is_held(&rcu_lock_map) &&
- !lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
if (rcu_blocking_is_gp())
return;
if (rcu_gp_is_expedited())
@@ -3253,23 +3272,247 @@ void cond_synchronize_rcu(unsigned long oldstate)
}
EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
-static int synchronize_sched_expedited_cpu_stop(void *data)
+/**
+ * get_state_synchronize_sched - Snapshot current RCU-sched state
+ *
+ * Returns a cookie that is used by a later call to cond_synchronize_sched()
+ * to determine whether or not a full grace period has elapsed in the
+ * meantime.
+ */
+unsigned long get_state_synchronize_sched(void)
{
/*
- * There must be a full memory barrier on each affected CPU
- * between the time that try_stop_cpus() is called and the
- * time that it returns.
- *
- * In the current initial implementation of cpu_stop, the
- * above condition is already met when the control reaches
- * this point and the following smp_mb() is not strictly
- * necessary. Do smp_mb() anyway for documentation and
- * robustness against future implementation changes.
+ * Any prior manipulation of RCU-protected data must happen
+ * before the load from ->gpnum.
+ */
+ smp_mb(); /* ^^^ */
+
+ /*
+ * Make sure this load happens before the purportedly
+ * time-consuming work between get_state_synchronize_sched()
+ * and cond_synchronize_sched().
+ */
+ return smp_load_acquire(&rcu_sched_state.gpnum);
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_sched);
+
+/**
+ * cond_synchronize_sched - Conditionally wait for an RCU-sched grace period
+ *
+ * @oldstate: return value from earlier call to get_state_synchronize_sched()
+ *
+ * If a full RCU-sched grace period has elapsed since the earlier call to
+ * get_state_synchronize_sched(), just return. Otherwise, invoke
+ * synchronize_sched() to wait for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account. But
+ * counter wrap is harmless. If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for one additional grace period should be just fine.
+ */
+void cond_synchronize_sched(unsigned long oldstate)
+{
+ unsigned long newstate;
+
+ /*
+ * Ensure that this load happens before any RCU-destructive
+ * actions the caller might carry out after we return.
*/
- smp_mb(); /* See above comment block. */
+ newstate = smp_load_acquire(&rcu_sched_state.completed);
+ if (ULONG_CMP_GE(oldstate, newstate))
+ synchronize_sched();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_sched);
+
+/* Adjust sequence number for start of update-side operation. */
+static void rcu_seq_start(unsigned long *sp)
+{
+ WRITE_ONCE(*sp, *sp + 1);
+ smp_mb(); /* Ensure update-side operation after counter increment. */
+ WARN_ON_ONCE(!(*sp & 0x1));
+}
+
+/* Adjust sequence number for end of update-side operation. */
+static void rcu_seq_end(unsigned long *sp)
+{
+ smp_mb(); /* Ensure update-side operation before counter increment. */
+ WRITE_ONCE(*sp, *sp + 1);
+ WARN_ON_ONCE(*sp & 0x1);
+}
+
+/* Take a snapshot of the update side's sequence number. */
+static unsigned long rcu_seq_snap(unsigned long *sp)
+{
+ unsigned long s;
+
+ smp_mb(); /* Caller's modifications seen first by other CPUs. */
+ s = (READ_ONCE(*sp) + 3) & ~0x1;
+ smp_mb(); /* Above access must not bleed into critical section. */
+ return s;
+}
+
+/*
+ * Given a snapshot from rcu_seq_snap(), determine whether or not a
+ * full update-side operation has occurred.
+ */
+static bool rcu_seq_done(unsigned long *sp, unsigned long s)
+{
+ return ULONG_CMP_GE(READ_ONCE(*sp), s);
+}
+
+/* Wrapper functions for expedited grace periods. */
+static void rcu_exp_gp_seq_start(struct rcu_state *rsp)
+{
+ rcu_seq_start(&rsp->expedited_sequence);
+}
+static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
+{
+ rcu_seq_end(&rsp->expedited_sequence);
+ smp_mb(); /* Ensure that consecutive grace periods serialize. */
+}
+static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
+{
+ return rcu_seq_snap(&rsp->expedited_sequence);
+}
+static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
+{
+ return rcu_seq_done(&rsp->expedited_sequence, s);
+}
+
+/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
+static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
+ struct rcu_data *rdp,
+ atomic_long_t *stat, unsigned long s)
+{
+ if (rcu_exp_gp_seq_done(rsp, s)) {
+ if (rnp)
+ mutex_unlock(&rnp->exp_funnel_mutex);
+ else if (rdp)
+ mutex_unlock(&rdp->exp_funnel_mutex);
+ /* Ensure test happens before caller kfree(). */
+ smp_mb__before_atomic(); /* ^^^ */
+ atomic_long_inc(stat);
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Funnel-lock acquisition for expedited grace periods. Returns a
+ * pointer to the root rcu_node structure, or NULL if some other
+ * task did the expedited grace period for us.
+ */
+static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
+{
+ struct rcu_data *rdp;
+ struct rcu_node *rnp0;
+ struct rcu_node *rnp1 = NULL;
+
+ /*
+ * First try directly acquiring the root lock in order to reduce
+ * latency in the common case where expedited grace periods are
+ * rare. We check mutex_is_locked() to avoid pathological levels of
+ * memory contention on ->exp_funnel_mutex in the heavy-load case.
+ */
+ rnp0 = rcu_get_root(rsp);
+ if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) {
+ if (mutex_trylock(&rnp0->exp_funnel_mutex)) {
+ if (sync_exp_work_done(rsp, rnp0, NULL,
+ &rsp->expedited_workdone0, s))
+ return NULL;
+ return rnp0;
+ }
+ }
+
+ /*
+ * Each pass through the following loop works its way
+ * up the rcu_node tree, returning if others have done the
+ * work or otherwise falls through holding the root rnp's
+ * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure
+ * can be inexact, as it is just promoting locality and is not
+ * strictly needed for correctness.
+ */
+ rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
+ if (sync_exp_work_done(rsp, NULL, NULL, &rsp->expedited_workdone1, s))
+ return NULL;
+ mutex_lock(&rdp->exp_funnel_mutex);
+ rnp0 = rdp->mynode;
+ for (; rnp0 != NULL; rnp0 = rnp0->parent) {
+ if (sync_exp_work_done(rsp, rnp1, rdp,
+ &rsp->expedited_workdone2, s))
+ return NULL;
+ mutex_lock(&rnp0->exp_funnel_mutex);
+ if (rnp1)
+ mutex_unlock(&rnp1->exp_funnel_mutex);
+ else
+ mutex_unlock(&rdp->exp_funnel_mutex);
+ rnp1 = rnp0;
+ }
+ if (sync_exp_work_done(rsp, rnp1, rdp,
+ &rsp->expedited_workdone3, s))
+ return NULL;
+ return rnp1;
+}
+
+/* Invoked on each online non-idle CPU for expedited quiescent state. */
+static int synchronize_sched_expedited_cpu_stop(void *data)
+{
+ struct rcu_data *rdp = data;
+ struct rcu_state *rsp = rdp->rsp;
+
+ /* We are here: If we are last, do the wakeup. */
+ rdp->exp_done = true;
+ if (atomic_dec_and_test(&rsp->expedited_need_qs))
+ wake_up(&rsp->expedited_wq);
return 0;
}
+static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
+{
+ int cpu;
+ unsigned long jiffies_stall;
+ unsigned long jiffies_start;
+ struct rcu_data *rdp;
+ int ret;
+
+ jiffies_stall = rcu_jiffies_till_stall_check();
+ jiffies_start = jiffies;
+
+ for (;;) {
+ ret = wait_event_interruptible_timeout(
+ rsp->expedited_wq,
+ !atomic_read(&rsp->expedited_need_qs),
+ jiffies_stall);
+ if (ret > 0)
+ return;
+ if (ret < 0) {
+ /* Hit a signal, disable CPU stall warnings. */
+ wait_event(rsp->expedited_wq,
+ !atomic_read(&rsp->expedited_need_qs));
+ return;
+ }
+ pr_err("INFO: %s detected expedited stalls on CPUs: {",
+ rsp->name);
+ for_each_online_cpu(cpu) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+
+ if (rdp->exp_done)
+ continue;
+ pr_cont(" %d", cpu);
+ }
+ pr_cont(" } %lu jiffies s: %lu\n",
+ jiffies - jiffies_start, rsp->expedited_sequence);
+ for_each_online_cpu(cpu) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+
+ if (rdp->exp_done)
+ continue;
+ dump_cpu_task(cpu);
+ }
+ jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3;
+ }
+}
+
/**
* synchronize_sched_expedited - Brute-force RCU-sched grace period
*
@@ -3281,58 +3524,21 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
* restructure your code to batch your updates, and then use a single
* synchronize_sched() instead.
*
- * This implementation can be thought of as an application of ticket
- * locking to RCU, with sync_sched_expedited_started and
- * sync_sched_expedited_done taking on the roles of the halves
- * of the ticket-lock word. Each task atomically increments
- * sync_sched_expedited_started upon entry, snapshotting the old value,
- * then attempts to stop all the CPUs. If this succeeds, then each
- * CPU will have executed a context switch, resulting in an RCU-sched
- * grace period. We are then done, so we use atomic_cmpxchg() to
- * update sync_sched_expedited_done to match our snapshot -- but
- * only if someone else has not already advanced past our snapshot.
- *
- * On the other hand, if try_stop_cpus() fails, we check the value
- * of sync_sched_expedited_done. If it has advanced past our
- * initial snapshot, then someone else must have forced a grace period
- * some time after we took our snapshot. In this case, our work is
- * done for us, and we can simply return. Otherwise, we try again,
- * but keep our initial snapshot for purposes of checking for someone
- * doing our work for us.
- *
- * If we fail too many times in a row, we fall back to synchronize_sched().
+ * This implementation can be thought of as an application of sequence
+ * locking to expedited grace periods, but using the sequence counter to
+ * determine when someone else has already done the work instead of for
+ * retrying readers.
*/
void synchronize_sched_expedited(void)
{
- cpumask_var_t cm;
- bool cma = false;
int cpu;
- long firstsnap, s, snap;
- int trycount = 0;
+ unsigned long s;
+ struct rcu_node *rnp;
struct rcu_state *rsp = &rcu_sched_state;
- /*
- * If we are in danger of counter wrap, just do synchronize_sched().
- * By allowing sync_sched_expedited_started to advance no more than
- * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring
- * that more than 3.5 billion CPUs would be required to force a
- * counter wrap on a 32-bit system. Quite a few more CPUs would of
- * course be required on a 64-bit system.
- */
- if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
- (ulong)atomic_long_read(&rsp->expedited_done) +
- ULONG_MAX / 8)) {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_wrap);
- return;
- }
+ /* Take a snapshot of the sequence number. */
+ s = rcu_exp_gp_seq_snap(rsp);
- /*
- * Take a ticket. Note that atomic_inc_return() implies a
- * full memory barrier.
- */
- snap = atomic_long_inc_return(&rsp->expedited_start);
- firstsnap = snap;
if (!try_get_online_cpus()) {
/* CPU hotplug operation in flight, fall back to normal GP. */
wait_rcu_gp(call_rcu_sched);
@@ -3341,100 +3547,38 @@ void synchronize_sched_expedited(void)
}
WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
- /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
- cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
- if (cma) {
- cpumask_copy(cm, cpu_online_mask);
- cpumask_clear_cpu(raw_smp_processor_id(), cm);
- for_each_cpu(cpu, cm) {
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
- if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
- cpumask_clear_cpu(cpu, cm);
- }
- if (cpumask_weight(cm) == 0)
- goto all_cpus_idle;
+ rnp = exp_funnel_lock(rsp, s);
+ if (rnp == NULL) {
+ put_online_cpus();
+ return; /* Someone else did our work for us. */
}
- /*
- * Each pass through the following loop attempts to force a
- * context switch on each CPU.
- */
- while (try_stop_cpus(cma ? cm : cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- atomic_long_inc(&rsp->expedited_tryfail);
-
- /* Check to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone1);
- free_cpumask_var(cm);
- return;
- }
+ rcu_exp_gp_seq_start(rsp);
- /* No joy, try again later. Or just synchronize_sched(). */
- if (trycount++ < 10) {
- udelay(trycount * num_online_cpus());
- } else {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
+ /* Stop each CPU that is online, non-idle, and not us. */
+ init_waitqueue_head(&rsp->expedited_wq);
+ atomic_set(&rsp->expedited_need_qs, 1); /* Extra count avoids race. */
+ for_each_online_cpu(cpu) {
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
- /* Recheck to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone2);
- free_cpumask_var(cm);
- return;
- }
+ rdp->exp_done = false;
- /*
- * Refetching sync_sched_expedited_started allows later
- * callers to piggyback on our grace period. We retry
- * after they started, so our grace period works for them,
- * and they started after our first try, so their grace
- * period works for us.
- */
- if (!try_get_online_cpus()) {
- /* CPU hotplug operation in flight, use normal GP. */
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
- snap = atomic_long_read(&rsp->expedited_start);
- smp_mb(); /* ensure read is before try_stop_cpus(). */
+ /* Skip our CPU and any idle CPUs. */
+ if (raw_smp_processor_id() == cpu ||
+ !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ continue;
+ atomic_inc(&rsp->expedited_need_qs);
+ stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop,
+ rdp, &rdp->exp_stop_work);
}
- atomic_long_inc(&rsp->expedited_stoppedcpus);
-all_cpus_idle:
- free_cpumask_var(cm);
+ /* Remove extra count and, if necessary, wait for CPUs to stop. */
+ if (!atomic_dec_and_test(&rsp->expedited_need_qs))
+ synchronize_sched_expedited_wait(rsp);
- /*
- * Everyone up to our most recent fetch is covered by our grace
- * period. Update the counter, but only if our work is still
- * relevant -- which it won't be if someone who started later
- * than we did already did their update.
- */
- do {
- atomic_long_inc(&rsp->expedited_done_tries);
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_done_lost);
- break;
- }
- } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
- atomic_long_inc(&rsp->expedited_done_exit);
+ rcu_exp_gp_seq_end(rsp);
+ mutex_unlock(&rnp->exp_funnel_mutex);
put_online_cpus();
}
@@ -3571,10 +3715,10 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
struct rcu_state *rsp = rdp->rsp;
if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
- _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done);
+ _rcu_barrier_trace(rsp, "LastCB", -1, rsp->barrier_sequence);
complete(&rsp->barrier_completion);
} else {
- _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done);
+ _rcu_barrier_trace(rsp, "CB", -1, rsp->barrier_sequence);
}
}
@@ -3586,7 +3730,7 @@ static void rcu_barrier_func(void *type)
struct rcu_state *rsp = type;
struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
- _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done);
+ _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence);
atomic_inc(&rsp->barrier_cpu_count);
rsp->call(&rdp->barrier_head, rcu_barrier_callback);
}
@@ -3599,55 +3743,24 @@ static void _rcu_barrier(struct rcu_state *rsp)
{
int cpu;
struct rcu_data *rdp;
- unsigned long snap = READ_ONCE(rsp->n_barrier_done);
- unsigned long snap_done;
+ unsigned long s = rcu_seq_snap(&rsp->barrier_sequence);
- _rcu_barrier_trace(rsp, "Begin", -1, snap);
+ _rcu_barrier_trace(rsp, "Begin", -1, s);
/* Take mutex to serialize concurrent rcu_barrier() requests. */
mutex_lock(&rsp->barrier_mutex);
- /*
- * Ensure that all prior references, including to ->n_barrier_done,
- * are ordered before the _rcu_barrier() machinery.
- */
- smp_mb(); /* See above block comment. */
-
- /*
- * Recheck ->n_barrier_done to see if others did our work for us.
- * This means checking ->n_barrier_done for an even-to-odd-to-even
- * transition. The "if" expression below therefore rounds the old
- * value up to the next even number and adds two before comparing.
- */
- snap_done = rsp->n_barrier_done;
- _rcu_barrier_trace(rsp, "Check", -1, snap_done);
-
- /*
- * If the value in snap is odd, we needed to wait for the current
- * rcu_barrier() to complete, then wait for the next one, in other
- * words, we need the value of snap_done to be three larger than
- * the value of snap. On the other hand, if the value in snap is
- * even, we only had to wait for the next rcu_barrier() to complete,
- * in other words, we need the value of snap_done to be only two
- * greater than the value of snap. The "(snap + 3) & ~0x1" computes
- * this for us (thank you, Linus!).
- */
- if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) {
- _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done);
+ /* Did someone else do our work for us? */
+ if (rcu_seq_done(&rsp->barrier_sequence, s)) {
+ _rcu_barrier_trace(rsp, "EarlyExit", -1, rsp->barrier_sequence);
smp_mb(); /* caller's subsequent code after above check. */
mutex_unlock(&rsp->barrier_mutex);
return;
}
- /*
- * Increment ->n_barrier_done to avoid duplicate work. Use
- * WRITE_ONCE() to prevent the compiler from speculating
- * the increment to precede the early-exit check.
- */
- WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1);
- WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
- _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
- smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
+ /* Mark the start of the barrier operation. */
+ rcu_seq_start(&rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, "Inc1", -1, rsp->barrier_sequence);
/*
* Initialize the count to one rather than to zero in order to
@@ -3671,10 +3784,10 @@ static void _rcu_barrier(struct rcu_state *rsp)
if (rcu_is_nocb_cpu(cpu)) {
if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) {
_rcu_barrier_trace(rsp, "OfflineNoCB", cpu,
- rsp->n_barrier_done);
+ rsp->barrier_sequence);
} else {
_rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
- rsp->n_barrier_done);
+ rsp->barrier_sequence);
smp_mb__before_atomic();
atomic_inc(&rsp->barrier_cpu_count);
__call_rcu(&rdp->barrier_head,
@@ -3682,11 +3795,11 @@ static void _rcu_barrier(struct rcu_state *rsp)
}
} else if (READ_ONCE(rdp->qlen)) {
_rcu_barrier_trace(rsp, "OnlineQ", cpu,
- rsp->n_barrier_done);
+ rsp->barrier_sequence);
smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
} else {
_rcu_barrier_trace(rsp, "OnlineNQ", cpu,
- rsp->n_barrier_done);
+ rsp->barrier_sequence);
}
}
put_online_cpus();
@@ -3698,16 +3811,13 @@ static void _rcu_barrier(struct rcu_state *rsp)
if (atomic_dec_and_test(&rsp->barrier_cpu_count))
complete(&rsp->barrier_completion);
- /* Increment ->n_barrier_done to prevent duplicate work. */
- smp_mb(); /* Keep increment after above mechanism. */
- WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1);
- WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
- _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
- smp_mb(); /* Keep increment before caller's subsequent code. */
-
/* Wait for all rcu_barrier_callback() callbacks to be invoked. */
wait_for_completion(&rsp->barrier_completion);
+ /* Mark the end of the barrier operation. */
+ _rcu_barrier_trace(rsp, "Inc2", -1, rsp->barrier_sequence);
+ rcu_seq_end(&rsp->barrier_sequence);
+
/* Other rcu_barrier() invocations can now safely proceed. */
mutex_unlock(&rsp->barrier_mutex);
}
@@ -3770,6 +3880,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
rdp->cpu = cpu;
rdp->rsp = rsp;
+ mutex_init(&rdp->exp_funnel_mutex);
rcu_boot_init_nocb_percpu_data(rdp);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
@@ -3961,22 +4072,22 @@ void rcu_scheduler_starting(void)
* Compute the per-level fanout, either using the exact fanout specified
* or balancing the tree, depending on the rcu_fanout_exact boot parameter.
*/
-static void __init rcu_init_levelspread(struct rcu_state *rsp)
+static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt)
{
int i;
if (rcu_fanout_exact) {
- rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
+ levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
for (i = rcu_num_lvls - 2; i >= 0; i--)
- rsp->levelspread[i] = RCU_FANOUT;
+ levelspread[i] = RCU_FANOUT;
} else {
int ccur;
int cprv;
cprv = nr_cpu_ids;
for (i = rcu_num_lvls - 1; i >= 0; i--) {
- ccur = rsp->levelcnt[i];
- rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
+ ccur = levelcnt[i];
+ levelspread[i] = (cprv + ccur - 1) / ccur;
cprv = ccur;
}
}
@@ -3988,23 +4099,20 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
static void __init rcu_init_one(struct rcu_state *rsp,
struct rcu_data __percpu *rda)
{
- static const char * const buf[] = {
- "rcu_node_0",
- "rcu_node_1",
- "rcu_node_2",
- "rcu_node_3" }; /* Match MAX_RCU_LVLS */
- static const char * const fqs[] = {
- "rcu_node_fqs_0",
- "rcu_node_fqs_1",
- "rcu_node_fqs_2",
- "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
+ static const char * const buf[] = RCU_NODE_NAME_INIT;
+ static const char * const fqs[] = RCU_FQS_NAME_INIT;
+ static const char * const exp[] = RCU_EXP_NAME_INIT;
+ static const char * const exp_sched[] = RCU_EXP_SCHED_NAME_INIT;
static u8 fl_mask = 0x1;
+
+ int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */
+ int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
int cpustride = 1;
int i;
int j;
struct rcu_node *rnp;
- BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
+ BUILD_BUG_ON(RCU_NUM_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
/* Silence gcc 4.8 false positive about array index out of range. */
if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS)
@@ -4013,19 +4121,19 @@ static void __init rcu_init_one(struct rcu_state *rsp,
/* Initialize the level-tracking arrays. */
for (i = 0; i < rcu_num_lvls; i++)
- rsp->levelcnt[i] = num_rcu_lvl[i];
+ levelcnt[i] = num_rcu_lvl[i];
for (i = 1; i < rcu_num_lvls; i++)
- rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
- rcu_init_levelspread(rsp);
+ rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1];
+ rcu_init_levelspread(levelspread, levelcnt);
rsp->flavor_mask = fl_mask;
fl_mask <<= 1;
/* Initialize the elements themselves, starting from the leaves. */
for (i = rcu_num_lvls - 1; i >= 0; i--) {
- cpustride *= rsp->levelspread[i];
+ cpustride *= levelspread[i];
rnp = rsp->level[i];
- for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
+ for (j = 0; j < levelcnt[i]; j++, rnp++) {
raw_spin_lock_init(&rnp->lock);
lockdep_set_class_and_name(&rnp->lock,
&rcu_node_class[i], buf[i]);
@@ -4045,14 +4153,23 @@ static void __init rcu_init_one(struct rcu_state *rsp,
rnp->grpmask = 0;
rnp->parent = NULL;
} else {
- rnp->grpnum = j % rsp->levelspread[i - 1];
+ rnp->grpnum = j % levelspread[i - 1];
rnp->grpmask = 1UL << rnp->grpnum;
rnp->parent = rsp->level[i - 1] +
- j / rsp->levelspread[i - 1];
+ j / levelspread[i - 1];
}
rnp->level = i;
INIT_LIST_HEAD(&rnp->blkd_tasks);
rcu_init_one_nocb(rnp);
+ mutex_init(&rnp->exp_funnel_mutex);
+ if (rsp == &rcu_sched_state)
+ lockdep_set_class_and_name(
+ &rnp->exp_funnel_mutex,
+ &rcu_exp_sched_class[i], exp_sched[i]);
+ else
+ lockdep_set_class_and_name(
+ &rnp->exp_funnel_mutex,
+ &rcu_exp_class[i], exp[i]);
}
}
@@ -4076,9 +4193,7 @@ static void __init rcu_init_geometry(void)
{
ulong d;
int i;
- int j;
- int n = nr_cpu_ids;
- int rcu_capacity[MAX_RCU_LVLS + 1];
+ int rcu_capacity[RCU_NUM_LVLS];
/*
* Initialize any unspecified boot parameters.
@@ -4101,47 +4216,49 @@ static void __init rcu_init_geometry(void)
rcu_fanout_leaf, nr_cpu_ids);
/*
- * Compute number of nodes that can be handled an rcu_node tree
- * with the given number of levels. Setting rcu_capacity[0] makes
- * some of the arithmetic easier.
- */
- rcu_capacity[0] = 1;
- rcu_capacity[1] = rcu_fanout_leaf;
- for (i = 2; i <= MAX_RCU_LVLS; i++)
- rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT;
-
- /*
* The boot-time rcu_fanout_leaf parameter is only permitted
* to increase the leaf-level fanout, not decrease it. Of course,
* the leaf-level fanout cannot exceed the number of bits in
- * the rcu_node masks. Finally, the tree must be able to accommodate
- * the configured number of CPUs. Complain and fall back to the
- * compile-time values if these limits are exceeded.
+ * the rcu_node masks. Complain and fall back to the compile-
+ * time values if these limits are exceeded.
*/
if (rcu_fanout_leaf < RCU_FANOUT_LEAF ||
- rcu_fanout_leaf > sizeof(unsigned long) * 8 ||
- n > rcu_capacity[MAX_RCU_LVLS]) {
+ rcu_fanout_leaf > sizeof(unsigned long) * 8) {
+ rcu_fanout_leaf = RCU_FANOUT_LEAF;
WARN_ON(1);
return;
}
+ /*
+ * Compute number of nodes that can be handled an rcu_node tree
+ * with the given number of levels.
+ */
+ rcu_capacity[0] = rcu_fanout_leaf;
+ for (i = 1; i < RCU_NUM_LVLS; i++)
+ rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT;
+
+ /*
+ * The tree must be able to accommodate the configured number of CPUs.
+ * If this limit is exceeded than we have a serious problem elsewhere.
+ */
+ if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1])
+ panic("rcu_init_geometry: rcu_capacity[] is too small");
+
+ /* Calculate the number of levels in the tree. */
+ for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) {
+ }
+ rcu_num_lvls = i + 1;
+
/* Calculate the number of rcu_nodes at each level of the tree. */
- for (i = 1; i <= MAX_RCU_LVLS; i++)
- if (n <= rcu_capacity[i]) {
- for (j = 0; j <= i; j++)
- num_rcu_lvl[j] =
- DIV_ROUND_UP(n, rcu_capacity[i - j]);
- rcu_num_lvls = i;
- for (j = i + 1; j <= MAX_RCU_LVLS; j++)
- num_rcu_lvl[j] = 0;
- break;
- }
+ for (i = 0; i < rcu_num_lvls; i++) {
+ int cap = rcu_capacity[(rcu_num_lvls - 1) - i];
+ num_rcu_lvl[i] = DIV_ROUND_UP(nr_cpu_ids, cap);
+ }
/* Calculate the total number of rcu_node structures. */
rcu_num_nodes = 0;
- for (i = 0; i <= MAX_RCU_LVLS; i++)
+ for (i = 0; i < rcu_num_lvls; i++)
rcu_num_nodes += num_rcu_lvl[i];
- rcu_num_nodes -= n;
}
/*
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 4adb7ca0bf47..2e991f8361e4 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -27,6 +27,7 @@
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/seqlock.h>
+#include <linux/stop_machine.h>
/*
* Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
@@ -36,8 +37,6 @@
* Of course, your mileage may vary.
*/
-#define MAX_RCU_LVLS 4
-
#ifdef CONFIG_RCU_FANOUT
#define RCU_FANOUT CONFIG_RCU_FANOUT
#else /* #ifdef CONFIG_RCU_FANOUT */
@@ -66,38 +65,53 @@
#if NR_CPUS <= RCU_FANOUT_1
# define RCU_NUM_LVLS 1
# define NUM_RCU_LVL_0 1
-# define NUM_RCU_LVL_1 (NR_CPUS)
-# define NUM_RCU_LVL_2 0
-# define NUM_RCU_LVL_3 0
-# define NUM_RCU_LVL_4 0
+# define NUM_RCU_NODES NUM_RCU_LVL_0
+# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 }
+# define RCU_NODE_NAME_INIT { "rcu_node_0" }
+# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" }
+# define RCU_EXP_NAME_INIT { "rcu_node_exp_0" }
+# define RCU_EXP_SCHED_NAME_INIT \
+ { "rcu_node_exp_sched_0" }
#elif NR_CPUS <= RCU_FANOUT_2
# define RCU_NUM_LVLS 2
# define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-# define NUM_RCU_LVL_2 (NR_CPUS)
-# define NUM_RCU_LVL_3 0
-# define NUM_RCU_LVL_4 0
+# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1)
+# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
+# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" }
+# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" }
+# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" }
+# define RCU_EXP_SCHED_NAME_INIT \
+ { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1" }
#elif NR_CPUS <= RCU_FANOUT_3
# define RCU_NUM_LVLS 3
# define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-# define NUM_RCU_LVL_3 (NR_CPUS)
-# define NUM_RCU_LVL_4 0
+# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2)
+# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
+# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
+# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
+# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" }
+# define RCU_EXP_SCHED_NAME_INIT \
+ { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2" }
#elif NR_CPUS <= RCU_FANOUT_4
# define RCU_NUM_LVLS 4
# define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-# define NUM_RCU_LVL_4 (NR_CPUS)
+# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
+# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
+# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
+# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
+# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" }
+# define RCU_EXP_SCHED_NAME_INIT \
+ { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2", "rcu_node_exp_sched_3" }
#else
# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
-#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
-#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
-
extern int rcu_num_lvls;
extern int rcu_num_nodes;
@@ -236,6 +250,8 @@ struct rcu_node {
int need_future_gp[2];
/* Counts of upcoming no-CB GP requests. */
raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
+
+ struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp;
} ____cacheline_internodealigned_in_smp;
/*
@@ -287,12 +303,13 @@ struct rcu_data {
bool gpwrap; /* Possible gpnum/completed wrap. */
struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
unsigned long grpmask; /* Mask to apply to leaf qsmask. */
-#ifdef CONFIG_RCU_CPU_STALL_INFO
unsigned long ticks_this_gp; /* The number of scheduling-clock */
/* ticks this CPU has handled */
/* during and after the last grace */
/* period it is aware of. */
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
+ struct cpu_stop_work exp_stop_work;
+ /* Expedited grace-period control */
+ /* for CPU stopping. */
/* 2) batch handling */
/*
@@ -355,11 +372,13 @@ struct rcu_data {
unsigned long n_rp_nocb_defer_wakeup;
unsigned long n_rp_need_nothing;
- /* 6) _rcu_barrier() and OOM callbacks. */
+ /* 6) _rcu_barrier(), OOM callbacks, and expediting. */
struct rcu_head barrier_head;
#ifdef CONFIG_RCU_FAST_NO_HZ
struct rcu_head oom_head;
#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
+ struct mutex exp_funnel_mutex;
+ bool exp_done; /* Expedited QS for this CPU? */
/* 7) Callback offloading. */
#ifdef CONFIG_RCU_NOCB_CPU
@@ -387,9 +406,7 @@ struct rcu_data {
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
/* 8) RCU CPU stall data. */
-#ifdef CONFIG_RCU_CPU_STALL_INFO
unsigned int softirq_snap; /* Snapshot of softirq activity. */
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
int cpu;
struct rcu_state *rsp;
@@ -442,9 +459,9 @@ do { \
*/
struct rcu_state {
struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */
- struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */
- u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
- u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
+ struct rcu_node *level[RCU_NUM_LVLS + 1];
+ /* Hierarchy levels (+1 to */
+ /* shut bogus gcc warning) */
u8 flavor_mask; /* bit in flavor mask. */
struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
@@ -479,21 +496,18 @@ struct rcu_state {
struct mutex barrier_mutex; /* Guards barrier fields. */
atomic_t barrier_cpu_count; /* # CPUs waiting on. */
struct completion barrier_completion; /* Wake at barrier end. */
- unsigned long n_barrier_done; /* ++ at start and end of */
+ unsigned long barrier_sequence; /* ++ at start and end of */
/* _rcu_barrier(). */
/* End of fields guarded by barrier_mutex. */
- atomic_long_t expedited_start; /* Starting ticket. */
- atomic_long_t expedited_done; /* Done ticket. */
- atomic_long_t expedited_wrap; /* # near-wrap incidents. */
- atomic_long_t expedited_tryfail; /* # acquisition failures. */
+ unsigned long expedited_sequence; /* Take a ticket. */
+ atomic_long_t expedited_workdone0; /* # done by others #0. */
atomic_long_t expedited_workdone1; /* # done by others #1. */
atomic_long_t expedited_workdone2; /* # done by others #2. */
+ atomic_long_t expedited_workdone3; /* # done by others #3. */
atomic_long_t expedited_normal; /* # fallbacks to normal. */
- atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */
- atomic_long_t expedited_done_tries; /* # tries to update _done. */
- atomic_long_t expedited_done_lost; /* # times beaten to _done. */
- atomic_long_t expedited_done_exit; /* # times exited _done loop. */
+ atomic_t expedited_need_qs; /* # CPUs left to check in. */
+ wait_queue_head_t expedited_wq; /* Wait for check-ins. */
unsigned long jiffies_force_qs; /* Time at which to invoke */
/* force_quiescent_state(). */
@@ -527,7 +541,11 @@ struct rcu_state {
/* Values for rcu_state structure's gp_flags field. */
#define RCU_GP_WAIT_INIT 0 /* Initial state. */
#define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */
-#define RCU_GP_WAIT_FQS 2 /* Wait for force-quiescent-state time. */
+#define RCU_GP_DONE_GPS 2 /* Wait done for grace-period start. */
+#define RCU_GP_WAIT_FQS 3 /* Wait for force-quiescent-state time. */
+#define RCU_GP_DOING_FQS 4 /* Wait done for force-quiescent-state time. */
+#define RCU_GP_CLEANUP 5 /* Grace-period cleanup started. */
+#define RCU_GP_CLEANED 6 /* Grace-period cleanup complete. */
extern struct list_head rcu_struct_flavors;
@@ -635,3 +653,15 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
}
#endif /* #ifdef CONFIG_RCU_TRACE */
+
+/*
+ * Place this after a lock-acquisition primitive to guarantee that
+ * an UNLOCK+LOCK pair act as a full barrier. This guarantee applies
+ * if the UNLOCK and LOCK are executed by the same CPU or if the
+ * UNLOCK and LOCK operate on the same lock variable.
+ */
+#ifdef CONFIG_PPC
+#define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */
+#else /* #ifdef CONFIG_PPC */
+#define smp_mb__after_unlock_lock() do { } while (0)
+#endif /* #else #ifdef CONFIG_PPC */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 013485fb2b06..b2bf3963a0ae 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -82,10 +82,8 @@ static void __init rcu_bootup_announce_oddness(void)
pr_info("\tRCU lockdep checking is enabled.\n");
if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE))
pr_info("\tRCU torture testing starts during boot.\n");
- if (IS_ENABLED(CONFIG_RCU_CPU_STALL_INFO))
- pr_info("\tAdditional per-CPU info printed with stalls.\n");
- if (NUM_RCU_LVL_4 != 0)
- pr_info("\tFour-level hierarchy is enabled.\n");
+ if (RCU_NUM_LVLS >= 4)
+ pr_info("\tFour(or more)-level hierarchy is enabled.\n");
if (RCU_FANOUT_LEAF != 16)
pr_info("\tBuild-time adjustment of leaf fanout to %d.\n",
RCU_FANOUT_LEAF);
@@ -418,8 +416,6 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
rcu_print_detail_task_stall_rnp(rnp);
}
-#ifdef CONFIG_RCU_CPU_STALL_INFO
-
static void rcu_print_task_stall_begin(struct rcu_node *rnp)
{
pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
@@ -431,18 +427,6 @@ static void rcu_print_task_stall_end(void)
pr_cont("\n");
}
-#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
-
-static void rcu_print_task_stall_begin(struct rcu_node *rnp)
-{
-}
-
-static void rcu_print_task_stall_end(void)
-{
-}
-
-#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
-
/*
* Scan the current list of tasks blocked within RCU read-side critical
* sections, printing out the tid of each.
@@ -538,10 +522,10 @@ EXPORT_SYMBOL_GPL(call_rcu);
*/
void synchronize_rcu(void)
{
- rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
- !lock_is_held(&rcu_lock_map) &&
- !lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_rcu() in RCU read-side critical section");
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_rcu() in RCU read-side critical section");
if (!rcu_scheduler_active)
return;
if (rcu_gp_is_expedited())
@@ -552,8 +536,6 @@ void synchronize_rcu(void)
EXPORT_SYMBOL_GPL(synchronize_rcu);
static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
-static unsigned long sync_rcu_preempt_exp_count;
-static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
/*
* Return non-zero if there are any tasks in RCU read-side critical
@@ -573,7 +555,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp)
* for the current expedited grace period. Works only for preemptible
* RCU -- other RCU implementation use other means.
*
- * Caller must hold sync_rcu_preempt_exp_mutex.
+ * Caller must hold the root rcu_node's exp_funnel_mutex.
*/
static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
{
@@ -589,7 +571,7 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
* recursively up the tree. (Calm down, calm down, we do the recursion
* iteratively!)
*
- * Caller must hold sync_rcu_preempt_exp_mutex.
+ * Caller must hold the root rcu_node's exp_funnel_mutex.
*/
static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
bool wake)
@@ -628,7 +610,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
* set the ->expmask bits on the leaf rcu_node structures to tell phase 2
* that work is needed here.
*
- * Caller must hold sync_rcu_preempt_exp_mutex.
+ * Caller must hold the root rcu_node's exp_funnel_mutex.
*/
static void
sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp)
@@ -671,7 +653,7 @@ sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp)
* invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits,
* enabling rcu_read_unlock_special() to do the bit-clearing.
*
- * Caller must hold sync_rcu_preempt_exp_mutex.
+ * Caller must hold the root rcu_node's exp_funnel_mutex.
*/
static void
sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp)
@@ -719,51 +701,17 @@ sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp)
void synchronize_rcu_expedited(void)
{
struct rcu_node *rnp;
+ struct rcu_node *rnp_unlock;
struct rcu_state *rsp = rcu_state_p;
- unsigned long snap;
- int trycount = 0;
+ unsigned long s;
- smp_mb(); /* Caller's modifications seen first by other CPUs. */
- snap = READ_ONCE(sync_rcu_preempt_exp_count) + 1;
- smp_mb(); /* Above access cannot bleed into critical section. */
+ s = rcu_exp_gp_seq_snap(rsp);
- /*
- * Block CPU-hotplug operations. This means that any CPU-hotplug
- * operation that finds an rcu_node structure with tasks in the
- * process of being boosted will know that all tasks blocking
- * this expedited grace period will already be in the process of
- * being boosted. This simplifies the process of moving tasks
- * from leaf to root rcu_node structures.
- */
- if (!try_get_online_cpus()) {
- /* CPU-hotplug operation in flight, fall back to normal GP. */
- wait_rcu_gp(call_rcu);
- return;
- }
+ rnp_unlock = exp_funnel_lock(rsp, s);
+ if (rnp_unlock == NULL)
+ return; /* Someone else did our work for us. */
- /*
- * Acquire lock, falling back to synchronize_rcu() if too many
- * lock-acquisition failures. Of course, if someone does the
- * expedited grace period for us, just leave.
- */
- while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
- if (ULONG_CMP_LT(snap,
- READ_ONCE(sync_rcu_preempt_exp_count))) {
- put_online_cpus();
- goto mb_ret; /* Others did our work for us. */
- }
- if (trycount++ < 10) {
- udelay(trycount * num_online_cpus());
- } else {
- put_online_cpus();
- wait_rcu_gp(call_rcu);
- return;
- }
- }
- if (ULONG_CMP_LT(snap, READ_ONCE(sync_rcu_preempt_exp_count))) {
- put_online_cpus();
- goto unlock_mb_ret; /* Others did our work for us. */
- }
+ rcu_exp_gp_seq_start(rsp);
/* force all RCU readers onto ->blkd_tasks lists. */
synchronize_sched_expedited();
@@ -779,20 +727,14 @@ void synchronize_rcu_expedited(void)
rcu_for_each_leaf_node(rsp, rnp)
sync_rcu_preempt_exp_init2(rsp, rnp);
- put_online_cpus();
-
/* Wait for snapshotted ->blkd_tasks lists to drain. */
rnp = rcu_get_root(rsp);
wait_event(sync_rcu_preempt_exp_wq,
sync_rcu_preempt_exp_done(rnp));
/* Clean up and exit. */
- smp_mb(); /* ensure expedited GP seen before counter increment. */
- WRITE_ONCE(sync_rcu_preempt_exp_count, sync_rcu_preempt_exp_count + 1);
-unlock_mb_ret:
- mutex_unlock(&sync_rcu_preempt_exp_mutex);
-mb_ret:
- smp_mb(); /* ensure subsequent action seen after grace period. */
+ rcu_exp_gp_seq_end(rsp);
+ mutex_unlock(&rnp_unlock->exp_funnel_mutex);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
@@ -1061,8 +1003,7 @@ static int rcu_boost(struct rcu_node *rnp)
}
/*
- * Priority-boosting kthread. One per leaf rcu_node and one for the
- * root rcu_node.
+ * Priority-boosting kthread, one per leaf rcu_node.
*/
static int rcu_boost_kthread(void *arg)
{
@@ -1680,12 +1621,10 @@ static int rcu_oom_notify(struct notifier_block *self,
*/
atomic_set(&oom_callback_count, 1);
- get_online_cpus();
for_each_online_cpu(cpu) {
smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
cond_resched_rcu_qs();
}
- put_online_cpus();
/* Unconditionally decrement: no need to wake ourselves up. */
atomic_dec(&oom_callback_count);
@@ -1706,8 +1645,6 @@ early_initcall(rcu_register_oom_notifier);
#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-#ifdef CONFIG_RCU_CPU_STALL_INFO
-
#ifdef CONFIG_RCU_FAST_NO_HZ
static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
@@ -1796,33 +1733,6 @@ static void increment_cpu_stall_ticks(void)
raw_cpu_inc(rsp->rda->ticks_this_gp);
}
-#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
-
-static void print_cpu_stall_info_begin(void)
-{
- pr_cont(" {");
-}
-
-static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
-{
- pr_cont(" %d", cpu);
-}
-
-static void print_cpu_stall_info_end(void)
-{
- pr_cont("} ");
-}
-
-static void zero_cpu_stall_ticks(struct rcu_data *rdp)
-{
-}
-
-static void increment_cpu_stall_ticks(void)
-{
-}
-
-#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
-
#ifdef CONFIG_RCU_NOCB_CPU
/*
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 3ea7ffc7d5c4..6fc4c5ff3bb5 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -81,9 +81,9 @@ static void r_stop(struct seq_file *m, void *v)
static int show_rcubarrier(struct seq_file *m, void *v)
{
struct rcu_state *rsp = (struct rcu_state *)m->private;
- seq_printf(m, "bcc: %d nbd: %lu\n",
+ seq_printf(m, "bcc: %d bseq: %lu\n",
atomic_read(&rsp->barrier_cpu_count),
- rsp->n_barrier_done);
+ rsp->barrier_sequence);
return 0;
}
@@ -185,18 +185,15 @@ static int show_rcuexp(struct seq_file *m, void *v)
{
struct rcu_state *rsp = (struct rcu_state *)m->private;
- seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n",
- atomic_long_read(&rsp->expedited_start),
- atomic_long_read(&rsp->expedited_done),
- atomic_long_read(&rsp->expedited_wrap),
- atomic_long_read(&rsp->expedited_tryfail),
+ seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
+ rsp->expedited_sequence,
+ atomic_long_read(&rsp->expedited_workdone0),
atomic_long_read(&rsp->expedited_workdone1),
atomic_long_read(&rsp->expedited_workdone2),
+ atomic_long_read(&rsp->expedited_workdone3),
atomic_long_read(&rsp->expedited_normal),
- atomic_long_read(&rsp->expedited_stoppedcpus),
- atomic_long_read(&rsp->expedited_done_tries),
- atomic_long_read(&rsp->expedited_done_lost),
- atomic_long_read(&rsp->expedited_done_exit));
+ atomic_read(&rsp->expedited_need_qs),
+ rsp->expedited_sequence / 2);
return 0;
}
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index afaecb7a799a..7a0b3bc7c5ed 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -62,6 +62,55 @@ MODULE_ALIAS("rcupdate");
module_param(rcu_expedited, int, 0);
+#if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT)
+/**
+ * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
+ *
+ * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an
+ * RCU-sched read-side critical section. In absence of
+ * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side
+ * critical section unless it can prove otherwise. Note that disabling
+ * of preemption (including disabling irqs) counts as an RCU-sched
+ * read-side critical section. This is useful for debug checks in functions
+ * that required that they be called within an RCU-sched read-side
+ * critical section.
+ *
+ * Check debug_lockdep_rcu_enabled() to prevent false positives during boot
+ * and while lockdep is disabled.
+ *
+ * Note that if the CPU is in the idle loop from an RCU point of
+ * view (ie: that we are in the section between rcu_idle_enter() and
+ * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU
+ * did an rcu_read_lock(). The reason for this is that RCU ignores CPUs
+ * that are in such a section, considering these as in extended quiescent
+ * state, so such a CPU is effectively never in an RCU read-side critical
+ * section regardless of what RCU primitives it invokes. This state of
+ * affairs is required --- we need to keep an RCU-free window in idle
+ * where the CPU may possibly enter into low power mode. This way we can
+ * notice an extended quiescent state to other CPUs that started a grace
+ * period. Otherwise we would delay any grace period as long as we run in
+ * the idle task.
+ *
+ * Similarly, we avoid claiming an SRCU read lock held if the current
+ * CPU is offline.
+ */
+int rcu_read_lock_sched_held(void)
+{
+ int lockdep_opinion = 0;
+
+ if (!debug_lockdep_rcu_enabled())
+ return 1;
+ if (!rcu_is_watching())
+ return 0;
+ if (!rcu_lockdep_current_cpu_online())
+ return 0;
+ if (debug_locks)
+ lockdep_opinion = lock_is_held(&rcu_sched_lock_map);
+ return lockdep_opinion || preempt_count() != 0 || irqs_disabled();
+}
+EXPORT_SYMBOL(rcu_read_lock_sched_held);
+#endif
+
#ifndef CONFIG_TINY_RCU
static atomic_t rcu_expedited_nesting =
@@ -269,20 +318,37 @@ void wakeme_after_rcu(struct rcu_head *head)
rcu = container_of(head, struct rcu_synchronize, head);
complete(&rcu->completion);
}
+EXPORT_SYMBOL_GPL(wakeme_after_rcu);
-void wait_rcu_gp(call_rcu_func_t crf)
+void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
+ struct rcu_synchronize *rs_array)
{
- struct rcu_synchronize rcu;
+ int i;
- init_rcu_head_on_stack(&rcu.head);
- init_completion(&rcu.completion);
- /* Will wake me after RCU finished. */
- crf(&rcu.head, wakeme_after_rcu);
- /* Wait for it. */
- wait_for_completion(&rcu.completion);
- destroy_rcu_head_on_stack(&rcu.head);
+ /* Initialize and register callbacks for each flavor specified. */
+ for (i = 0; i < n; i++) {
+ if (checktiny &&
+ (crcu_array[i] == call_rcu ||
+ crcu_array[i] == call_rcu_bh)) {
+ might_sleep();
+ continue;
+ }
+ init_rcu_head_on_stack(&rs_array[i].head);
+ init_completion(&rs_array[i].completion);
+ (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu);
+ }
+
+ /* Wait for all callbacks to be invoked. */
+ for (i = 0; i < n; i++) {
+ if (checktiny &&
+ (crcu_array[i] == call_rcu ||
+ crcu_array[i] == call_rcu_bh))
+ continue;
+ wait_for_completion(&rs_array[i].completion);
+ destroy_rcu_head_on_stack(&rs_array[i].head);
+ }
}
-EXPORT_SYMBOL_GPL(wait_rcu_gp);
+EXPORT_SYMBOL_GPL(__wait_rcu_gp);
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
void init_rcu_head(struct rcu_head *head)
@@ -523,8 +589,8 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks);
void synchronize_rcu_tasks(void)
{
/* Complain if the scheduler has not started. */
- rcu_lockdep_assert(!rcu_scheduler_active,
- "synchronize_rcu_tasks called too soon");
+ RCU_LOCKDEP_WARN(!rcu_scheduler_active,
+ "synchronize_rcu_tasks called too soon");
/* Wait for the grace period. */
wait_rcu_gp(call_rcu_tasks);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 66ae8baf42fe..3595403921bd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1149,15 +1149,45 @@ static int migration_cpu_stop(void *data)
return 0;
}
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+/*
+ * sched_class::set_cpus_allowed must do the below, but is not required to
+ * actually call this function.
+ */
+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
{
- if (p->sched_class->set_cpus_allowed)
- p->sched_class->set_cpus_allowed(p, new_mask);
-
cpumask_copy(&p->cpus_allowed, new_mask);
p->nr_cpus_allowed = cpumask_weight(new_mask);
}
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+ struct rq *rq = task_rq(p);
+ bool queued, running;
+
+ lockdep_assert_held(&p->pi_lock);
+
+ queued = task_on_rq_queued(p);
+ running = task_current(rq, p);
+
+ if (queued) {
+ /*
+ * Because __kthread_bind() calls this on blocked tasks without
+ * holding rq->lock.
+ */
+ lockdep_assert_held(&rq->lock);
+ dequeue_task(rq, p, 0);
+ }
+ if (running)
+ put_prev_task(rq, p);
+
+ p->sched_class->set_cpus_allowed(p, new_mask);
+
+ if (running)
+ p->sched_class->set_curr_task(rq);
+ if (queued)
+ enqueue_task(rq, p, 0);
+}
+
/*
* Change a given task's CPU affinity. Migrate the thread to a
* proper CPU and schedule it away if the CPU it's executing on
@@ -1167,7 +1197,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
* task must not exit() & deallocate itself prematurely. The
* call is not atomic; no spinlocks may be held.
*/
-int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+ const struct cpumask *new_mask, bool check)
{
unsigned long flags;
struct rq *rq;
@@ -1176,6 +1207,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
rq = task_rq_lock(p, &flags);
+ /*
+ * Must re-check here, to close a race against __kthread_bind(),
+ * sched_setaffinity() is not guaranteed to observe the flag.
+ */
+ if (check && (p->flags & PF_NO_SETAFFINITY)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
if (cpumask_equal(&p->cpus_allowed, new_mask))
goto out;
@@ -1212,6 +1252,11 @@ out:
return ret;
}
+
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+ return __set_cpus_allowed_ptr(p, new_mask, false);
+}
EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
@@ -1593,6 +1638,15 @@ static void update_avg(u64 *avg, u64 sample)
s64 diff = sample - *avg;
*avg += diff >> 3;
}
+
+#else
+
+static inline int __set_cpus_allowed_ptr(struct task_struct *p,
+ const struct cpumask *new_mask, bool check)
+{
+ return set_cpus_allowed_ptr(p, new_mask);
+}
+
#endif /* CONFIG_SMP */
static void
@@ -1652,9 +1706,9 @@ static void
ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
{
check_preempt_curr(rq, p, wake_flags);
- trace_sched_wakeup(p, true);
-
p->state = TASK_RUNNING;
+ trace_sched_wakeup(p);
+
#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
/*
@@ -1872,6 +1926,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
if (!(p->state & state))
goto out;
+ trace_sched_waking(p);
+
success = 1; /* we're going to change ->state */
cpu = task_cpu(p);
@@ -1947,6 +2003,8 @@ static void try_to_wake_up_local(struct task_struct *p)
if (!(p->state & TASK_NORMAL))
goto out;
+ trace_sched_waking(p);
+
if (!task_on_rq_queued(p))
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
@@ -2014,9 +2072,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
p->se.vruntime = 0;
-#ifdef CONFIG_SMP
- p->se.avg.decay_count = 0;
-#endif
INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_SCHEDSTATS
@@ -2198,8 +2253,8 @@ unsigned long to_ratio(u64 period, u64 runtime)
#ifdef CONFIG_SMP
inline struct dl_bw *dl_bw_of(int i)
{
- rcu_lockdep_assert(rcu_read_lock_sched_held(),
- "sched RCU must be held");
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
return &cpu_rq(i)->rd->dl_bw;
}
@@ -2208,8 +2263,8 @@ static inline int dl_bw_cpus(int i)
struct root_domain *rd = cpu_rq(i)->rd;
int cpus = 0;
- rcu_lockdep_assert(rcu_read_lock_sched_held(),
- "sched RCU must be held");
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
for_each_cpu_and(i, rd->span, cpu_active_mask)
cpus++;
@@ -2301,11 +2356,11 @@ void wake_up_new_task(struct task_struct *p)
#endif
/* Initialize new task's runnable average */
- init_task_runnable_average(p);
+ init_entity_runnable_average(&p->se);
rq = __task_rq_lock(p);
activate_task(rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
- trace_sched_wakeup_new(p, true);
+ trace_sched_wakeup_new(p);
check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken)
@@ -2467,7 +2522,6 @@ static struct rq *finish_task_switch(struct task_struct *prev)
*/
prev_state = prev->state;
vtime_task_switch(prev);
- finish_arch_switch(prev);
perf_event_task_sched_in(prev, current);
finish_lock_switch(rq, prev);
finish_arch_post_lock_switch();
@@ -2487,7 +2541,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
put_task_struct(prev);
}
- tick_nohz_task_switch(current);
+ tick_nohz_task_switch();
return rq;
}
@@ -4338,7 +4392,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
}
#endif
again:
- retval = set_cpus_allowed_ptr(p, new_mask);
+ retval = __set_cpus_allowed_ptr(p, new_mask, true);
if (!retval) {
cpuset_cpus_allowed(p, cpus_allowed);
@@ -4490,7 +4544,7 @@ SYSCALL_DEFINE0(sched_yield)
int __sched _cond_resched(void)
{
- if (should_resched()) {
+ if (should_resched(0)) {
preempt_schedule_common();
return 1;
}
@@ -4508,7 +4562,7 @@ EXPORT_SYMBOL(_cond_resched);
*/
int __cond_resched_lock(spinlock_t *lock)
{
- int resched = should_resched();
+ int resched = should_resched(PREEMPT_LOCK_OFFSET);
int ret = 0;
lockdep_assert_held(lock);
@@ -4530,7 +4584,7 @@ int __sched __cond_resched_softirq(void)
{
BUG_ON(!in_softirq());
- if (should_resched()) {
+ if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
local_bh_enable();
preempt_schedule_common();
local_bh_disable();
@@ -4863,7 +4917,8 @@ void init_idle(struct task_struct *idle, int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_lock_irqsave(&idle->pi_lock, flags);
+ raw_spin_lock(&rq->lock);
__sched_fork(0, idle);
idle->state = TASK_RUNNING;
@@ -4889,7 +4944,8 @@ void init_idle(struct task_struct *idle, int cpu)
#if defined(CONFIG_SMP)
idle->on_cpu = 1;
#endif
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
/* Set the preempt count _outside_ the spinlocks! */
init_idle_preempt_count(idle, cpu);
@@ -5309,8 +5365,7 @@ static void register_sched_domain_sysctl(void)
/* may be called multiple times per register */
static void unregister_sched_domain_sysctl(void)
{
- if (sd_sysctl_header)
- unregister_sysctl_table(sd_sysctl_header);
+ unregister_sysctl_table(sd_sysctl_header);
sd_sysctl_header = NULL;
if (sd_ctl_dir[0].child)
sd_free_ctl_entry(&sd_ctl_dir[0].child);
@@ -5431,6 +5486,14 @@ static int sched_cpu_active(struct notifier_block *nfb,
case CPU_STARTING:
set_cpu_rq_start_time();
return NOTIFY_OK;
+ case CPU_ONLINE:
+ /*
+ * At this point a starting CPU has marked itself as online via
+ * set_cpu_online(). But it might not yet have marked itself
+ * as active, which is essential from here on.
+ *
+ * Thus, fall-through and help the starting CPU along.
+ */
case CPU_DOWN_FAILED:
set_cpu_active((long)hcpu, true);
return NOTIFY_OK;
@@ -6443,8 +6506,10 @@ static void init_numa_topology_type(void)
n = sched_max_numa_distance;
- if (n <= 1)
+ if (sched_domains_numa_levels <= 1) {
sched_numa_topology_type = NUMA_DIRECT;
+ return;
+ }
for_each_online_node(a) {
for_each_online_node(b) {
@@ -8066,7 +8131,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
sched_offline_group(tg);
}
-static void cpu_cgroup_fork(struct task_struct *task)
+static void cpu_cgroup_fork(struct task_struct *task, void *private)
{
sched_move_task(task);
}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f5a64ffad176..8cbc3db671df 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -555,48 +555,43 @@ drop_precision:
}
/*
- * Atomically advance counter to the new value. Interrupts, vcpu
- * scheduling, and scaling inaccuracies can cause cputime_advance
- * to be occasionally called with a new value smaller than counter.
- * Let's enforce atomicity.
+ * Adjust tick based cputime random precision against scheduler runtime
+ * accounting.
*
- * Normally a caller will only go through this loop once, or not
- * at all in case a previous caller updated counter the same jiffy.
- */
-static void cputime_advance(cputime_t *counter, cputime_t new)
-{
- cputime_t old;
-
- while (new > (old = READ_ONCE(*counter)))
- cmpxchg_cputime(counter, old, new);
-}
-
-/*
- * Adjust tick based cputime random precision against scheduler
- * runtime accounting.
+ * Tick based cputime accounting depend on random scheduling timeslices of a
+ * task to be interrupted or not by the timer. Depending on these
+ * circumstances, the number of these interrupts may be over or
+ * under-optimistic, matching the real user and system cputime with a variable
+ * precision.
+ *
+ * Fix this by scaling these tick based values against the total runtime
+ * accounted by the CFS scheduler.
+ *
+ * This code provides the following guarantees:
+ *
+ * stime + utime == rtime
+ * stime_i+1 >= stime_i, utime_i+1 >= utime_i
+ *
+ * Assuming that rtime_i+1 >= rtime_i.
*/
static void cputime_adjust(struct task_cputime *curr,
- struct cputime *prev,
+ struct prev_cputime *prev,
cputime_t *ut, cputime_t *st)
{
cputime_t rtime, stime, utime;
+ unsigned long flags;
- /*
- * Tick based cputime accounting depend on random scheduling
- * timeslices of a task to be interrupted or not by the timer.
- * Depending on these circumstances, the number of these interrupts
- * may be over or under-optimistic, matching the real user and system
- * cputime with a variable precision.
- *
- * Fix this by scaling these tick based values against the total
- * runtime accounted by the CFS scheduler.
- */
+ /* Serialize concurrent callers such that we can honour our guarantees */
+ raw_spin_lock_irqsave(&prev->lock, flags);
rtime = nsecs_to_cputime(curr->sum_exec_runtime);
/*
- * Update userspace visible utime/stime values only if actual execution
- * time is bigger than already exported. Note that can happen, that we
- * provided bigger values due to scaling inaccuracy on big numbers.
+ * This is possible under two circumstances:
+ * - rtime isn't monotonic after all (a bug);
+ * - we got reordered by the lock.
+ *
+ * In both cases this acts as a filter such that the rest of the code
+ * can assume it is monotonic regardless of anything else.
*/
if (prev->stime + prev->utime >= rtime)
goto out;
@@ -606,22 +601,46 @@ static void cputime_adjust(struct task_cputime *curr,
if (utime == 0) {
stime = rtime;
- } else if (stime == 0) {
- utime = rtime;
- } else {
- cputime_t total = stime + utime;
+ goto update;
+ }
- stime = scale_stime((__force u64)stime,
- (__force u64)rtime, (__force u64)total);
- utime = rtime - stime;
+ if (stime == 0) {
+ utime = rtime;
+ goto update;
}
- cputime_advance(&prev->stime, stime);
- cputime_advance(&prev->utime, utime);
+ stime = scale_stime((__force u64)stime, (__force u64)rtime,
+ (__force u64)(stime + utime));
+
+ /*
+ * Make sure stime doesn't go backwards; this preserves monotonicity
+ * for utime because rtime is monotonic.
+ *
+ * utime_i+1 = rtime_i+1 - stime_i
+ * = rtime_i+1 - (rtime_i - utime_i)
+ * = (rtime_i+1 - rtime_i) + utime_i
+ * >= utime_i
+ */
+ if (stime < prev->stime)
+ stime = prev->stime;
+ utime = rtime - stime;
+
+ /*
+ * Make sure utime doesn't go backwards; this still preserves
+ * monotonicity for stime, analogous argument to above.
+ */
+ if (utime < prev->utime) {
+ utime = prev->utime;
+ stime = rtime - utime;
+ }
+update:
+ prev->stime = stime;
+ prev->utime = utime;
out:
*ut = prev->utime;
*st = prev->stime;
+ raw_spin_unlock_irqrestore(&prev->lock, flags);
}
void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0a17af35670a..fc8f01083527 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -953,7 +953,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
/*
* Use the scheduling parameters of the top pi-waiter
- * task if we have one and its (relative) deadline is
+ * task if we have one and its (absolute) deadline is
* smaller than our one... OTW we keep our runtime and
* deadline.
*/
@@ -1563,7 +1563,7 @@ out:
static void push_dl_tasks(struct rq *rq)
{
- /* Terminates as it moves a -deadline task */
+ /* push_dl_task() will return true if it moved a -deadline task */
while (push_dl_task(rq))
;
}
@@ -1657,7 +1657,6 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
{
if (!task_running(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
- has_pushable_dl_tasks(rq) &&
p->nr_cpus_allowed > 1 &&
dl_task(rq->curr) &&
(rq->curr->nr_cpus_allowed < 2 ||
@@ -1669,9 +1668,8 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
static void set_cpus_allowed_dl(struct task_struct *p,
const struct cpumask *new_mask)
{
- struct rq *rq;
struct root_domain *src_rd;
- int weight;
+ struct rq *rq;
BUG_ON(!dl_task(p));
@@ -1697,37 +1695,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
raw_spin_unlock(&src_dl_b->lock);
}
- /*
- * Update only if the task is actually running (i.e.,
- * it is on the rq AND it is not throttled).
- */
- if (!on_dl_rq(&p->dl))
- return;
-
- weight = cpumask_weight(new_mask);
-
- /*
- * Only update if the process changes its state from whether it
- * can migrate or not.
- */
- if ((p->nr_cpus_allowed > 1) == (weight > 1))
- return;
-
- /*
- * The process used to be able to migrate OR it can now migrate
- */
- if (weight <= 1) {
- if (!task_current(rq, p))
- dequeue_pushable_dl_task(rq, p);
- BUG_ON(!rq->dl.dl_nr_migratory);
- rq->dl.dl_nr_migratory--;
- } else {
- if (!task_current(rq, p))
- enqueue_pushable_dl_task(rq, p);
- rq->dl.dl_nr_migratory++;
- }
-
- update_dl_migration(&rq->dl);
+ set_cpus_allowed_common(p, new_mask);
}
/* Assumes rq->lock is held */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4222ec50ab88..641511771ae6 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -68,13 +68,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
#define PN(F) \
SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
- if (!se) {
- struct sched_avg *avg = &cpu_rq(cpu)->avg;
- P(avg->runnable_avg_sum);
- P(avg->avg_period);
+ if (!se)
return;
- }
-
PN(se->exec_start);
PN(se->vruntime);
@@ -93,12 +88,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
#endif
P(se->load.weight);
#ifdef CONFIG_SMP
- P(se->avg.runnable_avg_sum);
- P(se->avg.running_avg_sum);
- P(se->avg.avg_period);
- P(se->avg.load_avg_contrib);
- P(se->avg.utilization_avg_contrib);
- P(se->avg.decay_count);
+ P(se->avg.load_avg);
+ P(se->avg.util_avg);
#endif
#undef PN
#undef P
@@ -214,21 +205,21 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_SMP
- SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg",
+ SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
+ cfs_rq->avg.load_avg);
+ SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg",
cfs_rq->runnable_load_avg);
- SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg",
- cfs_rq->blocked_load_avg);
- SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg",
- cfs_rq->utilization_load_avg);
+ SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
+ cfs_rq->avg.util_avg);
+ SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg",
+ atomic_long_read(&cfs_rq->removed_load_avg));
+ SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg",
+ atomic_long_read(&cfs_rq->removed_util_avg));
#ifdef CONFIG_FAIR_GROUP_SCHED
- SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib",
- cfs_rq->tg_load_contrib);
- SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
- cfs_rq->tg_runnable_contrib);
+ SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib",
+ cfs_rq->tg_load_avg_contrib);
SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
atomic_long_read(&cfs_rq->tg->load_avg));
- SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
- atomic_read(&cfs_rq->tg->runnable_avg));
#endif
#endif
#ifdef CONFIG_CFS_BANDWIDTH
@@ -636,12 +627,11 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
P(se.load.weight);
#ifdef CONFIG_SMP
- P(se.avg.runnable_avg_sum);
- P(se.avg.running_avg_sum);
- P(se.avg.avg_period);
- P(se.avg.load_avg_contrib);
- P(se.avg.utilization_avg_contrib);
- P(se.avg.decay_count);
+ P(se.avg.load_sum);
+ P(se.avg.util_sum);
+ P(se.avg.load_avg);
+ P(se.avg.util_avg);
+ P(se.avg.last_update_time);
#endif
P(policy);
P(prio);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d113c3ba8bc4..6e2e3483b1ec 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -283,9 +283,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
return grp->my_q;
}
-static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
- int force_update);
-
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (!cfs_rq->on_list) {
@@ -305,8 +302,6 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
}
cfs_rq->on_list = 1;
- /* We should have no load, but we need to update last_decay. */
- update_cfs_rq_blocked_load(cfs_rq, 0);
}
}
@@ -616,15 +611,10 @@ static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
*/
static u64 __sched_period(unsigned long nr_running)
{
- u64 period = sysctl_sched_latency;
- unsigned long nr_latency = sched_nr_latency;
-
- if (unlikely(nr_running > nr_latency)) {
- period = sysctl_sched_min_granularity;
- period *= nr_running;
- }
-
- return period;
+ if (unlikely(nr_running > sched_nr_latency))
+ return nr_running * sysctl_sched_min_granularity;
+ else
+ return sysctl_sched_latency;
}
/*
@@ -669,22 +659,37 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
static int select_idle_sibling(struct task_struct *p, int cpu);
static unsigned long task_h_load(struct task_struct *p);
-static inline void __update_task_entity_contrib(struct sched_entity *se);
-static inline void __update_task_entity_utilization(struct sched_entity *se);
+/*
+ * We choose a half-life close to 1 scheduling period.
+ * Note: The tables below are dependent on this value.
+ */
+#define LOAD_AVG_PERIOD 32
+#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
+#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
-/* Give new task start runnable values to heavy its load in infant time */
-void init_task_runnable_average(struct task_struct *p)
+/* Give new sched_entity start runnable values to heavy its load in infant time */
+void init_entity_runnable_average(struct sched_entity *se)
{
- u32 slice;
+ struct sched_avg *sa = &se->avg;
- slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
- p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice;
- p->se.avg.avg_period = slice;
- __update_task_entity_contrib(&p->se);
- __update_task_entity_utilization(&p->se);
+ sa->last_update_time = 0;
+ /*
+ * sched_avg's period_contrib should be strictly less then 1024, so
+ * we give it 1023 to make sure it is almost a period (1024us), and
+ * will definitely be update (after enqueue).
+ */
+ sa->period_contrib = 1023;
+ sa->load_avg = scale_load_down(se->load.weight);
+ sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
+ sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
+ sa->util_sum = LOAD_AVG_MAX;
+ /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
+
+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
+static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
#else
-void init_task_runnable_average(struct task_struct *p)
+void init_entity_runnable_average(struct sched_entity *se)
{
}
#endif
@@ -1415,8 +1420,9 @@ static bool numa_has_capacity(struct task_numa_env *env)
* --------------------- vs ---------------------
* src->compute_capacity dst->compute_capacity
*/
- if (src->load * dst->compute_capacity >
- dst->load * src->compute_capacity)
+ if (src->load * dst->compute_capacity * env->imbalance_pct >
+
+ dst->load * src->compute_capacity * 100)
return true;
return false;
@@ -1702,8 +1708,8 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
delta = runtime - p->last_sum_exec_runtime;
*period = now - p->last_task_numa_placement;
} else {
- delta = p->se.avg.runnable_avg_sum;
- *period = p->se.avg.avg_period;
+ delta = p->se.avg.load_sum / p->se.load.weight;
+ *period = LOAD_AVG_MAX;
}
p->last_sum_exec_runtime = runtime;
@@ -2351,13 +2357,13 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
long tg_weight;
/*
- * Use this CPU's actual weight instead of the last load_contribution
- * to gain a more accurate current total weight. See
- * update_cfs_rq_load_contribution().
+ * Use this CPU's real-time load instead of the last load contribution
+ * as the updating of the contribution is delayed, and we will use the
+ * the real-time load to calc the share. See update_tg_load_avg().
*/
tg_weight = atomic_long_read(&tg->load_avg);
- tg_weight -= cfs_rq->tg_load_contrib;
- tg_weight += cfs_rq->load.weight;
+ tg_weight -= cfs_rq->tg_load_avg_contrib;
+ tg_weight += cfs_rq_load_avg(cfs_rq);
return tg_weight;
}
@@ -2367,7 +2373,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
long tg_weight, load, shares;
tg_weight = calc_tg_weight(tg, cfs_rq);
- load = cfs_rq->load.weight;
+ load = cfs_rq_load_avg(cfs_rq);
shares = (tg->shares * load);
if (tg_weight)
@@ -2429,14 +2435,6 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_SMP
-/*
- * We choose a half-life close to 1 scheduling period.
- * Note: The tables below are dependent on this value.
- */
-#define LOAD_AVG_PERIOD 32
-#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
-#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
-
/* Precomputed fixed inverse multiplies for multiplication by y^n */
static const u32 runnable_avg_yN_inv[] = {
0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
@@ -2485,9 +2483,8 @@ static __always_inline u64 decay_load(u64 val, u64 n)
local_n %= LOAD_AVG_PERIOD;
}
- val *= runnable_avg_yN_inv[local_n];
- /* We don't use SRR here since we always want to round down. */
- return val >> 32;
+ val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
+ return val;
}
/*
@@ -2546,23 +2543,22 @@ static u32 __compute_runnable_contrib(u64 n)
* load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
*/
-static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
- struct sched_avg *sa,
- int runnable,
- int running)
+static __always_inline int
+__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
+ unsigned long weight, int running, struct cfs_rq *cfs_rq)
{
u64 delta, periods;
- u32 runnable_contrib;
+ u32 contrib;
int delta_w, decayed = 0;
unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu);
- delta = now - sa->last_runnable_update;
+ delta = now - sa->last_update_time;
/*
* This should only happen when time goes backwards, which it
* unfortunately does during sched clock init when we swap over to TSC.
*/
if ((s64)delta < 0) {
- sa->last_runnable_update = now;
+ sa->last_update_time = now;
return 0;
}
@@ -2573,26 +2569,29 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
delta >>= 10;
if (!delta)
return 0;
- sa->last_runnable_update = now;
+ sa->last_update_time = now;
/* delta_w is the amount already accumulated against our next period */
- delta_w = sa->avg_period % 1024;
+ delta_w = sa->period_contrib;
if (delta + delta_w >= 1024) {
- /* period roll-over */
decayed = 1;
+ /* how much left for next period will start over, we don't know yet */
+ sa->period_contrib = 0;
+
/*
* Now that we know we're crossing a period boundary, figure
* out how much from delta we need to complete the current
* period and accrue it.
*/
delta_w = 1024 - delta_w;
- if (runnable)
- sa->runnable_avg_sum += delta_w;
+ if (weight) {
+ sa->load_sum += weight * delta_w;
+ if (cfs_rq)
+ cfs_rq->runnable_load_sum += weight * delta_w;
+ }
if (running)
- sa->running_avg_sum += delta_w * scale_freq
- >> SCHED_CAPACITY_SHIFT;
- sa->avg_period += delta_w;
+ sa->util_sum += delta_w * scale_freq >> SCHED_CAPACITY_SHIFT;
delta -= delta_w;
@@ -2600,341 +2599,186 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
periods = delta / 1024;
delta %= 1024;
- sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
- periods + 1);
- sa->running_avg_sum = decay_load(sa->running_avg_sum,
- periods + 1);
- sa->avg_period = decay_load(sa->avg_period,
- periods + 1);
+ sa->load_sum = decay_load(sa->load_sum, periods + 1);
+ if (cfs_rq) {
+ cfs_rq->runnable_load_sum =
+ decay_load(cfs_rq->runnable_load_sum, periods + 1);
+ }
+ sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
/* Efficiently calculate \sum (1..n_period) 1024*y^i */
- runnable_contrib = __compute_runnable_contrib(periods);
- if (runnable)
- sa->runnable_avg_sum += runnable_contrib;
+ contrib = __compute_runnable_contrib(periods);
+ if (weight) {
+ sa->load_sum += weight * contrib;
+ if (cfs_rq)
+ cfs_rq->runnable_load_sum += weight * contrib;
+ }
if (running)
- sa->running_avg_sum += runnable_contrib * scale_freq
- >> SCHED_CAPACITY_SHIFT;
- sa->avg_period += runnable_contrib;
+ sa->util_sum += contrib * scale_freq >> SCHED_CAPACITY_SHIFT;
}
/* Remainder of delta accrued against u_0` */
- if (runnable)
- sa->runnable_avg_sum += delta;
+ if (weight) {
+ sa->load_sum += weight * delta;
+ if (cfs_rq)
+ cfs_rq->runnable_load_sum += weight * delta;
+ }
if (running)
- sa->running_avg_sum += delta * scale_freq
- >> SCHED_CAPACITY_SHIFT;
- sa->avg_period += delta;
-
- return decayed;
-}
+ sa->util_sum += delta * scale_freq >> SCHED_CAPACITY_SHIFT;
-/* Synchronize an entity's decay with its parenting cfs_rq.*/
-static inline u64 __synchronize_entity_decay(struct sched_entity *se)
-{
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 decays = atomic64_read(&cfs_rq->decay_counter);
-
- decays -= se->avg.decay_count;
- se->avg.decay_count = 0;
- if (!decays)
- return 0;
+ sa->period_contrib += delta;
- se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
- se->avg.utilization_avg_contrib =
- decay_load(se->avg.utilization_avg_contrib, decays);
+ if (decayed) {
+ sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+ if (cfs_rq) {
+ cfs_rq->runnable_load_avg =
+ div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
+ }
+ sa->util_avg = (sa->util_sum << SCHED_LOAD_SHIFT) / LOAD_AVG_MAX;
+ }
- return decays;
+ return decayed;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
- int force_update)
-{
- struct task_group *tg = cfs_rq->tg;
- long tg_contrib;
-
- tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
- tg_contrib -= cfs_rq->tg_load_contrib;
-
- if (!tg_contrib)
- return;
-
- if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
- atomic_long_add(tg_contrib, &tg->load_avg);
- cfs_rq->tg_load_contrib += tg_contrib;
- }
-}
-
/*
- * Aggregate cfs_rq runnable averages into an equivalent task_group
- * representation for computing load contributions.
+ * Updating tg's load_avg is necessary before update_cfs_share (which is done)
+ * and effective_load (which is not done because it is too costly).
*/
-static inline void __update_tg_runnable_avg(struct sched_avg *sa,
- struct cfs_rq *cfs_rq)
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
{
- struct task_group *tg = cfs_rq->tg;
- long contrib;
+ long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
- /* The fraction of a cpu used by this cfs_rq */
- contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
- sa->avg_period + 1);
- contrib -= cfs_rq->tg_runnable_contrib;
-
- if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
- atomic_add(contrib, &tg->runnable_avg);
- cfs_rq->tg_runnable_contrib += contrib;
+ if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
+ atomic_long_add(delta, &cfs_rq->tg->load_avg);
+ cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
}
}
-static inline void __update_group_entity_contrib(struct sched_entity *se)
-{
- struct cfs_rq *cfs_rq = group_cfs_rq(se);
- struct task_group *tg = cfs_rq->tg;
- int runnable_avg;
-
- u64 contrib;
-
- contrib = cfs_rq->tg_load_contrib * tg->shares;
- se->avg.load_avg_contrib = div_u64(contrib,
- atomic_long_read(&tg->load_avg) + 1);
-
- /*
- * For group entities we need to compute a correction term in the case
- * that they are consuming <1 cpu so that we would contribute the same
- * load as a task of equal weight.
- *
- * Explicitly co-ordinating this measurement would be expensive, but
- * fortunately the sum of each cpus contribution forms a usable
- * lower-bound on the true value.
- *
- * Consider the aggregate of 2 contributions. Either they are disjoint
- * (and the sum represents true value) or they are disjoint and we are
- * understating by the aggregate of their overlap.
- *
- * Extending this to N cpus, for a given overlap, the maximum amount we
- * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
- * cpus that overlap for this interval and w_i is the interval width.
- *
- * On a small machine; the first term is well-bounded which bounds the
- * total error since w_i is a subset of the period. Whereas on a
- * larger machine, while this first term can be larger, if w_i is the
- * of consequential size guaranteed to see n_i*w_i quickly converge to
- * our upper bound of 1-cpu.
- */
- runnable_avg = atomic_read(&tg->runnable_avg);
- if (runnable_avg < NICE_0_LOAD) {
- se->avg.load_avg_contrib *= runnable_avg;
- se->avg.load_avg_contrib >>= NICE_0_SHIFT;
- }
-}
-
-static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
-{
- __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg,
- runnable, runnable);
- __update_tg_runnable_avg(&rq->avg, &rq->cfs);
-}
#else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
- int force_update) {}
-static inline void __update_tg_runnable_avg(struct sched_avg *sa,
- struct cfs_rq *cfs_rq) {}
-static inline void __update_group_entity_contrib(struct sched_entity *se) {}
-static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline void __update_task_entity_contrib(struct sched_entity *se)
-{
- u32 contrib;
-
- /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
- contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
- contrib /= (se->avg.avg_period + 1);
- se->avg.load_avg_contrib = scale_load(contrib);
-}
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
-/* Compute the current contribution to load_avg by se, return any delta */
-static long __update_entity_load_avg_contrib(struct sched_entity *se)
+/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
+static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
- long old_contrib = se->avg.load_avg_contrib;
+ int decayed;
+ struct sched_avg *sa = &cfs_rq->avg;
- if (entity_is_task(se)) {
- __update_task_entity_contrib(se);
- } else {
- __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
- __update_group_entity_contrib(se);
+ if (atomic_long_read(&cfs_rq->removed_load_avg)) {
+ long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
+ sa->load_avg = max_t(long, sa->load_avg - r, 0);
+ sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
}
- return se->avg.load_avg_contrib - old_contrib;
-}
-
-
-static inline void __update_task_entity_utilization(struct sched_entity *se)
-{
- u32 contrib;
+ if (atomic_long_read(&cfs_rq->removed_util_avg)) {
+ long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
+ sa->util_avg = max_t(long, sa->util_avg - r, 0);
+ sa->util_sum = max_t(s32, sa->util_sum -
+ ((r * LOAD_AVG_MAX) >> SCHED_LOAD_SHIFT), 0);
+ }
- /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
- contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE);
- contrib /= (se->avg.avg_period + 1);
- se->avg.utilization_avg_contrib = scale_load(contrib);
-}
+ decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
+ scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
-static long __update_entity_utilization_avg_contrib(struct sched_entity *se)
-{
- long old_contrib = se->avg.utilization_avg_contrib;
-
- if (entity_is_task(se))
- __update_task_entity_utilization(se);
- else
- se->avg.utilization_avg_contrib =
- group_cfs_rq(se)->utilization_load_avg;
+#ifndef CONFIG_64BIT
+ smp_wmb();
+ cfs_rq->load_last_update_time_copy = sa->last_update_time;
+#endif
- return se->avg.utilization_avg_contrib - old_contrib;
+ return decayed;
}
-static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
- long load_contrib)
-{
- if (likely(load_contrib < cfs_rq->blocked_load_avg))
- cfs_rq->blocked_load_avg -= load_contrib;
- else
- cfs_rq->blocked_load_avg = 0;
-}
-
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
-
-/* Update a sched_entity's runnable average */
-static inline void update_entity_load_avg(struct sched_entity *se,
- int update_cfs_rq)
+/* Update task and its cfs_rq load average */
+static inline void update_load_avg(struct sched_entity *se, int update_tg)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- long contrib_delta, utilization_delta;
int cpu = cpu_of(rq_of(cfs_rq));
- u64 now;
+ u64 now = cfs_rq_clock_task(cfs_rq);
/*
- * For a group entity we need to use their owned cfs_rq_clock_task() in
- * case they are the parent of a throttled hierarchy.
+ * Track task load average for carrying it to new CPU after migrated, and
+ * track group sched_entity load average for task_h_load calc in migration
*/
- if (entity_is_task(se))
- now = cfs_rq_clock_task(cfs_rq);
- else
- now = cfs_rq_clock_task(group_cfs_rq(se));
-
- if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq,
- cfs_rq->curr == se))
- return;
-
- contrib_delta = __update_entity_load_avg_contrib(se);
- utilization_delta = __update_entity_utilization_avg_contrib(se);
-
- if (!update_cfs_rq)
- return;
+ __update_load_avg(now, cpu, &se->avg,
+ se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL);
- if (se->on_rq) {
- cfs_rq->runnable_load_avg += contrib_delta;
- cfs_rq->utilization_load_avg += utilization_delta;
- } else {
- subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
- }
+ if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
+ update_tg_load_avg(cfs_rq, 0);
}
-/*
- * Decay the load contributed by all blocked children and account this so that
- * their contribution may appropriately discounted when they wake up.
- */
-static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
+/* Add the load generated by se into cfs_rq's load average */
+static inline void
+enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
- u64 decays;
-
- decays = now - cfs_rq->last_decay;
- if (!decays && !force_update)
- return;
+ struct sched_avg *sa = &se->avg;
+ u64 now = cfs_rq_clock_task(cfs_rq);
+ int migrated = 0, decayed;
- if (atomic_long_read(&cfs_rq->removed_load)) {
- unsigned long removed_load;
- removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
- subtract_blocked_load_contrib(cfs_rq, removed_load);
+ if (sa->last_update_time == 0) {
+ sa->last_update_time = now;
+ migrated = 1;
}
+ else {
+ __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
+ se->on_rq * scale_load_down(se->load.weight),
+ cfs_rq->curr == se, NULL);
+ }
+
+ decayed = update_cfs_rq_load_avg(now, cfs_rq);
- if (decays) {
- cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
- decays);
- atomic64_add(decays, &cfs_rq->decay_counter);
- cfs_rq->last_decay = now;
+ cfs_rq->runnable_load_avg += sa->load_avg;
+ cfs_rq->runnable_load_sum += sa->load_sum;
+
+ if (migrated) {
+ cfs_rq->avg.load_avg += sa->load_avg;
+ cfs_rq->avg.load_sum += sa->load_sum;
+ cfs_rq->avg.util_avg += sa->util_avg;
+ cfs_rq->avg.util_sum += sa->util_sum;
}
- __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
+ if (decayed || migrated)
+ update_tg_load_avg(cfs_rq, 0);
}
-/* Add the load generated by se into cfs_rq's child load-average */
-static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
- struct sched_entity *se,
- int wakeup)
+/* Remove the runnable load generated by se from cfs_rq's runnable load average */
+static inline void
+dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- /*
- * We track migrations using entity decay_count <= 0, on a wake-up
- * migration we use a negative decay count to track the remote decays
- * accumulated while sleeping.
- *
- * Newly forked tasks are enqueued with se->avg.decay_count == 0, they
- * are seen by enqueue_entity_load_avg() as a migration with an already
- * constructed load_avg_contrib.
- */
- if (unlikely(se->avg.decay_count <= 0)) {
- se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
- if (se->avg.decay_count) {
- /*
- * In a wake-up migration we have to approximate the
- * time sleeping. This is because we can't synchronize
- * clock_task between the two cpus, and it is not
- * guaranteed to be read-safe. Instead, we can
- * approximate this using our carried decays, which are
- * explicitly atomically readable.
- */
- se->avg.last_runnable_update -= (-se->avg.decay_count)
- << 20;
- update_entity_load_avg(se, 0);
- /* Indicate that we're now synchronized and on-rq */
- se->avg.decay_count = 0;
- }
- wakeup = 0;
- } else {
- __synchronize_entity_decay(se);
- }
+ update_load_avg(se, 1);
- /* migrated tasks did not contribute to our blocked load */
- if (wakeup) {
- subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
- update_entity_load_avg(se, 0);
- }
-
- cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
- cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib;
- /* we force update consideration on load-balancer moves */
- update_cfs_rq_blocked_load(cfs_rq, !wakeup);
+ cfs_rq->runnable_load_avg =
+ max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
+ cfs_rq->runnable_load_sum =
+ max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
}
/*
- * Remove se's load from this cfs_rq child load-average, if the entity is
- * transitioning to a blocked state we track its projected decay using
- * blocked_load_avg.
+ * Task first catches up with cfs_rq, and then subtract
+ * itself from the cfs_rq (task must be off the queue now).
*/
-static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
- struct sched_entity *se,
- int sleep)
+void remove_entity_load_avg(struct sched_entity *se)
{
- update_entity_load_avg(se, 1);
- /* we force update consideration on load-balancer moves */
- update_cfs_rq_blocked_load(cfs_rq, !sleep);
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 last_update_time;
+
+#ifndef CONFIG_64BIT
+ u64 last_update_time_copy;
- cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
- cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib;
- if (sleep) {
- cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
- se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
- } /* migrations, e.g. sleep=0 leave decay_count == 0 */
+ do {
+ last_update_time_copy = cfs_rq->load_last_update_time_copy;
+ smp_rmb();
+ last_update_time = cfs_rq->avg.last_update_time;
+ } while (last_update_time != last_update_time_copy);
+#else
+ last_update_time = cfs_rq->avg.last_update_time;
+#endif
+
+ __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+ atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
+ atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
}
/*
@@ -2944,7 +2788,6 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
*/
void idle_enter_fair(struct rq *this_rq)
{
- update_rq_runnable_avg(this_rq, 1);
}
/*
@@ -2954,24 +2797,28 @@ void idle_enter_fair(struct rq *this_rq)
*/
void idle_exit_fair(struct rq *this_rq)
{
- update_rq_runnable_avg(this_rq, 0);
+}
+
+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->runnable_load_avg;
+}
+
+static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->avg.load_avg;
}
static int idle_balance(struct rq *this_rq);
#else /* CONFIG_SMP */
-static inline void update_entity_load_avg(struct sched_entity *se,
- int update_cfs_rq) {}
-static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
-static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
- struct sched_entity *se,
- int wakeup) {}
-static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
- struct sched_entity *se,
- int sleep) {}
-static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
- int force_update) {}
+static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
+static inline void
+enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+static inline void
+dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+static inline void remove_entity_load_avg(struct sched_entity *se) {}
static inline int idle_balance(struct rq *rq)
{
@@ -3103,7 +2950,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
- enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
+ enqueue_entity_load_avg(cfs_rq, se);
account_entity_enqueue(cfs_rq, se);
update_cfs_shares(cfs_rq);
@@ -3178,7 +3025,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
- dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
+ dequeue_entity_load_avg(cfs_rq, se);
update_stats_dequeue(cfs_rq, se);
if (flags & DEQUEUE_SLEEP) {
@@ -3268,7 +3115,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
- update_entity_load_avg(se, 1);
+ update_load_avg(se, 1);
}
update_stats_curr_start(cfs_rq, se);
@@ -3368,7 +3215,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
- update_entity_load_avg(prev, 1);
+ update_load_avg(prev, 0);
}
cfs_rq->curr = NULL;
}
@@ -3384,8 +3231,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
/*
* Ensure that runnable average is periodically updated.
*/
- update_entity_load_avg(curr, 1);
- update_cfs_rq_blocked_load(cfs_rq, 1);
+ update_load_avg(curr, 1);
update_cfs_shares(cfs_rq);
#ifdef CONFIG_SCHED_HRTICK
@@ -4258,14 +4104,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
+ update_load_avg(se, 1);
update_cfs_shares(cfs_rq);
- update_entity_load_avg(se, 1);
}
- if (!se) {
- update_rq_runnable_avg(rq, rq->nr_running);
+ if (!se)
add_nr_running(rq, 1);
- }
+
hrtick_update(rq);
}
@@ -4319,14 +4164,13 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
+ update_load_avg(se, 1);
update_cfs_shares(cfs_rq);
- update_entity_load_avg(se, 1);
}
- if (!se) {
+ if (!se)
sub_nr_running(rq, 1);
- update_rq_runnable_avg(rq, 1);
- }
+
hrtick_update(rq);
}
@@ -4439,6 +4283,12 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
sched_avg_update(this_rq);
}
+/* Used instead of source_load when we know the type == 0 */
+static unsigned long weighted_cpuload(const int cpu)
+{
+ return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
+}
+
#ifdef CONFIG_NO_HZ_COMMON
/*
* There is no sane way to deal with nohz on smp when using jiffies because the
@@ -4460,7 +4310,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
static void update_idle_cpu_load(struct rq *this_rq)
{
unsigned long curr_jiffies = READ_ONCE(jiffies);
- unsigned long load = this_rq->cfs.runnable_load_avg;
+ unsigned long load = weighted_cpuload(cpu_of(this_rq));
unsigned long pending_updates;
/*
@@ -4506,7 +4356,7 @@ void update_cpu_load_nohz(void)
*/
void update_cpu_load_active(struct rq *this_rq)
{
- unsigned long load = this_rq->cfs.runnable_load_avg;
+ unsigned long load = weighted_cpuload(cpu_of(this_rq));
/*
* See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
*/
@@ -4514,12 +4364,6 @@ void update_cpu_load_active(struct rq *this_rq)
__update_cpu_load(this_rq, load, 1);
}
-/* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
-{
- return cpu_rq(cpu)->cfs.runnable_load_avg;
-}
-
/*
* Return a low guess at the load of a migration-source cpu weighted
* according to the scheduling class and "nice" value.
@@ -4567,7 +4411,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
- unsigned long load_avg = rq->cfs.runnable_load_avg;
+ unsigned long load_avg = weighted_cpuload(cpu);
if (nr_running)
return load_avg / nr_running;
@@ -4686,7 +4530,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
/*
* w = rw_i + @wl
*/
- w = se->my_q->load.weight + wl;
+ w = cfs_rq_load_avg(se->my_q) + wl;
/*
* wl = S * s'_i; see (2)
@@ -4707,7 +4551,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
/*
* wl = dw_i = S * (s'_i - s_i); see (3)
*/
- wl -= se->load.weight;
+ wl -= se->avg.load_avg;
/*
* Recursively apply this logic to all parent groups to compute
@@ -4730,26 +4574,29 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
#endif
+/*
+ * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
+ * A waker of many should wake a different task than the one last awakened
+ * at a frequency roughly N times higher than one of its wakees. In order
+ * to determine whether we should let the load spread vs consolodating to
+ * shared cache, we look for a minimum 'flip' frequency of llc_size in one
+ * partner, and a factor of lls_size higher frequency in the other. With
+ * both conditions met, we can be relatively sure that the relationship is
+ * non-monogamous, with partner count exceeding socket size. Waker/wakee
+ * being client/server, worker/dispatcher, interrupt source or whatever is
+ * irrelevant, spread criteria is apparent partner count exceeds socket size.
+ */
static int wake_wide(struct task_struct *p)
{
+ unsigned int master = current->wakee_flips;
+ unsigned int slave = p->wakee_flips;
int factor = this_cpu_read(sd_llc_size);
- /*
- * Yeah, it's the switching-frequency, could means many wakee or
- * rapidly switch, use factor here will just help to automatically
- * adjust the loose-degree, so bigger node will lead to more pull.
- */
- if (p->wakee_flips > factor) {
- /*
- * wakee is somewhat hot, it needs certain amount of cpu
- * resource, so if waker is far more hot, prefer to leave
- * it alone.
- */
- if (current->wakee_flips > (factor * p->wakee_flips))
- return 1;
- }
-
- return 0;
+ if (master < slave)
+ swap(master, slave);
+ if (slave < factor || master < slave * factor)
+ return 0;
+ return 1;
}
static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
@@ -4761,13 +4608,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
unsigned long weight;
int balanced;
- /*
- * If we wake multiple tasks be careful to not bounce
- * ourselves around too much.
- */
- if (wake_wide(p))
- return 0;
-
idx = sd->wake_idx;
this_cpu = smp_processor_id();
prev_cpu = task_cpu(p);
@@ -4781,14 +4621,14 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
*/
if (sync) {
tg = task_group(current);
- weight = current->se.load.weight;
+ weight = current->se.avg.load_avg;
this_load += effective_load(tg, this_cpu, -weight, -weight);
load += effective_load(tg, prev_cpu, 0, -weight);
}
tg = task_group(p);
- weight = p->se.load.weight;
+ weight = p->se.avg.load_avg;
/*
* In low-load situations, where prev_cpu is idle and this_cpu is idle
@@ -4981,12 +4821,12 @@ done:
* tasks. The unit of the return value must be the one of capacity so we can
* compare the usage with the capacity of the CPU that is available for CFS
* task (ie cpu_capacity).
- * cfs.utilization_load_avg is the sum of running time of runnable tasks on a
+ * cfs.avg.util_avg is the sum of running time of runnable tasks on a
* CPU. It represents the amount of utilization of a CPU in the range
* [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full
* capacity of the CPU because it's about the running time on this CPU.
- * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE
- * because of unfortunate rounding in avg_period and running_load_avg or just
+ * Nevertheless, cfs.avg.util_avg can be higher than SCHED_LOAD_SCALE
+ * because of unfortunate rounding in util_avg or just
* after migrating tasks until the average stabilizes with the new running
* time. So we need to check that the usage stays into the range
* [0..cpu_capacity_orig] and cap if necessary.
@@ -4995,7 +4835,7 @@ done:
*/
static int get_cpu_usage(int cpu)
{
- unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg;
+ unsigned long usage = cpu_rq(cpu)->cfs.avg.util_avg;
unsigned long capacity = capacity_orig_of(cpu);
if (usage >= SCHED_LOAD_SCALE)
@@ -5021,17 +4861,17 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
{
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
int cpu = smp_processor_id();
- int new_cpu = cpu;
+ int new_cpu = prev_cpu;
int want_affine = 0;
int sync = wake_flags & WF_SYNC;
if (sd_flag & SD_BALANCE_WAKE)
- want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+ want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
rcu_read_lock();
for_each_domain(cpu, tmp) {
if (!(tmp->flags & SD_LOAD_BALANCE))
- continue;
+ break;
/*
* If both cpu and prev_cpu are part of this domain,
@@ -5045,17 +4885,21 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (tmp->flags & sd_flag)
sd = tmp;
+ else if (!want_affine)
+ break;
}
- if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
- prev_cpu = cpu;
-
- if (sd_flag & SD_BALANCE_WAKE) {
- new_cpu = select_idle_sibling(p, prev_cpu);
- goto unlock;
+ if (affine_sd) {
+ sd = NULL; /* Prefer wake_affine over balance flags */
+ if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+ new_cpu = cpu;
}
- while (sd) {
+ if (!sd) {
+ if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
+ new_cpu = select_idle_sibling(p, new_cpu);
+
+ } else while (sd) {
struct sched_group *group;
int weight;
@@ -5089,7 +4933,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
}
/* while loop will break here if sd == NULL */
}
-unlock:
rcu_read_unlock();
return new_cpu;
@@ -5101,26 +4944,27 @@ unlock:
* previous cpu. However, the caller only guarantees p->pi_lock is held; no
* other assumptions, including the state of rq->lock, should be made.
*/
-static void
-migrate_task_rq_fair(struct task_struct *p, int next_cpu)
+static void migrate_task_rq_fair(struct task_struct *p, int next_cpu)
{
- struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
/*
- * Load tracking: accumulate removed load so that it can be processed
- * when we next update owning cfs_rq under rq->lock. Tasks contribute
- * to blocked load iff they have a positive decay-count. It can never
- * be negative here since on-rq tasks have decay-count == 0.
+ * We are supposed to update the task to "current" time, then its up to date
+ * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
+ * what current time is, so simply throw away the out-of-date time. This
+ * will result in the wakee task is less decayed, but giving the wakee more
+ * load sounds not bad.
*/
- if (se->avg.decay_count) {
- se->avg.decay_count = -__synchronize_entity_decay(se);
- atomic_long_add(se->avg.load_avg_contrib,
- &cfs_rq->removed_load);
- }
+ remove_entity_load_avg(&p->se);
+
+ /* Tell new CPU we are migrated */
+ p->se.avg.last_update_time = 0;
/* We have migrated, no longer consider this task hot */
- se->exec_start = 0;
+ p->se.exec_start = 0;
+}
+
+static void task_dead_fair(struct task_struct *p)
+{
+ remove_entity_load_avg(&p->se);
}
#endif /* CONFIG_SMP */
@@ -5670,72 +5514,39 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
#ifdef CONFIG_NUMA_BALANCING
/*
- * Returns true if the destination node is the preferred node.
- * Needs to match fbq_classify_rq(): if there is a runnable task
- * that is not on its preferred node, we should identify it.
+ * Returns 1, if task migration degrades locality
+ * Returns 0, if task migration improves locality i.e migration preferred.
+ * Returns -1, if task migration is not affected by locality.
*/
-static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
+static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
{
struct numa_group *numa_group = rcu_dereference(p->numa_group);
unsigned long src_faults, dst_faults;
int src_nid, dst_nid;
- if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
- !(env->sd->flags & SD_NUMA)) {
- return false;
- }
-
- src_nid = cpu_to_node(env->src_cpu);
- dst_nid = cpu_to_node(env->dst_cpu);
-
- if (src_nid == dst_nid)
- return false;
-
- /* Encourage migration to the preferred node. */
- if (dst_nid == p->numa_preferred_nid)
- return true;
-
- /* Migrating away from the preferred node is bad. */
- if (src_nid == p->numa_preferred_nid)
- return false;
-
- if (numa_group) {
- src_faults = group_faults(p, src_nid);
- dst_faults = group_faults(p, dst_nid);
- } else {
- src_faults = task_faults(p, src_nid);
- dst_faults = task_faults(p, dst_nid);
- }
-
- return dst_faults > src_faults;
-}
-
-
-static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
-{
- struct numa_group *numa_group = rcu_dereference(p->numa_group);
- unsigned long src_faults, dst_faults;
- int src_nid, dst_nid;
-
- if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
- return false;
-
if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
- return false;
+ return -1;
+
+ if (!sched_feat(NUMA))
+ return -1;
src_nid = cpu_to_node(env->src_cpu);
dst_nid = cpu_to_node(env->dst_cpu);
if (src_nid == dst_nid)
- return false;
+ return -1;
- /* Migrating away from the preferred node is bad. */
- if (src_nid == p->numa_preferred_nid)
- return true;
+ /* Migrating away from the preferred node is always bad. */
+ if (src_nid == p->numa_preferred_nid) {
+ if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
+ return 1;
+ else
+ return -1;
+ }
/* Encourage migration to the preferred node. */
if (dst_nid == p->numa_preferred_nid)
- return false;
+ return 0;
if (numa_group) {
src_faults = group_faults(p, src_nid);
@@ -5749,16 +5560,10 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
}
#else
-static inline bool migrate_improves_locality(struct task_struct *p,
+static inline int migrate_degrades_locality(struct task_struct *p,
struct lb_env *env)
{
- return false;
-}
-
-static inline bool migrate_degrades_locality(struct task_struct *p,
- struct lb_env *env)
-{
- return false;
+ return -1;
}
#endif
@@ -5768,7 +5573,7 @@ static inline bool migrate_degrades_locality(struct task_struct *p,
static
int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
- int tsk_cache_hot = 0;
+ int tsk_cache_hot;
lockdep_assert_held(&env->src_rq->lock);
@@ -5826,13 +5631,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
* 2) task is cache cold, or
* 3) too many balance attempts have failed.
*/
- tsk_cache_hot = task_hot(p, env);
- if (!tsk_cache_hot)
- tsk_cache_hot = migrate_degrades_locality(p, env);
+ tsk_cache_hot = migrate_degrades_locality(p, env);
+ if (tsk_cache_hot == -1)
+ tsk_cache_hot = task_hot(p, env);
- if (migrate_improves_locality(p, env) || !tsk_cache_hot ||
+ if (tsk_cache_hot <= 0 ||
env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
- if (tsk_cache_hot) {
+ if (tsk_cache_hot == 1) {
schedstat_inc(env->sd, lb_hot_gained[env->idle]);
schedstat_inc(p, se.statistics.nr_forced_migrations);
}
@@ -5906,6 +5711,13 @@ static int detach_tasks(struct lb_env *env)
return 0;
while (!list_empty(tasks)) {
+ /*
+ * We don't want to steal all, otherwise we may be treated likewise,
+ * which could at worst lead to a livelock crash.
+ */
+ if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
+ break;
+
p = list_first_entry(tasks, struct task_struct, se.group_node);
env->loop++;
@@ -6015,39 +5827,6 @@ static void attach_tasks(struct lb_env *env)
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-/*
- * update tg->load_weight by folding this cpu's load_avg
- */
-static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
-{
- struct sched_entity *se = tg->se[cpu];
- struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
-
- /* throttled entities do not contribute to load */
- if (throttled_hierarchy(cfs_rq))
- return;
-
- update_cfs_rq_blocked_load(cfs_rq, 1);
-
- if (se) {
- update_entity_load_avg(se, 1);
- /*
- * We pivot on our runnable average having decayed to zero for
- * list removal. This generally implies that all our children
- * have also been removed (modulo rounding error or bandwidth
- * control); however, such cases are rare and we can fix these
- * at enqueue.
- *
- * TODO: fix up out-of-order children on enqueue.
- */
- if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
- list_del_leaf_cfs_rq(cfs_rq);
- } else {
- struct rq *rq = rq_of(cfs_rq);
- update_rq_runnable_avg(rq, rq->nr_running);
- }
-}
-
static void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -6056,19 +5835,19 @@ static void update_blocked_averages(int cpu)
raw_spin_lock_irqsave(&rq->lock, flags);
update_rq_clock(rq);
+
/*
* Iterates the task_group tree in a bottom up fashion, see
* list_add_leaf_cfs_rq() for details.
*/
for_each_leaf_cfs_rq(rq, cfs_rq) {
- /*
- * Note: We may want to consider periodically releasing
- * rq->lock about these updates so that creating many task
- * groups does not result in continually extending hold time.
- */
- __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
- }
+ /* throttled entities do not contribute to load */
+ if (throttled_hierarchy(cfs_rq))
+ continue;
+ if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
+ update_tg_load_avg(cfs_rq, 0);
+ }
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -6096,14 +5875,14 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
}
if (!se) {
- cfs_rq->h_load = cfs_rq->runnable_load_avg;
+ cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
cfs_rq->last_h_load_update = now;
}
while ((se = cfs_rq->h_load_next) != NULL) {
load = cfs_rq->h_load;
- load = div64_ul(load * se->avg.load_avg_contrib,
- cfs_rq->runnable_load_avg + 1);
+ load = div64_ul(load * se->avg.load_avg,
+ cfs_rq_load_avg(cfs_rq) + 1);
cfs_rq = group_cfs_rq(se);
cfs_rq->h_load = load;
cfs_rq->last_h_load_update = now;
@@ -6115,17 +5894,25 @@ static unsigned long task_h_load(struct task_struct *p)
struct cfs_rq *cfs_rq = task_cfs_rq(p);
update_cfs_rq_h_load(cfs_rq);
- return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
- cfs_rq->runnable_load_avg + 1);
+ return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
+ cfs_rq_load_avg(cfs_rq) + 1);
}
#else
static inline void update_blocked_averages(int cpu)
{
+ struct rq *rq = cpu_rq(cpu);
+ struct cfs_rq *cfs_rq = &rq->cfs;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ update_rq_clock(rq);
+ update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
}
static unsigned long task_h_load(struct task_struct *p)
{
- return p->se.avg.load_avg_contrib;
+ return p->se.avg.load_avg;
}
#endif
@@ -8025,8 +7812,6 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
if (numabalancing_enabled)
task_tick_numa(rq, curr);
-
- update_rq_runnable_avg(rq, 1);
}
/*
@@ -8125,15 +7910,18 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
}
#ifdef CONFIG_SMP
- /*
- * Remove our load from contribution when we leave sched_fair
- * and ensure we don't carry in an old decay_count if we
- * switch back.
- */
- if (se->avg.decay_count) {
- __synchronize_entity_decay(se);
- subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
- }
+ /* Catch up with the cfs_rq and remove our load when we leave */
+ __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq), &se->avg,
+ se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL);
+
+ cfs_rq->avg.load_avg =
+ max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
+ cfs_rq->avg.load_sum =
+ max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
+ cfs_rq->avg.util_avg =
+ max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
+ cfs_rq->avg.util_sum =
+ max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
#endif
}
@@ -8142,16 +7930,31 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
*/
static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
-#ifdef CONFIG_FAIR_GROUP_SCHED
struct sched_entity *se = &p->se;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* Since the real-depth could have been changed (only FAIR
* class maintain depth value), reset depth properly.
*/
se->depth = se->parent ? se->parent->depth + 1 : 0;
#endif
- if (!task_on_rq_queued(p))
+
+ if (!task_on_rq_queued(p)) {
+
+ /*
+ * Ensure the task has a non-normalized vruntime when it is switched
+ * back to the fair class with !queued, so that enqueue_entity() at
+ * wake-up time will do the right thing.
+ *
+ * If it's queued, then the enqueue_entity(.flags=0) makes the task
+ * has non-normalized vruntime, if it's !queued, then it still has
+ * normalized vruntime.
+ */
+ if (p->state != TASK_RUNNING)
+ se->vruntime += cfs_rq_of(se)->min_vruntime;
return;
+ }
/*
* We were most likely switched from sched_rt, so
@@ -8190,8 +7993,8 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
#ifdef CONFIG_SMP
- atomic64_set(&cfs_rq->decay_counter, 1);
- atomic_long_set(&cfs_rq->removed_load, 0);
+ atomic_long_set(&cfs_rq->removed_load_avg, 0);
+ atomic_long_set(&cfs_rq->removed_util_avg, 0);
#endif
}
@@ -8236,14 +8039,14 @@ static void task_move_group_fair(struct task_struct *p, int queued)
if (!queued) {
cfs_rq = cfs_rq_of(se);
se->vruntime += cfs_rq->min_vruntime;
+
#ifdef CONFIG_SMP
- /*
- * migrate_task_rq_fair() will have removed our previous
- * contribution, but we must synchronize for ongoing future
- * decay.
- */
- se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
- cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
+ /* Virtually synchronize task with its new cfs_rq */
+ p->se.avg.last_update_time = cfs_rq->avg.last_update_time;
+ cfs_rq->avg.load_avg += p->se.avg.load_avg;
+ cfs_rq->avg.load_sum += p->se.avg.load_sum;
+ cfs_rq->avg.util_avg += p->se.avg.util_avg;
+ cfs_rq->avg.util_sum += p->se.avg.util_sum;
#endif
}
}
@@ -8257,8 +8060,11 @@ void free_fair_sched_group(struct task_group *tg)
for_each_possible_cpu(i) {
if (tg->cfs_rq)
kfree(tg->cfs_rq[i]);
- if (tg->se)
+ if (tg->se) {
+ if (tg->se[i])
+ remove_entity_load_avg(tg->se[i]);
kfree(tg->se[i]);
+ }
}
kfree(tg->cfs_rq);
@@ -8295,6 +8101,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
init_cfs_rq(cfs_rq);
init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
+ init_entity_runnable_average(se);
}
return 1;
@@ -8444,6 +8251,8 @@ const struct sched_class fair_sched_class = {
.rq_offline = rq_offline_fair,
.task_waking = task_waking_fair,
+ .task_dead = task_dead_fair,
+ .set_cpus_allowed = set_cpus_allowed_common,
#endif
.set_curr_task = set_curr_task_fair,
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 91e33cd485f6..83a50e7ca533 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -79,20 +79,12 @@ SCHED_FEAT(LB_MIN, false)
* numa_balancing=
*/
#ifdef CONFIG_NUMA_BALANCING
-SCHED_FEAT(NUMA, false)
/*
- * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a
- * higher number of hinting faults are recorded during active load
- * balancing.
+ * NUMA will favor moving tasks towards nodes where a higher number of
+ * hinting faults are recorded during active load balancing. It will
+ * resist moving tasks towards nodes where a lower number of hinting
+ * faults have been recorded.
*/
-SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
-
-/*
- * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a
- * lower number of hinting faults have been recorded. As this has
- * the potential to prevent a task ever migrating to a new node
- * due to CPU overload it is disabled by default.
- */
-SCHED_FEAT(NUMA_RESIST_LOWER, false)
+SCHED_FEAT(NUMA, true)
#endif
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 594275ed2620..8f177c73ae19 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -83,10 +83,13 @@ void __weak arch_cpu_idle(void)
*/
void default_idle_call(void)
{
- if (current_clr_polling_and_test())
+ if (current_clr_polling_and_test()) {
local_irq_enable();
- else
+ } else {
+ stop_critical_timings();
arch_cpu_idle();
+ start_critical_timings();
+ }
}
static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
@@ -141,12 +144,6 @@ static void cpuidle_idle_call(void)
}
/*
- * During the idle period, stop measuring the disabled irqs
- * critical sections latencies
- */
- stop_critical_timings();
-
- /*
* Tell the RCU framework we are entering an idle section,
* so no more rcu read side critical sections and one more
* step to the grace period
@@ -198,7 +195,6 @@ exit_idle:
local_irq_enable();
rcu_idle_exit();
- start_critical_timings();
}
DEFINE_PER_CPU(bool, cpu_dead_idle);
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index c65dac8c97cd..c4ae0f1fdf9b 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -96,6 +96,7 @@ const struct sched_class idle_sched_class = {
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_idle,
+ .set_cpus_allowed = set_cpus_allowed_common,
#endif
.set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 0d193a243e96..d2ea59364a1c 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2069,7 +2069,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
{
if (!task_running(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
- has_pushable_tasks(rq) &&
p->nr_cpus_allowed > 1 &&
(dl_task(rq->curr) || rt_task(rq->curr)) &&
(rq->curr->nr_cpus_allowed < 2 ||
@@ -2077,45 +2076,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
push_rt_tasks(rq);
}
-static void set_cpus_allowed_rt(struct task_struct *p,
- const struct cpumask *new_mask)
-{
- struct rq *rq;
- int weight;
-
- BUG_ON(!rt_task(p));
-
- if (!task_on_rq_queued(p))
- return;
-
- weight = cpumask_weight(new_mask);
-
- /*
- * Only update if the process changes its state from whether it
- * can migrate or not.
- */
- if ((p->nr_cpus_allowed > 1) == (weight > 1))
- return;
-
- rq = task_rq(p);
-
- /*
- * The process used to be able to migrate OR it can now migrate
- */
- if (weight <= 1) {
- if (!task_current(rq, p))
- dequeue_pushable_task(rq, p);
- BUG_ON(!rq->rt.rt_nr_migratory);
- rq->rt.rt_nr_migratory--;
- } else {
- if (!task_current(rq, p))
- enqueue_pushable_task(rq, p);
- rq->rt.rt_nr_migratory++;
- }
-
- update_rt_migration(&rq->rt);
-}
-
/* Assumes rq->lock is held */
static void rq_online_rt(struct rq *rq)
{
@@ -2324,7 +2284,7 @@ const struct sched_class rt_sched_class = {
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_rt,
- .set_cpus_allowed = set_cpus_allowed_rt,
+ .set_cpus_allowed = set_cpus_allowed_common,
.rq_online = rq_online_rt,
.rq_offline = rq_offline_rt,
.task_woken = task_woken_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 84d48790bb6d..68cda117574c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -245,7 +245,6 @@ struct task_group {
#ifdef CONFIG_SMP
atomic_long_t load_avg;
- atomic_t runnable_avg;
#endif
#endif
@@ -366,27 +365,20 @@ struct cfs_rq {
#ifdef CONFIG_SMP
/*
- * CFS Load tracking
- * Under CFS, load is tracked on a per-entity basis and aggregated up.
- * This allows for the description of both thread and group usage (in
- * the FAIR_GROUP_SCHED case).
- * runnable_load_avg is the sum of the load_avg_contrib of the
- * sched_entities on the rq.
- * blocked_load_avg is similar to runnable_load_avg except that its
- * the blocked sched_entities on the rq.
- * utilization_load_avg is the sum of the average running time of the
- * sched_entities on the rq.
+ * CFS load tracking
*/
- unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg;
- atomic64_t decay_counter;
- u64 last_decay;
- atomic_long_t removed_load;
-
+ struct sched_avg avg;
+ u64 runnable_load_sum;
+ unsigned long runnable_load_avg;
#ifdef CONFIG_FAIR_GROUP_SCHED
- /* Required to track per-cpu representation of a task_group */
- u32 tg_runnable_contrib;
- unsigned long tg_load_contrib;
+ unsigned long tg_load_avg_contrib;
+#endif
+ atomic_long_t removed_load_avg, removed_util_avg;
+#ifndef CONFIG_64BIT
+ u64 load_last_update_time_copy;
+#endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* h_load = weight * f(tg)
*
@@ -595,8 +587,6 @@ struct rq {
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list;
-
- struct sched_avg avg;
#endif /* CONFIG_FAIR_GROUP_SCHED */
/*
@@ -1065,9 +1055,6 @@ static inline int task_on_rq_migrating(struct task_struct *p)
#ifndef prepare_arch_switch
# define prepare_arch_switch(next) do { } while (0)
#endif
-#ifndef finish_arch_switch
-# define finish_arch_switch(prev) do { } while (0)
-#endif
#ifndef finish_arch_post_lock_switch
# define finish_arch_post_lock_switch() do { } while (0)
#endif
@@ -1268,6 +1255,8 @@ extern void trigger_load_balance(struct rq *rq);
extern void idle_enter_fair(struct rq *this_rq);
extern void idle_exit_fair(struct rq *this_rq);
+extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
+
#else
static inline void idle_enter_fair(struct rq *rq) { }
@@ -1319,7 +1308,7 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
unsigned long to_ratio(u64 period, u64 runtime);
-extern void init_task_runnable_average(struct task_struct *p);
+extern void init_entity_runnable_average(struct sched_entity *se);
static inline void add_nr_running(struct rq *rq, unsigned count)
{
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 79ffec45a6ac..cbc67da10954 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -123,6 +123,7 @@ const struct sched_class stop_sched_class = {
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_stop,
+ .set_cpus_allowed = set_cpus_allowed_common,
#endif
.set_curr_task = set_curr_task_stop,
diff --git a/kernel/signal.c b/kernel/signal.c
index 836df8dac6cc..0f6bbbe77b46 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2748,12 +2748,15 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
* Other callers might not initialize the si_lsb field,
* so check explicitly for the right codes here.
*/
- if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)
+ if (from->si_signo == SIGBUS &&
+ (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO))
err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
#endif
#ifdef SEGV_BNDERR
- err |= __put_user(from->si_lower, &to->si_lower);
- err |= __put_user(from->si_upper, &to->si_upper);
+ if (from->si_signo == SIGSEGV && from->si_code == SEGV_BNDERR) {
+ err |= __put_user(from->si_lower, &to->si_lower);
+ err |= __put_user(from->si_upper, &to->si_upper);
+ }
#endif
break;
case __SI_CHLD:
@@ -3017,7 +3020,7 @@ COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo,
int, sig,
struct compat_siginfo __user *, uinfo)
{
- siginfo_t info;
+ siginfo_t info = {};
int ret = copy_siginfo_from_user32(&info, uinfo);
if (unlikely(ret))
return ret;
@@ -3061,7 +3064,7 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
int, sig,
struct compat_siginfo __user *, uinfo)
{
- siginfo_t info;
+ siginfo_t info = {};
if (copy_siginfo_from_user32(&info, uinfo))
return -EFAULT;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index fd643d8c4b42..12484e5d5c88 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -35,13 +35,16 @@ struct cpu_stop_done {
/* the actual stopper, one per every possible cpu, enabled on online cpus */
struct cpu_stopper {
+ struct task_struct *thread;
+
spinlock_t lock;
bool enabled; /* is this stopper enabled? */
struct list_head works; /* list of pending works */
+
+ struct cpu_stop_work stop_work; /* for stop_cpus */
};
static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
-static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
static bool stop_machine_initialized = false;
/*
@@ -74,7 +77,6 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
{
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
- struct task_struct *p = per_cpu(cpu_stopper_task, cpu);
unsigned long flags;
@@ -82,7 +84,7 @@ static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
if (stopper->enabled) {
list_add_tail(&work->list, &stopper->works);
- wake_up_process(p);
+ wake_up_process(stopper->thread);
} else
cpu_stop_signal_done(work->done, false);
@@ -139,7 +141,7 @@ enum multi_stop_state {
};
struct multi_stop_data {
- int (*fn)(void *);
+ cpu_stop_fn_t fn;
void *data;
/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
unsigned int num_threads;
@@ -293,7 +295,6 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
/* static data for stop_cpus */
static DEFINE_MUTEX(stop_cpus_mutex);
-static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
static void queue_stop_cpus_work(const struct cpumask *cpumask,
cpu_stop_fn_t fn, void *arg,
@@ -302,22 +303,19 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
struct cpu_stop_work *work;
unsigned int cpu;
- /* initialize works and done */
- for_each_cpu(cpu, cpumask) {
- work = &per_cpu(stop_cpus_work, cpu);
- work->fn = fn;
- work->arg = arg;
- work->done = done;
- }
-
/*
* Disable preemption while queueing to avoid getting
* preempted by a stopper which might wait for other stoppers
* to enter @fn which can lead to deadlock.
*/
lg_global_lock(&stop_cpus_lock);
- for_each_cpu(cpu, cpumask)
- cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
+ for_each_cpu(cpu, cpumask) {
+ work = &per_cpu(cpu_stopper.stop_work, cpu);
+ work->fn = fn;
+ work->arg = arg;
+ work->done = done;
+ cpu_stop_queue_work(cpu, work);
+ }
lg_global_unlock(&stop_cpus_lock);
}
@@ -458,19 +456,21 @@ extern void sched_set_stop_task(int cpu, struct task_struct *stop);
static void cpu_stop_create(unsigned int cpu)
{
- sched_set_stop_task(cpu, per_cpu(cpu_stopper_task, cpu));
+ sched_set_stop_task(cpu, per_cpu(cpu_stopper.thread, cpu));
}
static void cpu_stop_park(unsigned int cpu)
{
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
- struct cpu_stop_work *work;
+ struct cpu_stop_work *work, *tmp;
unsigned long flags;
/* drain remaining works */
spin_lock_irqsave(&stopper->lock, flags);
- list_for_each_entry(work, &stopper->works, list)
+ list_for_each_entry_safe(work, tmp, &stopper->works, list) {
+ list_del_init(&work->list);
cpu_stop_signal_done(work->done, false);
+ }
stopper->enabled = false;
spin_unlock_irqrestore(&stopper->lock, flags);
}
@@ -485,7 +485,7 @@ static void cpu_stop_unpark(unsigned int cpu)
}
static struct smp_hotplug_thread cpu_stop_threads = {
- .store = &cpu_stopper_task,
+ .store = &cpu_stopper.thread,
.thread_should_run = cpu_stop_should_run,
.thread_fn = cpu_stopper_thread,
.thread_comm = "migration/%u",
@@ -515,7 +515,7 @@ early_initcall(cpu_stop_init);
#ifdef CONFIG_STOP_MACHINE
-int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
+static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
{
struct multi_stop_data msdata = {
.fn = fn,
@@ -548,7 +548,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata);
}
-int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
+int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
{
int ret;
@@ -582,7 +582,7 @@ EXPORT_SYMBOL_GPL(stop_machine);
* 0 if all executions of @fn returned 0, any non zero return value if any
* returned non zero.
*/
-int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
+int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
const struct cpumask *cpus)
{
struct multi_stop_data msdata = { .fn = fn, .data = data,
diff --git a/kernel/sys.c b/kernel/sys.c
index 259fda25eb6b..fa2f2f671a5c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1668,8 +1668,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
* overall picture.
*/
err = -EACCES;
- if (!S_ISREG(inode->i_mode) ||
- exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC)
+ if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
goto exit;
err = inode_permission(inode, MAY_EXEC);
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 579ce1b929af..4008d9f95dd7 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -92,12 +92,10 @@ config NO_HZ_FULL
depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
# We need at least one periodic CPU for timekeeping
depends on SMP
- # RCU_USER_QS dependency
depends on HAVE_CONTEXT_TRACKING
# VIRT_CPU_ACCOUNTING_GEN dependency
depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
select NO_HZ_COMMON
- select RCU_USER_QS
select RCU_NOCB_CPU
select VIRT_CPU_ACCOUNTING_GEN
select IRQ_WORK
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 5c7ae4b641c4..457a373e2181 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -183,7 +183,7 @@ struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
int pinned)
{
if (pinned || !base->migration_enabled)
- return this_cpu_ptr(&hrtimer_bases);
+ return base;
return &per_cpu(hrtimer_bases, get_nohz_timer_target());
}
#else
@@ -191,23 +191,32 @@ static inline
struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
int pinned)
{
- return this_cpu_ptr(&hrtimer_bases);
+ return base;
}
#endif
/*
- * Switch the timer base to the current CPU when possible.
+ * We switch the timer base to a power-optimized selected CPU target,
+ * if:
+ * - NO_HZ_COMMON is enabled
+ * - timer migration is enabled
+ * - the timer callback is not running
+ * - the timer is not the first expiring timer on the new target
+ *
+ * If one of the above requirements is not fulfilled we move the timer
+ * to the current CPU or leave it on the previously assigned CPU if
+ * the timer callback is currently running.
*/
static inline struct hrtimer_clock_base *
switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
int pinned)
{
- struct hrtimer_cpu_base *new_cpu_base, *this_base;
+ struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base;
struct hrtimer_clock_base *new_base;
int basenum = base->index;
- this_base = this_cpu_ptr(&hrtimer_bases);
- new_cpu_base = get_target_base(this_base, pinned);
+ this_cpu_base = this_cpu_ptr(&hrtimer_bases);
+ new_cpu_base = get_target_base(this_cpu_base, pinned);
again:
new_base = &new_cpu_base->clock_base[basenum];
@@ -229,19 +238,19 @@ again:
raw_spin_unlock(&base->cpu_base->lock);
raw_spin_lock(&new_base->cpu_base->lock);
- if (new_cpu_base != this_base &&
+ if (new_cpu_base != this_cpu_base &&
hrtimer_check_target(timer, new_base)) {
raw_spin_unlock(&new_base->cpu_base->lock);
raw_spin_lock(&base->cpu_base->lock);
- new_cpu_base = this_base;
+ new_cpu_base = this_cpu_base;
timer->base = base;
goto again;
}
timer->base = new_base;
} else {
- if (new_cpu_base != this_base &&
+ if (new_cpu_base != this_cpu_base &&
hrtimer_check_target(timer, new_base)) {
- new_cpu_base = this_base;
+ new_cpu_base = this_cpu_base;
goto again;
}
}
@@ -679,14 +688,14 @@ static void retrigger_next_event(void *arg)
/*
* Switch to high resolution mode
*/
-static int hrtimer_switch_to_hres(void)
+static void hrtimer_switch_to_hres(void)
{
struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
if (tick_init_highres()) {
printk(KERN_WARNING "Could not switch to high resolution "
"mode on CPU %d\n", base->cpu);
- return 0;
+ return;
}
base->hres_active = 1;
hrtimer_resolution = HIGH_RES_NSEC;
@@ -694,7 +703,6 @@ static int hrtimer_switch_to_hres(void)
tick_setup_sched_timer();
/* "Retrigger" the interrupt to get things going */
retrigger_next_event(NULL);
- return 1;
}
static void clock_was_set_work(struct work_struct *work)
@@ -718,7 +726,7 @@ void clock_was_set_delayed(void)
static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; }
static inline int hrtimer_hres_active(void) { return 0; }
static inline int hrtimer_is_hres_enabled(void) { return 0; }
-static inline int hrtimer_switch_to_hres(void) { return 0; }
+static inline void hrtimer_switch_to_hres(void) { }
static inline void
hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
static inline int hrtimer_reprogram(struct hrtimer *timer,
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index fb4d98c7fd43..df68cb875248 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -487,6 +487,11 @@ out:
}
#ifdef CONFIG_GENERIC_CMOS_UPDATE
+int __weak update_persistent_clock(struct timespec now)
+{
+ return -ENODEV;
+}
+
int __weak update_persistent_clock64(struct timespec64 now64)
{
struct timespec now;
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index 3e7db49a2381..53d7184da0be 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -18,30 +18,23 @@
static struct hrtimer bctimer;
-static void bc_set_mode(enum clock_event_mode mode,
- struct clock_event_device *bc)
+static int bc_shutdown(struct clock_event_device *evt)
{
- switch (mode) {
- case CLOCK_EVT_MODE_UNUSED:
- case CLOCK_EVT_MODE_SHUTDOWN:
- /*
- * Note, we cannot cancel the timer here as we might
- * run into the following live lock scenario:
- *
- * cpu 0 cpu1
- * lock(broadcast_lock);
- * hrtimer_interrupt()
- * bc_handler()
- * tick_handle_oneshot_broadcast();
- * lock(broadcast_lock);
- * hrtimer_cancel()
- * wait_for_callback()
- */
- hrtimer_try_to_cancel(&bctimer);
- break;
- default:
- break;
- }
+ /*
+ * Note, we cannot cancel the timer here as we might
+ * run into the following live lock scenario:
+ *
+ * cpu 0 cpu1
+ * lock(broadcast_lock);
+ * hrtimer_interrupt()
+ * bc_handler()
+ * tick_handle_oneshot_broadcast();
+ * lock(broadcast_lock);
+ * hrtimer_cancel()
+ * wait_for_callback()
+ */
+ hrtimer_try_to_cancel(&bctimer);
+ return 0;
}
/*
@@ -82,7 +75,7 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
}
static struct clock_event_device ce_broadcast_hrtimer = {
- .set_mode = bc_set_mode,
+ .set_state_shutdown = bc_shutdown,
.set_next_ktime = bc_set_next,
.features = CLOCK_EVT_FEAT_ONESHOT |
CLOCK_EVT_FEAT_KTIME |
@@ -102,13 +95,11 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t)
{
ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer);
- switch (ce_broadcast_hrtimer.mode) {
- case CLOCK_EVT_MODE_ONESHOT:
+ if (clockevent_state_oneshot(&ce_broadcast_hrtimer))
if (ce_broadcast_hrtimer.next_event.tv64 != KTIME_MAX)
return HRTIMER_RESTART;
- default:
- return HRTIMER_NORESTART;
- }
+
+ return HRTIMER_NORESTART;
}
void tick_setup_hrtimer_broadcast(void)
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index f8bf47571dda..d11c55b6ab7d 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -304,9 +304,6 @@ void tick_check_new_device(struct clock_event_device *newdev)
int cpu;
cpu = smp_processor_id();
- if (!cpumask_test_cpu(cpu, newdev->cpumask))
- goto out_bc;
-
td = &per_cpu(tick_cpu_device, cpu);
curdev = td->evtdev;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index c792429e98c6..3319e16f31e5 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -197,27 +197,9 @@ static bool can_stop_full_tick(void)
return true;
}
-static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
-
-/*
- * Re-evaluate the need for the tick on the current CPU
- * and restart it if necessary.
- */
-void __tick_nohz_full_check(void)
-{
- struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
-
- if (tick_nohz_full_cpu(smp_processor_id())) {
- if (ts->tick_stopped && !is_idle_task(current)) {
- if (!can_stop_full_tick())
- tick_nohz_restart_sched_tick(ts, ktime_get());
- }
- }
-}
-
static void nohz_full_kick_work_func(struct irq_work *work)
{
- __tick_nohz_full_check();
+ /* Empty, the tick restart happens on tick_nohz_irq_exit() */
}
static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
@@ -252,7 +234,7 @@ void tick_nohz_full_kick_cpu(int cpu)
static void nohz_full_kick_ipi(void *info)
{
- __tick_nohz_full_check();
+ /* Empty, the tick restart happens on tick_nohz_irq_exit() */
}
/*
@@ -276,7 +258,7 @@ void tick_nohz_full_kick_all(void)
* It might need the tick due to per task/process properties:
* perf events, posix cpu timers, ...
*/
-void __tick_nohz_task_switch(struct task_struct *tsk)
+void __tick_nohz_task_switch(void)
{
unsigned long flags;
@@ -705,21 +687,38 @@ out:
return tick;
}
-static void tick_nohz_full_stop_tick(struct tick_sched *ts)
+static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
+{
+ /* Update jiffies first */
+ tick_do_update_jiffies64(now);
+ update_cpu_load_nohz();
+
+ calc_load_exit_idle();
+ touch_softlockup_watchdog();
+ /*
+ * Cancel the scheduled timer and restore the tick
+ */
+ ts->tick_stopped = 0;
+ ts->idle_exittime = now;
+
+ tick_nohz_restart(ts, now);
+}
+
+static void tick_nohz_full_update_tick(struct tick_sched *ts)
{
#ifdef CONFIG_NO_HZ_FULL
int cpu = smp_processor_id();
- if (!tick_nohz_full_cpu(cpu) || is_idle_task(current))
+ if (!tick_nohz_full_cpu(cpu))
return;
if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
return;
- if (!can_stop_full_tick())
- return;
-
- tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
+ if (can_stop_full_tick())
+ tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
+ else if (ts->tick_stopped)
+ tick_nohz_restart_sched_tick(ts, ktime_get());
#endif
}
@@ -849,7 +848,7 @@ void tick_nohz_irq_exit(void)
if (ts->inidle)
__tick_nohz_idle_enter(ts);
else
- tick_nohz_full_stop_tick(ts);
+ tick_nohz_full_update_tick(ts);
}
/**
@@ -864,23 +863,6 @@ ktime_t tick_nohz_get_sleep_length(void)
return ts->sleep_length;
}
-static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
-{
- /* Update jiffies first */
- tick_do_update_jiffies64(now);
- update_cpu_load_nohz();
-
- calc_load_exit_idle();
- touch_softlockup_watchdog();
- /*
- * Cancel the scheduled timer and restore the tick
- */
- ts->tick_stopped = 0;
- ts->idle_exittime = now;
-
- tick_nohz_restart(ts, now);
-}
-
static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
{
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 85d5bb1d67eb..86751c68e08d 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -268,10 +268,14 @@ EXPORT_SYMBOL(jiffies_to_msecs);
unsigned int jiffies_to_usecs(const unsigned long j)
{
-#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
+ /*
+ * Hz usually doesn't go much further MSEC_PER_SEC.
+ * jiffies_to_usecs() and usecs_to_jiffies() depend on that.
+ */
+ BUILD_BUG_ON(HZ > USEC_PER_SEC);
+
+#if !(USEC_PER_SEC % HZ)
return (USEC_PER_SEC / HZ) * j;
-#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
- return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
#else
# if BITS_PER_LONG == 32
return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
@@ -287,26 +291,20 @@ EXPORT_SYMBOL(jiffies_to_usecs);
* @t: Timespec
* @gran: Granularity in ns.
*
- * Truncate a timespec to a granularity. gran must be smaller than a second.
- * Always rounds down.
- *
- * This function should be only used for timestamps returned by
- * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because
- * it doesn't handle the better resolution of the latter.
+ * Truncate a timespec to a granularity. Always rounds down. gran must
+ * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns).
*/
struct timespec timespec_trunc(struct timespec t, unsigned gran)
{
- /*
- * Division is pretty slow so avoid it for common cases.
- * Currently current_kernel_time() never returns better than
- * jiffies resolution. Exploit that.
- */
- if (gran <= jiffies_to_usecs(1) * 1000) {
+ /* Avoid division in the common cases 1 ns and 1 s. */
+ if (gran == 1) {
/* nothing */
- } else if (gran == 1000000000) {
+ } else if (gran == NSEC_PER_SEC) {
t.tv_nsec = 0;
- } else {
+ } else if (gran > 1 && gran < NSEC_PER_SEC) {
t.tv_nsec -= t.tv_nsec % gran;
+ } else {
+ WARN(1, "illegal file time granularity: %u", gran);
}
return t;
}
@@ -546,7 +544,7 @@ EXPORT_SYMBOL(__usecs_to_jiffies);
* value to a scaled second value.
*/
static unsigned long
-__timespec_to_jiffies(unsigned long sec, long nsec)
+__timespec64_to_jiffies(u64 sec, long nsec)
{
nsec = nsec + TICK_NSEC - 1;
@@ -554,22 +552,27 @@ __timespec_to_jiffies(unsigned long sec, long nsec)
sec = MAX_SEC_IN_JIFFIES;
nsec = 0;
}
- return (((u64)sec * SEC_CONVERSION) +
+ return ((sec * SEC_CONVERSION) +
(((u64)nsec * NSEC_CONVERSION) >>
(NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
}
-unsigned long
-timespec_to_jiffies(const struct timespec *value)
+static unsigned long
+__timespec_to_jiffies(unsigned long sec, long nsec)
{
- return __timespec_to_jiffies(value->tv_sec, value->tv_nsec);
+ return __timespec64_to_jiffies((u64)sec, nsec);
}
-EXPORT_SYMBOL(timespec_to_jiffies);
+unsigned long
+timespec64_to_jiffies(const struct timespec64 *value)
+{
+ return __timespec64_to_jiffies(value->tv_sec, value->tv_nsec);
+}
+EXPORT_SYMBOL(timespec64_to_jiffies);
void
-jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
+jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value)
{
/*
* Convert jiffies to nanoseconds and separate with
@@ -580,7 +583,7 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
NSEC_PER_SEC, &rem);
value->tv_nsec = rem;
}
-EXPORT_SYMBOL(jiffies_to_timespec);
+EXPORT_SYMBOL(jiffies_to_timespec64);
/*
* We could use a similar algorithm to timespec_to_jiffies (with a
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index bca3667a2de1..f6ee2e6b6f5d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -911,6 +911,7 @@ int do_settimeofday64(const struct timespec64 *ts)
struct timekeeper *tk = &tk_core.timekeeper;
struct timespec64 ts_delta, xt;
unsigned long flags;
+ int ret = 0;
if (!timespec64_valid_strict(ts))
return -EINVAL;
@@ -924,10 +925,15 @@ int do_settimeofday64(const struct timespec64 *ts)
ts_delta.tv_sec = ts->tv_sec - xt.tv_sec;
ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec;
+ if (timespec64_compare(&tk->wall_to_monotonic, &ts_delta) > 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta));
tk_set_xtime(tk, ts);
-
+out:
timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
write_seqcount_end(&tk_core.seq);
@@ -936,7 +942,7 @@ int do_settimeofday64(const struct timespec64 *ts)
/* signal hrtimers about time change */
clock_was_set();
- return 0;
+ return ret;
}
EXPORT_SYMBOL(do_settimeofday64);
@@ -965,7 +971,8 @@ int timekeeping_inject_offset(struct timespec *ts)
/* Make sure the proposed value is valid */
tmp = timespec64_add(tk_xtime(tk), ts64);
- if (!timespec64_valid_strict(&tmp)) {
+ if (timespec64_compare(&tk->wall_to_monotonic, &ts64) > 0 ||
+ !timespec64_valid_strict(&tmp)) {
ret = -EINVAL;
goto error;
}
@@ -1874,7 +1881,7 @@ struct timespec __current_kernel_time(void)
return timespec64_to_timespec(tk_xtime(tk));
}
-struct timespec current_kernel_time(void)
+struct timespec64 current_kernel_time64(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
struct timespec64 now;
@@ -1886,9 +1893,9 @@ struct timespec current_kernel_time(void)
now = tk_xtime(tk);
} while (read_seqcount_retry(&tk_core.seq, seq));
- return timespec64_to_timespec(now);
+ return now;
}
-EXPORT_SYMBOL(current_kernel_time);
+EXPORT_SYMBOL(current_kernel_time64);
struct timespec64 get_monotonic_coarse64(void)
{
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 5e097fa9faf7..84190f02b521 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -807,8 +807,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
spin_unlock(&base->lock);
base = new_base;
spin_lock(&base->lock);
- timer->flags &= ~TIMER_BASEMASK;
- timer->flags |= base->cpu;
+ WRITE_ONCE(timer->flags,
+ (timer->flags & ~TIMER_BASEMASK) | base->cpu);
}
}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index a4536e1e3e2a..129c96033e46 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -137,7 +137,7 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
(unsigned long long) ktime_to_ns(base->offset));
#endif
SEQ_printf(m, "active timers:\n");
- print_active_timers(m, base, now);
+ print_active_timers(m, base, now + ktime_to_ns(base->offset));
}
static void print_cpu(struct seq_file *m, int cpu, u64 now)
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 3b9a48ae153a..1153c43428f3 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -434,7 +434,7 @@ config UPROBE_EVENT
config BPF_EVENTS
depends on BPF_SYSCALL
- depends on KPROBE_EVENT
+ depends on KPROBE_EVENT || UPROBE_EVENT
bool
default y
help
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b3e6b39b6cf9..90e72a0c3047 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -778,9 +778,6 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
if (likely(!bt))
return;
- if (!error && !bio_flagged(bio, BIO_UPTODATE))
- error = EIO;
-
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
bio->bi_rw, what, error, 0, NULL);
}
@@ -887,8 +884,7 @@ static void blk_add_trace_split(void *ignore,
__blk_add_trace(bt, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT,
- !bio_flagged(bio, BIO_UPTODATE),
- sizeof(rpdu), &rpdu);
+ bio->bi_error, sizeof(rpdu), &rpdu);
}
}
@@ -920,8 +916,8 @@ static void blk_add_trace_bio_remap(void *ignore,
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio->bi_rw, BLK_TA_REMAP,
- !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+ bio->bi_rw, BLK_TA_REMAP, bio->bi_error,
+ sizeof(r), &r);
}
/**
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 88a041adee90..0fe96c7c8803 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -81,13 +81,16 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
/*
* limited trace_printk()
- * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed
+ * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
*/
static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
{
char *fmt = (char *) (long) r1;
+ bool str_seen = false;
int mod[3] = {};
int fmt_cnt = 0;
+ u64 unsafe_addr;
+ char buf[64];
int i;
/*
@@ -114,12 +117,37 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
if (fmt[i] == 'l') {
mod[fmt_cnt]++;
i++;
- } else if (fmt[i] == 'p') {
+ } else if (fmt[i] == 'p' || fmt[i] == 's') {
mod[fmt_cnt]++;
i++;
if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0)
return -EINVAL;
fmt_cnt++;
+ if (fmt[i - 1] == 's') {
+ if (str_seen)
+ /* allow only one '%s' per fmt string */
+ return -EINVAL;
+ str_seen = true;
+
+ switch (fmt_cnt) {
+ case 1:
+ unsafe_addr = r3;
+ r3 = (long) buf;
+ break;
+ case 2:
+ unsafe_addr = r4;
+ r4 = (long) buf;
+ break;
+ case 3:
+ unsafe_addr = r5;
+ r5 = (long) buf;
+ break;
+ }
+ buf[0] = 0;
+ strncpy_from_unsafe(buf,
+ (void *) (long) unsafe_addr,
+ sizeof(buf));
+ }
continue;
}
@@ -158,6 +186,35 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
return &bpf_trace_printk_proto;
}
+static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
+{
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct perf_event *event;
+
+ if (unlikely(index >= array->map.max_entries))
+ return -E2BIG;
+
+ event = (struct perf_event *)array->ptrs[index];
+ if (!event)
+ return -ENOENT;
+
+ /*
+ * we don't know if the function is run successfully by the
+ * return value. It can be judged in other places, such as
+ * eBPF programs.
+ */
+ return perf_event_read_local(event);
+}
+
+const struct bpf_func_proto bpf_perf_event_read_proto = {
+ .func = bpf_perf_event_read,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+};
+
static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
@@ -183,6 +240,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
return bpf_get_trace_printk_proto();
case BPF_FUNC_get_smp_processor_id:
return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_perf_event_read:
+ return &bpf_perf_event_read_proto;
default:
return NULL;
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b7d0cdd9906c..c9956440d0e6 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -165,11 +165,9 @@ DEFINE_BASIC_FETCH_FUNCS(memory)
static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
void *addr, void *dest)
{
- long ret;
int maxlen = get_rloc_len(*(u32 *)dest);
u8 *dst = get_rloc_data(dest);
- u8 *src = addr;
- mm_segment_t old_fs = get_fs();
+ long ret;
if (!maxlen)
return;
@@ -178,23 +176,13 @@ static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
* Try to get string again, since the string can be changed while
* probing.
*/
- set_fs(KERNEL_DS);
- pagefault_disable();
-
- do
- ret = __copy_from_user_inatomic(dst++, src++, 1);
- while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
-
- dst[-1] = '\0';
- pagefault_enable();
- set_fs(old_fs);
+ ret = strncpy_from_unsafe(dst, addr, maxlen);
if (ret < 0) { /* Failed to fetch string */
- ((u8 *)get_rloc_data(dest))[0] = '\0';
+ dst[0] = '\0';
*(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
} else {
- *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
- get_rloc_offs(*(u32 *)dest));
+ *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest));
}
}
NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string));
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 419ca37e72c9..f270088e9929 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -26,7 +26,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n
}
static void
-probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
+probe_sched_wakeup(void *ignore, struct task_struct *wakee)
{
if (unlikely(!sched_ref))
return;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 9b33dd117f3f..12cbe77b4136 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -514,7 +514,7 @@ static void wakeup_reset(struct trace_array *tr)
}
static void
-probe_wakeup(void *ignore, struct task_struct *p, int success)
+probe_wakeup(void *ignore, struct task_struct *p)
{
struct trace_array_cpu *data;
int cpu = smp_processor_id();
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index aa1ea7b36fa8..d2f6d0be3503 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -601,7 +601,22 @@ static int probes_seq_show(struct seq_file *m, void *v)
seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system,
trace_event_name(&tu->tp.call));
- seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
+ seq_printf(m, " %s:", tu->filename);
+
+ /* Don't print "0x (null)" when offset is 0 */
+ if (tu->offset) {
+ seq_printf(m, "0x%p", (void *)tu->offset);
+ } else {
+ switch (sizeof(void *)) {
+ case 4:
+ seq_printf(m, "0x00000000");
+ break;
+ case 8:
+ default:
+ seq_printf(m, "0x0000000000000000");
+ break;
+ }
+ }
for (i = 0; i < tu->tp.nr_args; i++)
seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm);
@@ -1095,11 +1110,15 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
{
struct trace_event_call *call = &tu->tp.call;
struct uprobe_trace_entry_head *entry;
+ struct bpf_prog *prog = call->prog;
struct hlist_head *head;
void *data;
int size, esize;
int rctx;
+ if (prog && !trace_call_bpf(prog, regs))
+ return;
+
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
size = esize + tu->tp.size + dsize;
@@ -1289,6 +1308,7 @@ static int register_uprobe_event(struct trace_uprobe *tu)
return -ENODEV;
}
+ call->flags = TRACE_EVENT_FL_UPROBE;
call->class->reg = trace_uprobe_register;
call->data = tu;
ret = trace_add_event_call(call);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 4109f8320684..f65a0a06a8c0 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -976,8 +976,8 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
if (user_ns == current_user_ns())
return -EINVAL;
- /* Threaded processes may not enter a different user namespace */
- if (atomic_read(&current->mm->mm_users) > 1)
+ /* Tasks that share a thread group must share a user namespace */
+ if (!thread_group_empty(current))
return -EINVAL;
if (current->fs->users != 1)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4c4f06176f74..ca71582fcfab 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -338,20 +338,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
#include <trace/events/workqueue.h>
#define assert_rcu_or_pool_mutex() \
- rcu_lockdep_assert(rcu_read_lock_sched_held() || \
- lockdep_is_held(&wq_pool_mutex), \
- "sched RCU or wq_pool_mutex should be held")
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
+ !lockdep_is_held(&wq_pool_mutex), \
+ "sched RCU or wq_pool_mutex should be held")
#define assert_rcu_or_wq_mutex(wq) \
- rcu_lockdep_assert(rcu_read_lock_sched_held() || \
- lockdep_is_held(&wq->mutex), \
- "sched RCU or wq->mutex should be held")
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
+ !lockdep_is_held(&wq->mutex), \
+ "sched RCU or wq->mutex should be held")
#define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \
- rcu_lockdep_assert(rcu_read_lock_sched_held() || \
- lockdep_is_held(&wq->mutex) || \
- lockdep_is_held(&wq_pool_mutex), \
- "sched RCU, wq->mutex or wq_pool_mutex should be held")
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
+ !lockdep_is_held(&wq->mutex) && \
+ !lockdep_is_held(&wq_pool_mutex), \
+ "sched RCU, wq->mutex or wq_pool_mutex should be held")
#define for_each_cpu_worker_pool(pool, cpu) \
for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \
@@ -1714,9 +1714,7 @@ static struct worker *create_worker(struct worker_pool *pool)
goto fail;
set_user_nice(worker->task, pool->attrs->nice);
-
- /* prevent userland from meddling with cpumask of workqueue workers */
- worker->task->flags |= PF_NO_SETAFFINITY;
+ kthread_bind_mask(worker->task, pool->attrs->cpumask);
/* successful, attach the worker to the pool */
worker_attach_to_pool(worker, pool);
@@ -2614,7 +2612,7 @@ void flush_workqueue(struct workqueue_struct *wq)
out_unlock:
mutex_unlock(&wq->mutex);
}
-EXPORT_SYMBOL_GPL(flush_workqueue);
+EXPORT_SYMBOL(flush_workqueue);
/**
* drain_workqueue - drain a workqueue
@@ -3856,7 +3854,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
}
wq->rescuer = rescuer;
- rescuer->task->flags |= PF_NO_SETAFFINITY;
+ kthread_bind_mask(rescuer->task, cpu_possible_mask);
wake_up_process(rescuer->task);
}