summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c14
-rw-r--r--kernel/cgroup_freezer.c116
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/debug/debug_core.c4
-rw-r--r--kernel/events/core.c287
-rw-r--r--kernel/events/uprobes.c31
-rw-r--r--kernel/futex.c56
-rw-r--r--kernel/hrtimer.c9
-rw-r--r--kernel/kexec.c8
-rw-r--r--kernel/kmod.c2
-rw-r--r--kernel/ksysfs.c5
-rw-r--r--kernel/locking/lockdep_internals.h6
-rw-r--r--kernel/locking/locktorture.c12
-rw-r--r--kernel/locking/rtmutex.c32
-rw-r--r--kernel/locking/rwsem-xadd.c49
-rw-r--r--kernel/power/Kconfig3
-rw-r--r--kernel/power/hibernate.c27
-rw-r--r--kernel/power/main.c33
-rw-r--r--kernel/power/power.h9
-rw-r--r--kernel/power/suspend.c111
-rw-r--r--kernel/power/suspend_test.c24
-rw-r--r--kernel/power/swap.c2
-rw-r--r--kernel/printk/printk.c1
-rw-r--r--kernel/rcu/rcutorture.c217
-rw-r--r--kernel/rcu/tiny_plugin.h8
-rw-r--r--kernel/rcu/tree.c331
-rw-r--r--kernel/rcu/tree.h11
-rw-r--r--kernel/rcu/tree_plugin.h144
-rw-r--r--kernel/rcu/update.c30
-rw-r--r--kernel/resource.c7
-rw-r--r--kernel/sched/core.c415
-rw-r--r--kernel/sched/cpudeadline.c37
-rw-r--r--kernel/sched/cpudeadline.h6
-rw-r--r--kernel/sched/cpupri.c16
-rw-r--r--kernel/sched/cpupri.h2
-rw-r--r--kernel/sched/cputime.c32
-rw-r--r--kernel/sched/deadline.c11
-rw-r--r--kernel/sched/fair.c242
-rw-r--r--kernel/sched/idle.c140
-rw-r--r--kernel/sched/rt.c119
-rw-r--r--kernel/sched/sched.h26
-rw-r--r--kernel/sched/stop_task.c4
-rw-r--r--kernel/sched/wait.c2
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/sys.c6
-rw-r--r--kernel/sysctl.c4
-rw-r--r--kernel/time/ntp.c17
-rw-r--r--kernel/time/sched_clock.c13
-rw-r--r--kernel/torture.c40
-rw-r--r--kernel/workqueue.c42
50 files changed, 1724 insertions, 1049 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 9fcdaa705b6c..ceee0c54c6a4 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -33,6 +33,7 @@
#include <linux/init_task.h>
#include <linux/kernel.h>
#include <linux/list.h>
+#include <linux/magic.h>
#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/mount.h>
@@ -348,7 +349,7 @@ struct cgrp_cset_link {
* reference-counted, to improve performance when child cgroups
* haven't been created.
*/
-static struct css_set init_css_set = {
+struct css_set init_css_set = {
.refcount = ATOMIC_INIT(1),
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
.tasks = LIST_HEAD_INIT(init_css_set.tasks),
@@ -1495,7 +1496,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
*/
if (!use_task_css_set_links)
cgroup_enable_task_cg_lists();
-retry:
+
mutex_lock(&cgroup_tree_mutex);
mutex_lock(&cgroup_mutex);
@@ -1503,7 +1504,7 @@ retry:
ret = parse_cgroupfs_options(data, &opts);
if (ret)
goto out_unlock;
-
+retry:
/* look for a matching existing root */
if (!opts.subsys_mask && !opts.none && !opts.name) {
cgrp_dfl_root_visible = true;
@@ -1562,9 +1563,9 @@ retry:
if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgroup_tree_mutex);
- kfree(opts.release_agent);
- kfree(opts.name);
msleep(10);
+ mutex_lock(&cgroup_tree_mutex);
+ mutex_lock(&cgroup_mutex);
goto retry;
}
@@ -1604,7 +1605,8 @@ out_unlock:
if (ret)
return ERR_PTR(ret);
- dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb);
+ dentry = kernfs_mount(fs_type, flags, root->kf_root,
+ CGROUP_SUPER_MAGIC, &new_sb);
if (IS_ERR(dentry) || !new_sb)
cgroup_put(&root->cgrp);
return dentry;
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 2bc4a2256444..345628c78b5b 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -21,6 +21,7 @@
#include <linux/uaccess.h>
#include <linux/freezer.h>
#include <linux/seq_file.h>
+#include <linux/mutex.h>
/*
* A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
@@ -42,9 +43,10 @@ enum freezer_state_flags {
struct freezer {
struct cgroup_subsys_state css;
unsigned int state;
- spinlock_t lock;
};
+static DEFINE_MUTEX(freezer_mutex);
+
static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
{
return css ? container_of(css, struct freezer, css) : NULL;
@@ -93,7 +95,6 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css)
if (!freezer)
return ERR_PTR(-ENOMEM);
- spin_lock_init(&freezer->lock);
return &freezer->css;
}
@@ -110,14 +111,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
struct freezer *freezer = css_freezer(css);
struct freezer *parent = parent_freezer(freezer);
- /*
- * The following double locking and freezing state inheritance
- * guarantee that @cgroup can never escape ancestors' freezing
- * states. See css_for_each_descendant_pre() for details.
- */
- if (parent)
- spin_lock_irq(&parent->lock);
- spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING);
+ mutex_lock(&freezer_mutex);
freezer->state |= CGROUP_FREEZER_ONLINE;
@@ -126,10 +120,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
atomic_inc(&system_freezing_cnt);
}
- spin_unlock(&freezer->lock);
- if (parent)
- spin_unlock_irq(&parent->lock);
-
+ mutex_unlock(&freezer_mutex);
return 0;
}
@@ -144,14 +135,14 @@ static void freezer_css_offline(struct cgroup_subsys_state *css)
{
struct freezer *freezer = css_freezer(css);
- spin_lock_irq(&freezer->lock);
+ mutex_lock(&freezer_mutex);
if (freezer->state & CGROUP_FREEZING)
atomic_dec(&system_freezing_cnt);
freezer->state = 0;
- spin_unlock_irq(&freezer->lock);
+ mutex_unlock(&freezer_mutex);
}
static void freezer_css_free(struct cgroup_subsys_state *css)
@@ -175,7 +166,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
struct task_struct *task;
bool clear_frozen = false;
- spin_lock_irq(&freezer->lock);
+ mutex_lock(&freezer_mutex);
/*
* Make the new tasks conform to the current state of @new_css.
@@ -197,21 +188,13 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
}
}
- spin_unlock_irq(&freezer->lock);
-
- /*
- * Propagate FROZEN clearing upwards. We may race with
- * update_if_frozen(), but as long as both work bottom-up, either
- * update_if_frozen() sees child's FROZEN cleared or we clear the
- * parent's FROZEN later. No parent w/ !FROZEN children can be
- * left FROZEN.
- */
+ /* propagate FROZEN clearing upwards */
while (clear_frozen && (freezer = parent_freezer(freezer))) {
- spin_lock_irq(&freezer->lock);
freezer->state &= ~CGROUP_FROZEN;
clear_frozen = freezer->state & CGROUP_FREEZING;
- spin_unlock_irq(&freezer->lock);
}
+
+ mutex_unlock(&freezer_mutex);
}
/**
@@ -228,9 +211,6 @@ static void freezer_fork(struct task_struct *task)
{
struct freezer *freezer;
- rcu_read_lock();
- freezer = task_freezer(task);
-
/*
* The root cgroup is non-freezable, so we can skip locking the
* freezer. This is safe regardless of race with task migration.
@@ -238,24 +218,18 @@ static void freezer_fork(struct task_struct *task)
* to do. If we lost and root is the new cgroup, noop is still the
* right thing to do.
*/
- if (!parent_freezer(freezer))
- goto out;
+ if (task_css_is_root(task, freezer_cgrp_id))
+ return;
- /*
- * Grab @freezer->lock and freeze @task after verifying @task still
- * belongs to @freezer and it's freezing. The former is for the
- * case where we have raced against task migration and lost and
- * @task is already in a different cgroup which may not be frozen.
- * This isn't strictly necessary as freeze_task() is allowed to be
- * called spuriously but let's do it anyway for, if nothing else,
- * documentation.
- */
- spin_lock_irq(&freezer->lock);
- if (freezer == task_freezer(task) && (freezer->state & CGROUP_FREEZING))
+ mutex_lock(&freezer_mutex);
+ rcu_read_lock();
+
+ freezer = task_freezer(task);
+ if (freezer->state & CGROUP_FREEZING)
freeze_task(task);
- spin_unlock_irq(&freezer->lock);
-out:
+
rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
}
/**
@@ -281,22 +255,24 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
struct css_task_iter it;
struct task_struct *task;
- WARN_ON_ONCE(!rcu_read_lock_held());
-
- spin_lock_irq(&freezer->lock);
+ lockdep_assert_held(&freezer_mutex);
if (!(freezer->state & CGROUP_FREEZING) ||
(freezer->state & CGROUP_FROZEN))
- goto out_unlock;
+ return;
/* are all (live) children frozen? */
+ rcu_read_lock();
css_for_each_child(pos, css) {
struct freezer *child = css_freezer(pos);
if ((child->state & CGROUP_FREEZER_ONLINE) &&
- !(child->state & CGROUP_FROZEN))
- goto out_unlock;
+ !(child->state & CGROUP_FROZEN)) {
+ rcu_read_unlock();
+ return;
+ }
}
+ rcu_read_unlock();
/* are all tasks frozen? */
css_task_iter_start(css, &it);
@@ -317,21 +293,29 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
freezer->state |= CGROUP_FROZEN;
out_iter_end:
css_task_iter_end(&it);
-out_unlock:
- spin_unlock_irq(&freezer->lock);
}
static int freezer_read(struct seq_file *m, void *v)
{
struct cgroup_subsys_state *css = seq_css(m), *pos;
+ mutex_lock(&freezer_mutex);
rcu_read_lock();
/* update states bottom-up */
- css_for_each_descendant_post(pos, css)
+ css_for_each_descendant_post(pos, css) {
+ if (!css_tryget(pos))
+ continue;
+ rcu_read_unlock();
+
update_if_frozen(pos);
+ rcu_read_lock();
+ css_put(pos);
+ }
+
rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
seq_puts(m, freezer_state_strs(css_freezer(css)->state));
seq_putc(m, '\n');
@@ -373,7 +357,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
unsigned int state)
{
/* also synchronizes against task migration, see freezer_attach() */
- lockdep_assert_held(&freezer->lock);
+ lockdep_assert_held(&freezer_mutex);
if (!(freezer->state & CGROUP_FREEZER_ONLINE))
return;
@@ -414,31 +398,29 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
* descendant will try to inherit its parent's FREEZING state as
* CGROUP_FREEZING_PARENT.
*/
+ mutex_lock(&freezer_mutex);
rcu_read_lock();
css_for_each_descendant_pre(pos, &freezer->css) {
struct freezer *pos_f = css_freezer(pos);
struct freezer *parent = parent_freezer(pos_f);
- spin_lock_irq(&pos_f->lock);
+ if (!css_tryget(pos))
+ continue;
+ rcu_read_unlock();
- if (pos_f == freezer) {
+ if (pos_f == freezer)
freezer_apply_state(pos_f, freeze,
CGROUP_FREEZING_SELF);
- } else {
- /*
- * Our update to @parent->state is already visible
- * which is all we need. No need to lock @parent.
- * For more info on synchronization, see
- * freezer_post_create().
- */
+ else
freezer_apply_state(pos_f,
parent->state & CGROUP_FREEZING,
CGROUP_FREEZING_PARENT);
- }
- spin_unlock_irq(&pos_f->lock);
+ rcu_read_lock();
+ css_put(pos);
}
rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
}
static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft,
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a9e710eef0e2..247979a1b815 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -726,10 +726,12 @@ void set_cpu_present(unsigned int cpu, bool present)
void set_cpu_online(unsigned int cpu, bool online)
{
- if (online)
+ if (online) {
cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
- else
+ cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
+ } else {
cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
+ }
}
void set_cpu_active(unsigned int cpu, bool active)
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 2956c8da1605..1adf62b39b96 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -534,7 +534,7 @@ return_normal:
kgdb_info[cpu].exception_state &=
~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
kgdb_info[cpu].enter_kgdb--;
- smp_mb__before_atomic_dec();
+ smp_mb__before_atomic();
atomic_dec(&slaves_in_kgdb);
dbg_touch_watchdogs();
local_irq_restore(flags);
@@ -662,7 +662,7 @@ kgdb_restore:
kgdb_info[cpu].exception_state &=
~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
kgdb_info[cpu].enter_kgdb--;
- smp_mb__before_atomic_dec();
+ smp_mb__before_atomic();
atomic_dec(&masters_in_kgdb);
/* Free kgdb_active */
atomic_set(&kgdb_active, -1);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f83a71a3e46d..689237a0c5e8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -39,6 +39,7 @@
#include <linux/hw_breakpoint.h>
#include <linux/mm_types.h>
#include <linux/cgroup.h>
+#include <linux/module.h>
#include "internal.h"
@@ -1443,6 +1444,11 @@ group_sched_out(struct perf_event *group_event,
cpuctx->exclusive = 0;
}
+struct remove_event {
+ struct perf_event *event;
+ bool detach_group;
+};
+
/*
* Cross CPU call to remove a performance event
*
@@ -1451,12 +1457,15 @@ group_sched_out(struct perf_event *group_event,
*/
static int __perf_remove_from_context(void *info)
{
- struct perf_event *event = info;
+ struct remove_event *re = info;
+ struct perf_event *event = re->event;
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
raw_spin_lock(&ctx->lock);
event_sched_out(event, cpuctx, ctx);
+ if (re->detach_group)
+ perf_group_detach(event);
list_del_event(event, ctx);
if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
ctx->is_active = 0;
@@ -1481,10 +1490,14 @@ static int __perf_remove_from_context(void *info)
* When called from perf_event_exit_task, it's OK because the
* context has been detached from its task.
*/
-static void perf_remove_from_context(struct perf_event *event)
+static void perf_remove_from_context(struct perf_event *event, bool detach_group)
{
struct perf_event_context *ctx = event->ctx;
struct task_struct *task = ctx->task;
+ struct remove_event re = {
+ .event = event,
+ .detach_group = detach_group,
+ };
lockdep_assert_held(&ctx->mutex);
@@ -1493,12 +1506,12 @@ static void perf_remove_from_context(struct perf_event *event)
* Per cpu events are removed via an smp call and
* the removal is always successful.
*/
- cpu_function_call(event->cpu, __perf_remove_from_context, event);
+ cpu_function_call(event->cpu, __perf_remove_from_context, &re);
return;
}
retry:
- if (!task_function_call(task, __perf_remove_from_context, event))
+ if (!task_function_call(task, __perf_remove_from_context, &re))
return;
raw_spin_lock_irq(&ctx->lock);
@@ -1515,6 +1528,8 @@ retry:
* Since the task isn't running, its safe to remove the event, us
* holding the ctx->lock ensures the task won't get scheduled in.
*/
+ if (detach_group)
+ perf_group_detach(event);
list_del_event(event, ctx);
raw_spin_unlock_irq(&ctx->lock);
}
@@ -1663,6 +1678,8 @@ event_sched_in(struct perf_event *event,
u64 tstamp = perf_event_time(event);
int ret = 0;
+ lockdep_assert_held(&ctx->lock);
+
if (event->state <= PERF_EVENT_STATE_OFF)
return 0;
@@ -3178,7 +3195,8 @@ static void free_event_rcu(struct rcu_head *head)
}
static void ring_buffer_put(struct ring_buffer *rb);
-static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);
+static void ring_buffer_attach(struct perf_event *event,
+ struct ring_buffer *rb);
static void unaccount_event_cpu(struct perf_event *event, int cpu)
{
@@ -3229,17 +3247,19 @@ static void __free_event(struct perf_event *event)
if (event->ctx)
put_ctx(event->ctx);
+ if (event->pmu)
+ module_put(event->pmu->module);
+
call_rcu(&event->rcu_head, free_event_rcu);
}
-static void free_event(struct perf_event *event)
+
+static void _free_event(struct perf_event *event)
{
irq_work_sync(&event->pending);
unaccount_event(event);
if (event->rb) {
- struct ring_buffer *rb;
-
/*
* Can happen when we close an event with re-directed output.
*
@@ -3247,57 +3267,38 @@ static void free_event(struct perf_event *event)
* over us; possibly making our ring_buffer_put() the last.
*/
mutex_lock(&event->mmap_mutex);
- rb = event->rb;
- if (rb) {
- rcu_assign_pointer(event->rb, NULL);
- ring_buffer_detach(event, rb);
- ring_buffer_put(rb); /* could be last */
- }
+ ring_buffer_attach(event, NULL);
mutex_unlock(&event->mmap_mutex);
}
if (is_cgroup_event(event))
perf_detach_cgroup(event);
-
__free_event(event);
}
-int perf_event_release_kernel(struct perf_event *event)
+/*
+ * Used to free events which have a known refcount of 1, such as in error paths
+ * where the event isn't exposed yet and inherited events.
+ */
+static void free_event(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
-
- WARN_ON_ONCE(ctx->parent_ctx);
- /*
- * There are two ways this annotation is useful:
- *
- * 1) there is a lock recursion from perf_event_exit_task
- * see the comment there.
- *
- * 2) there is a lock-inversion with mmap_sem through
- * perf_event_read_group(), which takes faults while
- * holding ctx->mutex, however this is called after
- * the last filedesc died, so there is no possibility
- * to trigger the AB-BA case.
- */
- mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
- raw_spin_lock_irq(&ctx->lock);
- perf_group_detach(event);
- raw_spin_unlock_irq(&ctx->lock);
- perf_remove_from_context(event);
- mutex_unlock(&ctx->mutex);
-
- free_event(event);
+ if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
+ "unexpected event refcount: %ld; ptr=%p\n",
+ atomic_long_read(&event->refcount), event)) {
+ /* leak to avoid use-after-free */
+ return;
+ }
- return 0;
+ _free_event(event);
}
-EXPORT_SYMBOL_GPL(perf_event_release_kernel);
/*
* Called when the last reference to the file is gone.
*/
static void put_event(struct perf_event *event)
{
+ struct perf_event_context *ctx = event->ctx;
struct task_struct *owner;
if (!atomic_long_dec_and_test(&event->refcount))
@@ -3336,9 +3337,33 @@ static void put_event(struct perf_event *event)
put_task_struct(owner);
}
- perf_event_release_kernel(event);
+ WARN_ON_ONCE(ctx->parent_ctx);
+ /*
+ * There are two ways this annotation is useful:
+ *
+ * 1) there is a lock recursion from perf_event_exit_task
+ * see the comment there.
+ *
+ * 2) there is a lock-inversion with mmap_sem through
+ * perf_event_read_group(), which takes faults while
+ * holding ctx->mutex, however this is called after
+ * the last filedesc died, so there is no possibility
+ * to trigger the AB-BA case.
+ */
+ mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
+ perf_remove_from_context(event, true);
+ mutex_unlock(&ctx->mutex);
+
+ _free_event(event);
}
+int perf_event_release_kernel(struct perf_event *event)
+{
+ put_event(event);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(perf_event_release_kernel);
+
static int perf_release(struct inode *inode, struct file *file)
{
put_event(file->private_data);
@@ -3839,28 +3864,47 @@ unlock:
static void ring_buffer_attach(struct perf_event *event,
struct ring_buffer *rb)
{
+ struct ring_buffer *old_rb = NULL;
unsigned long flags;
- if (!list_empty(&event->rb_entry))
- return;
+ if (event->rb) {
+ /*
+ * Should be impossible, we set this when removing
+ * event->rb_entry and wait/clear when adding event->rb_entry.
+ */
+ WARN_ON_ONCE(event->rcu_pending);
- spin_lock_irqsave(&rb->event_lock, flags);
- if (list_empty(&event->rb_entry))
- list_add(&event->rb_entry, &rb->event_list);
- spin_unlock_irqrestore(&rb->event_lock, flags);
-}
+ old_rb = event->rb;
+ event->rcu_batches = get_state_synchronize_rcu();
+ event->rcu_pending = 1;
-static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb)
-{
- unsigned long flags;
+ spin_lock_irqsave(&old_rb->event_lock, flags);
+ list_del_rcu(&event->rb_entry);
+ spin_unlock_irqrestore(&old_rb->event_lock, flags);
+ }
- if (list_empty(&event->rb_entry))
- return;
+ if (event->rcu_pending && rb) {
+ cond_synchronize_rcu(event->rcu_batches);
+ event->rcu_pending = 0;
+ }
+
+ if (rb) {
+ spin_lock_irqsave(&rb->event_lock, flags);
+ list_add_rcu(&event->rb_entry, &rb->event_list);
+ spin_unlock_irqrestore(&rb->event_lock, flags);
+ }
+
+ rcu_assign_pointer(event->rb, rb);
- spin_lock_irqsave(&rb->event_lock, flags);
- list_del_init(&event->rb_entry);
- wake_up_all(&event->waitq);
- spin_unlock_irqrestore(&rb->event_lock, flags);
+ if (old_rb) {
+ ring_buffer_put(old_rb);
+ /*
+ * Since we detached before setting the new rb, so that we
+ * could attach the new rb, we could have missed a wakeup.
+ * Provide it now.
+ */
+ wake_up_all(&event->waitq);
+ }
}
static void ring_buffer_wakeup(struct perf_event *event)
@@ -3929,7 +3973,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
- struct ring_buffer *rb = event->rb;
+ struct ring_buffer *rb = ring_buffer_get(event);
struct user_struct *mmap_user = rb->mmap_user;
int mmap_locked = rb->mmap_locked;
unsigned long size = perf_data_size(rb);
@@ -3937,18 +3981,14 @@ static void perf_mmap_close(struct vm_area_struct *vma)
atomic_dec(&rb->mmap_count);
if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
- return;
+ goto out_put;
- /* Detach current event from the buffer. */
- rcu_assign_pointer(event->rb, NULL);
- ring_buffer_detach(event, rb);
+ ring_buffer_attach(event, NULL);
mutex_unlock(&event->mmap_mutex);
/* If there's still other mmap()s of this buffer, we're done. */
- if (atomic_read(&rb->mmap_count)) {
- ring_buffer_put(rb); /* can't be last */
- return;
- }
+ if (atomic_read(&rb->mmap_count))
+ goto out_put;
/*
* No other mmap()s, detach from all other events that might redirect
@@ -3978,11 +4018,9 @@ again:
* still restart the iteration to make sure we're not now
* iterating the wrong list.
*/
- if (event->rb == rb) {
- rcu_assign_pointer(event->rb, NULL);
- ring_buffer_detach(event, rb);
- ring_buffer_put(rb); /* can't be last, we still have one */
- }
+ if (event->rb == rb)
+ ring_buffer_attach(event, NULL);
+
mutex_unlock(&event->mmap_mutex);
put_event(event);
@@ -4007,6 +4045,7 @@ again:
vma->vm_mm->pinned_vm -= mmap_locked;
free_uid(mmap_user);
+out_put:
ring_buffer_put(rb); /* could be last */
}
@@ -4124,7 +4163,6 @@ again:
vma->vm_mm->pinned_vm += extra;
ring_buffer_attach(event, rb);
- rcu_assign_pointer(event->rb, rb);
perf_event_init_userpage(event);
perf_event_update_userpage(event);
@@ -5408,6 +5446,9 @@ struct swevent_htable {
/* Recursion avoidance in each contexts */
int recursion[PERF_NR_CONTEXTS];
+
+ /* Keeps track of cpu being initialized/exited */
+ bool online;
};
static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
@@ -5654,8 +5695,14 @@ static int perf_swevent_add(struct perf_event *event, int flags)
hwc->state = !(flags & PERF_EF_START);
head = find_swevent_head(swhash, event);
- if (WARN_ON_ONCE(!head))
+ if (!head) {
+ /*
+ * We can race with cpu hotplug code. Do not
+ * WARN if the cpu just got unplugged.
+ */
+ WARN_ON_ONCE(swhash->online);
return -EINVAL;
+ }
hlist_add_head_rcu(&event->hlist_entry, head);
@@ -6551,6 +6598,7 @@ free_pdc:
free_percpu(pmu->pmu_disable_count);
goto unlock;
}
+EXPORT_SYMBOL_GPL(perf_pmu_register);
void perf_pmu_unregister(struct pmu *pmu)
{
@@ -6572,6 +6620,7 @@ void perf_pmu_unregister(struct pmu *pmu)
put_device(pmu->dev);
free_pmu_context(pmu);
}
+EXPORT_SYMBOL_GPL(perf_pmu_unregister);
struct pmu *perf_init_event(struct perf_event *event)
{
@@ -6585,6 +6634,10 @@ struct pmu *perf_init_event(struct perf_event *event)
pmu = idr_find(&pmu_idr, event->attr.type);
rcu_read_unlock();
if (pmu) {
+ if (!try_module_get(pmu->module)) {
+ pmu = ERR_PTR(-ENODEV);
+ goto unlock;
+ }
event->pmu = pmu;
ret = pmu->event_init(event);
if (ret)
@@ -6593,6 +6646,10 @@ struct pmu *perf_init_event(struct perf_event *event)
}
list_for_each_entry_rcu(pmu, &pmus, entry) {
+ if (!try_module_get(pmu->module)) {
+ pmu = ERR_PTR(-ENODEV);
+ goto unlock;
+ }
event->pmu = pmu;
ret = pmu->event_init(event);
if (!ret)
@@ -6771,6 +6828,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
err_pmu:
if (event->destroy)
event->destroy(event);
+ module_put(pmu->module);
err_ns:
if (event->ns)
put_pid_ns(event->ns);
@@ -6914,7 +6972,7 @@ err_size:
static int
perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
{
- struct ring_buffer *rb = NULL, *old_rb = NULL;
+ struct ring_buffer *rb = NULL;
int ret = -EINVAL;
if (!output_event)
@@ -6942,8 +7000,6 @@ set:
if (atomic_read(&event->mmap_count))
goto unlock;
- old_rb = event->rb;
-
if (output_event) {
/* get the rb we want to redirect to */
rb = ring_buffer_get(output_event);
@@ -6951,23 +7007,7 @@ set:
goto unlock;
}
- if (old_rb)
- ring_buffer_detach(event, old_rb);
-
- if (rb)
- ring_buffer_attach(event, rb);
-
- rcu_assign_pointer(event->rb, rb);
-
- if (old_rb) {
- ring_buffer_put(old_rb);
- /*
- * Since we detached before setting the new rb, so that we
- * could attach the new rb, we could have missed a wakeup.
- * Provide it now.
- */
- wake_up_all(&event->waitq);
- }
+ ring_buffer_attach(event, rb);
ret = 0;
unlock:
@@ -7018,6 +7058,9 @@ SYSCALL_DEFINE5(perf_event_open,
if (attr.freq) {
if (attr.sample_freq > sysctl_perf_event_sample_rate)
return -EINVAL;
+ } else {
+ if (attr.sample_period & (1ULL << 63))
+ return -EINVAL;
}
/*
@@ -7055,20 +7098,26 @@ SYSCALL_DEFINE5(perf_event_open,
}
}
+ if (task && group_leader &&
+ group_leader->attr.inherit != attr.inherit) {
+ err = -EINVAL;
+ goto err_task;
+ }
+
get_online_cpus();
event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
NULL, NULL);
if (IS_ERR(event)) {
err = PTR_ERR(event);
- goto err_task;
+ goto err_cpus;
}
if (flags & PERF_FLAG_PID_CGROUP) {
err = perf_cgroup_connect(pid, event, &attr, group_leader);
if (err) {
__free_event(event);
- goto err_task;
+ goto err_cpus;
}
}
@@ -7165,7 +7214,7 @@ SYSCALL_DEFINE5(perf_event_open,
struct perf_event_context *gctx = group_leader->ctx;
mutex_lock(&gctx->mutex);
- perf_remove_from_context(group_leader);
+ perf_remove_from_context(group_leader, false);
/*
* Removing from the context ends up with disabled
@@ -7175,7 +7224,7 @@ SYSCALL_DEFINE5(perf_event_open,
perf_event__state_init(group_leader);
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
- perf_remove_from_context(sibling);
+ perf_remove_from_context(sibling, false);
perf_event__state_init(sibling);
put_ctx(gctx);
}
@@ -7230,8 +7279,9 @@ err_context:
put_ctx(ctx);
err_alloc:
free_event(event);
-err_task:
+err_cpus:
put_online_cpus();
+err_task:
if (task)
put_task_struct(task);
err_group_fd:
@@ -7305,7 +7355,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
mutex_lock(&src_ctx->mutex);
list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
event_entry) {
- perf_remove_from_context(event);
+ perf_remove_from_context(event, false);
unaccount_event_cpu(event, src_cpu);
put_ctx(src_ctx);
list_add(&event->migrate_entry, &events);
@@ -7367,13 +7417,7 @@ __perf_event_exit_task(struct perf_event *child_event,
struct perf_event_context *child_ctx,
struct task_struct *child)
{
- if (child_event->parent) {
- raw_spin_lock_irq(&child_ctx->lock);
- perf_group_detach(child_event);
- raw_spin_unlock_irq(&child_ctx->lock);
- }
-
- perf_remove_from_context(child_event);
+ perf_remove_from_context(child_event, true);
/*
* It can happen that the parent exits first, and has events
@@ -7388,7 +7432,7 @@ __perf_event_exit_task(struct perf_event *child_event,
static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
{
- struct perf_event *child_event, *tmp;
+ struct perf_event *child_event;
struct perf_event_context *child_ctx;
unsigned long flags;
@@ -7442,24 +7486,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
*/
mutex_lock(&child_ctx->mutex);
-again:
- list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
- group_entry)
+ list_for_each_entry_rcu(child_event, &child_ctx->event_list, event_entry)
__perf_event_exit_task(child_event, child_ctx, child);
- list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
- group_entry)
- __perf_event_exit_task(child_event, child_ctx, child);
-
- /*
- * If the last event was a group event, it will have appended all
- * its siblings to the list, but we obtained 'tmp' before that which
- * will still point to the list head terminating the iteration.
- */
- if (!list_empty(&child_ctx->pinned_groups) ||
- !list_empty(&child_ctx->flexible_groups))
- goto again;
-
mutex_unlock(&child_ctx->mutex);
put_ctx(child_ctx);
@@ -7724,6 +7753,8 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
* swapped under us.
*/
parent_ctx = perf_pin_task_context(parent, ctxn);
+ if (!parent_ctx)
+ return 0;
/*
* No need to check if parent_ctx != NULL here; since we saw
@@ -7835,6 +7866,7 @@ static void perf_event_init_cpu(int cpu)
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
mutex_lock(&swhash->hlist_mutex);
+ swhash->online = true;
if (swhash->hlist_refcount > 0) {
struct swevent_hlist *hlist;
@@ -7857,14 +7889,14 @@ static void perf_pmu_rotate_stop(struct pmu *pmu)
static void __perf_event_exit_context(void *__info)
{
+ struct remove_event re = { .detach_group = false };
struct perf_event_context *ctx = __info;
- struct perf_event *event;
perf_pmu_rotate_stop(ctx->pmu);
rcu_read_lock();
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry)
- __perf_remove_from_context(event);
+ list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
+ __perf_remove_from_context(&re);
rcu_read_unlock();
}
@@ -7892,6 +7924,7 @@ static void perf_event_exit_cpu(int cpu)
perf_event_exit_cpu_context(cpu);
mutex_lock(&swhash->hlist_mutex);
+ swhash->online = false;
swevent_hlist_release(swhash);
mutex_unlock(&swhash->hlist_mutex);
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 04709b66369d..d1edc5e6fd03 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -60,8 +60,6 @@ static struct percpu_rw_semaphore dup_mmap_sem;
/* Have a copy of original instruction */
#define UPROBE_COPY_INSN 0
-/* Can skip singlestep */
-#define UPROBE_SKIP_SSTEP 1
struct uprobe {
struct rb_node rb_node; /* node in the rb tree */
@@ -491,12 +489,9 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
uprobe->offset = offset;
init_rwsem(&uprobe->register_rwsem);
init_rwsem(&uprobe->consumer_rwsem);
- /* For now assume that the instruction need not be single-stepped */
- __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
/* add to uprobes_tree, sorted on inode:offset */
cur_uprobe = insert_uprobe(uprobe);
-
/* a uprobe exists for this inode:offset combination */
if (cur_uprobe) {
kfree(uprobe);
@@ -1628,20 +1623,6 @@ bool uprobe_deny_signal(void)
return true;
}
-/*
- * Avoid singlestepping the original instruction if the original instruction
- * is a NOP or can be emulated.
- */
-static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
-{
- if (test_bit(UPROBE_SKIP_SSTEP, &uprobe->flags)) {
- if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
- return true;
- clear_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
- }
- return false;
-}
-
static void mmf_recalc_uprobes(struct mm_struct *mm)
{
struct vm_area_struct *vma;
@@ -1868,13 +1849,13 @@ static void handle_swbp(struct pt_regs *regs)
handler_chain(uprobe, regs);
- if (can_skip_sstep(uprobe, regs))
+ if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
goto out;
if (!pre_ssout(uprobe, regs, bp_vaddr))
return;
- /* can_skip_sstep() succeeded, or restart if can't singlestep */
+ /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
out:
put_uprobe(uprobe);
}
@@ -1886,10 +1867,11 @@ out:
static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
{
struct uprobe *uprobe;
+ int err = 0;
uprobe = utask->active_uprobe;
if (utask->state == UTASK_SSTEP_ACK)
- arch_uprobe_post_xol(&uprobe->arch, regs);
+ err = arch_uprobe_post_xol(&uprobe->arch, regs);
else if (utask->state == UTASK_SSTEP_TRAPPED)
arch_uprobe_abort_xol(&uprobe->arch, regs);
else
@@ -1903,6 +1885,11 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
spin_lock_irq(&current->sighand->siglock);
recalc_sigpending(); /* see uprobe_deny_signal() */
spin_unlock_irq(&current->sighand->siglock);
+
+ if (unlikely(err)) {
+ uprobe_warn(current, "execute the probed insn, sending SIGILL.");
+ force_sig_info(SIGILL, SEND_SIG_FORCED, current);
+ }
}
/*
diff --git a/kernel/futex.c b/kernel/futex.c
index 5f589279e462..89bc9d59ac65 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -267,7 +267,7 @@ static inline void futex_get_mm(union futex_key *key)
* get_futex_key() implies a full barrier. This is relied upon
* as full barrier (B), see the ordering comment above.
*/
- smp_mb__after_atomic_inc();
+ smp_mb__after_atomic();
}
/*
@@ -280,7 +280,7 @@ static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
/*
* Full barrier (A), see the ordering comment above.
*/
- smp_mb__after_atomic_inc();
+ smp_mb__after_atomic();
#endif
}
@@ -745,7 +745,8 @@ void exit_pi_state_list(struct task_struct *curr)
static int
lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
- union futex_key *key, struct futex_pi_state **ps)
+ union futex_key *key, struct futex_pi_state **ps,
+ struct task_struct *task)
{
struct futex_pi_state *pi_state = NULL;
struct futex_q *this, *next;
@@ -786,6 +787,16 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
return -EINVAL;
}
+ /*
+ * Protect against a corrupted uval. If uval
+ * is 0x80000000 then pid is 0 and the waiter
+ * bit is set. So the deadlock check in the
+ * calling code has failed and we did not fall
+ * into the check above due to !pid.
+ */
+ if (task && pi_state->owner == task)
+ return -EDEADLK;
+
atomic_inc(&pi_state->refcount);
*ps = pi_state;
@@ -803,6 +814,11 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
if (!p)
return -ESRCH;
+ if (!p->mm) {
+ put_task_struct(p);
+ return -EPERM;
+ }
+
/*
* We need to look at the task state flags to figure out,
* whether the task is exiting. To protect against the do_exit
@@ -935,7 +951,7 @@ retry:
* We dont have the lock. Look up the PI state (or create it if
* we are the first waiter):
*/
- ret = lookup_pi_state(uval, hb, key, ps);
+ ret = lookup_pi_state(uval, hb, key, ps, task);
if (unlikely(ret)) {
switch (ret) {
@@ -1347,7 +1363,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
*
* Return:
* 0 - failed to acquire the lock atomically;
- * 1 - acquired the lock;
+ * >0 - acquired the lock, return value is vpid of the top_waiter
* <0 - error
*/
static int futex_proxy_trylock_atomic(u32 __user *pifutex,
@@ -1358,7 +1374,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
{
struct futex_q *top_waiter = NULL;
u32 curval;
- int ret;
+ int ret, vpid;
if (get_futex_value_locked(&curval, pifutex))
return -EFAULT;
@@ -1386,11 +1402,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
* the contended case or if set_waiters is 1. The pi_state is returned
* in ps in contended cases.
*/
+ vpid = task_pid_vnr(top_waiter->task);
ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
set_waiters);
- if (ret == 1)
+ if (ret == 1) {
requeue_pi_wake_futex(top_waiter, key2, hb2);
-
+ return vpid;
+ }
return ret;
}
@@ -1421,7 +1439,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
struct futex_pi_state *pi_state = NULL;
struct futex_hash_bucket *hb1, *hb2;
struct futex_q *this, *next;
- u32 curval2;
if (requeue_pi) {
/*
@@ -1509,16 +1526,25 @@ retry_private:
* At this point the top_waiter has either taken uaddr2 or is
* waiting on it. If the former, then the pi_state will not
* exist yet, look it up one more time to ensure we have a
- * reference to it.
+ * reference to it. If the lock was taken, ret contains the
+ * vpid of the top waiter task.
*/
- if (ret == 1) {
+ if (ret > 0) {
WARN_ON(pi_state);
drop_count++;
task_count++;
- ret = get_futex_value_locked(&curval2, uaddr2);
- if (!ret)
- ret = lookup_pi_state(curval2, hb2, &key2,
- &pi_state);
+ /*
+ * If we acquired the lock, then the user
+ * space value of uaddr2 should be vpid. It
+ * cannot be changed by the top waiter as it
+ * is blocked on hb2 lock if it tries to do
+ * so. If something fiddled with it behind our
+ * back the pi state lookup might unearth
+ * it. So we rather use the known value than
+ * rereading and handing potential crap to
+ * lookup_pi_state.
+ */
+ ret = lookup_pi_state(ret, hb2, &key2, &pi_state, NULL);
}
switch (ret) {
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 6b715c0af1b1..3ab28993f6e0 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -990,11 +990,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
/* Remove an active timer from the queue: */
ret = remove_hrtimer(timer, base);
- /* Switch the timer base, if necessary: */
- new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
-
if (mode & HRTIMER_MODE_REL) {
- tim = ktime_add_safe(tim, new_base->get_time());
+ tim = ktime_add_safe(tim, base->get_time());
/*
* CONFIG_TIME_LOW_RES is a temporary way for architectures
* to signal that they simply return xtime in
@@ -1009,6 +1006,9 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
hrtimer_set_expires_range_ns(timer, tim, delta_ns);
+ /* Switch the timer base, if necessary: */
+ new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
+
timer_stats_hrtimer_set_start_info(timer);
leftmost = enqueue_hrtimer(timer, new_base);
@@ -1039,6 +1039,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
return ret;
}
+EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns);
/**
* hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c8380ad203bc..28c57069ef68 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1683,6 +1683,14 @@ int kernel_kexec(void)
kexec_in_progress = true;
kernel_restart_prepare(NULL);
migrate_to_reboot_cpu();
+
+ /*
+ * migrate_to_reboot_cpu() disables CPU hotplug assuming that
+ * no further code needs to use CPU hotplug (which is true in
+ * the reboot case). However, the kexec path depends on using
+ * CPU hotplug again; so re-enable it here.
+ */
+ cpu_hotplug_enable();
printk(KERN_EMERG "Starting new kernel\n");
machine_shutdown();
}
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 6b375af4958d..0ac67a5861c5 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -498,7 +498,7 @@ int __usermodehelper_disable(enum umh_disable_depth depth)
static void helper_lock(void)
{
atomic_inc(&running_helpers);
- smp_mb__after_atomic_inc();
+ smp_mb__after_atomic();
}
static void helper_unlock(void)
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 2495a9b14ac8..6683ccef9fff 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -37,6 +37,7 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj,
}
KERNEL_ATTR_RO(uevent_seqnum);
+#ifdef CONFIG_UEVENT_HELPER
/* uevent helper program, used during early boot */
static ssize_t uevent_helper_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
@@ -56,7 +57,7 @@ static ssize_t uevent_helper_store(struct kobject *kobj,
return count;
}
KERNEL_ATTR_RW(uevent_helper);
-
+#endif
#ifdef CONFIG_PROFILING
static ssize_t profiling_show(struct kobject *kobj,
@@ -189,7 +190,9 @@ EXPORT_SYMBOL_GPL(kernel_kobj);
static struct attribute * kernel_attrs[] = {
&fscaps_attr.attr,
&uevent_seqnum_attr.attr,
+#ifdef CONFIG_UEVENT_HELPER
&uevent_helper_attr.attr,
+#endif
#ifdef CONFIG_PROFILING
&profiling_attr.attr,
#endif
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index 4f560cfedc8f..51c4b24b6328 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -54,9 +54,9 @@ enum {
* table (if it's not there yet), and we check it for lock order
* conflicts and deadlocks.
*/
-#define MAX_LOCKDEP_ENTRIES 16384UL
+#define MAX_LOCKDEP_ENTRIES 32768UL
-#define MAX_LOCKDEP_CHAINS_BITS 15
+#define MAX_LOCKDEP_CHAINS_BITS 16
#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
@@ -65,7 +65,7 @@ enum {
* Stack-trace: tightly packed array of stack backtrace
* addresses. Protected by the hash_lock.
*/
-#define MAX_STACK_TRACE_ENTRIES 262144UL
+#define MAX_STACK_TRACE_ENTRIES 524288UL
extern struct list_head all_lock_classes;
extern struct lock_chain lock_chains[];
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index f26b1a18e34e..0955b885d0dc 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -82,14 +82,14 @@ struct lock_writer_stress_stats {
};
static struct lock_writer_stress_stats *lwsa;
-#if defined(MODULE) || defined(CONFIG_LOCK_TORTURE_TEST_RUNNABLE)
+#if defined(MODULE)
#define LOCKTORTURE_RUNNABLE_INIT 1
#else
#define LOCKTORTURE_RUNNABLE_INIT 0
#endif
int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT;
module_param(locktorture_runnable, int, 0444);
-MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at boot");
+MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at module init");
/* Forward reference. */
static void lock_torture_cleanup(void);
@@ -216,10 +216,11 @@ static int lock_torture_writer(void *arg)
static DEFINE_TORTURE_RANDOM(rand);
VERBOSE_TOROUT_STRING("lock_torture_writer task started");
- set_user_nice(current, 19);
+ set_user_nice(current, MAX_NICE);
do {
- schedule_timeout_uninterruptible(1);
+ if ((torture_random(&rand) & 0xfffff) == 0)
+ schedule_timeout_uninterruptible(1);
cur_ops->writelock();
if (WARN_ON_ONCE(lock_is_write_held))
lwsp->n_write_lock_fail++;
@@ -354,7 +355,8 @@ static int __init lock_torture_init(void)
&lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops,
};
- torture_init_begin(torture_type, verbose, &locktorture_runnable);
+ if (!torture_init_begin(torture_type, verbose, &locktorture_runnable))
+ return -EBUSY;
/* Process args and tell the world that the torturer is on the job. */
for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index aa4dff04b594..a620d4d08ca6 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -343,9 +343,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* top_waiter can be NULL, when we are in the deboosting
* mode!
*/
- if (top_waiter && (!task_has_pi_waiters(task) ||
- top_waiter != task_top_pi_waiter(task)))
- goto out_unlock_pi;
+ if (top_waiter) {
+ if (!task_has_pi_waiters(task))
+ goto out_unlock_pi;
+ /*
+ * If deadlock detection is off, we stop here if we
+ * are not the top pi waiter of the task.
+ */
+ if (!detect_deadlock && top_waiter != task_top_pi_waiter(task))
+ goto out_unlock_pi;
+ }
/*
* When deadlock detection is off then we check, if further
@@ -361,7 +368,12 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
goto retry;
}
- /* Deadlock detection */
+ /*
+ * Deadlock detection. If the lock is the same as the original
+ * lock which caused us to walk the lock chain or if the
+ * current lock is owned by the task which initiated the chain
+ * walk, we detected a deadlock.
+ */
if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
raw_spin_unlock(&lock->wait_lock);
@@ -527,6 +539,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
unsigned long flags;
int chain_walk = 0, res;
+ /*
+ * Early deadlock detection. We really don't want the task to
+ * enqueue on itself just to untangle the mess later. It's not
+ * only an optimization. We drop the locks, so another waiter
+ * can come in before the chain walk detects the deadlock. So
+ * the other will detect the deadlock and return -EDEADLOCK,
+ * which is wrong, as the other waiter is not in a deadlock
+ * situation.
+ */
+ if (detect_deadlock && owner == task)
+ return -EDEADLK;
+
raw_spin_lock_irqsave(&task->pi_lock, flags);
__rt_mutex_adjust_prio(task);
waiter->task = task;
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 1d66e08e897d..b4219ff87b8c 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -12,6 +12,55 @@
#include <linux/export.h>
/*
+ * Guide to the rw_semaphore's count field for common values.
+ * (32-bit case illustrated, similar for 64-bit)
+ *
+ * 0x0000000X (1) X readers active or attempting lock, no writer waiting
+ * X = #active_readers + #readers attempting to lock
+ * (X*ACTIVE_BIAS)
+ *
+ * 0x00000000 rwsem is unlocked, and no one is waiting for the lock or
+ * attempting to read lock or write lock.
+ *
+ * 0xffff000X (1) X readers active or attempting lock, with waiters for lock
+ * X = #active readers + # readers attempting lock
+ * (X*ACTIVE_BIAS + WAITING_BIAS)
+ * (2) 1 writer attempting lock, no waiters for lock
+ * X-1 = #active readers + #readers attempting lock
+ * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS)
+ * (3) 1 writer active, no waiters for lock
+ * X-1 = #active readers + #readers attempting lock
+ * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS)
+ *
+ * 0xffff0001 (1) 1 reader active or attempting lock, waiters for lock
+ * (WAITING_BIAS + ACTIVE_BIAS)
+ * (2) 1 writer active or attempting lock, no waiters for lock
+ * (ACTIVE_WRITE_BIAS)
+ *
+ * 0xffff0000 (1) There are writers or readers queued but none active
+ * or in the process of attempting lock.
+ * (WAITING_BIAS)
+ * Note: writer can attempt to steal lock for this count by adding
+ * ACTIVE_WRITE_BIAS in cmpxchg and checking the old count
+ *
+ * 0xfffe0001 (1) 1 writer active, or attempting lock. Waiters on queue.
+ * (ACTIVE_WRITE_BIAS + WAITING_BIAS)
+ *
+ * Note: Readers attempt to lock by adding ACTIVE_BIAS in down_read and checking
+ * the count becomes more than 0 for successful lock acquisition,
+ * i.e. the case where there are only readers or nobody has lock.
+ * (1st and 2nd case above).
+ *
+ * Writers attempt to lock by adding ACTIVE_WRITE_BIAS in down_write and
+ * checking the count becomes ACTIVE_WRITE_BIAS for successful lock
+ * acquisition (i.e. nobody else has lock or attempts lock). If
+ * unsuccessful, in rwsem_down_write_failed, we'll check to see if there
+ * are only waiters but none active (5th case above), and attempt to
+ * steal the lock.
+ *
+ */
+
+/*
* Initialize an rwsem:
*/
void __init_rwsem(struct rw_semaphore *sem, const char *name,
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 2fac9cc79b3d..9a83d780facd 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -257,8 +257,7 @@ config ARCH_HAS_OPP
bool
config PM_OPP
- bool "Operating Performance Point (OPP) Layer library"
- depends on ARCH_HAS_OPP
+ bool
---help---
SOCs have a standard set of tuples consisting of frequency and
voltage pairs that the device will support per voltage domain. This
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index f4f2073711d3..df88d55dc436 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -35,7 +35,7 @@
static int nocompress;
static int noresume;
static int resume_wait;
-static int resume_delay;
+static unsigned int resume_delay;
static char resume_file[256] = CONFIG_PM_STD_PARTITION;
dev_t swsusp_resume_device;
sector_t swsusp_resume_block;
@@ -228,19 +228,23 @@ static void platform_recover(int platform_mode)
void swsusp_show_speed(struct timeval *start, struct timeval *stop,
unsigned nr_pages, char *msg)
{
- s64 elapsed_centisecs64;
- int centisecs;
- int k;
- int kps;
+ u64 elapsed_centisecs64;
+ unsigned int centisecs;
+ unsigned int k;
+ unsigned int kps;
elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
+ /*
+ * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time,
+ * it is obvious enough for what went wrong.
+ */
do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
centisecs = elapsed_centisecs64;
if (centisecs == 0)
centisecs = 1; /* avoid div-by-zero */
k = nr_pages * (PAGE_SIZE / 1024);
kps = (k * 100) / centisecs;
- printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n",
+ printk(KERN_INFO "PM: %s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n",
msg, k,
centisecs / 100, centisecs % 100,
kps / 1000, (kps % 1000) / 10);
@@ -595,7 +599,8 @@ static void power_down(void)
case HIBERNATION_PLATFORM:
hibernation_platform_enter();
case HIBERNATION_SHUTDOWN:
- kernel_power_off();
+ if (pm_power_off)
+ kernel_power_off();
break;
#ifdef CONFIG_SUSPEND
case HIBERNATION_SUSPEND:
@@ -623,7 +628,8 @@ static void power_down(void)
* corruption after resume.
*/
printk(KERN_CRIT "PM: Please power down manually\n");
- while(1);
+ while (1)
+ cpu_relax();
}
/**
@@ -1109,7 +1115,10 @@ static int __init resumewait_setup(char *str)
static int __init resumedelay_setup(char *str)
{
- resume_delay = simple_strtoul(str, NULL, 0);
+ int rc = kstrtouint(str, 0, &resume_delay);
+
+ if (rc)
+ return rc;
return 1;
}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6271bc4073ef..573410d6647e 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -279,26 +279,26 @@ static inline void pm_print_times_init(void) {}
struct kobject *power_kobj;
/**
- * state - control system power state.
+ * state - control system sleep states.
*
- * show() returns what states are supported, which is hard-coded to
- * 'freeze' (Low-Power Idle), 'standby' (Power-On Suspend),
- * 'mem' (Suspend-to-RAM), and 'disk' (Suspend-to-Disk).
+ * show() returns available sleep state labels, which may be "mem", "standby",
+ * "freeze" and "disk" (hibernation). See Documentation/power/states.txt for a
+ * description of what they mean.
*
- * store() accepts one of those strings, translates it into the
- * proper enumerated value, and initiates a suspend transition.
+ * store() accepts one of those strings, translates it into the proper
+ * enumerated value, and initiates a suspend transition.
*/
static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
char *s = buf;
#ifdef CONFIG_SUSPEND
- int i;
+ suspend_state_t i;
+
+ for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
+ if (pm_states[i].state)
+ s += sprintf(s,"%s ", pm_states[i].label);
- for (i = 0; i < PM_SUSPEND_MAX; i++) {
- if (pm_states[i] && valid_state(i))
- s += sprintf(s,"%s ", pm_states[i]);
- }
#endif
#ifdef CONFIG_HIBERNATION
s += sprintf(s, "%s\n", "disk");
@@ -314,7 +314,7 @@ static suspend_state_t decode_state(const char *buf, size_t n)
{
#ifdef CONFIG_SUSPEND
suspend_state_t state = PM_SUSPEND_MIN;
- const char * const *s;
+ struct pm_sleep_state *s;
#endif
char *p;
int len;
@@ -328,8 +328,9 @@ static suspend_state_t decode_state(const char *buf, size_t n)
#ifdef CONFIG_SUSPEND
for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++)
- if (*s && len == strlen(*s) && !strncmp(buf, *s, len))
- return state;
+ if (s->state && len == strlen(s->label)
+ && !strncmp(buf, s->label, len))
+ return s->state;
#endif
return PM_SUSPEND_ON;
@@ -447,8 +448,8 @@ static ssize_t autosleep_show(struct kobject *kobj,
#ifdef CONFIG_SUSPEND
if (state < PM_SUSPEND_MAX)
- return sprintf(buf, "%s\n", valid_state(state) ?
- pm_states[state] : "error");
+ return sprintf(buf, "%s\n", pm_states[state].state ?
+ pm_states[state].label : "error");
#endif
#ifdef CONFIG_HIBERNATION
return sprintf(buf, "disk\n");
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 15f37ea08719..c60f13b5270a 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -178,17 +178,20 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *,
unsigned int, char *);
#ifdef CONFIG_SUSPEND
+struct pm_sleep_state {
+ const char *label;
+ suspend_state_t state;
+};
+
/* kernel/power/suspend.c */
-extern const char *const pm_states[];
+extern struct pm_sleep_state pm_states[];
-extern bool valid_state(suspend_state_t state);
extern int suspend_devices_and_enter(suspend_state_t state);
#else /* !CONFIG_SUSPEND */
static inline int suspend_devices_and_enter(suspend_state_t state)
{
return -ENOSYS;
}
-static inline bool valid_state(suspend_state_t state) { return false; }
#endif /* !CONFIG_SUSPEND */
#ifdef CONFIG_PM_TEST_SUSPEND
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 8233cd4047d7..963e6d0f050b 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -31,13 +31,14 @@
#include "power.h"
-const char *const pm_states[PM_SUSPEND_MAX] = {
- [PM_SUSPEND_FREEZE] = "freeze",
- [PM_SUSPEND_STANDBY] = "standby",
- [PM_SUSPEND_MEM] = "mem",
+struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = {
+ [PM_SUSPEND_FREEZE] = { .label = "freeze", .state = PM_SUSPEND_FREEZE },
+ [PM_SUSPEND_STANDBY] = { .label = "standby", },
+ [PM_SUSPEND_MEM] = { .label = "mem", },
};
static const struct platform_suspend_ops *suspend_ops;
+static const struct platform_freeze_ops *freeze_ops;
static bool need_suspend_ops(suspend_state_t state)
{
@@ -47,6 +48,13 @@ static bool need_suspend_ops(suspend_state_t state)
static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
static bool suspend_freeze_wake;
+void freeze_set_ops(const struct platform_freeze_ops *ops)
+{
+ lock_system_sleep();
+ freeze_ops = ops;
+ unlock_system_sleep();
+}
+
static void freeze_begin(void)
{
suspend_freeze_wake = false;
@@ -54,9 +62,11 @@ static void freeze_begin(void)
static void freeze_enter(void)
{
+ cpuidle_use_deepest_state(true);
cpuidle_resume();
wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
cpuidle_pause();
+ cpuidle_use_deepest_state(false);
}
void freeze_wake(void)
@@ -66,42 +76,62 @@ void freeze_wake(void)
}
EXPORT_SYMBOL_GPL(freeze_wake);
+static bool valid_state(suspend_state_t state)
+{
+ /*
+ * PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states need low level
+ * support and need to be valid to the low level
+ * implementation, no valid callback implies that none are valid.
+ */
+ return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
+}
+
+/*
+ * If this is set, the "mem" label always corresponds to the deepest sleep state
+ * available, the "standby" label corresponds to the second deepest sleep state
+ * available (if any), and the "freeze" label corresponds to the remaining
+ * available sleep state (if there is one).
+ */
+static bool relative_states;
+
+static int __init sleep_states_setup(char *str)
+{
+ relative_states = !strncmp(str, "1", 1);
+ if (relative_states) {
+ pm_states[PM_SUSPEND_MEM].state = PM_SUSPEND_FREEZE;
+ pm_states[PM_SUSPEND_FREEZE].state = 0;
+ }
+ return 1;
+}
+
+__setup("relative_sleep_states=", sleep_states_setup);
+
/**
* suspend_set_ops - Set the global suspend method table.
* @ops: Suspend operations to use.
*/
void suspend_set_ops(const struct platform_suspend_ops *ops)
{
+ suspend_state_t i;
+ int j = PM_SUSPEND_MAX - 1;
+
lock_system_sleep();
+
suspend_ops = ops;
+ for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--)
+ if (valid_state(i))
+ pm_states[j--].state = i;
+ else if (!relative_states)
+ pm_states[j--].state = 0;
+
+ pm_states[j--].state = PM_SUSPEND_FREEZE;
+ while (j >= PM_SUSPEND_MIN)
+ pm_states[j--].state = 0;
+
unlock_system_sleep();
}
EXPORT_SYMBOL_GPL(suspend_set_ops);
-bool valid_state(suspend_state_t state)
-{
- if (state == PM_SUSPEND_FREEZE) {
-#ifdef CONFIG_PM_DEBUG
- if (pm_test_level != TEST_NONE &&
- pm_test_level != TEST_FREEZER &&
- pm_test_level != TEST_DEVICES &&
- pm_test_level != TEST_PLATFORM) {
- printk(KERN_WARNING "Unsupported pm_test mode for "
- "freeze state, please choose "
- "none/freezer/devices/platform.\n");
- return false;
- }
-#endif
- return true;
- }
- /*
- * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel
- * support and need to be valid to the lowlevel
- * implementation, no valid callback implies that none are valid.
- */
- return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
-}
-
/**
* suspend_valid_only_mem - Generic memory-only valid callback.
*
@@ -269,6 +299,10 @@ int suspend_devices_and_enter(suspend_state_t state)
error = suspend_ops->begin(state);
if (error)
goto Close;
+ } else if (state == PM_SUSPEND_FREEZE && freeze_ops->begin) {
+ error = freeze_ops->begin();
+ if (error)
+ goto Close;
}
suspend_console();
suspend_test_start();
@@ -294,6 +328,9 @@ int suspend_devices_and_enter(suspend_state_t state)
Close:
if (need_suspend_ops(state) && suspend_ops->end)
suspend_ops->end();
+ else if (state == PM_SUSPEND_FREEZE && freeze_ops->end)
+ freeze_ops->end();
+
trace_machine_suspend(PWR_EVENT_EXIT);
return error;
@@ -328,9 +365,17 @@ static int enter_state(suspend_state_t state)
{
int error;
- if (!valid_state(state))
- return -ENODEV;
-
+ if (state == PM_SUSPEND_FREEZE) {
+#ifdef CONFIG_PM_DEBUG
+ if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) {
+ pr_warning("PM: Unsupported test mode for freeze state,"
+ "please choose none/freezer/devices/platform.\n");
+ return -EAGAIN;
+ }
+#endif
+ } else if (!valid_state(state)) {
+ return -EINVAL;
+ }
if (!mutex_trylock(&pm_mutex))
return -EBUSY;
@@ -341,7 +386,7 @@ static int enter_state(suspend_state_t state)
sys_sync();
printk("done.\n");
- pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
+ pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label);
error = suspend_prepare(state);
if (error)
goto Unlock;
@@ -349,7 +394,7 @@ static int enter_state(suspend_state_t state)
if (suspend_test(TEST_FREEZER))
goto Finish;
- pr_debug("PM: Entering %s sleep\n", pm_states[state]);
+ pr_debug("PM: Entering %s sleep\n", pm_states[state].label);
pm_restrict_gfp_mask();
error = suspend_devices_and_enter(state);
pm_restore_gfp_mask();
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 9b2a1d58558d..269b097e78ea 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -92,13 +92,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
}
if (state == PM_SUSPEND_MEM) {
- printk(info_test, pm_states[state]);
+ printk(info_test, pm_states[state].label);
status = pm_suspend(state);
if (status == -ENODEV)
state = PM_SUSPEND_STANDBY;
}
if (state == PM_SUSPEND_STANDBY) {
- printk(info_test, pm_states[state]);
+ printk(info_test, pm_states[state].label);
status = pm_suspend(state);
}
if (status < 0)
@@ -136,18 +136,16 @@ static char warn_bad_state[] __initdata =
static int __init setup_test_suspend(char *value)
{
- unsigned i;
+ suspend_state_t i;
/* "=mem" ==> "mem" */
value++;
- for (i = 0; i < PM_SUSPEND_MAX; i++) {
- if (!pm_states[i])
- continue;
- if (strcmp(pm_states[i], value) != 0)
- continue;
- test_state = (__force suspend_state_t) i;
- return 0;
- }
+ for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
+ if (!strcmp(pm_states[i].label, value)) {
+ test_state = pm_states[i].state;
+ return 0;
+ }
+
printk(warn_bad_state, value);
return 0;
}
@@ -164,8 +162,8 @@ static int __init test_suspend(void)
/* PM is initialized by now; is that state testable? */
if (test_state == PM_SUSPEND_ON)
goto done;
- if (!valid_state(test_state)) {
- printk(warn_bad_state, pm_states[test_state]);
+ if (!pm_states[test_state].state) {
+ printk(warn_bad_state, pm_states[test_state].label);
goto done;
}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8c9a4819f798..aaa3261dea5d 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -567,7 +567,7 @@ static int lzo_compress_threadfn(void *data)
/**
* save_image_lzo - Save the suspend image data compressed with LZO.
- * @handle: Swap mam handle to use for saving the image.
+ * @handle: Swap map handle to use for saving the image.
* @snapshot: Image to read data from.
* @nr_to_write: Number of pages to save.
*/
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 7228258b85ec..221229cf0190 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2413,6 +2413,7 @@ int unregister_console(struct console *console)
if (console_drivers != NULL && console->flags & CON_CONSDEV)
console_drivers->flags |= CON_CONSDEV;
+ console->flags &= ~CON_ENABLED;
console_unlock();
console_sysfs_notify();
return res;
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index bd30bc61bc05..7fa34f86e5ba 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -58,9 +58,11 @@ torture_param(int, fqs_duration, 0,
"Duration of fqs bursts (us), 0 to disable");
torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)");
torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)");
+torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives");
torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
torture_param(bool, gp_normal, false,
"Use normal (non-expedited) GP wait primitives");
+torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives");
torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
torture_param(int, n_barrier_cbs, 0,
"# of callbacks/kthreads for barrier testing");
@@ -138,6 +140,18 @@ static long n_barrier_attempts;
static long n_barrier_successes;
static struct list_head rcu_torture_removed;
+static int rcu_torture_writer_state;
+#define RTWS_FIXED_DELAY 0
+#define RTWS_DELAY 1
+#define RTWS_REPLACE 2
+#define RTWS_DEF_FREE 3
+#define RTWS_EXP_SYNC 4
+#define RTWS_COND_GET 5
+#define RTWS_COND_SYNC 6
+#define RTWS_SYNC 7
+#define RTWS_STUTTER 8
+#define RTWS_STOPPING 9
+
#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
#define RCUTORTURE_RUNNABLE_INIT 1
#else
@@ -214,6 +228,7 @@ rcu_torture_free(struct rcu_torture *p)
*/
struct rcu_torture_ops {
+ int ttype;
void (*init)(void);
int (*readlock)(void);
void (*read_delay)(struct torture_random_state *rrsp);
@@ -222,6 +237,8 @@ struct rcu_torture_ops {
void (*deferred_free)(struct rcu_torture *p);
void (*sync)(void);
void (*exp_sync)(void);
+ unsigned long (*get_state)(void);
+ void (*cond_sync)(unsigned long oldstate);
void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
void (*cb_barrier)(void);
void (*fqs)(void);
@@ -273,10 +290,48 @@ static int rcu_torture_completed(void)
return rcu_batches_completed();
}
+/*
+ * Update callback in the pipe. This should be invoked after a grace period.
+ */
+static bool
+rcu_torture_pipe_update_one(struct rcu_torture *rp)
+{
+ int i;
+
+ i = rp->rtort_pipe_count;
+ if (i > RCU_TORTURE_PIPE_LEN)
+ i = RCU_TORTURE_PIPE_LEN;
+ atomic_inc(&rcu_torture_wcount[i]);
+ if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
+ rp->rtort_mbtest = 0;
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Update all callbacks in the pipe. Suitable for synchronous grace-period
+ * primitives.
+ */
+static void
+rcu_torture_pipe_update(struct rcu_torture *old_rp)
+{
+ struct rcu_torture *rp;
+ struct rcu_torture *rp1;
+
+ if (old_rp)
+ list_add(&old_rp->rtort_free, &rcu_torture_removed);
+ list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) {
+ if (rcu_torture_pipe_update_one(rp)) {
+ list_del(&rp->rtort_free);
+ rcu_torture_free(rp);
+ }
+ }
+}
+
static void
rcu_torture_cb(struct rcu_head *p)
{
- int i;
struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
if (torture_must_stop_irq()) {
@@ -284,16 +339,10 @@ rcu_torture_cb(struct rcu_head *p)
/* The next initialization will pick up the pieces. */
return;
}
- i = rp->rtort_pipe_count;
- if (i > RCU_TORTURE_PIPE_LEN)
- i = RCU_TORTURE_PIPE_LEN;
- atomic_inc(&rcu_torture_wcount[i]);
- if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
- rp->rtort_mbtest = 0;
+ if (rcu_torture_pipe_update_one(rp))
rcu_torture_free(rp);
- } else {
+ else
cur_ops->deferred_free(rp);
- }
}
static int rcu_no_completed(void)
@@ -312,6 +361,7 @@ static void rcu_sync_torture_init(void)
}
static struct rcu_torture_ops rcu_ops = {
+ .ttype = RCU_FLAVOR,
.init = rcu_sync_torture_init,
.readlock = rcu_torture_read_lock,
.read_delay = rcu_read_delay,
@@ -320,6 +370,8 @@ static struct rcu_torture_ops rcu_ops = {
.deferred_free = rcu_torture_deferred_free,
.sync = synchronize_rcu,
.exp_sync = synchronize_rcu_expedited,
+ .get_state = get_state_synchronize_rcu,
+ .cond_sync = cond_synchronize_rcu,
.call = call_rcu,
.cb_barrier = rcu_barrier,
.fqs = rcu_force_quiescent_state,
@@ -355,6 +407,7 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
}
static struct rcu_torture_ops rcu_bh_ops = {
+ .ttype = RCU_BH_FLAVOR,
.init = rcu_sync_torture_init,
.readlock = rcu_bh_torture_read_lock,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
@@ -397,6 +450,7 @@ call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
}
static struct rcu_torture_ops rcu_busted_ops = {
+ .ttype = INVALID_RCU_FLAVOR,
.init = rcu_sync_torture_init,
.readlock = rcu_torture_read_lock,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
@@ -479,9 +533,11 @@ static void srcu_torture_stats(char *page)
page += sprintf(page, "%s%s per-CPU(idx=%d):",
torture_type, TORTURE_FLAG, idx);
for_each_possible_cpu(cpu) {
- page += sprintf(page, " %d(%lu,%lu)", cpu,
- per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx],
- per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]);
+ long c0, c1;
+
+ c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx];
+ c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx];
+ page += sprintf(page, " %d(%ld,%ld)", cpu, c0, c1);
}
sprintf(page, "\n");
}
@@ -492,6 +548,7 @@ static void srcu_torture_synchronize_expedited(void)
}
static struct rcu_torture_ops srcu_ops = {
+ .ttype = SRCU_FLAVOR,
.init = rcu_sync_torture_init,
.readlock = srcu_torture_read_lock,
.read_delay = srcu_read_delay,
@@ -527,6 +584,7 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
}
static struct rcu_torture_ops sched_ops = {
+ .ttype = RCU_SCHED_FLAVOR,
.init = rcu_sync_torture_init,
.readlock = sched_torture_read_lock,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
@@ -688,23 +746,59 @@ rcu_torture_fqs(void *arg)
static int
rcu_torture_writer(void *arg)
{
- bool exp;
+ unsigned long gp_snap;
+ bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
+ bool gp_sync1 = gp_sync;
int i;
struct rcu_torture *rp;
- struct rcu_torture *rp1;
struct rcu_torture *old_rp;
static DEFINE_TORTURE_RANDOM(rand);
+ int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC,
+ RTWS_COND_GET, RTWS_SYNC };
+ int nsynctypes = 0;
VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
- set_user_nice(current, MAX_NICE);
+
+ /* Initialize synctype[] array. If none set, take default. */
+ if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync)
+ gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true;
+ if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync)
+ synctype[nsynctypes++] = RTWS_COND_GET;
+ else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync))
+ pr_alert("rcu_torture_writer: gp_cond without primitives.\n");
+ if (gp_exp1 && cur_ops->exp_sync)
+ synctype[nsynctypes++] = RTWS_EXP_SYNC;
+ else if (gp_exp && !cur_ops->exp_sync)
+ pr_alert("rcu_torture_writer: gp_exp without primitives.\n");
+ if (gp_normal1 && cur_ops->deferred_free)
+ synctype[nsynctypes++] = RTWS_DEF_FREE;
+ else if (gp_normal && !cur_ops->deferred_free)
+ pr_alert("rcu_torture_writer: gp_normal without primitives.\n");
+ if (gp_sync1 && cur_ops->sync)
+ synctype[nsynctypes++] = RTWS_SYNC;
+ else if (gp_sync && !cur_ops->sync)
+ pr_alert("rcu_torture_writer: gp_sync without primitives.\n");
+ if (WARN_ONCE(nsynctypes == 0,
+ "rcu_torture_writer: No update-side primitives.\n")) {
+ /*
+ * No updates primitives, so don't try updating.
+ * The resulting test won't be testing much, hence the
+ * above WARN_ONCE().
+ */
+ rcu_torture_writer_state = RTWS_STOPPING;
+ torture_kthread_stopping("rcu_torture_writer");
+ }
do {
+ rcu_torture_writer_state = RTWS_FIXED_DELAY;
schedule_timeout_uninterruptible(1);
rp = rcu_torture_alloc();
if (rp == NULL)
continue;
rp->rtort_pipe_count = 0;
+ rcu_torture_writer_state = RTWS_DELAY;
udelay(torture_random(&rand) & 0x3ff);
+ rcu_torture_writer_state = RTWS_REPLACE;
old_rp = rcu_dereference_check(rcu_torture_current,
current == writer_task);
rp->rtort_mbtest = 1;
@@ -716,35 +810,42 @@ rcu_torture_writer(void *arg)
i = RCU_TORTURE_PIPE_LEN;
atomic_inc(&rcu_torture_wcount[i]);
old_rp->rtort_pipe_count++;
- if (gp_normal == gp_exp)
- exp = !!(torture_random(&rand) & 0x80);
- else
- exp = gp_exp;
- if (!exp) {
+ switch (synctype[torture_random(&rand) % nsynctypes]) {
+ case RTWS_DEF_FREE:
+ rcu_torture_writer_state = RTWS_DEF_FREE;
cur_ops->deferred_free(old_rp);
- } else {
+ break;
+ case RTWS_EXP_SYNC:
+ rcu_torture_writer_state = RTWS_EXP_SYNC;
cur_ops->exp_sync();
- list_add(&old_rp->rtort_free,
- &rcu_torture_removed);
- list_for_each_entry_safe(rp, rp1,
- &rcu_torture_removed,
- rtort_free) {
- i = rp->rtort_pipe_count;
- if (i > RCU_TORTURE_PIPE_LEN)
- i = RCU_TORTURE_PIPE_LEN;
- atomic_inc(&rcu_torture_wcount[i]);
- if (++rp->rtort_pipe_count >=
- RCU_TORTURE_PIPE_LEN) {
- rp->rtort_mbtest = 0;
- list_del(&rp->rtort_free);
- rcu_torture_free(rp);
- }
- }
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_COND_GET:
+ rcu_torture_writer_state = RTWS_COND_GET;
+ gp_snap = cur_ops->get_state();
+ i = torture_random(&rand) % 16;
+ if (i != 0)
+ schedule_timeout_interruptible(i);
+ udelay(torture_random(&rand) % 1000);
+ rcu_torture_writer_state = RTWS_COND_SYNC;
+ cur_ops->cond_sync(gp_snap);
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_SYNC:
+ rcu_torture_writer_state = RTWS_SYNC;
+ cur_ops->sync();
+ rcu_torture_pipe_update(old_rp);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
}
}
rcutorture_record_progress(++rcu_torture_current_version);
+ rcu_torture_writer_state = RTWS_STUTTER;
stutter_wait("rcu_torture_writer");
} while (!torture_must_stop());
+ rcu_torture_writer_state = RTWS_STOPPING;
torture_kthread_stopping("rcu_torture_writer");
return 0;
}
@@ -784,7 +885,7 @@ rcu_torture_fakewriter(void *arg)
return 0;
}
-void rcutorture_trace_dump(void)
+static void rcutorture_trace_dump(void)
{
static atomic_t beenhere = ATOMIC_INIT(0);
@@ -918,11 +1019,13 @@ rcu_torture_reader(void *arg)
__this_cpu_inc(rcu_torture_batch[completed]);
preempt_enable();
cur_ops->readunlock(idx);
- schedule();
+ cond_resched();
stutter_wait("rcu_torture_reader");
} while (!torture_must_stop());
- if (irqreader && cur_ops->irq_capable)
+ if (irqreader && cur_ops->irq_capable) {
del_timer_sync(&t);
+ destroy_timer_on_stack(&t);
+ }
torture_kthread_stopping("rcu_torture_reader");
return 0;
}
@@ -937,6 +1040,7 @@ rcu_torture_printk(char *page)
int i;
long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
+ static unsigned long rtcv_snap = ULONG_MAX;
for_each_possible_cpu(cpu) {
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
@@ -997,6 +1101,22 @@ rcu_torture_printk(char *page)
page += sprintf(page, "\n");
if (cur_ops->stats)
cur_ops->stats(page);
+ if (rtcv_snap == rcu_torture_current_version &&
+ rcu_torture_current != NULL) {
+ int __maybe_unused flags;
+ unsigned long __maybe_unused gpnum;
+ unsigned long __maybe_unused completed;
+
+ rcutorture_get_gp_data(cur_ops->ttype,
+ &flags, &gpnum, &completed);
+ page += sprintf(page,
+ "??? Writer stall state %d g%lu c%lu f%#x\n",
+ rcu_torture_writer_state,
+ gpnum, completed, flags);
+ show_rcu_gp_kthreads();
+ rcutorture_trace_dump();
+ }
+ rtcv_snap = rcu_torture_current_version;
}
/*
@@ -1146,7 +1266,7 @@ static int __init rcu_torture_stall_init(void)
}
/* Callback function for RCU barrier testing. */
-void rcu_torture_barrier_cbf(struct rcu_head *rcu)
+static void rcu_torture_barrier_cbf(struct rcu_head *rcu)
{
atomic_inc(&barrier_cbs_invoked);
}
@@ -1416,7 +1536,8 @@ rcu_torture_init(void)
&rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops,
};
- torture_init_begin(torture_type, verbose, &rcutorture_runnable);
+ if (!torture_init_begin(torture_type, verbose, &rcutorture_runnable))
+ return -EBUSY;
/* Process args and tell the world that the torturer is on the job. */
for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
@@ -1441,10 +1562,13 @@ rcu_torture_init(void)
if (cur_ops->init)
cur_ops->init(); /* no "goto unwind" prior to this point!!! */
- if (nreaders >= 0)
+ if (nreaders >= 0) {
nrealreaders = nreaders;
- else
- nrealreaders = 2 * num_online_cpus();
+ } else {
+ nrealreaders = num_online_cpus() - 1;
+ if (nrealreaders <= 0)
+ nrealreaders = 1;
+ }
rcu_torture_print_module_parms(cur_ops, "Start of test");
/* Set up the freelist. */
@@ -1533,7 +1657,8 @@ rcu_torture_init(void)
fqs_duration = 0;
if (fqs_duration) {
/* Create the fqs thread */
- torture_create_kthread(rcu_torture_fqs, NULL, fqs_task);
+ firsterr = torture_create_kthread(rcu_torture_fqs, NULL,
+ fqs_task);
if (firsterr)
goto unwind;
}
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 431528520562..858c56569127 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -144,7 +144,7 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
return;
rcp->ticks_this_gp++;
j = jiffies;
- js = rcp->jiffies_stall;
+ js = ACCESS_ONCE(rcp->jiffies_stall);
if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
@@ -152,17 +152,17 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
dump_stack();
}
if (*rcp->curtail && ULONG_CMP_GE(j, js))
- rcp->jiffies_stall = jiffies +
+ ACCESS_ONCE(rcp->jiffies_stall) = jiffies +
3 * rcu_jiffies_till_stall_check() + 3;
else if (ULONG_CMP_GE(j, js))
- rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
+ ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
}
static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
{
rcp->ticks_this_gp = 0;
rcp->gp_start = jiffies;
- rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
+ ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
}
static void check_cpu_stalls(void)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 0c47e300210a..f1ba77363fbb 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -101,7 +101,7 @@ DEFINE_PER_CPU(struct rcu_data, sname##_data)
RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
-static struct rcu_state *rcu_state;
+static struct rcu_state *rcu_state_p;
LIST_HEAD(rcu_struct_flavors);
/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
@@ -243,7 +243,7 @@ static ulong jiffies_till_next_fqs = ULONG_MAX;
module_param(jiffies_till_first_fqs, ulong, 0644);
module_param(jiffies_till_next_fqs, ulong, 0644);
-static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
+static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp);
static void force_qs_rnp(struct rcu_state *rsp,
int (*f)(struct rcu_data *rsp, bool *isidle,
@@ -271,6 +271,15 @@ long rcu_batches_completed_bh(void)
EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
/*
+ * Force a quiescent state.
+ */
+void rcu_force_quiescent_state(void)
+{
+ force_quiescent_state(rcu_state_p);
+}
+EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
+
+/*
* Force a quiescent state for RCU BH.
*/
void rcu_bh_force_quiescent_state(void)
@@ -280,6 +289,21 @@ void rcu_bh_force_quiescent_state(void)
EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
/*
+ * Show the state of the grace-period kthreads.
+ */
+void show_rcu_gp_kthreads(void)
+{
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp) {
+ pr_info("%s: wait state: %d ->state: %#lx\n",
+ rsp->name, rsp->gp_state, rsp->gp_kthread->state);
+ /* sched_show_task(rsp->gp_kthread); */
+ }
+}
+EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
+
+/*
* Record the number of times rcutorture tests have been initiated and
* terminated. This information allows the debugfs tracing stats to be
* correlated to the rcutorture messages, even when the rcutorture module
@@ -294,6 +318,39 @@ void rcutorture_record_test_transition(void)
EXPORT_SYMBOL_GPL(rcutorture_record_test_transition);
/*
+ * Send along grace-period-related data for rcutorture diagnostics.
+ */
+void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
+ unsigned long *gpnum, unsigned long *completed)
+{
+ struct rcu_state *rsp = NULL;
+
+ switch (test_type) {
+ case RCU_FLAVOR:
+ rsp = rcu_state_p;
+ break;
+ case RCU_BH_FLAVOR:
+ rsp = &rcu_bh_state;
+ break;
+ case RCU_SCHED_FLAVOR:
+ rsp = &rcu_sched_state;
+ break;
+ default:
+ break;
+ }
+ if (rsp != NULL) {
+ *flags = ACCESS_ONCE(rsp->gp_flags);
+ *gpnum = ACCESS_ONCE(rsp->gpnum);
+ *completed = ACCESS_ONCE(rsp->completed);
+ return;
+ }
+ *flags = 0;
+ *gpnum = 0;
+ *completed = 0;
+}
+EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
+
+/*
* Record the number of writer passes through the current rcutorture test.
* This is also used to correlate debugfs tracing stats with the rcutorture
* messages.
@@ -324,6 +381,28 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
}
/*
+ * Return the root node of the specified rcu_state structure.
+ */
+static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
+{
+ return &rsp->node[0];
+}
+
+/*
+ * Is there any need for future grace periods?
+ * Interrupts must be disabled. If the caller does not hold the root
+ * rnp_node structure's ->lock, the results are advisory only.
+ */
+static int rcu_future_needs_gp(struct rcu_state *rsp)
+{
+ struct rcu_node *rnp = rcu_get_root(rsp);
+ int idx = (ACCESS_ONCE(rnp->completed) + 1) & 0x1;
+ int *fp = &rnp->need_future_gp[idx];
+
+ return ACCESS_ONCE(*fp);
+}
+
+/*
* Does the current CPU require a not-yet-started grace period?
* The caller must have disabled interrupts to prevent races with
* normal callback registry.
@@ -335,7 +414,7 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
if (rcu_gp_in_progress(rsp))
return 0; /* No, a grace period is already in progress. */
- if (rcu_nocb_needs_gp(rsp))
+ if (rcu_future_needs_gp(rsp))
return 1; /* Yes, a no-CBs CPU needs one. */
if (!rdp->nxttail[RCU_NEXT_TAIL])
return 0; /* No, this is a no-CBs (or offline) CPU. */
@@ -350,14 +429,6 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
}
/*
- * Return the root node of the specified rcu_state structure.
- */
-static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
-{
- return &rsp->node[0];
-}
-
-/*
* rcu_eqs_enter_common - current CPU is moving towards extended quiescent state
*
* If the new value of the ->dynticks_nesting counter now is zero,
@@ -387,9 +458,9 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
}
rcu_prepare_for_idle(smp_processor_id());
/* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
- smp_mb__before_atomic_inc(); /* See above. */
+ smp_mb__before_atomic(); /* See above. */
atomic_inc(&rdtp->dynticks);
- smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
+ smp_mb__after_atomic(); /* Force ordering with next sojourn. */
WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
/*
@@ -507,10 +578,10 @@ void rcu_irq_exit(void)
static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
int user)
{
- smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */
+ smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */
atomic_inc(&rdtp->dynticks);
/* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
- smp_mb__after_atomic_inc(); /* See above. */
+ smp_mb__after_atomic(); /* See above. */
WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
rcu_cleanup_after_idle(smp_processor_id());
trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
@@ -635,10 +706,10 @@ void rcu_nmi_enter(void)
(atomic_read(&rdtp->dynticks) & 0x1))
return;
rdtp->dynticks_nmi_nesting++;
- smp_mb__before_atomic_inc(); /* Force delay from prior write. */
+ smp_mb__before_atomic(); /* Force delay from prior write. */
atomic_inc(&rdtp->dynticks);
/* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
- smp_mb__after_atomic_inc(); /* See above. */
+ smp_mb__after_atomic(); /* See above. */
WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
}
@@ -657,9 +728,9 @@ void rcu_nmi_exit(void)
--rdtp->dynticks_nmi_nesting != 0)
return;
/* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
- smp_mb__before_atomic_inc(); /* See above. */
+ smp_mb__before_atomic(); /* See above. */
atomic_inc(&rdtp->dynticks);
- smp_mb__after_atomic_inc(); /* Force delay to next write. */
+ smp_mb__after_atomic(); /* Force delay to next write. */
WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
}
@@ -758,7 +829,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
{
rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
rcu_sysidle_check_cpu(rdp, isidle, maxj);
- return (rdp->dynticks_snap & 0x1) == 0;
+ if ((rdp->dynticks_snap & 0x1) == 0) {
+ trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
+ return 1;
+ } else {
+ return 0;
+ }
}
/*
@@ -834,7 +910,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
* we will beat on the first one until it gets unstuck, then move
* to the next. Only do this for the primary flavor of RCU.
*/
- if (rdp->rsp == rcu_state &&
+ if (rdp->rsp == rcu_state_p &&
ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
rdp->rsp->jiffies_resched += 5;
resched_cpu(rdp->cpu);
@@ -851,7 +927,7 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
rsp->gp_start = j;
smp_wmb(); /* Record start time before stall time. */
j1 = rcu_jiffies_till_stall_check();
- rsp->jiffies_stall = j + j1;
+ ACCESS_ONCE(rsp->jiffies_stall) = j + j1;
rsp->jiffies_resched = j + j1 / 2;
}
@@ -890,12 +966,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
/* Only let one CPU complain about others per time interval. */
raw_spin_lock_irqsave(&rnp->lock, flags);
- delta = jiffies - rsp->jiffies_stall;
+ delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
- rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+ ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
/*
@@ -932,9 +1008,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
print_cpu_stall_info_end();
for_each_possible_cpu(cpu)
totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
- pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n",
+ pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",
smp_processor_id(), (long)(jiffies - rsp->gp_start),
- rsp->gpnum, rsp->completed, totqlen);
+ (long)rsp->gpnum, (long)rsp->completed, totqlen);
if (ndetected == 0)
pr_err("INFO: Stall ended before state dump start\n");
else if (!trigger_all_cpu_backtrace())
@@ -947,12 +1023,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
force_quiescent_state(rsp); /* Kick them all. */
}
-/*
- * This function really isn't for public consumption, but RCU is special in
- * that context switches can allow the state machine to make progress.
- */
-extern void resched_cpu(int cpu);
-
static void print_cpu_stall(struct rcu_state *rsp)
{
int cpu;
@@ -971,14 +1041,15 @@ static void print_cpu_stall(struct rcu_state *rsp)
print_cpu_stall_info_end();
for_each_possible_cpu(cpu)
totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
- pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n",
- jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen);
+ pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
+ jiffies - rsp->gp_start,
+ (long)rsp->gpnum, (long)rsp->completed, totqlen);
if (!trigger_all_cpu_backtrace())
dump_stack();
raw_spin_lock_irqsave(&rnp->lock, flags);
- if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
- rsp->jiffies_stall = jiffies +
+ if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall)))
+ ACCESS_ONCE(rsp->jiffies_stall) = jiffies +
3 * rcu_jiffies_till_stall_check() + 3;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1062,7 +1133,7 @@ void rcu_cpu_stall_reset(void)
struct rcu_state *rsp;
for_each_rcu_flavor(rsp)
- rsp->jiffies_stall = jiffies + ULONG_MAX / 2;
+ ACCESS_ONCE(rsp->jiffies_stall) = jiffies + ULONG_MAX / 2;
}
/*
@@ -1123,15 +1194,18 @@ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
/*
* Start some future grace period, as needed to handle newly arrived
* callbacks. The required future grace periods are recorded in each
- * rcu_node structure's ->need_future_gp field.
+ * rcu_node structure's ->need_future_gp field. Returns true if there
+ * is reason to awaken the grace-period kthread.
*
* The caller must hold the specified rcu_node structure's ->lock.
*/
-static unsigned long __maybe_unused
-rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
+static bool __maybe_unused
+rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
+ unsigned long *c_out)
{
unsigned long c;
int i;
+ bool ret = false;
struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
/*
@@ -1142,7 +1216,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf"));
if (rnp->need_future_gp[c & 0x1]) {
trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf"));
- return c;
+ goto out;
}
/*
@@ -1156,7 +1230,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
rnp->need_future_gp[c & 0x1]++;
trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
- return c;
+ goto out;
}
/*
@@ -1197,12 +1271,15 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot"));
} else {
trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot"));
- rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
+ ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
}
unlock_out:
if (rnp != rnp_root)
raw_spin_unlock(&rnp_root->lock);
- return c;
+out:
+ if (c_out != NULL)
+ *c_out = c;
+ return ret;
}
/*
@@ -1226,25 +1303,43 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
}
/*
+ * Awaken the grace-period kthread for the specified flavor of RCU.
+ * Don't do a self-awaken, and don't bother awakening when there is
+ * nothing for the grace-period kthread to do (as in several CPUs
+ * raced to awaken, and we lost), and finally don't try to awaken
+ * a kthread that has not yet been created.
+ */
+static void rcu_gp_kthread_wake(struct rcu_state *rsp)
+{
+ if (current == rsp->gp_kthread ||
+ !ACCESS_ONCE(rsp->gp_flags) ||
+ !rsp->gp_kthread)
+ return;
+ wake_up(&rsp->gp_wq);
+}
+
+/*
* If there is room, assign a ->completed number to any callbacks on
* this CPU that have not already been assigned. Also accelerate any
* callbacks that were previously assigned a ->completed number that has
* since proven to be too conservative, which can happen if callbacks get
* assigned a ->completed number while RCU is idle, but with reference to
* a non-root rcu_node structure. This function is idempotent, so it does
- * not hurt to call it repeatedly.
+ * not hurt to call it repeatedly. Returns an flag saying that we should
+ * awaken the RCU grace-period kthread.
*
* The caller must hold rnp->lock with interrupts disabled.
*/
-static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
+static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp)
{
unsigned long c;
int i;
+ bool ret;
/* If the CPU has no callbacks, nothing to do. */
if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
- return;
+ return false;
/*
* Starting from the sublist containing the callbacks most
@@ -1273,7 +1368,7 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
* be grouped into.
*/
if (++i >= RCU_NEXT_TAIL)
- return;
+ return false;
/*
* Assign all subsequent callbacks' ->completed number to the next
@@ -1285,13 +1380,14 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
rdp->nxtcompleted[i] = c;
}
/* Record any needed additional grace periods. */
- rcu_start_future_gp(rnp, rdp);
+ ret = rcu_start_future_gp(rnp, rdp, NULL);
/* Trace depending on how much we were able to accelerate. */
if (!*rdp->nxttail[RCU_WAIT_TAIL])
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB"));
else
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB"));
+ return ret;
}
/*
@@ -1300,17 +1396,18 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
* assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL
* sublist. This function is idempotent, so it does not hurt to
* invoke it repeatedly. As long as it is not invoked -too- often...
+ * Returns true if the RCU grace-period kthread needs to be awakened.
*
* The caller must hold rnp->lock with interrupts disabled.
*/
-static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
+static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp)
{
int i, j;
/* If the CPU has no callbacks, nothing to do. */
if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
- return;
+ return false;
/*
* Find all callbacks whose ->completed numbers indicate that they
@@ -1334,26 +1431,30 @@ static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
}
/* Classify any remaining callbacks. */
- rcu_accelerate_cbs(rsp, rnp, rdp);
+ return rcu_accelerate_cbs(rsp, rnp, rdp);
}
/*
* Update CPU-local rcu_data state to record the beginnings and ends of
* grace periods. The caller must hold the ->lock of the leaf rcu_node
* structure corresponding to the current CPU, and must have irqs disabled.
+ * Returns true if the grace-period kthread needs to be awakened.
*/
-static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
+static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
+ struct rcu_data *rdp)
{
+ bool ret;
+
/* Handle the ends of any preceding grace periods first. */
if (rdp->completed == rnp->completed) {
/* No grace period end, so just accelerate recent callbacks. */
- rcu_accelerate_cbs(rsp, rnp, rdp);
+ ret = rcu_accelerate_cbs(rsp, rnp, rdp);
} else {
/* Advance callbacks. */
- rcu_advance_cbs(rsp, rnp, rdp);
+ ret = rcu_advance_cbs(rsp, rnp, rdp);
/* Remember that we saw this grace-period completion. */
rdp->completed = rnp->completed;
@@ -1372,11 +1473,13 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc
rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
zero_cpu_stall_ticks(rdp);
}
+ return ret;
}
static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
{
unsigned long flags;
+ bool needwake;
struct rcu_node *rnp;
local_irq_save(flags);
@@ -1388,8 +1491,10 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
return;
}
smp_mb__after_unlock_lock();
- __note_gp_changes(rsp, rnp, rdp);
+ needwake = __note_gp_changes(rsp, rnp, rdp);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ if (needwake)
+ rcu_gp_kthread_wake(rsp);
}
/*
@@ -1403,12 +1508,12 @@ static int rcu_gp_init(struct rcu_state *rsp)
rcu_bind_gp_kthread();
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
- if (rsp->gp_flags == 0) {
+ if (!ACCESS_ONCE(rsp->gp_flags)) {
/* Spurious wakeup, tell caller to go back to sleep. */
raw_spin_unlock_irq(&rnp->lock);
return 0;
}
- rsp->gp_flags = 0; /* Clear all flags: New grace period. */
+ ACCESS_ONCE(rsp->gp_flags) = 0; /* Clear all flags: New grace period. */
if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) {
/*
@@ -1453,7 +1558,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
WARN_ON_ONCE(rnp->completed != rsp->completed);
ACCESS_ONCE(rnp->completed) = rsp->completed;
if (rnp == rdp->mynode)
- __note_gp_changes(rsp, rnp, rdp);
+ (void)__note_gp_changes(rsp, rnp, rdp);
rcu_preempt_boost_start_gp(rnp);
trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
rnp->level, rnp->grplo,
@@ -1501,7 +1606,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
- rsp->gp_flags &= ~RCU_GP_FLAG_FQS;
+ ACCESS_ONCE(rsp->gp_flags) &= ~RCU_GP_FLAG_FQS;
raw_spin_unlock_irq(&rnp->lock);
}
return fqs_state;
@@ -1513,6 +1618,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
static void rcu_gp_cleanup(struct rcu_state *rsp)
{
unsigned long gp_duration;
+ bool needgp = false;
int nocb = 0;
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);
@@ -1548,7 +1654,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
ACCESS_ONCE(rnp->completed) = rsp->gpnum;
rdp = this_cpu_ptr(rsp->rda);
if (rnp == rdp->mynode)
- __note_gp_changes(rsp, rnp, rdp);
+ needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
/* smp_mb() provided by prior unlock-lock pair. */
nocb += rcu_future_gp_cleanup(rsp, rnp);
raw_spin_unlock_irq(&rnp->lock);
@@ -1564,9 +1670,10 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
rsp->fqs_state = RCU_GP_IDLE;
rdp = this_cpu_ptr(rsp->rda);
- rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */
- if (cpu_needs_another_gp(rsp, rdp)) {
- rsp->gp_flags = RCU_GP_FLAG_INIT;
+ /* Advance CBs to reduce false positives below. */
+ needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp;
+ if (needgp || cpu_needs_another_gp(rsp, rdp)) {
+ ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
trace_rcu_grace_period(rsp->name,
ACCESS_ONCE(rsp->gpnum),
TPS("newreq"));
@@ -1593,6 +1700,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
trace_rcu_grace_period(rsp->name,
ACCESS_ONCE(rsp->gpnum),
TPS("reqwait"));
+ rsp->gp_state = RCU_GP_WAIT_GPS;
wait_event_interruptible(rsp->gp_wq,
ACCESS_ONCE(rsp->gp_flags) &
RCU_GP_FLAG_INIT);
@@ -1620,6 +1728,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
trace_rcu_grace_period(rsp->name,
ACCESS_ONCE(rsp->gpnum),
TPS("fqswait"));
+ rsp->gp_state = RCU_GP_WAIT_FQS;
ret = wait_event_interruptible_timeout(rsp->gp_wq,
((gf = ACCESS_ONCE(rsp->gp_flags)) &
RCU_GP_FLAG_FQS) ||
@@ -1665,14 +1774,6 @@ static int __noreturn rcu_gp_kthread(void *arg)
}
}
-static void rsp_wakeup(struct irq_work *work)
-{
- struct rcu_state *rsp = container_of(work, struct rcu_state, wakeup_work);
-
- /* Wake up rcu_gp_kthread() to start the grace period. */
- wake_up(&rsp->gp_wq);
-}
-
/*
* Start a new RCU grace period if warranted, re-initializing the hierarchy
* in preparation for detecting the next grace period. The caller must hold
@@ -1681,8 +1782,10 @@ static void rsp_wakeup(struct irq_work *work)
* Note that it is legal for a dying CPU (which is marked as offline) to
* invoke this function. This can happen when the dying CPU reports its
* quiescent state.
+ *
+ * Returns true if the grace-period kthread must be awakened.
*/
-static void
+static bool
rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp)
{
@@ -1693,20 +1796,18 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
* or a grace period is already in progress.
* Either way, don't start a new grace period.
*/
- return;
+ return false;
}
- rsp->gp_flags = RCU_GP_FLAG_INIT;
+ ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum),
TPS("newreq"));
/*
* We can't do wakeups while holding the rnp->lock, as that
* could cause possible deadlocks with the rq->lock. Defer
- * the wakeup to interrupt context. And don't bother waking
- * up the running kthread.
+ * the wakeup to our caller.
*/
- if (current != rsp->gp_kthread)
- irq_work_queue(&rsp->wakeup_work);
+ return true;
}
/*
@@ -1715,12 +1816,14 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
* is invoked indirectly from rcu_advance_cbs(), which would result in
* endless recursion -- or would do so if it wasn't for the self-deadlock
* that is encountered beforehand.
+ *
+ * Returns true if the grace-period kthread needs to be awakened.
*/
-static void
-rcu_start_gp(struct rcu_state *rsp)
+static bool rcu_start_gp(struct rcu_state *rsp)
{
struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
struct rcu_node *rnp = rcu_get_root(rsp);
+ bool ret = false;
/*
* If there is no grace period in progress right now, any
@@ -1730,8 +1833,9 @@ rcu_start_gp(struct rcu_state *rsp)
* resulting in pointless grace periods. So, advance callbacks
* then start the grace period!
*/
- rcu_advance_cbs(rsp, rnp, rdp);
- rcu_start_gp_advanced(rsp, rnp, rdp);
+ ret = rcu_advance_cbs(rsp, rnp, rdp) || ret;
+ ret = rcu_start_gp_advanced(rsp, rnp, rdp) || ret;
+ return ret;
}
/*
@@ -1820,6 +1924,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
{
unsigned long flags;
unsigned long mask;
+ bool needwake;
struct rcu_node *rnp;
rnp = rdp->mynode;
@@ -1848,9 +1953,11 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
* This GP can't end until cpu checks in, so all of our
* callbacks can be processed during the next GP.
*/
- rcu_accelerate_cbs(rsp, rnp, rdp);
+ needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
+ if (needwake)
+ rcu_gp_kthread_wake(rsp);
}
}
@@ -1951,7 +2058,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
{
int i;
- struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
+ struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
/* No-CBs CPUs are handled specially. */
if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
@@ -2320,7 +2427,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
return; /* Someone beat us to it. */
}
- rsp->gp_flags |= RCU_GP_FLAG_FQS;
+ ACCESS_ONCE(rsp->gp_flags) |= RCU_GP_FLAG_FQS;
raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */
}
@@ -2334,7 +2441,8 @@ static void
__rcu_process_callbacks(struct rcu_state *rsp)
{
unsigned long flags;
- struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
+ bool needwake;
+ struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
WARN_ON_ONCE(rdp->beenonline == 0);
@@ -2345,8 +2453,10 @@ __rcu_process_callbacks(struct rcu_state *rsp)
local_irq_save(flags);
if (cpu_needs_another_gp(rsp, rdp)) {
raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
- rcu_start_gp(rsp);
+ needwake = rcu_start_gp(rsp);
raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
+ if (needwake)
+ rcu_gp_kthread_wake(rsp);
} else {
local_irq_restore(flags);
}
@@ -2404,6 +2514,8 @@ static void invoke_rcu_core(void)
static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
struct rcu_head *head, unsigned long flags)
{
+ bool needwake;
+
/*
* If called from an extended quiescent state, invoke the RCU
* core in order to force a re-evaluation of RCU's idleness.
@@ -2433,8 +2545,10 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
raw_spin_lock(&rnp_root->lock);
smp_mb__after_unlock_lock();
- rcu_start_gp(rsp);
+ needwake = rcu_start_gp(rsp);
raw_spin_unlock(&rnp_root->lock);
+ if (needwake)
+ rcu_gp_kthread_wake(rsp);
} else {
/* Give the grace period a kick. */
rdp->blimit = LONG_MAX;
@@ -2537,6 +2651,20 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
EXPORT_SYMBOL_GPL(call_rcu_bh);
/*
+ * Queue an RCU callback for lazy invocation after a grace period.
+ * This will likely be later named something like "call_rcu_lazy()",
+ * but this change will require some way of tagging the lazy RCU
+ * callbacks in the list of pending callbacks. Until then, this
+ * function may only be called from __kfree_rcu().
+ */
+void kfree_call_rcu(struct rcu_head *head,
+ void (*func)(struct rcu_head *rcu))
+{
+ __call_rcu(head, func, rcu_state_p, -1, 1);
+}
+EXPORT_SYMBOL_GPL(kfree_call_rcu);
+
+/*
* Because a context switch is a grace period for RCU-sched and RCU-bh,
* any blocking grace-period wait automatically implies a grace period
* if there is only one CPU online at any point time during execution
@@ -2659,7 +2787,7 @@ unsigned long get_state_synchronize_rcu(void)
* time-consuming work between get_state_synchronize_rcu()
* and cond_synchronize_rcu().
*/
- return smp_load_acquire(&rcu_state->gpnum);
+ return smp_load_acquire(&rcu_state_p->gpnum);
}
EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
@@ -2685,7 +2813,7 @@ void cond_synchronize_rcu(unsigned long oldstate)
* Ensure that this load happens before any RCU-destructive
* actions the caller might carry out after we return.
*/
- newstate = smp_load_acquire(&rcu_state->completed);
+ newstate = smp_load_acquire(&rcu_state_p->completed);
if (ULONG_CMP_GE(oldstate, newstate))
synchronize_rcu();
}
@@ -2790,7 +2918,7 @@ void synchronize_sched_expedited(void)
s = atomic_long_read(&rsp->expedited_done);
if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
/* ensure test happens before caller kfree */
- smp_mb__before_atomic_inc(); /* ^^^ */
+ smp_mb__before_atomic(); /* ^^^ */
atomic_long_inc(&rsp->expedited_workdone1);
return;
}
@@ -2808,7 +2936,7 @@ void synchronize_sched_expedited(void)
s = atomic_long_read(&rsp->expedited_done);
if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
/* ensure test happens before caller kfree */
- smp_mb__before_atomic_inc(); /* ^^^ */
+ smp_mb__before_atomic(); /* ^^^ */
atomic_long_inc(&rsp->expedited_workdone2);
return;
}
@@ -2837,7 +2965,7 @@ void synchronize_sched_expedited(void)
s = atomic_long_read(&rsp->expedited_done);
if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
/* ensure test happens before caller kfree */
- smp_mb__before_atomic_inc(); /* ^^^ */
+ smp_mb__before_atomic(); /* ^^^ */
atomic_long_inc(&rsp->expedited_done_lost);
break;
}
@@ -2988,7 +3116,7 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
static void rcu_barrier_func(void *type)
{
struct rcu_state *rsp = type;
- struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
+ struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
_rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done);
atomic_inc(&rsp->barrier_cpu_count);
@@ -3160,7 +3288,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
* that this CPU cannot possibly have any RCU callbacks in flight yet.
*/
static void
-rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
+rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
{
unsigned long flags;
unsigned long mask;
@@ -3173,7 +3301,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
/* Set up local state, ensuring consistent view of global state. */
raw_spin_lock_irqsave(&rnp->lock, flags);
rdp->beenonline = 1; /* We have now been online. */
- rdp->preemptible = preemptible;
rdp->qlen_last_fqs_check = 0;
rdp->n_force_qs_snap = rsp->n_force_qs;
rdp->blimit = blimit;
@@ -3217,8 +3344,7 @@ static void rcu_prepare_cpu(int cpu)
struct rcu_state *rsp;
for_each_rcu_flavor(rsp)
- rcu_init_percpu_data(cpu, rsp,
- strcmp(rsp->name, "rcu_preempt") == 0);
+ rcu_init_percpu_data(cpu, rsp);
}
/*
@@ -3228,7 +3354,7 @@ static int rcu_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
- struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
+ struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
struct rcu_node *rnp = rdp->mynode;
struct rcu_state *rsp;
@@ -3402,8 +3528,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
rnp->qsmaskinit = 0;
rnp->grplo = j * cpustride;
rnp->grphi = (j + 1) * cpustride - 1;
- if (rnp->grphi >= NR_CPUS)
- rnp->grphi = NR_CPUS - 1;
+ if (rnp->grphi >= nr_cpu_ids)
+ rnp->grphi = nr_cpu_ids - 1;
if (i == 0) {
rnp->grpnum = 0;
rnp->grpmask = 0;
@@ -3422,7 +3548,6 @@ static void __init rcu_init_one(struct rcu_state *rsp,
rsp->rda = rda;
init_waitqueue_head(&rsp->gp_wq);
- init_irq_work(&rsp->wakeup_work, rsp_wakeup);
rnp = rsp->level[rcu_num_lvls - 1];
for_each_possible_cpu(i) {
while (i > rnp->grphi)
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 75dc3c39a02a..bf2c1e669691 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -252,7 +252,6 @@ struct rcu_data {
bool passed_quiesce; /* User-mode/idle loop etc. */
bool qs_pending; /* Core waits for quiesc state. */
bool beenonline; /* CPU online at least once. */
- bool preemptible; /* Preemptible RCU? */
struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
unsigned long grpmask; /* Mask to apply to leaf qsmask. */
#ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -406,7 +405,8 @@ struct rcu_state {
unsigned long completed; /* # of last completed gp. */
struct task_struct *gp_kthread; /* Task for grace periods. */
wait_queue_head_t gp_wq; /* Where GP task waits. */
- int gp_flags; /* Commands for GP task. */
+ short gp_flags; /* Commands for GP task. */
+ short gp_state; /* GP kthread sleep state. */
/* End of fields guarded by root rcu_node's lock. */
@@ -462,13 +462,17 @@ struct rcu_state {
const char *name; /* Name of structure. */
char abbr; /* Abbreviated name. */
struct list_head flavors; /* List of RCU flavors. */
- struct irq_work wakeup_work; /* Postponed wakeups */
};
/* Values for rcu_state structure's gp_flags field. */
#define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */
#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */
+/* Values for rcu_state structure's gp_flags field. */
+#define RCU_GP_WAIT_INIT 0 /* Initial state. */
+#define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */
+#define RCU_GP_WAIT_FQS 2 /* Wait for force-quiescent-state time. */
+
extern struct list_head rcu_struct_flavors;
/* Sequence through rcu_state structures for each RCU flavor. */
@@ -547,7 +551,6 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
static void print_cpu_stall_info_end(void);
static void zero_cpu_stall_ticks(struct rcu_data *rdp);
static void increment_cpu_stall_ticks(void);
-static int rcu_nocb_needs_gp(struct rcu_state *rsp);
static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
static void rcu_init_one_nocb(struct rcu_node *rnp);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 962d1d589929..cbc2c45265e2 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -116,7 +116,7 @@ static void __init rcu_bootup_announce_oddness(void)
#ifdef CONFIG_TREE_PREEMPT_RCU
RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
-static struct rcu_state *rcu_state = &rcu_preempt_state;
+static struct rcu_state *rcu_state_p = &rcu_preempt_state;
static int rcu_preempted_readers_exp(struct rcu_node *rnp);
@@ -149,15 +149,6 @@ long rcu_batches_completed(void)
EXPORT_SYMBOL_GPL(rcu_batches_completed);
/*
- * Force a quiescent state for preemptible RCU.
- */
-void rcu_force_quiescent_state(void)
-{
- force_quiescent_state(&rcu_preempt_state);
-}
-EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
-
-/*
* Record a preemptible-RCU quiescent state for the specified CPU. Note
* that this just means that the task currently running on the CPU is
* not in a quiescent state. There might be any number of tasks blocked
@@ -688,20 +679,6 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
}
EXPORT_SYMBOL_GPL(call_rcu);
-/*
- * Queue an RCU callback for lazy invocation after a grace period.
- * This will likely be later named something like "call_rcu_lazy()",
- * but this change will require some way of tagging the lazy RCU
- * callbacks in the list of pending callbacks. Until then, this
- * function may only be called from __kfree_rcu().
- */
-void kfree_call_rcu(struct rcu_head *head,
- void (*func)(struct rcu_head *rcu))
-{
- __call_rcu(head, func, &rcu_preempt_state, -1, 1);
-}
-EXPORT_SYMBOL_GPL(kfree_call_rcu);
-
/**
* synchronize_rcu - wait until a grace period has elapsed.
*
@@ -970,7 +947,7 @@ void exit_rcu(void)
#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-static struct rcu_state *rcu_state = &rcu_sched_state;
+static struct rcu_state *rcu_state_p = &rcu_sched_state;
/*
* Tell them what RCU they are running.
@@ -991,16 +968,6 @@ long rcu_batches_completed(void)
EXPORT_SYMBOL_GPL(rcu_batches_completed);
/*
- * Force a quiescent state for RCU, which, because there is no preemptible
- * RCU, becomes the same as rcu-sched.
- */
-void rcu_force_quiescent_state(void)
-{
- rcu_sched_force_quiescent_state();
-}
-EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
-
-/*
* Because preemptible RCU does not exist, we never have to check for
* CPUs being in quiescent states.
*/
@@ -1080,22 +1047,6 @@ static void rcu_preempt_check_callbacks(int cpu)
}
/*
- * Queue an RCU callback for lazy invocation after a grace period.
- * This will likely be later named something like "call_rcu_lazy()",
- * but this change will require some way of tagging the lazy RCU
- * callbacks in the list of pending callbacks. Until then, this
- * function may only be called from __kfree_rcu().
- *
- * Because there is no preemptible RCU, we use RCU-sched instead.
- */
-void kfree_call_rcu(struct rcu_head *head,
- void (*func)(struct rcu_head *rcu))
-{
- __call_rcu(head, func, &rcu_sched_state, -1, 1);
-}
-EXPORT_SYMBOL_GPL(kfree_call_rcu);
-
-/*
* Wait for an rcu-preempt grace period, but make it happen quickly.
* But because preemptible RCU does not exist, map to rcu-sched.
*/
@@ -1517,11 +1468,11 @@ static int __init rcu_spawn_kthreads(void)
for_each_possible_cpu(cpu)
per_cpu(rcu_cpu_has_work, cpu) = 0;
BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
- rnp = rcu_get_root(rcu_state);
- (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
+ rnp = rcu_get_root(rcu_state_p);
+ (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
if (NUM_RCU_NODES > 1) {
- rcu_for_each_leaf_node(rcu_state, rnp)
- (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
+ rcu_for_each_leaf_node(rcu_state_p, rnp)
+ (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
}
return 0;
}
@@ -1529,12 +1480,12 @@ early_initcall(rcu_spawn_kthreads);
static void rcu_prepare_kthreads(int cpu)
{
- struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
+ struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
struct rcu_node *rnp = rdp->mynode;
/* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
if (rcu_scheduler_fully_active)
- (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
+ (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
}
#else /* #ifdef CONFIG_RCU_BOOST */
@@ -1744,6 +1695,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
static void rcu_prepare_for_idle(int cpu)
{
#ifndef CONFIG_RCU_NOCB_CPU_ALL
+ bool needwake;
struct rcu_data *rdp;
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
struct rcu_node *rnp;
@@ -1792,8 +1744,10 @@ static void rcu_prepare_for_idle(int cpu)
rnp = rdp->mynode;
raw_spin_lock(&rnp->lock); /* irqs already disabled. */
smp_mb__after_unlock_lock();
- rcu_accelerate_cbs(rsp, rnp, rdp);
+ needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ if (needwake)
+ rcu_gp_kthread_wake(rsp);
}
#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
}
@@ -1855,7 +1809,7 @@ static void rcu_oom_notify_cpu(void *unused)
struct rcu_data *rdp;
for_each_rcu_flavor(rsp) {
- rdp = __this_cpu_ptr(rsp->rda);
+ rdp = raw_cpu_ptr(rsp->rda);
if (rdp->qlen_lazy != 0) {
atomic_inc(&oom_callback_count);
rsp->call(&rdp->oom_head, rcu_oom_callback);
@@ -1997,7 +1951,7 @@ static void increment_cpu_stall_ticks(void)
struct rcu_state *rsp;
for_each_rcu_flavor(rsp)
- __this_cpu_ptr(rsp->rda)->ticks_this_gp++;
+ raw_cpu_inc(rsp->rda->ticks_this_gp);
}
#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
@@ -2068,19 +2022,6 @@ static int __init parse_rcu_nocb_poll(char *arg)
early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
/*
- * Do any no-CBs CPUs need another grace period?
- *
- * Interrupts must be disabled. If the caller does not hold the root
- * rnp_node structure's ->lock, the results are advisory only.
- */
-static int rcu_nocb_needs_gp(struct rcu_state *rsp)
-{
- struct rcu_node *rnp = rcu_get_root(rsp);
-
- return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1];
-}
-
-/*
* Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
* grace period.
*/
@@ -2109,7 +2050,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
}
#ifndef CONFIG_RCU_NOCB_CPU_ALL
-/* Is the specified CPU a no-CPUs CPU? */
+/* Is the specified CPU a no-CBs CPU? */
bool rcu_is_nocb_cpu(int cpu)
{
if (have_rcu_nocb_mask)
@@ -2243,12 +2184,15 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
unsigned long c;
bool d;
unsigned long flags;
+ bool needwake;
struct rcu_node *rnp = rdp->mynode;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
- c = rcu_start_future_gp(rnp, rdp);
+ needwake = rcu_start_future_gp(rnp, rdp, &c);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ if (needwake)
+ rcu_gp_kthread_wake(rdp->rsp);
/*
* Wait for the grace period. Do so interruptibly to avoid messing
@@ -2402,11 +2346,6 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
-static int rcu_nocb_needs_gp(struct rcu_state *rsp)
-{
- return 0;
-}
-
static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
{
}
@@ -2523,9 +2462,9 @@ static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
/* Record start of fully idle period. */
j = jiffies;
ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j;
- smp_mb__before_atomic_inc();
+ smp_mb__before_atomic();
atomic_inc(&rdtp->dynticks_idle);
- smp_mb__after_atomic_inc();
+ smp_mb__after_atomic();
WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1);
}
@@ -2590,9 +2529,9 @@ static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
}
/* Record end of idle period. */
- smp_mb__before_atomic_inc();
+ smp_mb__before_atomic();
atomic_inc(&rdtp->dynticks_idle);
- smp_mb__after_atomic_inc();
+ smp_mb__after_atomic();
WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1));
/*
@@ -2657,20 +2596,6 @@ static bool is_sysidle_rcu_state(struct rcu_state *rsp)
}
/*
- * Bind the grace-period kthread for the sysidle flavor of RCU to the
- * timekeeping CPU.
- */
-static void rcu_bind_gp_kthread(void)
-{
- int cpu = ACCESS_ONCE(tick_do_timer_cpu);
-
- if (cpu < 0 || cpu >= nr_cpu_ids)
- return;
- if (raw_smp_processor_id() != cpu)
- set_cpus_allowed_ptr(current, cpumask_of(cpu));
-}
-
-/*
* Return a delay in jiffies based on the number of CPUs, rcu_node
* leaf fanout, and jiffies tick rate. The idea is to allow larger
* systems more time to transition to full-idle state in order to
@@ -2734,7 +2659,8 @@ static void rcu_sysidle(unsigned long j)
static void rcu_sysidle_cancel(void)
{
smp_mb();
- ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
+ if (full_sysidle_state > RCU_SYSIDLE_SHORT)
+ ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
}
/*
@@ -2880,10 +2806,6 @@ static bool is_sysidle_rcu_state(struct rcu_state *rsp)
return false;
}
-static void rcu_bind_gp_kthread(void)
-{
-}
-
static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
unsigned long maxj)
{
@@ -2914,3 +2836,19 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
#endif /* #ifdef CONFIG_NO_HZ_FULL */
return 0;
}
+
+/*
+ * Bind the grace-period kthread for the sysidle flavor of RCU to the
+ * timekeeping CPU.
+ */
+static void rcu_bind_gp_kthread(void)
+{
+#ifdef CONFIG_NO_HZ_FULL
+ int cpu = ACCESS_ONCE(tick_do_timer_cpu);
+
+ if (cpu < 0 || cpu >= nr_cpu_ids)
+ return;
+ if (raw_smp_processor_id() != cpu)
+ set_cpus_allowed_ptr(current, cpumask_of(cpu));
+#endif /* #ifdef CONFIG_NO_HZ_FULL */
+}
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 4c0a9b0af469..a2aeb4df0f60 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -320,6 +320,18 @@ int rcu_jiffies_till_stall_check(void)
return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
}
+void rcu_sysrq_start(void)
+{
+ if (!rcu_cpu_stall_suppress)
+ rcu_cpu_stall_suppress = 2;
+}
+
+void rcu_sysrq_end(void)
+{
+ if (rcu_cpu_stall_suppress == 2)
+ rcu_cpu_stall_suppress = 0;
+}
+
static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
{
rcu_cpu_stall_suppress = 1;
@@ -338,3 +350,21 @@ static int __init check_cpu_stall_init(void)
early_initcall(check_cpu_stall_init);
#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
+
+/*
+ * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings.
+ */
+
+DEFINE_PER_CPU(int, rcu_cond_resched_count);
+
+/*
+ * Report a set of RCU quiescent states, for use by cond_resched()
+ * and friends. Out of line due to being called infrequently.
+ */
+void rcu_resched(void)
+{
+ preempt_disable();
+ __this_cpu_write(rcu_cond_resched_count, 0);
+ rcu_note_context_switch(smp_processor_id());
+ preempt_enable();
+}
diff --git a/kernel/resource.c b/kernel/resource.c
index 8957d686e29b..3c2237ac32db 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1288,13 +1288,10 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
if (p->flags & IORESOURCE_BUSY)
continue;
- printk(KERN_WARNING "resource map sanity check conflict: "
- "0x%llx 0x%llx 0x%llx 0x%llx %s\n",
+ printk(KERN_WARNING "resource sanity check: requesting [mem %#010llx-%#010llx], which spans more than %s %pR\n",
(unsigned long long)addr,
(unsigned long long)(addr + size - 1),
- (unsigned long long)p->start,
- (unsigned long long)p->end,
- p->name);
+ p->name, p);
err = -1;
break;
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d9d8ece46a15..913c6d6cc2c1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -90,6 +90,22 @@
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
+#ifdef smp_mb__before_atomic
+void __smp_mb__before_atomic(void)
+{
+ smp_mb__before_atomic();
+}
+EXPORT_SYMBOL(__smp_mb__before_atomic);
+#endif
+
+#ifdef smp_mb__after_atomic
+void __smp_mb__after_atomic(void)
+{
+ smp_mb__after_atomic();
+}
+EXPORT_SYMBOL(__smp_mb__after_atomic);
+#endif
+
void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
{
unsigned long delta;
@@ -506,6 +522,39 @@ static inline void init_hrtick(void)
#endif /* CONFIG_SCHED_HRTICK */
/*
+ * cmpxchg based fetch_or, macro so it works for different integer types
+ */
+#define fetch_or(ptr, val) \
+({ typeof(*(ptr)) __old, __val = *(ptr); \
+ for (;;) { \
+ __old = cmpxchg((ptr), __val, __val | (val)); \
+ if (__old == __val) \
+ break; \
+ __val = __old; \
+ } \
+ __old; \
+})
+
+#ifdef TIF_POLLING_NRFLAG
+/*
+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
+ * this avoids any races wrt polling state changes and thereby avoids
+ * spurious IPIs.
+ */
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+ struct thread_info *ti = task_thread_info(p);
+ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
+}
+#else
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+ set_tsk_need_resched(p);
+ return true;
+}
+#endif
+
+/*
* resched_task - mark a task 'to be rescheduled now'.
*
* On UP this means the setting of the need_resched flag, on SMP it
@@ -521,17 +570,15 @@ void resched_task(struct task_struct *p)
if (test_tsk_need_resched(p))
return;
- set_tsk_need_resched(p);
-
cpu = task_cpu(p);
+
if (cpu == smp_processor_id()) {
+ set_tsk_need_resched(p);
set_preempt_need_resched();
return;
}
- /* NEED_RESCHED must be visible before we test polling */
- smp_mb();
- if (!tsk_is_polling(p))
+ if (set_nr_and_not_polling(p))
smp_send_reschedule(cpu);
}
@@ -2592,8 +2639,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
if (likely(prev->sched_class == class &&
rq->nr_running == rq->cfs.h_nr_running)) {
p = fair_sched_class.pick_next_task(rq, prev);
- if (likely(p && p != RETRY_TASK))
- return p;
+ if (unlikely(p == RETRY_TASK))
+ goto again;
+
+ /* assumes fair_sched_class->next == idle_sched_class */
+ if (unlikely(!p))
+ p = idle_sched_class.pick_next_task(rq, prev);
+
+ return p;
}
again:
@@ -2996,7 +3049,7 @@ EXPORT_SYMBOL(set_user_nice);
int can_nice(const struct task_struct *p, const int nice)
{
/* convert nice value [19,-20] to rlimit style value [1,40] */
- int nice_rlim = 20 - nice;
+ int nice_rlim = nice_to_rlimit(nice);
return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
capable(CAP_SYS_NICE));
@@ -3020,17 +3073,10 @@ SYSCALL_DEFINE1(nice, int, increment)
* We don't have to worry. Conceptually one call occurs first
* and we have a single winner.
*/
- if (increment < -40)
- increment = -40;
- if (increment > 40)
- increment = 40;
-
+ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
nice = task_nice(current) + increment;
- if (nice < MIN_NICE)
- nice = MIN_NICE;
- if (nice > MAX_NICE)
- nice = MAX_NICE;
+ nice = clamp_val(nice, MIN_NICE, MAX_NICE);
if (increment < 0 && !can_nice(current, nice))
return -EPERM;
@@ -3124,6 +3170,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
dl_se->dl_throttled = 0;
dl_se->dl_new = 1;
+ dl_se->dl_yielded = 0;
}
static void __setscheduler_params(struct task_struct *p,
@@ -3188,17 +3235,40 @@ __getparam_dl(struct task_struct *p, struct sched_attr *attr)
* We ask for the deadline not being zero, and greater or equal
* than the runtime, as well as the period of being zero or
* greater than deadline. Furthermore, we have to be sure that
- * user parameters are above the internal resolution (1us); we
- * check sched_runtime only since it is always the smaller one.
+ * user parameters are above the internal resolution of 1us (we
+ * check sched_runtime only since it is always the smaller one) and
+ * below 2^63 ns (we have to check both sched_deadline and
+ * sched_period, as the latter can be zero).
*/
static bool
__checkparam_dl(const struct sched_attr *attr)
{
- return attr && attr->sched_deadline != 0 &&
- (attr->sched_period == 0 ||
- (s64)(attr->sched_period - attr->sched_deadline) >= 0) &&
- (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 &&
- attr->sched_runtime >= (2 << (DL_SCALE - 1));
+ /* deadline != 0 */
+ if (attr->sched_deadline == 0)
+ return false;
+
+ /*
+ * Since we truncate DL_SCALE bits, make sure we're at least
+ * that big.
+ */
+ if (attr->sched_runtime < (1ULL << DL_SCALE))
+ return false;
+
+ /*
+ * Since we use the MSB for wrap-around and sign issues, make
+ * sure it's not set (mind that period can be equal to zero).
+ */
+ if (attr->sched_deadline & (1ULL << 63) ||
+ attr->sched_period & (1ULL << 63))
+ return false;
+
+ /* runtime <= deadline <= period (if period != 0) */
+ if ((attr->sched_period != 0 &&
+ attr->sched_period < attr->sched_deadline) ||
+ attr->sched_deadline < attr->sched_runtime)
+ return false;
+
+ return true;
}
/*
@@ -3596,13 +3666,11 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
*/
attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
-out:
- return ret;
+ return 0;
err_size:
put_user(sizeof(*attr), &uattr->size);
- ret = -E2BIG;
- goto out;
+ return -E2BIG;
}
/**
@@ -3639,6 +3707,7 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
* sys_sched_setattr - same as above, but with extended sched_attr
* @pid: the pid in question.
* @uattr: structure containing the extended parameters.
+ * @flags: for future extension.
*/
SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
unsigned int, flags)
@@ -3650,8 +3719,12 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
if (!uattr || pid < 0 || flags)
return -EINVAL;
- if (sched_copy_attr(uattr, &attr))
- return -EFAULT;
+ retval = sched_copy_attr(uattr, &attr);
+ if (retval)
+ return retval;
+
+ if (attr.sched_policy < 0)
+ return -EINVAL;
rcu_read_lock();
retval = -ESRCH;
@@ -3701,7 +3774,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
*/
SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
{
- struct sched_param lp;
+ struct sched_param lp = { .sched_priority = 0 };
struct task_struct *p;
int retval;
@@ -3718,11 +3791,8 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
if (retval)
goto out_unlock;
- if (task_has_dl_policy(p)) {
- retval = -EINVAL;
- goto out_unlock;
- }
- lp.sched_priority = p->rt_priority;
+ if (task_has_rt_policy(p))
+ lp.sched_priority = p->rt_priority;
rcu_read_unlock();
/*
@@ -3760,7 +3830,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
for (; addr < end; addr++) {
if (*addr)
- goto err_size;
+ return -EFBIG;
}
attr->size = usize;
@@ -3770,12 +3840,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
if (ret)
return -EFAULT;
-out:
- return ret;
-
-err_size:
- ret = -E2BIG;
- goto out;
+ return 0;
}
/**
@@ -3783,6 +3848,7 @@ err_size:
* @pid: the pid in question.
* @uattr: structure containing the extended parameters.
* @size: sizeof(attr) for fwd/bwd comp.
+ * @flags: for future extension.
*/
SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
unsigned int, size, unsigned int, flags)
@@ -4051,6 +4117,7 @@ static void __cond_resched(void)
int __sched _cond_resched(void)
{
+ rcu_cond_resched();
if (should_resched()) {
__cond_resched();
return 1;
@@ -4069,15 +4136,18 @@ EXPORT_SYMBOL(_cond_resched);
*/
int __cond_resched_lock(spinlock_t *lock)
{
+ bool need_rcu_resched = rcu_should_resched();
int resched = should_resched();
int ret = 0;
lockdep_assert_held(lock);
- if (spin_needbreak(lock) || resched) {
+ if (spin_needbreak(lock) || resched || need_rcu_resched) {
spin_unlock(lock);
if (resched)
__cond_resched();
+ else if (unlikely(need_rcu_resched))
+ rcu_resched();
else
cpu_relax();
ret = 1;
@@ -4091,6 +4161,7 @@ int __sched __cond_resched_softirq(void)
{
BUG_ON(!in_softirq());
+ rcu_cond_resched(); /* BH disabled OK, just recording QSes. */
if (should_resched()) {
local_bh_enable();
__cond_resched();
@@ -5039,11 +5110,20 @@ static struct notifier_block migration_notifier = {
.priority = CPU_PRI_MIGRATION,
};
+static void __cpuinit set_cpu_rq_start_time(void)
+{
+ int cpu = smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
+ rq->age_stamp = sched_clock_cpu(cpu);
+}
+
static int sched_cpu_active(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_STARTING:
+ set_cpu_rq_start_time();
+ return NOTIFY_OK;
case CPU_DOWN_FAILED:
set_cpu_active((long)hcpu, true);
return NOTIFY_OK;
@@ -5252,7 +5332,8 @@ static int sd_degenerate(struct sched_domain *sd)
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
SD_SHARE_CPUPOWER |
- SD_SHARE_PKG_RESOURCES)) {
+ SD_SHARE_PKG_RESOURCES |
+ SD_SHARE_POWERDOMAIN)) {
if (sd->groups != sd->groups->next)
return 0;
}
@@ -5283,7 +5364,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
SD_BALANCE_EXEC |
SD_SHARE_CPUPOWER |
SD_SHARE_PKG_RESOURCES |
- SD_PREFER_SIBLING);
+ SD_PREFER_SIBLING |
+ SD_SHARE_POWERDOMAIN);
if (nr_node_ids == 1)
pflags &= ~SD_SERIALIZE;
}
@@ -5557,17 +5639,6 @@ static int __init isolated_cpu_setup(char *str)
__setup("isolcpus=", isolated_cpu_setup);
-static const struct cpumask *cpu_cpu_mask(int cpu)
-{
- return cpumask_of_node(cpu_to_node(cpu));
-}
-
-struct sd_data {
- struct sched_domain **__percpu sd;
- struct sched_group **__percpu sg;
- struct sched_group_power **__percpu sgp;
-};
-
struct s_data {
struct sched_domain ** __percpu sd;
struct root_domain *rd;
@@ -5580,21 +5651,6 @@ enum s_alloc {
sa_none,
};
-struct sched_domain_topology_level;
-
-typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
-typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
-
-#define SDTL_OVERLAP 0x01
-
-struct sched_domain_topology_level {
- sched_domain_init_f init;
- sched_domain_mask_f mask;
- int flags;
- int numa_level;
- struct sd_data data;
-};
-
/*
* Build an iteration mask that can exclude certain CPUs from the upwards
* domain traversal.
@@ -5762,8 +5818,6 @@ build_sched_groups(struct sched_domain *sd, int cpu)
continue;
group = get_group(i, sdd, &sg);
- cpumask_clear(sched_group_cpus(sg));
- sg->sgp->power = 0;
cpumask_setall(sched_group_mask(sg));
for_each_cpu(j, span) {
@@ -5813,44 +5867,11 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
}
-int __weak arch_sd_sibling_asym_packing(void)
-{
- return 0*SD_ASYM_PACKING;
-}
-
/*
* Initializers for schedule domains
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
*/
-#ifdef CONFIG_SCHED_DEBUG
-# define SD_INIT_NAME(sd, type) sd->name = #type
-#else
-# define SD_INIT_NAME(sd, type) do { } while (0)
-#endif
-
-#define SD_INIT_FUNC(type) \
-static noinline struct sched_domain * \
-sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
-{ \
- struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
- *sd = SD_##type##_INIT; \
- SD_INIT_NAME(sd, type); \
- sd->private = &tl->data; \
- return sd; \
-}
-
-SD_INIT_FUNC(CPU)
-#ifdef CONFIG_SCHED_SMT
- SD_INIT_FUNC(SIBLING)
-#endif
-#ifdef CONFIG_SCHED_MC
- SD_INIT_FUNC(MC)
-#endif
-#ifdef CONFIG_SCHED_BOOK
- SD_INIT_FUNC(BOOK)
-#endif
-
static int default_relax_domain_level = -1;
int sched_domain_level_max;
@@ -5938,97 +5959,154 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
*per_cpu_ptr(sdd->sgp, cpu) = NULL;
}
-#ifdef CONFIG_SCHED_SMT
-static const struct cpumask *cpu_smt_mask(int cpu)
-{
- return topology_thread_cpumask(cpu);
-}
-#endif
-
-/*
- * Topology list, bottom-up.
- */
-static struct sched_domain_topology_level default_topology[] = {
-#ifdef CONFIG_SCHED_SMT
- { sd_init_SIBLING, cpu_smt_mask, },
-#endif
-#ifdef CONFIG_SCHED_MC
- { sd_init_MC, cpu_coregroup_mask, },
-#endif
-#ifdef CONFIG_SCHED_BOOK
- { sd_init_BOOK, cpu_book_mask, },
-#endif
- { sd_init_CPU, cpu_cpu_mask, },
- { NULL, },
-};
-
-static struct sched_domain_topology_level *sched_domain_topology = default_topology;
-
-#define for_each_sd_topology(tl) \
- for (tl = sched_domain_topology; tl->init; tl++)
-
#ifdef CONFIG_NUMA
-
static int sched_domains_numa_levels;
static int *sched_domains_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
static int sched_domains_curr_level;
+#endif
-static inline int sd_local_flags(int level)
-{
- if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
- return 0;
-
- return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
-}
+/*
+ * SD_flags allowed in topology descriptions.
+ *
+ * SD_SHARE_CPUPOWER - describes SMT topologies
+ * SD_SHARE_PKG_RESOURCES - describes shared caches
+ * SD_NUMA - describes NUMA topologies
+ * SD_SHARE_POWERDOMAIN - describes shared power domain
+ *
+ * Odd one out:
+ * SD_ASYM_PACKING - describes SMT quirks
+ */
+#define TOPOLOGY_SD_FLAGS \
+ (SD_SHARE_CPUPOWER | \
+ SD_SHARE_PKG_RESOURCES | \
+ SD_NUMA | \
+ SD_ASYM_PACKING | \
+ SD_SHARE_POWERDOMAIN)
static struct sched_domain *
-sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
+sd_init(struct sched_domain_topology_level *tl, int cpu)
{
struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
- int level = tl->numa_level;
- int sd_weight = cpumask_weight(
- sched_domains_numa_masks[level][cpu_to_node(cpu)]);
+ int sd_weight, sd_flags = 0;
+
+#ifdef CONFIG_NUMA
+ /*
+ * Ugly hack to pass state to sd_numa_mask()...
+ */
+ sched_domains_curr_level = tl->numa_level;
+#endif
+
+ sd_weight = cpumask_weight(tl->mask(cpu));
+
+ if (tl->sd_flags)
+ sd_flags = (*tl->sd_flags)();
+ if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
+ "wrong sd_flags in topology description\n"))
+ sd_flags &= ~TOPOLOGY_SD_FLAGS;
*sd = (struct sched_domain){
.min_interval = sd_weight,
.max_interval = 2*sd_weight,
.busy_factor = 32,
.imbalance_pct = 125,
- .cache_nice_tries = 2,
- .busy_idx = 3,
- .idle_idx = 2,
+
+ .cache_nice_tries = 0,
+ .busy_idx = 0,
+ .idle_idx = 0,
.newidle_idx = 0,
.wake_idx = 0,
.forkexec_idx = 0,
.flags = 1*SD_LOAD_BALANCE
| 1*SD_BALANCE_NEWIDLE
- | 0*SD_BALANCE_EXEC
- | 0*SD_BALANCE_FORK
+ | 1*SD_BALANCE_EXEC
+ | 1*SD_BALANCE_FORK
| 0*SD_BALANCE_WAKE
- | 0*SD_WAKE_AFFINE
+ | 1*SD_WAKE_AFFINE
| 0*SD_SHARE_CPUPOWER
| 0*SD_SHARE_PKG_RESOURCES
- | 1*SD_SERIALIZE
+ | 0*SD_SERIALIZE
| 0*SD_PREFER_SIBLING
- | 1*SD_NUMA
- | sd_local_flags(level)
+ | 0*SD_NUMA
+ | sd_flags
,
+
.last_balance = jiffies,
.balance_interval = sd_weight,
+ .smt_gain = 0,
+ .max_newidle_lb_cost = 0,
+ .next_decay_max_lb_cost = jiffies,
+#ifdef CONFIG_SCHED_DEBUG
+ .name = tl->name,
+#endif
};
- SD_INIT_NAME(sd, NUMA);
- sd->private = &tl->data;
/*
- * Ugly hack to pass state to sd_numa_mask()...
+ * Convert topological properties into behaviour.
*/
- sched_domains_curr_level = tl->numa_level;
+
+ if (sd->flags & SD_SHARE_CPUPOWER) {
+ sd->imbalance_pct = 110;
+ sd->smt_gain = 1178; /* ~15% */
+
+ } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+ sd->imbalance_pct = 117;
+ sd->cache_nice_tries = 1;
+ sd->busy_idx = 2;
+
+#ifdef CONFIG_NUMA
+ } else if (sd->flags & SD_NUMA) {
+ sd->cache_nice_tries = 2;
+ sd->busy_idx = 3;
+ sd->idle_idx = 2;
+
+ sd->flags |= SD_SERIALIZE;
+ if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+ sd->flags &= ~(SD_BALANCE_EXEC |
+ SD_BALANCE_FORK |
+ SD_WAKE_AFFINE);
+ }
+
+#endif
+ } else {
+ sd->flags |= SD_PREFER_SIBLING;
+ sd->cache_nice_tries = 1;
+ sd->busy_idx = 2;
+ sd->idle_idx = 1;
+ }
+
+ sd->private = &tl->data;
return sd;
}
+/*
+ * Topology list, bottom-up.
+ */
+static struct sched_domain_topology_level default_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+#endif
+ { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { NULL, },
+};
+
+struct sched_domain_topology_level *sched_domain_topology = default_topology;
+
+#define for_each_sd_topology(tl) \
+ for (tl = sched_domain_topology; tl->mask; tl++)
+
+void set_sched_topology(struct sched_domain_topology_level *tl)
+{
+ sched_domain_topology = tl;
+}
+
+#ifdef CONFIG_NUMA
+
static const struct cpumask *sd_numa_mask(int cpu)
{
return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
@@ -6172,7 +6250,10 @@ static void sched_init_numa(void)
}
}
- tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
+ /* Compute default topology size */
+ for (i = 0; sched_domain_topology[i].mask; i++);
+
+ tl = kzalloc((i + level + 1) *
sizeof(struct sched_domain_topology_level), GFP_KERNEL);
if (!tl)
return;
@@ -6180,18 +6261,19 @@ static void sched_init_numa(void)
/*
* Copy the default topology bits..
*/
- for (i = 0; default_topology[i].init; i++)
- tl[i] = default_topology[i];
+ for (i = 0; sched_domain_topology[i].mask; i++)
+ tl[i] = sched_domain_topology[i];
/*
* .. and append 'j' levels of NUMA goodness.
*/
for (j = 0; j < level; i++, j++) {
tl[i] = (struct sched_domain_topology_level){
- .init = sd_numa_init,
.mask = sd_numa_mask,
+ .sd_flags = cpu_numa_flags,
.flags = SDTL_OVERLAP,
.numa_level = j,
+ SD_INIT_NAME(NUMA)
};
}
@@ -6349,7 +6431,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *child, int cpu)
{
- struct sched_domain *sd = tl->init(tl, cpu);
+ struct sched_domain *sd = sd_init(tl, cpu);
if (!sd)
return child;
@@ -6919,6 +7001,7 @@ void __init sched_init(void)
if (cpu_isolated_map == NULL)
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
idle_thread_set_boot_cpu();
+ set_cpu_rq_start_time();
#endif
init_sched_fair_class();
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5b9bb42b2d47..bd95963dae80 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -13,6 +13,7 @@
#include <linux/gfp.h>
#include <linux/kernel.h>
+#include <linux/slab.h>
#include "cpudeadline.h"
static inline int parent(int i)
@@ -39,8 +40,10 @@ static void cpudl_exchange(struct cpudl *cp, int a, int b)
{
int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
- swap(cp->elements[a], cp->elements[b]);
- swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]);
+ swap(cp->elements[a].cpu, cp->elements[b].cpu);
+ swap(cp->elements[a].dl , cp->elements[b].dl );
+
+ swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx);
}
static void cpudl_heapify(struct cpudl *cp, int idx)
@@ -140,7 +143,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
WARN_ON(!cpu_present(cpu));
raw_spin_lock_irqsave(&cp->lock, flags);
- old_idx = cp->cpu_to_idx[cpu];
+ old_idx = cp->elements[cpu].idx;
if (!is_valid) {
/* remove item */
if (old_idx == IDX_INVALID) {
@@ -155,8 +158,8 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
cp->elements[old_idx].cpu = new_cpu;
cp->size--;
- cp->cpu_to_idx[new_cpu] = old_idx;
- cp->cpu_to_idx[cpu] = IDX_INVALID;
+ cp->elements[new_cpu].idx = old_idx;
+ cp->elements[cpu].idx = IDX_INVALID;
while (old_idx > 0 && dl_time_before(
cp->elements[parent(old_idx)].dl,
cp->elements[old_idx].dl)) {
@@ -173,7 +176,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
cp->size++;
cp->elements[cp->size - 1].dl = 0;
cp->elements[cp->size - 1].cpu = cpu;
- cp->cpu_to_idx[cpu] = cp->size - 1;
+ cp->elements[cpu].idx = cp->size - 1;
cpudl_change_key(cp, cp->size - 1, dl);
cpumask_clear_cpu(cpu, cp->free_cpus);
} else {
@@ -195,10 +198,21 @@ int cpudl_init(struct cpudl *cp)
memset(cp, 0, sizeof(*cp));
raw_spin_lock_init(&cp->lock);
cp->size = 0;
- for (i = 0; i < NR_CPUS; i++)
- cp->cpu_to_idx[i] = IDX_INVALID;
- if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL))
+
+ cp->elements = kcalloc(nr_cpu_ids,
+ sizeof(struct cpudl_item),
+ GFP_KERNEL);
+ if (!cp->elements)
+ return -ENOMEM;
+
+ if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
+ kfree(cp->elements);
return -ENOMEM;
+ }
+
+ for_each_possible_cpu(i)
+ cp->elements[i].idx = IDX_INVALID;
+
cpumask_setall(cp->free_cpus);
return 0;
@@ -210,7 +224,6 @@ int cpudl_init(struct cpudl *cp)
*/
void cpudl_cleanup(struct cpudl *cp)
{
- /*
- * nothing to do for the moment
- */
+ free_cpumask_var(cp->free_cpus);
+ kfree(cp->elements);
}
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index a202789a412c..538c9796ad4a 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -5,17 +5,17 @@
#define IDX_INVALID -1
-struct array_item {
+struct cpudl_item {
u64 dl;
int cpu;
+ int idx;
};
struct cpudl {
raw_spinlock_t lock;
int size;
- int cpu_to_idx[NR_CPUS];
- struct array_item elements[NR_CPUS];
cpumask_var_t free_cpus;
+ struct cpudl_item *elements;
};
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 8b836b376d91..981fcd7dc394 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -30,6 +30,7 @@
#include <linux/gfp.h>
#include <linux/sched.h>
#include <linux/sched/rt.h>
+#include <linux/slab.h>
#include "cpupri.h"
/* Convert between a 140 based task->prio, and our 102 based cpupri */
@@ -70,8 +71,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
int idx = 0;
int task_pri = convert_prio(p->prio);
- if (task_pri >= MAX_RT_PRIO)
- return 0;
+ BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
for (idx = 0; idx < task_pri; idx++) {
struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
@@ -165,7 +165,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
* do a write memory barrier, and then update the count, to
* make sure the vector is visible when count is set.
*/
- smp_mb__before_atomic_inc();
+ smp_mb__before_atomic();
atomic_inc(&(vec)->count);
do_mb = 1;
}
@@ -185,14 +185,14 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
* the new priority vec.
*/
if (do_mb)
- smp_mb__after_atomic_inc();
+ smp_mb__after_atomic();
/*
* When removing from the vector, we decrement the counter first
* do a memory barrier and then clear the mask.
*/
atomic_dec(&(vec)->count);
- smp_mb__after_atomic_inc();
+ smp_mb__after_atomic();
cpumask_clear_cpu(cpu, vec->mask);
}
@@ -219,8 +219,13 @@ int cpupri_init(struct cpupri *cp)
goto cleanup;
}
+ cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL);
+ if (!cp->cpu_to_pri)
+ goto cleanup;
+
for_each_possible_cpu(i)
cp->cpu_to_pri[i] = CPUPRI_INVALID;
+
return 0;
cleanup:
@@ -237,6 +242,7 @@ void cpupri_cleanup(struct cpupri *cp)
{
int i;
+ kfree(cp->cpu_to_pri);
for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
free_cpumask_var(cp->pri_to_cpu[i].mask);
}
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index f6d756173491..6b033347fdfd 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -17,7 +17,7 @@ struct cpupri_vec {
struct cpupri {
struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
- int cpu_to_pri[NR_CPUS];
+ int *cpu_to_pri;
};
#ifdef CONFIG_SMP
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a95097cb4591..72fdf06ef865 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -332,50 +332,50 @@ out:
* softirq as those do not count in task exec_runtime any more.
*/
static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq)
+ struct rq *rq, int ticks)
{
- cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+ cputime_t scaled = cputime_to_scaled(cputime_one_jiffy);
+ u64 cputime = (__force u64) cputime_one_jiffy;
u64 *cpustat = kcpustat_this_cpu->cpustat;
if (steal_account_process_tick())
return;
+ cputime *= ticks;
+ scaled *= ticks;
+
if (irqtime_account_hi_update()) {
- cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
+ cpustat[CPUTIME_IRQ] += cputime;
} else if (irqtime_account_si_update()) {
- cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
+ cpustat[CPUTIME_SOFTIRQ] += cputime;
} else if (this_cpu_ksoftirqd() == p) {
/*
* ksoftirqd time do not get accounted in cpu_softirq_time.
* So, we have to handle it separately here.
* Also, p->stime needs to be updated for ksoftirqd.
*/
- __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
- CPUTIME_SOFTIRQ);
+ __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ);
} else if (user_tick) {
- account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ account_user_time(p, cputime, scaled);
} else if (p == rq->idle) {
- account_idle_time(cputime_one_jiffy);
+ account_idle_time(cputime);
} else if (p->flags & PF_VCPU) { /* System time or guest time */
- account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ account_guest_time(p, cputime, scaled);
} else {
- __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
- CPUTIME_SYSTEM);
+ __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM);
}
}
static void irqtime_account_idle_ticks(int ticks)
{
- int i;
struct rq *rq = this_rq();
- for (i = 0; i < ticks; i++)
- irqtime_account_process_tick(current, 0, rq);
+ irqtime_account_process_tick(current, 0, rq, ticks);
}
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
static inline void irqtime_account_idle_ticks(int ticks) {}
static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq) {}
+ struct rq *rq, int nr_ticks) {}
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
/*
@@ -464,7 +464,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
return;
if (sched_clock_irqtime) {
- irqtime_account_process_tick(p, user_tick, rq);
+ irqtime_account_process_tick(p, user_tick, rq, 1);
return;
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b08095786cb8..f9ca7d19781a 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -520,7 +520,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
* We need to take care of a possible races here. In fact, the
* task might have changed its scheduling policy to something
* different from SCHED_DEADLINE or changed its reservation
- * parameters (through sched_setscheduler()).
+ * parameters (through sched_setattr()).
*/
if (!dl_task(p) || dl_se->dl_new)
goto unlock;
@@ -528,6 +528,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
sched_clock_tick();
update_rq_clock(rq);
dl_se->dl_throttled = 0;
+ dl_se->dl_yielded = 0;
if (p->on_rq) {
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
if (task_has_dl_policy(rq->curr))
@@ -740,7 +741,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
WARN_ON(!dl_prio(prio));
dl_rq->dl_nr_running++;
- inc_nr_running(rq_of_dl_rq(dl_rq));
+ add_nr_running(rq_of_dl_rq(dl_rq), 1);
inc_dl_deadline(dl_rq, deadline);
inc_dl_migration(dl_se, dl_rq);
@@ -754,7 +755,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
WARN_ON(!dl_prio(prio));
WARN_ON(!dl_rq->dl_nr_running);
dl_rq->dl_nr_running--;
- dec_nr_running(rq_of_dl_rq(dl_rq));
+ sub_nr_running(rq_of_dl_rq(dl_rq), 1);
dec_dl_deadline(dl_rq, dl_se->deadline);
dec_dl_migration(dl_se, dl_rq);
@@ -893,10 +894,10 @@ static void yield_task_dl(struct rq *rq)
* We make the task go to sleep until its current deadline by
* forcing its runtime to zero. This way, update_curr_dl() stops
* it and the bandwidth timer will wake it up and will give it
- * new scheduling parameters (thanks to dl_new=1).
+ * new scheduling parameters (thanks to dl_yielded=1).
*/
if (p->dl.runtime > 0) {
- rq->curr->dl.dl_new = 1;
+ rq->curr->dl.dl_yielded = 1;
p->dl.runtime = 0;
}
update_curr_dl(rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7570dd969c28..c9617b73bcc0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1095,6 +1095,34 @@ static void task_numa_assign(struct task_numa_env *env,
env->best_cpu = env->dst_cpu;
}
+static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
+ long src_load, long dst_load,
+ struct task_numa_env *env)
+{
+ long imb, old_imb;
+
+ /* We care about the slope of the imbalance, not the direction. */
+ if (dst_load < src_load)
+ swap(dst_load, src_load);
+
+ /* Is the difference below the threshold? */
+ imb = dst_load * 100 - src_load * env->imbalance_pct;
+ if (imb <= 0)
+ return false;
+
+ /*
+ * The imbalance is above the allowed threshold.
+ * Compare it with the old imbalance.
+ */
+ if (orig_dst_load < orig_src_load)
+ swap(orig_dst_load, orig_src_load);
+
+ old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct;
+
+ /* Would this change make things worse? */
+ return (old_imb > imb);
+}
+
/*
* This checks if the overall compute and NUMA accesses of the system would
* be improved if the source tasks was migrated to the target dst_cpu taking
@@ -1107,7 +1135,8 @@ static void task_numa_compare(struct task_numa_env *env,
struct rq *src_rq = cpu_rq(env->src_cpu);
struct rq *dst_rq = cpu_rq(env->dst_cpu);
struct task_struct *cur;
- long dst_load, src_load;
+ long orig_src_load, src_load;
+ long orig_dst_load, dst_load;
long load;
long imp = (groupimp > 0) ? groupimp : taskimp;
@@ -1181,13 +1210,13 @@ static void task_numa_compare(struct task_numa_env *env,
* In the overloaded case, try and keep the load balanced.
*/
balance:
- dst_load = env->dst_stats.load;
- src_load = env->src_stats.load;
+ orig_dst_load = env->dst_stats.load;
+ orig_src_load = env->src_stats.load;
/* XXX missing power terms */
load = task_h_load(env->p);
- dst_load += load;
- src_load -= load;
+ dst_load = orig_dst_load + load;
+ src_load = orig_src_load - load;
if (cur) {
load = task_h_load(cur);
@@ -1195,11 +1224,8 @@ balance:
src_load += load;
}
- /* make src_load the smaller */
- if (dst_load < src_load)
- swap(dst_load, src_load);
-
- if (src_load * env->imbalance_pct < dst_load * 100)
+ if (load_too_imbalanced(orig_src_load, orig_dst_load,
+ src_load, dst_load, env))
goto unlock;
assign:
@@ -1301,7 +1327,16 @@ static int task_numa_migrate(struct task_struct *p)
if (env.best_cpu == -1)
return -EAGAIN;
- sched_setnuma(p, env.dst_nid);
+ /*
+ * If the task is part of a workload that spans multiple NUMA nodes,
+ * and is migrating into one of the workload's active nodes, remember
+ * this node as the task's preferred numa node, so the workload can
+ * settle down.
+ * A task that migrated to a second choice node will be better off
+ * trying for a better one later. Do not set the preferred node here.
+ */
+ if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes))
+ sched_setnuma(p, env.dst_nid);
/*
* Reset the scan period if the task is being rescheduled on an
@@ -1326,12 +1361,15 @@ static int task_numa_migrate(struct task_struct *p)
/* Attempt to migrate a task to a CPU on the preferred node. */
static void numa_migrate_preferred(struct task_struct *p)
{
+ unsigned long interval = HZ;
+
/* This task has no NUMA fault statistics yet */
if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
return;
/* Periodically retry migrating the task to the preferred node */
- p->numa_migrate_retry = jiffies + HZ;
+ interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
+ p->numa_migrate_retry = jiffies + interval;
/* Success if task is already running on preferred CPU */
if (task_node(p) == p->numa_preferred_nid)
@@ -1738,6 +1776,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
struct task_struct *p = current;
bool migrated = flags & TNF_MIGRATED;
int cpu_node = task_node(current);
+ int local = !!(flags & TNF_FAULT_LOCAL);
int priv;
if (!numabalancing_enabled)
@@ -1786,6 +1825,17 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
task_numa_group(p, last_cpupid, flags, &priv);
}
+ /*
+ * If a workload spans multiple NUMA nodes, a shared fault that
+ * occurs wholly within the set of nodes that the workload is
+ * actively using should be counted as local. This allows the
+ * scan rate to slow down when a workload has settled down.
+ */
+ if (!priv && !local && p->numa_group &&
+ node_isset(cpu_node, p->numa_group->active_nodes) &&
+ node_isset(mem_node, p->numa_group->active_nodes))
+ local = 1;
+
task_numa_placement(p);
/*
@@ -1800,7 +1850,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
- p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
+ p->numa_faults_locality[local] += pages;
}
static void reset_ptenuma_scan(struct task_struct *p)
@@ -3301,7 +3351,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
}
if (!se)
- rq->nr_running -= task_delta;
+ sub_nr_running(rq, task_delta);
cfs_rq->throttled = 1;
cfs_rq->throttled_clock = rq_clock(rq);
@@ -3352,7 +3402,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
}
if (!se)
- rq->nr_running += task_delta;
+ add_nr_running(rq, task_delta);
/* determine whether we need to wake up potentially idle cpu */
if (rq->curr == rq->idle && rq->cfs.nr_running)
@@ -3884,7 +3934,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se) {
update_rq_runnable_avg(rq, rq->nr_running);
- inc_nr_running(rq);
+ add_nr_running(rq, 1);
}
hrtick_update(rq);
}
@@ -3944,7 +3994,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
}
if (!se) {
- dec_nr_running(rq);
+ sub_nr_running(rq, 1);
update_rq_runnable_avg(rq, 1);
}
hrtick_update(rq);
@@ -4015,7 +4065,7 @@ static void record_wakee(struct task_struct *p)
* about the loss.
*/
if (jiffies > current->wakee_flip_decay_ts + HZ) {
- current->wakee_flips = 0;
+ current->wakee_flips >>= 1;
current->wakee_flip_decay_ts = jiffies;
}
@@ -4449,10 +4499,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
sd = tmp;
}
- if (affine_sd) {
- if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
- prev_cpu = cpu;
+ if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+ prev_cpu = cpu;
+ if (sd_flag & SD_BALANCE_WAKE) {
new_cpu = select_idle_sibling(p, prev_cpu);
goto unlock;
}
@@ -4520,6 +4570,9 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)
atomic_long_add(se->avg.load_avg_contrib,
&cfs_rq->removed_load);
}
+
+ /* We have migrated, no longer consider this task hot */
+ se->exec_start = 0;
}
#endif /* CONFIG_SMP */
@@ -5070,6 +5123,7 @@ task_hot(struct task_struct *p, u64 now)
/* Returns true if the destination node has incurred more faults */
static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
{
+ struct numa_group *numa_group = rcu_dereference(p->numa_group);
int src_nid, dst_nid;
if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
@@ -5083,21 +5137,29 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
if (src_nid == dst_nid)
return false;
- /* Always encourage migration to the preferred node. */
- if (dst_nid == p->numa_preferred_nid)
- return true;
+ if (numa_group) {
+ /* Task is already in the group's interleave set. */
+ if (node_isset(src_nid, numa_group->active_nodes))
+ return false;
- /* If both task and group weight improve, this move is a winner. */
- if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
- group_weight(p, dst_nid) > group_weight(p, src_nid))
+ /* Task is moving into the group's interleave set. */
+ if (node_isset(dst_nid, numa_group->active_nodes))
+ return true;
+
+ return group_faults(p, dst_nid) > group_faults(p, src_nid);
+ }
+
+ /* Encourage migration to the preferred node. */
+ if (dst_nid == p->numa_preferred_nid)
return true;
- return false;
+ return task_faults(p, dst_nid) > task_faults(p, src_nid);
}
static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
{
+ struct numa_group *numa_group = rcu_dereference(p->numa_group);
int src_nid, dst_nid;
if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
@@ -5112,16 +5174,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
if (src_nid == dst_nid)
return false;
+ if (numa_group) {
+ /* Task is moving within/into the group's interleave set. */
+ if (node_isset(dst_nid, numa_group->active_nodes))
+ return false;
+
+ /* Task is moving out of the group's interleave set. */
+ if (node_isset(src_nid, numa_group->active_nodes))
+ return true;
+
+ return group_faults(p, dst_nid) < group_faults(p, src_nid);
+ }
+
/* Migrating away from the preferred node is always bad. */
if (src_nid == p->numa_preferred_nid)
return true;
- /* If either task or group weight get worse, don't do it. */
- if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
- group_weight(p, dst_nid) < group_weight(p, src_nid))
- return true;
-
- return false;
+ return task_faults(p, dst_nid) < task_faults(p, src_nid);
}
#else
@@ -5564,6 +5633,7 @@ static unsigned long scale_rt_power(int cpu)
{
struct rq *rq = cpu_rq(cpu);
u64 total, available, age_stamp, avg;
+ s64 delta;
/*
* Since we're reading these variables without serialization make sure
@@ -5572,7 +5642,11 @@ static unsigned long scale_rt_power(int cpu)
age_stamp = ACCESS_ONCE(rq->age_stamp);
avg = ACCESS_ONCE(rq->rt_avg);
- total = sched_avg_period() + (rq_clock(rq) - age_stamp);
+ delta = rq_clock(rq) - age_stamp;
+ if (unlikely(delta < 0))
+ delta = 0;
+
+ total = sched_avg_period() + delta;
if (unlikely(total < avg)) {
/* Ensures that power won't end up being negative */
@@ -6640,27 +6714,62 @@ out:
return ld_moved;
}
+static inline unsigned long
+get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
+{
+ unsigned long interval = sd->balance_interval;
+
+ if (cpu_busy)
+ interval *= sd->busy_factor;
+
+ /* scale ms to jiffies */
+ interval = msecs_to_jiffies(interval);
+ interval = clamp(interval, 1UL, max_load_balance_interval);
+
+ return interval;
+}
+
+static inline void
+update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
+{
+ unsigned long interval, next;
+
+ interval = get_sd_balance_interval(sd, cpu_busy);
+ next = sd->last_balance + interval;
+
+ if (time_after(*next_balance, next))
+ *next_balance = next;
+}
+
/*
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*/
static int idle_balance(struct rq *this_rq)
{
+ unsigned long next_balance = jiffies + HZ;
+ int this_cpu = this_rq->cpu;
struct sched_domain *sd;
int pulled_task = 0;
- unsigned long next_balance = jiffies + HZ;
u64 curr_cost = 0;
- int this_cpu = this_rq->cpu;
idle_enter_fair(this_rq);
+
/*
* We must set idle_stamp _before_ calling idle_balance(), such that we
* measure the duration of idle_balance() as idle time.
*/
this_rq->idle_stamp = rq_clock(this_rq);
- if (this_rq->avg_idle < sysctl_sched_migration_cost)
+ if (this_rq->avg_idle < sysctl_sched_migration_cost) {
+ rcu_read_lock();
+ sd = rcu_dereference_check_sched_domain(this_rq->sd);
+ if (sd)
+ update_next_balance(sd, 0, &next_balance);
+ rcu_read_unlock();
+
goto out;
+ }
/*
* Drop the rq->lock, but keep IRQ/preempt disabled.
@@ -6670,20 +6779,20 @@ static int idle_balance(struct rq *this_rq)
update_blocked_averages(this_cpu);
rcu_read_lock();
for_each_domain(this_cpu, sd) {
- unsigned long interval;
int continue_balancing = 1;
u64 t0, domain_cost;
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
- if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
+ if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
+ update_next_balance(sd, 0, &next_balance);
break;
+ }
if (sd->flags & SD_BALANCE_NEWIDLE) {
t0 = sched_clock_cpu(this_cpu);
- /* If we've pulled tasks over stop searching: */
pulled_task = load_balance(this_cpu, this_rq,
sd, CPU_NEWLY_IDLE,
&continue_balancing);
@@ -6695,42 +6804,37 @@ static int idle_balance(struct rq *this_rq)
curr_cost += domain_cost;
}
- interval = msecs_to_jiffies(sd->balance_interval);
- if (time_after(next_balance, sd->last_balance + interval))
- next_balance = sd->last_balance + interval;
- if (pulled_task)
+ update_next_balance(sd, 0, &next_balance);
+
+ /*
+ * Stop searching for tasks to pull if there are
+ * now runnable tasks on this rq.
+ */
+ if (pulled_task || this_rq->nr_running > 0)
break;
}
rcu_read_unlock();
raw_spin_lock(&this_rq->lock);
+ if (curr_cost > this_rq->max_idle_balance_cost)
+ this_rq->max_idle_balance_cost = curr_cost;
+
/*
- * While browsing the domains, we released the rq lock.
- * A task could have be enqueued in the meantime
+ * While browsing the domains, we released the rq lock, a task could
+ * have been enqueued in the meantime. Since we're not going idle,
+ * pretend we pulled a task.
*/
- if (this_rq->cfs.h_nr_running && !pulled_task) {
+ if (this_rq->cfs.h_nr_running && !pulled_task)
pulled_task = 1;
- goto out;
- }
- if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
- /*
- * We are going idle. next_balance may be set based on
- * a busy processor. So reset next_balance.
- */
+out:
+ /* Move the next balance forward */
+ if (time_after(this_rq->next_balance, next_balance))
this_rq->next_balance = next_balance;
- }
-
- if (curr_cost > this_rq->max_idle_balance_cost)
- this_rq->max_idle_balance_cost = curr_cost;
-out:
/* Is there a task of a high priority class? */
- if (this_rq->nr_running != this_rq->cfs.h_nr_running &&
- ((this_rq->stop && this_rq->stop->on_rq) ||
- this_rq->dl.dl_nr_running ||
- (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))
+ if (this_rq->nr_running != this_rq->cfs.h_nr_running)
pulled_task = -1;
if (pulled_task) {
@@ -7011,16 +7115,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
break;
}
- interval = sd->balance_interval;
- if (idle != CPU_IDLE)
- interval *= sd->busy_factor;
-
- /* scale ms to jiffies */
- interval = msecs_to_jiffies(interval);
- interval = clamp(interval, 1UL, max_load_balance_interval);
+ interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
need_serialize = sd->flags & SD_SERIALIZE;
-
if (need_serialize) {
if (!spin_trylock(&balancing))
goto out;
@@ -7036,6 +7133,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
}
sd->last_balance = jiffies;
+ interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
}
if (need_serialize)
spin_unlock(&balancing);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 8f4390a079c7..25b9423abce9 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -67,24 +67,21 @@ void __weak arch_cpu_idle(void)
* cpuidle_idle_call - the main idle function
*
* NOTE: no locks or semaphores should be used here
- * return non-zero on failure
*/
-static int cpuidle_idle_call(void)
+static void cpuidle_idle_call(void)
{
struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
- int next_state, entered_state, ret;
+ int next_state, entered_state;
bool broadcast;
/*
* Check if the idle task must be rescheduled. If it is the
- * case, exit the function after re-enabling the local irq and
- * set again the polling flag
+ * case, exit the function after re-enabling the local irq.
*/
- if (current_clr_polling_and_test()) {
+ if (need_resched()) {
local_irq_enable();
- __current_set_polling();
- return 0;
+ return;
}
/*
@@ -101,96 +98,79 @@ static int cpuidle_idle_call(void)
rcu_idle_enter();
/*
- * Check if the cpuidle framework is ready, otherwise fallback
- * to the default arch specific idle method
+ * Ask the cpuidle framework to choose a convenient idle state.
+ * Fall back to the default arch idle method on errors.
*/
- ret = cpuidle_enabled(drv, dev);
-
- if (!ret) {
+ next_state = cpuidle_select(drv, dev);
+ if (next_state < 0) {
+use_default:
/*
- * Ask the governor to choose an idle state it thinks
- * it is convenient to go to. There is *always* a
- * convenient idle state
+ * We can't use the cpuidle framework, let's use the default
+ * idle routine.
*/
- next_state = cpuidle_select(drv, dev);
-
- /*
- * The idle task must be scheduled, it is pointless to
- * go to idle, just update no idle residency and get
- * out of this function
- */
- if (current_clr_polling_and_test()) {
- dev->last_residency = 0;
- entered_state = next_state;
+ if (current_clr_polling_and_test())
local_irq_enable();
- } else {
- broadcast = !!(drv->states[next_state].flags &
- CPUIDLE_FLAG_TIMER_STOP);
-
- if (broadcast)
- /*
- * Tell the time framework to switch
- * to a broadcast timer because our
- * local timer will be shutdown. If a
- * local timer is used from another
- * cpu as a broadcast timer, this call
- * may fail if it is not available
- */
- ret = clockevents_notify(
- CLOCK_EVT_NOTIFY_BROADCAST_ENTER,
- &dev->cpu);
-
- if (!ret) {
- trace_cpu_idle_rcuidle(next_state, dev->cpu);
-
- /*
- * Enter the idle state previously
- * returned by the governor
- * decision. This function will block
- * until an interrupt occurs and will
- * take care of re-enabling the local
- * interrupts
- */
- entered_state = cpuidle_enter(drv, dev,
- next_state);
-
- trace_cpu_idle_rcuidle(PWR_EVENT_EXIT,
- dev->cpu);
-
- if (broadcast)
- clockevents_notify(
- CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
- &dev->cpu);
-
- /*
- * Give the governor an opportunity to reflect on the
- * outcome
- */
- cpuidle_reflect(dev, entered_state);
- }
- }
+ else
+ arch_cpu_idle();
+
+ goto exit_idle;
}
+
/*
- * We can't use the cpuidle framework, let's use the default
- * idle routine
+ * The idle task must be scheduled, it is pointless to
+ * go to idle, just update no idle residency and get
+ * out of this function
*/
- if (ret)
- arch_cpu_idle();
+ if (current_clr_polling_and_test()) {
+ dev->last_residency = 0;
+ entered_state = next_state;
+ local_irq_enable();
+ goto exit_idle;
+ }
+
+ broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP);
+ /*
+ * Tell the time framework to switch to a broadcast timer
+ * because our local timer will be shutdown. If a local timer
+ * is used from another cpu as a broadcast timer, this call may
+ * fail if it is not available
+ */
+ if (broadcast &&
+ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
+ goto use_default;
+
+ trace_cpu_idle_rcuidle(next_state, dev->cpu);
+
+ /*
+ * Enter the idle state previously returned by the governor decision.
+ * This function will block until an interrupt occurs and will take
+ * care of re-enabling the local interrupts
+ */
+ entered_state = cpuidle_enter(drv, dev, next_state);
+
+ trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
+
+ if (broadcast)
+ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
+
+ /*
+ * Give the governor an opportunity to reflect on the outcome
+ */
+ cpuidle_reflect(dev, entered_state);
+
+exit_idle:
__current_set_polling();
/*
- * It is up to the idle functions to enable back the local
- * interrupt
+ * It is up to the idle functions to reenable local interrupts
*/
if (WARN_ON_ONCE(irqs_disabled()))
local_irq_enable();
rcu_idle_exit();
start_critical_timings();
-
- return 0;
}
/*
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index bd2267ad404f..0ebfd7a29472 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -79,6 +79,8 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
#endif
+ /* We start is dequeued state, because no RT tasks are queued */
+ rt_rq->rt_queued = 0;
rt_rq->rt_time = 0;
rt_rq->rt_throttled = 0;
@@ -112,6 +114,13 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
return rt_se->rt_rq;
}
+static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *rt_rq = rt_se->rt_rq;
+
+ return rt_rq->rq;
+}
+
void free_rt_sched_group(struct task_group *tg)
{
int i;
@@ -211,10 +220,16 @@ static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
return container_of(rt_rq, struct rq, rt);
}
-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
{
struct task_struct *p = rt_task_of(rt_se);
- struct rq *rq = task_rq(p);
+
+ return task_rq(p);
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+ struct rq *rq = rq_of_rt_se(rt_se);
return &rq->rt;
}
@@ -391,6 +406,9 @@ static inline void set_post_schedule(struct rq *rq)
}
#endif /* CONFIG_SMP */
+static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
+static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
+
static inline int on_rt_rq(struct sched_rt_entity *rt_se)
{
return !list_empty(&rt_se->run_list);
@@ -452,8 +470,11 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
rt_se = rt_rq->tg->rt_se[cpu];
if (rt_rq->rt_nr_running) {
- if (rt_se && !on_rt_rq(rt_se))
+ if (!rt_se)
+ enqueue_top_rt_rq(rt_rq);
+ else if (!on_rt_rq(rt_se))
enqueue_rt_entity(rt_se, false);
+
if (rt_rq->highest_prio.curr < curr->prio)
resched_task(curr);
}
@@ -466,10 +487,17 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
rt_se = rt_rq->tg->rt_se[cpu];
- if (rt_se && on_rt_rq(rt_se))
+ if (!rt_se)
+ dequeue_top_rt_rq(rt_rq);
+ else if (on_rt_rq(rt_se))
dequeue_rt_entity(rt_se);
}
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+ return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
+}
+
static int rt_se_boosted(struct sched_rt_entity *rt_se)
{
struct rt_rq *rt_rq = group_rt_rq(rt_se);
@@ -532,12 +560,23 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
- if (rt_rq->rt_nr_running)
- resched_task(rq_of_rt_rq(rt_rq)->curr);
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ if (!rt_rq->rt_nr_running)
+ return;
+
+ enqueue_top_rt_rq(rt_rq);
+ resched_task(rq->curr);
}
static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
{
+ dequeue_top_rt_rq(rt_rq);
+}
+
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+ return rt_rq->rt_throttled;
}
static inline const struct cpumask *sched_rt_period_mask(void)
@@ -922,6 +961,38 @@ static void update_curr_rt(struct rq *rq)
}
}
+static void
+dequeue_top_rt_rq(struct rt_rq *rt_rq)
+{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ BUG_ON(&rq->rt != rt_rq);
+
+ if (!rt_rq->rt_queued)
+ return;
+
+ BUG_ON(!rq->nr_running);
+
+ sub_nr_running(rq, rt_rq->rt_nr_running);
+ rt_rq->rt_queued = 0;
+}
+
+static void
+enqueue_top_rt_rq(struct rt_rq *rt_rq)
+{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ BUG_ON(&rq->rt != rt_rq);
+
+ if (rt_rq->rt_queued)
+ return;
+ if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
+ return;
+
+ add_nr_running(rq, rt_rq->rt_nr_running);
+ rt_rq->rt_queued = 1;
+}
+
#if defined CONFIG_SMP
static void
@@ -1045,12 +1116,23 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
#endif /* CONFIG_RT_GROUP_SCHED */
static inline
+unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *group_rq = group_rt_rq(rt_se);
+
+ if (group_rq)
+ return group_rq->rt_nr_running;
+ else
+ return 1;
+}
+
+static inline
void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
int prio = rt_se_prio(rt_se);
WARN_ON(!rt_prio(prio));
- rt_rq->rt_nr_running++;
+ rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
inc_rt_prio(rt_rq, prio);
inc_rt_migration(rt_se, rt_rq);
@@ -1062,7 +1144,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
WARN_ON(!rt_prio(rt_se_prio(rt_se)));
WARN_ON(!rt_rq->rt_nr_running);
- rt_rq->rt_nr_running--;
+ rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
dec_rt_prio(rt_rq, rt_se_prio(rt_se));
dec_rt_migration(rt_se, rt_rq);
@@ -1119,6 +1201,8 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
back = rt_se;
}
+ dequeue_top_rt_rq(rt_rq_of_se(back));
+
for (rt_se = back; rt_se; rt_se = rt_se->back) {
if (on_rt_rq(rt_se))
__dequeue_rt_entity(rt_se);
@@ -1127,13 +1211,18 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
{
+ struct rq *rq = rq_of_rt_se(rt_se);
+
dequeue_rt_stack(rt_se);
for_each_sched_rt_entity(rt_se)
__enqueue_rt_entity(rt_se, head);
+ enqueue_top_rt_rq(&rq->rt);
}
static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
{
+ struct rq *rq = rq_of_rt_se(rt_se);
+
dequeue_rt_stack(rt_se);
for_each_sched_rt_entity(rt_se) {
@@ -1142,6 +1231,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
if (rt_rq && rt_rq->rt_nr_running)
__enqueue_rt_entity(rt_se, false);
}
+ enqueue_top_rt_rq(&rq->rt);
}
/*
@@ -1159,8 +1249,6 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
-
- inc_nr_running(rq);
}
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -1171,8 +1259,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
dequeue_rt_entity(rt_se);
dequeue_pushable_task(rq, p);
-
- dec_nr_running(rq);
}
/*
@@ -1377,10 +1463,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
if (prev->sched_class == &rt_sched_class)
update_curr_rt(rq);
- if (!rt_rq->rt_nr_running)
- return NULL;
-
- if (rt_rq_throttled(rt_rq))
+ if (!rt_rq->rt_queued)
return NULL;
put_prev_task(rq, prev);
@@ -1892,9 +1975,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
*/
if (p->on_rq && rq->curr != p) {
#ifdef CONFIG_SMP
- if (rq->rt.overloaded && push_rt_task(rq) &&
+ if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
/* Don't resched if we changed runqueues */
- rq != task_rq(p))
+ push_rt_task(rq) && rq != task_rq(p))
check_resched = 0;
#endif /* CONFIG_SMP */
if (check_resched && p->prio < rq->curr->prio)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 456e492a3dca..600e2291a75c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -409,6 +409,8 @@ struct rt_rq {
int overloaded;
struct plist_head pushable_tasks;
#endif
+ int rt_queued;
+
int rt_throttled;
u64 rt_time;
u64 rt_runtime;
@@ -423,18 +425,6 @@ struct rt_rq {
#endif
};
-#ifdef CONFIG_RT_GROUP_SCHED
-static inline int rt_rq_throttled(struct rt_rq *rt_rq)
-{
- return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
-}
-#else
-static inline int rt_rq_throttled(struct rt_rq *rt_rq)
-{
- return rt_rq->rt_throttled;
-}
-#endif
-
/* Deadline class' related fields in a runqueue */
struct dl_rq {
/* runqueue is an rbtree, ordered by deadline */
@@ -1216,12 +1206,14 @@ extern void update_idle_cpu_load(struct rq *this_rq);
extern void init_task_runnable_average(struct task_struct *p);
-static inline void inc_nr_running(struct rq *rq)
+static inline void add_nr_running(struct rq *rq, unsigned count)
{
- rq->nr_running++;
+ unsigned prev_nr = rq->nr_running;
+
+ rq->nr_running = prev_nr + count;
#ifdef CONFIG_NO_HZ_FULL
- if (rq->nr_running == 2) {
+ if (prev_nr < 2 && rq->nr_running >= 2) {
if (tick_nohz_full_cpu(rq->cpu)) {
/* Order rq->nr_running write against the IPI */
smp_wmb();
@@ -1231,9 +1223,9 @@ static inline void inc_nr_running(struct rq *rq)
#endif
}
-static inline void dec_nr_running(struct rq *rq)
+static inline void sub_nr_running(struct rq *rq, unsigned count)
{
- rq->nr_running--;
+ rq->nr_running -= count;
}
static inline void rq_last_tick_reset(struct rq *rq)
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index d6ce65dde541..bfe0edadbfbb 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -41,13 +41,13 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev)
static void
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
- inc_nr_running(rq);
+ add_nr_running(rq, 1);
}
static void
dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
- dec_nr_running(rq);
+ sub_nr_running(rq, 1);
}
static void yield_task_stop(struct rq *rq)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 7d50f794e248..0ffa20ae657b 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -394,7 +394,7 @@ EXPORT_SYMBOL(__wake_up_bit);
*
* In order for this to function properly, as it uses waitqueue_active()
* internally, some kind of memory barrier must be done prior to calling
- * this. Typically, this will be smp_mb__after_clear_bit(), but in some
+ * this. Typically, this will be smp_mb__after_atomic(), but in some
* cases where bitflags are manipulated non-atomically under a lock, one
* may need to use a less regular barrier, such fs/inode.c's smp_mb(),
* because spin_unlock() does not guarantee a memory barrier.
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 92f24f5e8d52..5918d227730f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -232,7 +232,6 @@ asmlinkage __visible void __do_softirq(void)
bool in_hardirq;
__u32 pending;
int softirq_bit;
- int cpu;
/*
* Mask out PF_MEMALLOC s current task context is borrowed for the
@@ -247,7 +246,6 @@ asmlinkage __visible void __do_softirq(void)
__local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
in_hardirq = lockdep_softirq_start();
- cpu = smp_processor_id();
restart:
/* Reset the pending bitmask before enabling irqs */
set_softirq_pending(0);
@@ -276,11 +274,11 @@ restart:
prev_count, preempt_count());
preempt_count_set(prev_count);
}
- rcu_bh_qs(cpu);
h++;
pending >>= softirq_bit;
}
+ rcu_bh_qs(smp_processor_id());
local_irq_disable();
pending = local_softirq_pending();
diff --git a/kernel/sys.c b/kernel/sys.c
index fba0f29401ea..66a751ebf9d9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -250,7 +250,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
else
p = current;
if (p) {
- niceval = 20 - task_nice(p);
+ niceval = nice_to_rlimit(task_nice(p));
if (niceval > retval)
retval = niceval;
}
@@ -261,7 +261,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
else
pgrp = task_pgrp(current);
do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
- niceval = 20 - task_nice(p);
+ niceval = nice_to_rlimit(task_nice(p));
if (niceval > retval)
retval = niceval;
} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
@@ -277,7 +277,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
do_each_thread(g, p) {
if (uid_eq(task_uid(p), uid)) {
- niceval = 20 - task_nice(p);
+ niceval = nice_to_rlimit(task_nice(p));
if (niceval > retval)
retval = niceval;
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 74f5b580fe34..bc966a8ffc3e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -643,7 +643,7 @@ static struct ctl_table kern_table[] = {
.extra2 = &one,
},
#endif
-
+#ifdef CONFIG_UEVENT_HELPER
{
.procname = "hotplug",
.data = &uevent_helper,
@@ -651,7 +651,7 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dostring,
},
-
+#endif
#ifdef CONFIG_CHR_DEV_SG
{
.procname = "sg-big-buff",
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 419a52cecd20..c8780cdaf852 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -165,21 +165,21 @@ static inline void pps_set_freq(s64 freq)
static inline int is_error_status(int status)
{
- return (time_status & (STA_UNSYNC|STA_CLOCKERR))
+ return (status & (STA_UNSYNC|STA_CLOCKERR))
/* PPS signal lost when either PPS time or
* PPS frequency synchronization requested
*/
- || ((time_status & (STA_PPSFREQ|STA_PPSTIME))
- && !(time_status & STA_PPSSIGNAL))
+ || ((status & (STA_PPSFREQ|STA_PPSTIME))
+ && !(status & STA_PPSSIGNAL))
/* PPS jitter exceeded when
* PPS time synchronization requested */
- || ((time_status & (STA_PPSTIME|STA_PPSJITTER))
+ || ((status & (STA_PPSTIME|STA_PPSJITTER))
== (STA_PPSTIME|STA_PPSJITTER))
/* PPS wander exceeded or calibration error when
* PPS frequency synchronization requested
*/
- || ((time_status & STA_PPSFREQ)
- && (time_status & (STA_PPSWANDER|STA_PPSERROR)));
+ || ((status & STA_PPSFREQ)
+ && (status & (STA_PPSWANDER|STA_PPSERROR)));
}
static inline void pps_fill_timex(struct timex *txc)
@@ -923,7 +923,10 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
static int __init ntp_tick_adj_setup(char *str)
{
- ntp_tick_adj = simple_strtol(str, NULL, 0);
+ int rc = kstrtol(str, 0, (long *)&ntp_tick_adj);
+
+ if (rc)
+ return rc;
ntp_tick_adj <<= NTP_SCALE_SHIFT;
return 1;
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 4d23dc4d8139..445106d2c729 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -49,13 +49,6 @@ static u64 notrace jiffy_sched_clock_read(void)
return (u64)(jiffies - INITIAL_JIFFIES);
}
-static u32 __read_mostly (*read_sched_clock_32)(void);
-
-static u64 notrace read_sched_clock_32_wrapper(void)
-{
- return read_sched_clock_32();
-}
-
static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
@@ -176,12 +169,6 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
pr_debug("Registered %pF as sched_clock source\n", read);
}
-void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
-{
- read_sched_clock_32 = read;
- sched_clock_register(read_sched_clock_32_wrapper, bits, rate);
-}
-
void __init sched_clock_postinit(void)
{
/*
diff --git a/kernel/torture.c b/kernel/torture.c
index acc9afc2f26e..40bb511cca48 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -335,13 +335,8 @@ static void torture_shuffle_tasks(void)
shuffle_idle_cpu = cpumask_next(shuffle_idle_cpu, shuffle_tmp_mask);
if (shuffle_idle_cpu >= nr_cpu_ids)
shuffle_idle_cpu = -1;
- if (shuffle_idle_cpu != -1) {
+ else
cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask);
- if (cpumask_empty(shuffle_tmp_mask)) {
- put_online_cpus();
- return;
- }
- }
mutex_lock(&shuffle_task_mutex);
list_for_each_entry(stp, &shuffle_task_list, st_l)
@@ -533,7 +528,11 @@ void stutter_wait(const char *title)
while (ACCESS_ONCE(stutter_pause_test) ||
(torture_runnable && !ACCESS_ONCE(*torture_runnable))) {
if (stutter_pause_test)
- schedule_timeout_interruptible(1);
+ if (ACCESS_ONCE(stutter_pause_test) == 1)
+ schedule_timeout_interruptible(1);
+ else
+ while (ACCESS_ONCE(stutter_pause_test))
+ cond_resched();
else
schedule_timeout_interruptible(round_jiffies_relative(HZ));
torture_shutdown_absorb(title);
@@ -550,7 +549,11 @@ static int torture_stutter(void *arg)
VERBOSE_TOROUT_STRING("torture_stutter task started");
do {
if (!torture_must_stop()) {
- schedule_timeout_interruptible(stutter);
+ if (stutter > 1) {
+ schedule_timeout_interruptible(stutter - 1);
+ ACCESS_ONCE(stutter_pause_test) = 2;
+ }
+ schedule_timeout_interruptible(1);
ACCESS_ONCE(stutter_pause_test) = 1;
}
if (!torture_must_stop())
@@ -596,21 +599,27 @@ static void torture_stutter_cleanup(void)
* The runnable parameter points to a flag that controls whether or not
* the test is currently runnable. If there is no such flag, pass in NULL.
*/
-void __init torture_init_begin(char *ttype, bool v, int *runnable)
+bool torture_init_begin(char *ttype, bool v, int *runnable)
{
mutex_lock(&fullstop_mutex);
+ if (torture_type != NULL) {
+ pr_alert("torture_init_begin: refusing %s init: %s running",
+ ttype, torture_type);
+ mutex_unlock(&fullstop_mutex);
+ return false;
+ }
torture_type = ttype;
verbose = v;
torture_runnable = runnable;
fullstop = FULLSTOP_DONTSTOP;
-
+ return true;
}
EXPORT_SYMBOL_GPL(torture_init_begin);
/*
* Tell the torture module that initialization is complete.
*/
-void __init torture_init_end(void)
+void torture_init_end(void)
{
mutex_unlock(&fullstop_mutex);
register_reboot_notifier(&torture_shutdown_nb);
@@ -642,6 +651,9 @@ bool torture_cleanup(void)
torture_shuffle_cleanup();
torture_stutter_cleanup();
torture_onoff_cleanup();
+ mutex_lock(&fullstop_mutex);
+ torture_type = NULL;
+ mutex_unlock(&fullstop_mutex);
return false;
}
EXPORT_SYMBOL_GPL(torture_cleanup);
@@ -674,8 +686,10 @@ EXPORT_SYMBOL_GPL(torture_must_stop_irq);
*/
void torture_kthread_stopping(char *title)
{
- if (verbose)
- VERBOSE_TOROUT_STRING(title);
+ char buf[128];
+
+ snprintf(buf, sizeof(buf), "Stopping %s", title);
+ VERBOSE_TOROUT_STRING(buf);
while (!kthread_should_stop()) {
torture_shutdown_absorb(title);
schedule_timeout_uninterruptible(1);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0ee63af30bd1..a4bab46cd38e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -100,10 +100,10 @@ enum {
/*
* Rescue workers are used only on emergencies and shared by
- * all cpus. Give -20.
+ * all cpus. Give MIN_NICE.
*/
- RESCUER_NICE_LEVEL = -20,
- HIGHPRI_NICE_LEVEL = -20,
+ RESCUER_NICE_LEVEL = MIN_NICE,
+ HIGHPRI_NICE_LEVEL = MIN_NICE,
WQ_NAME_LEN = 24,
};
@@ -1916,6 +1916,12 @@ static void send_mayday(struct work_struct *work)
/* mayday mayday mayday */
if (list_empty(&pwq->mayday_node)) {
+ /*
+ * If @pwq is for an unbound wq, its base ref may be put at
+ * any time due to an attribute change. Pin @pwq until the
+ * rescuer is done with it.
+ */
+ get_pwq(pwq);
list_add_tail(&pwq->mayday_node, &wq->maydays);
wake_up_process(wq->rescuer->task);
}
@@ -2398,6 +2404,7 @@ static int rescuer_thread(void *__rescuer)
struct worker *rescuer = __rescuer;
struct workqueue_struct *wq = rescuer->rescue_wq;
struct list_head *scheduled = &rescuer->scheduled;
+ bool should_stop;
set_user_nice(current, RESCUER_NICE_LEVEL);
@@ -2409,11 +2416,15 @@ static int rescuer_thread(void *__rescuer)
repeat:
set_current_state(TASK_INTERRUPTIBLE);
- if (kthread_should_stop()) {
- __set_current_state(TASK_RUNNING);
- rescuer->task->flags &= ~PF_WQ_WORKER;
- return 0;
- }
+ /*
+ * By the time the rescuer is requested to stop, the workqueue
+ * shouldn't have any work pending, but @wq->maydays may still have
+ * pwq(s) queued. This can happen by non-rescuer workers consuming
+ * all the work items before the rescuer got to them. Go through
+ * @wq->maydays processing before acting on should_stop so that the
+ * list is always empty on exit.
+ */
+ should_stop = kthread_should_stop();
/* see whether any pwq is asking for help */
spin_lock_irq(&wq_mayday_lock);
@@ -2445,6 +2456,12 @@ repeat:
process_scheduled_works(rescuer);
/*
+ * Put the reference grabbed by send_mayday(). @pool won't
+ * go away while we're holding its lock.
+ */
+ put_pwq(pwq);
+
+ /*
* Leave this pool. If keep_working() is %true, notify a
* regular worker; otherwise, we end up with 0 concurrency
* and stalling the execution.
@@ -2459,6 +2476,12 @@ repeat:
spin_unlock_irq(&wq_mayday_lock);
+ if (should_stop) {
+ __set_current_state(TASK_RUNNING);
+ rescuer->task->flags &= ~PF_WQ_WORKER;
+ return 0;
+ }
+
/* rescuers should never participate in concurrency management */
WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
schedule();
@@ -4100,7 +4123,8 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
if (!pwq) {
pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
wq->name);
- goto out_unlock;
+ mutex_lock(&wq->mutex);
+ goto use_dfl_pwq;
}
/*