From 3b9d6da67e11ca8f78fde887918983523a36b0fa Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 8 Apr 2016 14:40:15 +0200 Subject: cpu/hotplug: Fix rollback during error-out in __cpu_disable() The recent introduction of the hotplug thread which invokes the callbacks on the plugged cpu, cased the following regression: If takedown_cpu() fails, then we run into several issues: 1) The rollback of the target cpu states is not invoked. That leaves the smp threads and the hotplug thread in disabled state. 2) notify_online() is executed due to a missing skip_onerr flag. That causes that both CPU_DOWN_FAILED and CPU_ONLINE notifications are invoked which confuses quite some notifiers. 3) The CPU_DOWN_FAILED notification is not invoked on the target CPU. That's not an issue per se, but it is inconsistent and in consequence blocks the patches which rely on these states being invoked on the target CPU and not on the controlling cpu. It also does not preserve the strict call order on rollback which is problematic for the ongoing state machine conversion as well. To fix this we add a rollback flag to the remote callback machinery and invoke the rollback including the CPU_DOWN_FAILED notification on the remote cpu. Further mark the notify online state with 'skip_onerr' so we don't get a double invokation. This workaround will go away once we moved the unplug invocation to the target cpu itself. [ tglx: Massaged changelog and moved the CPU_DOWN_FAILED notifiaction to the target cpu ] Fixes: 4cb28ced23c4 ("cpu/hotplug: Create hotplug threads") Reported-by: Heiko Carstens Signed-off-by: Sebastian Andrzej Siewior Cc: linux-s390@vger.kernel.org Cc: rt@linutronix.de Cc: Martin Schwidefsky Cc: Anna-Maria Gleixner Link: http://lkml.kernel.org/r/20160408124015.GA21960@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) (limited to 'kernel/cpu.c') diff --git a/kernel/cpu.c b/kernel/cpu.c index 6ea42e8da861..3e3f6e49eabb 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -36,6 +36,7 @@ * @target: The target state * @thread: Pointer to the hotplug thread * @should_run: Thread should execute + * @rollback: Perform a rollback * @cb_stat: The state for a single callback (install/uninstall) * @cb: Single callback function (install/uninstall) * @result: Result of the operation @@ -47,6 +48,7 @@ struct cpuhp_cpu_state { #ifdef CONFIG_SMP struct task_struct *thread; bool should_run; + bool rollback; enum cpuhp_state cb_state; int (*cb)(unsigned int cpu); int result; @@ -301,6 +303,11 @@ static int cpu_notify(unsigned long val, unsigned int cpu) return __cpu_notify(val, cpu, -1, NULL); } +static void cpu_notify_nofail(unsigned long val, unsigned int cpu) +{ + BUG_ON(cpu_notify(val, cpu)); +} + /* Notifier wrappers for transitioning to state machine */ static int notify_prepare(unsigned int cpu) { @@ -477,6 +484,16 @@ static void cpuhp_thread_fun(unsigned int cpu) } else { ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb); } + } else if (st->rollback) { + BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); + + undo_cpu_down(cpu, st, cpuhp_ap_states); + /* + * This is a momentary workaround to keep the notifier users + * happy. Will go away once we got rid of the notifiers. + */ + cpu_notify_nofail(CPU_DOWN_FAILED, cpu); + st->rollback = false; } else { /* Cannot happen .... */ BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); @@ -636,11 +653,6 @@ static inline void check_for_tasks(int dead_cpu) read_unlock(&tasklist_lock); } -static void cpu_notify_nofail(unsigned long val, unsigned int cpu) -{ - BUG_ON(cpu_notify(val, cpu)); -} - static int notify_down_prepare(unsigned int cpu) { int err, nr_calls = 0; @@ -721,9 +733,10 @@ static int takedown_cpu(unsigned int cpu) */ err = stop_machine(take_cpu_down, NULL, cpumask_of(cpu)); if (err) { - /* CPU didn't die: tell everyone. Can't complain. */ - cpu_notify_nofail(CPU_DOWN_FAILED, cpu); + /* CPU refused to die */ irq_unlock_sparse(); + /* Unpark the hotplug thread so we can rollback there */ + kthread_unpark(per_cpu_ptr(&cpuhp_state, cpu)->thread); return err; } BUG_ON(cpu_online(cpu)); @@ -832,6 +845,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, * to do the further cleanups. */ ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target); + if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) { + st->target = prev_state; + st->rollback = true; + cpuhp_kick_ap_work(cpu); + } hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE; out: @@ -1249,6 +1267,7 @@ static struct cpuhp_step cpuhp_ap_states[] = { .name = "notify:online", .startup = notify_online, .teardown = notify_down_prepare, + .skip_onerr = true, }, #endif /* -- cgit v1.2.3