From 1791f881435fab951939ad700e947b66c062e083 Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Wed, 30 Mar 2011 15:24:21 +0200
Subject: posix clocks: Replace mutex with reader/writer semaphore

A dynamic posix clock is protected from asynchronous removal by a mutex.
However, using a mutex has the unwanted effect that a long running clock
operation in one process will unnecessarily block other processes.

For example, one process might call read() to get an external time stamp
coming in at one pulse per second. A second process calling clock_gettime
would have to wait for almost a whole second.

This patch fixes the issue by using a reader/writer semaphore instead of
a mutex.

Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/%3C20110330132421.GA31771%40riccoc20.at.omicron.at%3E
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/posix-clock.c | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 25028dd4fa18..c340ca658f37 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -19,7 +19,6 @@
  */
 #include <linux/device.h>
 #include <linux/file.h>
-#include <linux/mutex.h>
 #include <linux/posix-clock.h>
 #include <linux/slab.h>
 #include <linux/syscalls.h>
@@ -34,19 +33,19 @@ static struct posix_clock *get_posix_clock(struct file *fp)
 {
 	struct posix_clock *clk = fp->private_data;
 
-	mutex_lock(&clk->mutex);
+	down_read(&clk->rwsem);
 
 	if (!clk->zombie)
 		return clk;
 
-	mutex_unlock(&clk->mutex);
+	up_read(&clk->rwsem);
 
 	return NULL;
 }
 
 static void put_posix_clock(struct posix_clock *clk)
 {
-	mutex_unlock(&clk->mutex);
+	up_read(&clk->rwsem);
 }
 
 static ssize_t posix_clock_read(struct file *fp, char __user *buf,
@@ -156,7 +155,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
 	struct posix_clock *clk =
 		container_of(inode->i_cdev, struct posix_clock, cdev);
 
-	mutex_lock(&clk->mutex);
+	down_read(&clk->rwsem);
 
 	if (clk->zombie) {
 		err = -ENODEV;
@@ -172,7 +171,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
 		fp->private_data = clk;
 	}
 out:
-	mutex_unlock(&clk->mutex);
+	up_read(&clk->rwsem);
 	return err;
 }
 
@@ -211,25 +210,20 @@ int posix_clock_register(struct posix_clock *clk, dev_t devid)
 	int err;
 
 	kref_init(&clk->kref);
-	mutex_init(&clk->mutex);
+	init_rwsem(&clk->rwsem);
 
 	cdev_init(&clk->cdev, &posix_clock_file_operations);
 	clk->cdev.owner = clk->ops.owner;
 	err = cdev_add(&clk->cdev, devid, 1);
-	if (err)
-		goto no_cdev;
 
 	return err;
-no_cdev:
-	mutex_destroy(&clk->mutex);
-	return err;
 }
 EXPORT_SYMBOL_GPL(posix_clock_register);
 
 static void delete_clock(struct kref *kref)
 {
 	struct posix_clock *clk = container_of(kref, struct posix_clock, kref);
-	mutex_destroy(&clk->mutex);
+
 	if (clk->release)
 		clk->release(clk);
 }
@@ -238,9 +232,9 @@ void posix_clock_unregister(struct posix_clock *clk)
 {
 	cdev_del(&clk->cdev);
 
-	mutex_lock(&clk->mutex);
+	down_write(&clk->rwsem);
 	clk->zombie = true;
-	mutex_unlock(&clk->mutex);
+	up_write(&clk->rwsem);
 
 	kref_put(&clk->kref, delete_clock);
 }
-- 
cgit v1.2.3


From 2ca6f62f595c01f689b269db6736de5544da7667 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 18 Apr 2011 23:58:59 +0200
Subject: PM: Fix error code paths executed after failing syscore_suspend()

If syscore_suspend() fails in suspend_enter(), create_image() or
resume_target_kernel(), it is necessary to call sysdev_resume(),
because sysdev_suspend() has been called already and succeeded
and we are going to abort the transition.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/power/hibernate.c | 10 ++++++++--
 kernel/power/suspend.c   |  5 ++++-
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index aeabd26e3342..50aae660174d 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -273,8 +273,11 @@ static int create_image(int platform_mode)
 	local_irq_disable();
 
 	error = sysdev_suspend(PMSG_FREEZE);
-	if (!error)
+	if (!error) {
 		error = syscore_suspend();
+		if (error)
+			sysdev_resume();
+	}
 	if (error) {
 		printk(KERN_ERR "PM: Some system devices failed to power down, "
 			"aborting hibernation\n");
@@ -407,8 +410,11 @@ static int resume_target_kernel(bool platform_mode)
 	local_irq_disable();
 
 	error = sysdev_suspend(PMSG_QUIESCE);
-	if (!error)
+	if (!error) {
 		error = syscore_suspend();
+		if (error)
+			sysdev_resume();
+	}
 	if (error)
 		goto Enable_irqs;
 
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 2814c32aed51..8935369d503a 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -164,8 +164,11 @@ static int suspend_enter(suspend_state_t state)
 	BUG_ON(!irqs_disabled());
 
 	error = sysdev_suspend(PMSG_SUSPEND);
-	if (!error)
+	if (!error) {
 		error = syscore_suspend();
+		if (error)
+			sysdev_resume();
+	}
 	if (!error) {
 		if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
 			error = suspend_ops->enter(state);
-- 
cgit v1.2.3


From 19234c0819da0e043a02710488dfd9b242b42eba Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 20 Apr 2011 00:36:11 +0200
Subject: PM: Add missing syscore_suspend() and syscore_resume() calls

Device suspend/resume infrastructure is used not only by the suspend
and hibernate code in kernel/power, but also by APM, Xen and the
kexec jump feature.  However, commit 40dc166cb5dddbd36aa4ad11c03915ea
(PM / Core: Introduce struct syscore_ops for core subsystems PM)
failed to add syscore_suspend() and syscore_resume() calls to that
code, which generally leads to breakage when the features in question
are used.

To fix this problem, add the missing syscore_suspend() and
syscore_resume() calls to arch/x86/kernel/apm_32.c, kernel/kexec.c
and drivers/xen/manage.c.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
---
 kernel/kexec.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 55936f9cb251..87b77de03dd3 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -33,6 +33,7 @@
 #include <linux/vmalloc.h>
 #include <linux/swap.h>
 #include <linux/kmsg_dump.h>
+#include <linux/syscore_ops.h>
 
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -1532,6 +1533,11 @@ int kernel_kexec(void)
 		local_irq_disable();
 		/* Suspend system devices */
 		error = sysdev_suspend(PMSG_FREEZE);
+		if (!error) {
+			error = syscore_suspend();
+			if (error)
+				sysdev_resume();
+		}
 		if (error)
 			goto Enable_irqs;
 	} else
@@ -1546,6 +1552,7 @@ int kernel_kexec(void)
 
 #ifdef CONFIG_KEXEC_JUMP
 	if (kexec_image->preserve_context) {
+		syscore_resume();
 		sysdev_resume();
  Enable_irqs:
 		local_irq_enable();
-- 
cgit v1.2.3


From d20ac252821ab9780ddf00b95629547d3cebc857 Mon Sep 17 00:00:00 2001
From: Michal Simek <monstr@monstr.eu>
Date: Mon, 4 Apr 2011 11:20:12 +0200
Subject: ftrace: Build without frame pointers on Microblaze

Microblaze doesn't need/support FRAME_POINTERS in order to have a working
function tracer.

The patch remove Kconfig warning.

Warning log:
warning: (LOCKDEP && FAULT_INJECTION_STACKTRACE_FILTER && LATENCYTOP &&
FUNCTION_TRACER && KMEMCHECK) selects FRAME_POINTER which has unmet direct
dependencies (DEBUG_KERNEL && (CRIS || M68K || FRV || UML || AVR32 ||
SUPERH || BLACKFIN || MN10300) || ARCH_WANT_FRAME_POINTERS)

Signed-off-by: Michal Simek <monstr@monstr.eu>
Link: http://lkml.kernel.org/r/1301908812-8119-2-git-send-email-monstr@monstr.eu
CC: Frederic Weisbecker <fweisbec@gmail.com>
CC: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 61d7d59f4a1a..2ad39e556cb4 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -141,7 +141,7 @@ if FTRACE
 config FUNCTION_TRACER
 	bool "Kernel Function Tracer"
 	depends on HAVE_FUNCTION_TRACER
-	select FRAME_POINTER if !ARM_UNWIND && !S390
+	select FRAME_POINTER if !ARM_UNWIND && !S390 && !MICROBLAZE
 	select KALLSYMS
 	select GENERIC_TRACER
 	select CONTEXT_SWITCH_TRACER
-- 
cgit v1.2.3


From bf26c018490c2fce7fe9b629083b96ce0e6ad019 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 7 Apr 2011 16:53:20 +0200
Subject: ptrace: Prepare to fix racy accesses on task breakpoints

When a task is traced and is in a stopped state, the tracer
may execute a ptrace request to examine the tracee state and
get its task struct. Right after, the tracee can be killed
and thus its breakpoints released.
This can happen concurrently when the tracer is in the middle
of reading or modifying these breakpoints, leading to dereferencing
a freed pointer.

Hence, to prepare the fix, create a generic breakpoint reference
holding API. When a reference on the breakpoints of a task is
held, the breakpoints won't be released until the last reference
is dropped. After that, no more ptrace request on the task's
breakpoints can be serviced for the tracer.

Reported-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Prasad <prasad@linux.vnet.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: v2.6.33.. <stable@kernel.org>
Link: http://lkml.kernel.org/r/1302284067-7860-2-git-send-email-fweisbec@gmail.com
---
 kernel/exit.c   |  2 +-
 kernel/ptrace.c | 17 +++++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index f5d2f63bae0b..8dd874181542 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1016,7 +1016,7 @@ NORET_TYPE void do_exit(long code)
 	/*
 	 * FIXME: do that only when needed, using sched_exit tracepoint
 	 */
-	flush_ptrace_hw_breakpoint(tsk);
+	ptrace_put_breakpoints(tsk);
 
 	exit_notify(tsk, group_dead);
 #ifdef CONFIG_NUMA
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 0fc1eed28d27..dc7ab65f3b36 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -22,6 +22,7 @@
 #include <linux/syscalls.h>
 #include <linux/uaccess.h>
 #include <linux/regset.h>
+#include <linux/hw_breakpoint.h>
 
 
 /*
@@ -879,3 +880,19 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
 	return ret;
 }
 #endif	/* CONFIG_COMPAT */
+
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+int ptrace_get_breakpoints(struct task_struct *tsk)
+{
+	if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt))
+		return 0;
+
+	return -1;
+}
+
+void ptrace_put_breakpoints(struct task_struct *tsk)
+{
+	if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt))
+		flush_ptrace_hw_breakpoint(tsk);
+}
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
-- 
cgit v1.2.3


From 1409f141ac719b994d2832911b1e9ec928943fc2 Mon Sep 17 00:00:00 2001
From: Hillf Danton <dhillf@gmail.com>
Date: Wed, 27 Apr 2011 15:26:55 -0700
Subject: kernel/watchdog.c: disable nmi perf event in the error path of
 enabling watchdog

In corner cases where softlockup watchdog is not setup successfully, the
relevant nmi perf event for hardlockup watchdog could be disabled, then
the status of the underlying hardware remains unchanged.

Also, if the kthread doesn't start then the hrtimer won't run and the
hardlockup detector will falsely fire.

Signed-off-by: Hillf Danton <dhillf@gmail.com>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watchdog.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 140dce750450..14733d4d156b 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -430,9 +430,12 @@ static int watchdog_enable(int cpu)
 		p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
 		if (IS_ERR(p)) {
 			printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
-			if (!err)
+			if (!err) {
 				/* if hardlockup hasn't already set this */
 				err = PTR_ERR(p);
+				/* and disable the perf event */
+				watchdog_nmi_disable(cpu);
+			}
 			goto out;
 		}
 		kthread_bind(p, cpu);
-- 
cgit v1.2.3


From ce31332d3c77532d6ea97ddcb475a2b02dd358b4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 29 Apr 2011 00:02:00 +0200
Subject: hrtimer: Initialize CLOCK_ID to HRTIMER_BASE table statically
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sedat and Bruno reported RCU stalls which turned out to be caused by
the following;

sched_init() calls init_rt_bandwidth() which calls hrtimer_init()
_BEFORE_ hrtimers_init() is called. While not entirely correct this
worked because hrtimer_init() only accessed statically initialized
data (hrtimer_bases.clock_base[CLOCK_MONOTONIC])

Commit e06383db9 (hrtimers: extend hrtimer base code to handle more
then 2 clockids) added an indirection to the hrtimer_bases.clock_base
lookup to avoid gap handling in the hot path. The table which is used
for the translataion from CLOCK_ID to HRTIMER_BASE index is
initialized at runtime in hrtimers_init(). So the early call of the
scheduler code translates CLOCK_MONOTONIC to HRTIMER_BASE_REALTIME.

Thus the rt_bandwith timer ends up on CLOCK_REALTIME. If the timer is
armed and the wall clock time is set (e.g. ntpdate in the early boot
process - which also gives the problem deterministic behaviour
i.e. magic recovery after N hours), then the timer ends up with an
expiry time far into the future. That breaks the RT throttler
mechanism as rt runtime is accumulated and never cleared, so the rt
throttler detects a false cpu hog condition and blocks all RT tasks
until the timer finally expires. That in turn stalls the RCU thread of
TINYRCU which leads to an huge amount of RCU callbacks piling up.

Make the translation table statically initialized, so we are back to
the status of <= 2.6.39.

Reported-and-tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Reported-by: Bruno Prémont <bonbons@linux-vserver.org>
Cc: John stultz <johnstul@us.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/%3Calpine.LFD.2.02.1104282353140.3005%40ionos%3E
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/hrtimer.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 9017478c5d4c..87fdb3f8db14 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -81,7 +81,11 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 	}
 };
 
-static int hrtimer_clock_to_base_table[MAX_CLOCKS];
+static int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
+	[CLOCK_REALTIME]	= HRTIMER_BASE_REALTIME,
+	[CLOCK_MONOTONIC]	= HRTIMER_BASE_MONOTONIC,
+	[CLOCK_BOOTTIME]	= HRTIMER_BASE_BOOTTIME,
+};
 
 static inline int hrtimer_clockid_to_base(clockid_t clock_id)
 {
@@ -1722,10 +1726,6 @@ static struct notifier_block __cpuinitdata hrtimers_nb = {
 
 void __init hrtimers_init(void)
 {
-	hrtimer_clock_to_base_table[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME;
-	hrtimer_clock_to_base_table[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC;
-	hrtimer_clock_to_base_table[CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME;
-
 	hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
 			  (void *)(long)smp_processor_id());
 	register_cpu_notifier(&hrtimers_nb);
-- 
cgit v1.2.3


From 5035b20fa5cd146b66f5f89619c20a4177fb736d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 29 Apr 2011 18:08:37 +0200
Subject: workqueue: fix deadlock in worker_maybe_bind_and_lock()

If a rescuer and stop_machine() bringing down a CPU race with each
other, they may deadlock on non-preemptive kernel.  The CPU won't
accept a new task, so the rescuer can't migrate to the target CPU,
while stop_machine() can't proceed because the rescuer is holding one
of the CPU retrying migration.  GCWQ_DISASSOCIATED is never cleared
and worker_maybe_bind_and_lock() retries indefinitely.

This problem can be reproduced semi reliably while the system is
entering suspend.

 http://thread.gmane.org/gmane.linux.kernel/1122051

A lot of kudos to Thilo-Alexander for reporting this tricky issue and
painstaking testing.

stable: This affects all kernels with cmwq, so all kernels since and
        including v2.6.36 need this fix.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Thilo-Alexander Ginkel <thilo@ginkel.com>
Tested-by: Thilo-Alexander Ginkel <thilo@ginkel.com>
Cc: stable@kernel.org
---
 kernel/workqueue.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 04ef830690ec..e3378e8d3a5c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1291,8 +1291,14 @@ __acquires(&gcwq->lock)
 			return true;
 		spin_unlock_irq(&gcwq->lock);
 
-		/* CPU has come up inbetween, retry migration */
+		/*
+		 * We've raced with CPU hot[un]plug.  Give it a breather
+		 * and retry migration.  cond_resched() is required here;
+		 * otherwise, we might deadlock against cpu_stop trying to
+		 * bring down the CPU on non-preemptive kernel.
+		 */
 		cpu_relax();
+		cond_resched();
 	}
 }
 
-- 
cgit v1.2.3


From 94b2c363dcf732a4edab4ed66041cb36e7f28fbf Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Sat, 30 Apr 2011 22:56:20 +0200
Subject: genirq: Fix typo CONFIG_GENIRC_IRQ_SHOW_LEVEL

commit ab7798ffcf98b11a9525cf65bacdae3fd58d357f ("genirq: Expand generic
show_interrupts()") added the Kconfig option GENERIC_IRQ_SHOW_LEVEL to
accomodate PowerPC, but this doesn't actually enable the functionality due
to a typo in the #ifdef check.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Linux/PPC Development <linuxppc-dev@lists.ozlabs.org>
Link: http://lkml.kernel.org/r/%3Calpine.DEB.2.00.1104302251370.19068%40ayla.of.borg%3E
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index dd201bd35103..834899f2500f 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -419,7 +419,7 @@ int show_interrupts(struct seq_file *p, void *v)
 	} else {
 		seq_printf(p, " %8s", "None");
 	}
-#ifdef CONFIG_GENIRC_IRQ_SHOW_LEVEL
+#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL
 	seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge");
 #endif
 	if (desc->name)
-- 
cgit v1.2.3


From e05b2efb82596905ebfe88e8612ee81dec9b6592 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Wed, 4 May 2011 18:16:50 -0700
Subject: clocksource: Install completely before selecting

Christian Hoffmann reported that the command line clocksource override
with acpi_pm timer fails:

 Kernel command line: <SNIP> clocksource=acpi_pm
 hpet clockevent registered
 Switching to clocksource hpet
 Override clocksource acpi_pm is not HRT compatible.
 Cannot switch while in HRT/NOHZ mode.

The watchdog code is what enables CLOCK_SOURCE_VALID_FOR_HRES, but we
actually end up selecting the clocksource before we enqueue it into
the watchdog list, so that's why we see the warning and fail to switch
to acpi_pm timer as requested. That's particularly bad when we want to
debug timekeeping related problems in early boot.

Put the selection call last.

Reported-by: Christian Hoffmann <email@christianhoffmann.info>
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Cc: stable@kernel.org # 32...
Link: http://lkml.kernel.org/r/%3C1304558210.2943.24.camel%40work-vm%3E
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clocksource.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 6519cf62d9cd..0e17c10f8a9d 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -685,8 +685,8 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 	/* Add clocksource to the clcoksource list */
 	mutex_lock(&clocksource_mutex);
 	clocksource_enqueue(cs);
-	clocksource_select();
 	clocksource_enqueue_watchdog(cs);
+	clocksource_select();
 	mutex_unlock(&clocksource_mutex);
 	return 0;
 }
@@ -706,8 +706,8 @@ int clocksource_register(struct clocksource *cs)
 
 	mutex_lock(&clocksource_mutex);
 	clocksource_enqueue(cs);
-	clocksource_select();
 	clocksource_enqueue_watchdog(cs);
+	clocksource_select();
 	mutex_unlock(&clocksource_mutex);
 	return 0;
 }
-- 
cgit v1.2.3


From a3a4a5acd3bd2f6f1e102e1f1b9d2e2bb320a7fd Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Thu, 5 May 2011 23:55:18 -0400
Subject: Regression: partial revert "tracing: Remove lock_depth from event
 entry"

This partially reverts commit e6e1e2593592a8f6f6380496655d8c6f67431266.

That commit changed the structure layout of the trace structure, which
in turn broke PowerTOP (1.9x generation) quite badly.

I appreciate not wanting to expose the variable in question, and
PowerTOP was not using it, so I've replaced the variable with just a
padding field - that way if in the future a new field is needed it can
just use this padding field.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/trace/trace.c        | 1 +
 kernel/trace/trace_events.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d38c16a06a6f..1cb49be7c7fb 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1110,6 +1110,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 
 	entry->preempt_count		= pc & 0xff;
 	entry->pid			= (tsk) ? tsk->pid : 0;
+	entry->padding			= 0;
 	entry->flags =
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
 		(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e88f74fe1d4c..2fe110341359 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -116,6 +116,7 @@ static int trace_define_common_fields(void)
 	__common_field(unsigned char, flags);
 	__common_field(unsigned char, preempt_count);
 	__common_field(int, pid);
+	__common_field(int, padding);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 87186475a402391a1ca7d42a675c9b35a18dc348 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 10 May 2011 21:09:53 +0200
Subject: PM: Fix warning in pm_restrict_gfp_mask() during SNAPSHOT_S2RAM ioctl

A warning is printed by pm_restrict_gfp_mask() while the
SNAPSHOT_S2RAM ioctl is being executed after creating a hibernation
image, because pm_restrict_gfp_mask() has been called once already
before the image creation and suspend_devices_and_enter() calls it
once again.  This happens after commit 452aa6999e6703ffbddd7f6ea124d3
(mm/pm: force GFP_NOIO during suspend/hibernation and resume).

To avoid this issue, move pm_restrict_gfp_mask() and
pm_restore_gfp_mask() from suspend_devices_and_enter() to its caller
in kernel/power/suspend.c.

Reported-by: Alexandre Felipe Muller de Souza <alexandrefm@mandriva.com.br>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: stable@kernel.org
---
 kernel/power/suspend.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 8935369d503a..6275970b2189 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -216,7 +216,6 @@ int suspend_devices_and_enter(suspend_state_t state)
 			goto Close;
 	}
 	suspend_console();
-	pm_restrict_gfp_mask();
 	suspend_test_start();
 	error = dpm_suspend_start(PMSG_SUSPEND);
 	if (error) {
@@ -233,7 +232,6 @@ int suspend_devices_and_enter(suspend_state_t state)
 	suspend_test_start();
 	dpm_resume_end(PMSG_RESUME);
 	suspend_test_finish("resume devices");
-	pm_restore_gfp_mask();
 	resume_console();
  Close:
 	if (suspend_ops->end)
@@ -294,7 +292,9 @@ int enter_state(suspend_state_t state)
 		goto Finish;
 
 	pr_debug("PM: Entering %s sleep\n", pm_states[state]);
+	pm_restrict_gfp_mask();
 	error = suspend_devices_and_enter(state);
+	pm_restore_gfp_mask();
 
  Finish:
 	pr_debug("PM: Finishing wakeup.\n");
-- 
cgit v1.2.3


From 9744997a8a2280e67984d4bffd87221d24f3b6b1 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 10 May 2011 21:10:01 +0200
Subject: PM / Hibernate: Make snapshot_release() restore GFP mask

If the process using the hibernate user space interface closes
/dev/snapshot after creating a hibernation image without thawing
tasks, snapshot_release() should call pm_restore_gfp_mask() to
restore the GFP mask used before the creation of the image.  Make
that happen.

Tested-by: Alexandre Felipe Muller de Souza <alexandrefm@mandriva.com.br>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: stable@kernel.org
---
 kernel/power/user.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/user.c b/kernel/power/user.c
index c36c3b9e8a84..6522be913ac1 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -135,8 +135,10 @@ static int snapshot_release(struct inode *inode, struct file *filp)
 	free_basic_memory_bitmaps();
 	data = filp->private_data;
 	free_all_swap_pages(data->swap);
-	if (data->frozen)
+	if (data->frozen) {
+		pm_restore_gfp_mask();
 		thaw_processes();
+	}
 	pm_notifier_call_chain(data->mode == O_RDONLY ?
 			PM_POST_HIBERNATION : PM_POST_RESTORE);
 	atomic_inc(&snapshot_device_available);
-- 
cgit v1.2.3


From 36cb7035ea0c11ef2c7fa2bbe0cd181b23569b29 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 10 May 2011 21:10:13 +0200
Subject: PM / Hibernate: Fix ioctl SNAPSHOT_S2RAM

The SNAPSHOT_S2RAM ioctl used for implementing the feature allowing
one to suspend to RAM after creating a hibernation image is currently
broken, because it doesn't clear the "ready" flag in the struct
snapshot_data object handled by it.  As a result, the
SNAPSHOT_UNFREEZE doesn't work correctly after SNAPSHOT_S2RAM has
returned and the user space hibernate task cannot thaw the other
processes as appropriate.  Make SNAPSHOT_S2RAM clear data->ready
to fix this problem.

Tested-by: Alexandre Felipe Muller de Souza <alexandrefm@mandriva.com.br>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: stable@kernel.org
---
 kernel/power/user.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/power/user.c b/kernel/power/user.c
index 6522be913ac1..7d02d33be699 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -381,6 +381,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 		 * PM_HIBERNATION_PREPARE
 		 */
 		error = suspend_devices_and_enter(PM_SUSPEND_MEM);
+		data->ready = 0;
 		break;
 
 	case SNAPSHOT_PLATFORM_SUPPORT:
-- 
cgit v1.2.3


From 47a150edc2ae734c0f4bf50aa19499e23b9a46f8 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serge.hallyn@canonical.com>
Date: Fri, 13 May 2011 04:27:54 +0100
Subject: Cache user_ns in struct cred

If !CONFIG_USERNS, have current_user_ns() defined to (&init_user_ns).

Get rid of _current_user_ns.  This requires nsown_capable() to be
defined in capability.c rather than as static inline in capability.h,
so do that.

Request_key needs init_user_ns defined at current_user_ns if
!CONFIG_USERNS, so forward-declare that in cred.h if !CONFIG_USERNS
at current_user_ns() define.

Compile-tested with and without CONFIG_USERNS.

Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
[ This makes a huge performance difference for acl_permission_check(),
  up to 30%.  And that is one of the hottest kernel functions for loads
  that are pathname-lookup heavy.  ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/capability.c | 12 ++++++++++++
 kernel/cred.c       | 12 ++++++------
 2 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/capability.c b/kernel/capability.c
index bf0c734d0c12..32a80e08ff4b 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -399,3 +399,15 @@ bool task_ns_capable(struct task_struct *t, int cap)
 	return ns_capable(task_cred_xxx(t, user)->user_ns, cap);
 }
 EXPORT_SYMBOL(task_ns_capable);
+
+/**
+ * nsown_capable - Check superior capability to one's own user_ns
+ * @cap: The capability in question
+ *
+ * Return true if the current task has the given superior capability
+ * targeted at its own user namespace.
+ */
+bool nsown_capable(int cap)
+{
+	return ns_capable(current_user_ns(), cap);
+}
diff --git a/kernel/cred.c b/kernel/cred.c
index 5557b55048df..8093c16b84b1 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -54,6 +54,7 @@ struct cred init_cred = {
 	.cap_effective		= CAP_INIT_EFF_SET,
 	.cap_bset		= CAP_INIT_BSET,
 	.user			= INIT_USER,
+	.user_ns		= &init_user_ns,
 	.group_info		= &init_groups,
 #ifdef CONFIG_KEYS
 	.tgcred			= &init_tgcred,
@@ -410,6 +411,11 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 			goto error_put;
 	}
 
+	/* cache user_ns in cred.  Doesn't need a refcount because it will
+	 * stay pinned by cred->user
+	 */
+	new->user_ns = new->user->user_ns;
+
 #ifdef CONFIG_KEYS
 	/* new threads get their own thread keyrings if their parent already
 	 * had one */
@@ -741,12 +747,6 @@ int set_create_files_as(struct cred *new, struct inode *inode)
 }
 EXPORT_SYMBOL(set_create_files_as);
 
-struct user_namespace *current_user_ns(void)
-{
-	return _current_user_ns();
-}
-EXPORT_SYMBOL(current_user_ns);
-
 #ifdef CONFIG_DEBUG_CREDENTIALS
 
 bool creds_are_invalid(const struct cred *cred)
-- 
cgit v1.2.3


From 07f4beb0b5bbfaf36a64aa00d59e670ec578a95a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 16 May 2011 11:07:48 +0200
Subject: tick: Clear broadcast active bit when switching to oneshot

The first cpu which switches from periodic to oneshot mode switches
also the broadcast device into oneshot mode. The broadcast device
serves as a backup for per cpu timers which stop in deeper
C-states. To avoid starvation of the cpus which might be in idle and
depend on broadcast mode it marks the other cpus as broadcast active
and sets the brodcast expiry value of those cpus to the next tick.

The oneshot mode broadcast bit for the other cpus is sticky and gets
only cleared when those cpus exit idle. If a cpu was not idle while
the bit got set in consequence the bit prevents that the broadcast
device is armed on behalf of that cpu when it enters idle for the
first time after it switched to oneshot mode.

In most cases that goes unnoticed as one of the other cpus has usually
a timer pending which keeps the broadcast device armed with a short
timeout. Now if the only cpu which has a short timer active has the
bit set then the broadcast device will not be armed on behalf of that
cpu and will fire way after the expected timer expiry. In the case of
Christians bug report it took ~145 seconds which is about half of the
wrap around time of HPET (the limit for that device) due to the fact
that all other cpus had no timers armed which expired before the 145
seconds timeframe.

The solution is simply to clear the broadcast active bit
unconditionally when a cpu switches to oneshot mode after the first
cpu switched the broadcast device over. It's not idle at that point
otherwise it would not be executing that code.

[ I fundamentally hate that broadcast crap. Why the heck thought some
  folks that when going into deep idle it's a brilliant concept to
  switch off the last device which brings the cpu back from that
  state? ]

Thanks to Christian for providing all the valuable debug information!

Reported-and-tested-by: Christian Hoffmann <email@christianhoffmann.info>
Cc: John Stultz <johnstul@us.ibm.com>
Link: http://lkml.kernel.org/r/%3Calpine.LFD.2.02.1105161105170.3078%40ionos%3E
Cc: stable@kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index da800ffa810c..723c7637e55a 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -522,10 +522,11 @@ static void tick_broadcast_init_next_event(struct cpumask *mask,
  */
 void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 {
+	int cpu = smp_processor_id();
+
 	/* Set it up only once ! */
 	if (bc->event_handler != tick_handle_oneshot_broadcast) {
 		int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
-		int cpu = smp_processor_id();
 
 		bc->event_handler = tick_handle_oneshot_broadcast;
 		clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
@@ -551,6 +552,15 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 			tick_broadcast_set_event(tick_next_period, 1);
 		} else
 			bc->next_event.tv64 = KTIME_MAX;
+	} else {
+		/*
+		 * The first cpu which switches to oneshot mode sets
+		 * the bit for all other cpus which are in the general
+		 * (periodic) broadcast mask. So the bit is set and
+		 * would prevent the first broadcast enter after this
+		 * to program the bc device.
+		 */
+		tick_broadcast_clear_oneshot(cpu);
 	}
 }
 
-- 
cgit v1.2.3