From ee9c578527a93c66becb526c4a122c5358a959c5 Mon Sep 17 00:00:00 2001
From: Russell King <rmk@dyn-67.arm.linux.org.uk>
Date: Sun, 20 Apr 2008 13:59:33 +0100
Subject: dyntick: Remove last reminants of dyntick support

Remove the last reminants of dyntick support from the generic kernel.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 kernel/hrtimer.c |  2 +-
 kernel/sysctl.c  | 12 ------------
 kernel/timer.c   | 10 +---------
 3 files changed, 2 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 421be5fe5cc7..543d9ca9b4f4 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1078,7 +1078,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
 }
 EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
 
-#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ)
+#ifdef CONFIG_NO_HZ
 /**
  * hrtimer_get_next_event - get the time until next expiry event
  *
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d7ffdc59816a..76426d930077 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -131,8 +131,6 @@ extern int sysctl_userprocess_debug;
 extern int spin_retry;
 #endif
 
-extern int sysctl_hz_timer;
-
 #ifdef CONFIG_BSD_PROCESS_ACCT
 extern int acct_parm[];
 #endif
@@ -561,16 +559,6 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-#endif
-#ifdef CONFIG_NO_IDLE_HZ
-	{
-		.ctl_name       = KERN_HZ_TIMER,
-		.procname       = "hz_timer",
-		.data           = &sysctl_hz_timer,
-		.maxlen         = sizeof(int),
-		.mode           = 0644,
-		.proc_handler   = &proc_dointvec,
-	},
 #endif
 	{
 		.ctl_name	= KERN_S390_USER_DEBUG_LOGGING,
diff --git a/kernel/timer.c b/kernel/timer.c
index ceacc6626572..ef3fa6906e8f 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -812,7 +812,7 @@ static inline void __run_timers(struct tvec_base *base)
 	spin_unlock_irq(&base->lock);
 }
 
-#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ)
+#ifdef CONFIG_NO_HZ
 /*
  * Find out when the next timer event is due to happen. This
  * is used on S/390 to stop all activity when a cpus is idle.
@@ -947,14 +947,6 @@ unsigned long get_next_timer_interrupt(unsigned long now)
 
 	return cmp_next_hrtimer_event(now, expires);
 }
-
-#ifdef CONFIG_NO_IDLE_HZ
-unsigned long next_timer_interrupt(void)
-{
-	return get_next_timer_interrupt(jiffies);
-}
-#endif
-
 #endif
 
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
-- 
cgit v1.2.3


From 493d35863dbb692c38c1415fd83d88dfb902ae37 Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@ozlabs.org>
Date: Wed, 14 May 2008 16:22:58 -0700
Subject: mutex-debug: check mutex magic before owner

Currently, the mutex debug code checks the lock->owner before lock->magic, so
a corrupt mutex will most likely result in failing the owner check, rather
than the magic check.

This change to debug_mutex_unlock does the magic check first, so
we have a better idea of what breaks.

Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/mutex-debug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 3aaa06c561de..1d94160eb532 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -79,8 +79,8 @@ void debug_mutex_unlock(struct mutex *lock)
 	if (unlikely(!debug_locks))
 		return;
 
-	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
 	DEBUG_LOCKS_WARN_ON(lock->magic != lock);
+	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
 	DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
 	DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
 }
-- 
cgit v1.2.3


From 8b09dee67f484e9b42114b1a1f068e080fd7aa56 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:21:05 +0200
Subject: rcupreempt: remove duplicate prototypes

rcu_batches_completed and rcu_patches_completed_bh are both declared
in rcuclassic.h and rcupreempt.h. This patch removes the extra
prototypes for them from rcupdate.h.

rcu_batches_completed_bh is defined as a static inline in the rcupreempt.h
header file. Trying to export this as EXPORT_SYMBOL_GPL causes linking problems
with the powerpc linker. There's no need to export a static inlined function.

Modules must be compiled with the same type of RCU implementation as the
kernel they are for.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupreempt.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index e1cdf196a515..5e02b7740702 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -217,8 +217,6 @@ long rcu_batches_completed(void)
 }
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
 
-EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
-
 void __rcu_read_lock(void)
 {
 	int idx;
-- 
cgit v1.2.3


From 4446a36ff8c74ac3b32feb009b651048e129c6af Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 12 May 2008 21:21:05 +0200
Subject: rcu: add call_rcu_sched()

Fourth cut of patch to provide the call_rcu_sched().  This is again to
synchronize_sched() as call_rcu() is to synchronize_rcu().

Should be fine for experimental and -rt use, but not ready for inclusion.
With some luck, I will be able to tell Andrew to come out of hiding on
the next round.

Passes multi-day rcutorture sessions with concurrent CPU hotplugging.

Fixes since the first version include a bug that could result in
indefinite blocking (spotted by Gautham Shenoy), better resiliency
against CPU-hotplug operations, and other minor fixes.

Fixes since the second version include reworking grace-period detection
to avoid deadlocks that could happen when running concurrently with
CPU hotplug, adding Mathieu's fix to avoid the softlockup messages,
as well as Mathieu's fix to allow use earlier in boot.

Fixes since the third version include a wrong-CPU bug spotted by
Andrew, getting rid of the obsolete synchronize_kernel API that somehow
snuck back in, merging spin_unlock() and local_irq_restore() in a
few places, commenting the code that checks for quiescent states based
on interrupting from user-mode execution or the idle loop, removing
some inline attributes, and some code-style changes.

Known/suspected shortcomings:

o	I still do not entirely trust the sleep/wakeup logic.  Next step
	will be to use a private snapshot of the CPU online mask in
	rcu_sched_grace_period() -- if the CPU wasn't there at the start
	of the grace period, we don't need to hear from it.  And the
	bit about accounting for changes in online CPUs inside of
	rcu_sched_grace_period() is ugly anyway.

o	It might be good for rcu_sched_grace_period() to invoke
	resched_cpu() when a given CPU wasn't responding quickly,
	but resched_cpu() is declared static...

This patch also fixes a long-standing bug in the earlier preemptable-RCU
implementation of synchronize_rcu() that could result in loss of
concurrent external changes to a task's CPU affinity mask.  I still cannot
remember who reported this...

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/rcupdate.c   |  20 +--
 kernel/rcupreempt.c | 414 ++++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 372 insertions(+), 62 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index c09605f8d16c..a4e329d92883 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -39,18 +39,12 @@
 #include <linux/sched.h>
 #include <asm/atomic.h>
 #include <linux/bitops.h>
-#include <linux/completion.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
 #include <linux/cpu.h>
 #include <linux/mutex.h>
 #include <linux/module.h>
 
-struct rcu_synchronize {
-	struct rcu_head head;
-	struct completion completion;
-};
-
 static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
 static atomic_t rcu_barrier_cpu_count;
 static DEFINE_MUTEX(rcu_barrier_mutex);
@@ -60,7 +54,7 @@ static struct completion rcu_barrier_completion;
  * Awaken the corresponding synchronize_rcu() instance now that a
  * grace period has elapsed.
  */
-static void wakeme_after_rcu(struct rcu_head  *head)
+void wakeme_after_rcu(struct rcu_head  *head)
 {
 	struct rcu_synchronize *rcu;
 
@@ -77,17 +71,7 @@ static void wakeme_after_rcu(struct rcu_head  *head)
  * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
  * and may be nested.
  */
-void synchronize_rcu(void)
-{
-	struct rcu_synchronize rcu;
-
-	init_completion(&rcu.completion);
-	/* Will wake me after RCU finished */
-	call_rcu(&rcu.head, wakeme_after_rcu);
-
-	/* Wait for it */
-	wait_for_completion(&rcu.completion);
-}
+synchronize_rcu_xxx(synchronize_rcu, call_rcu)
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 
 static void rcu_barrier_callback(struct rcu_head *notused)
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 5e02b7740702..aaa7976bd85f 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -46,6 +46,7 @@
 #include <asm/atomic.h>
 #include <linux/bitops.h>
 #include <linux/module.h>
+#include <linux/kthread.h>
 #include <linux/completion.h>
 #include <linux/moduleparam.h>
 #include <linux/percpu.h>
@@ -87,9 +88,14 @@ struct rcu_data {
 	struct rcu_head **nexttail;
 	struct rcu_head *waitlist[GP_STAGES];
 	struct rcu_head **waittail[GP_STAGES];
-	struct rcu_head *donelist;
+	struct rcu_head *donelist;	/* from waitlist & waitschedlist */
 	struct rcu_head **donetail;
 	long rcu_flipctr[2];
+	struct rcu_head *nextschedlist;
+	struct rcu_head **nextschedtail;
+	struct rcu_head *waitschedlist;
+	struct rcu_head **waitschedtail;
+	int rcu_sched_sleeping;
 #ifdef CONFIG_RCU_TRACE
 	struct rcupreempt_trace trace;
 #endif /* #ifdef CONFIG_RCU_TRACE */
@@ -131,11 +137,24 @@ enum rcu_try_flip_states {
 	rcu_try_flip_waitmb_state,
 };
 
+/*
+ * States for rcu_ctrlblk.rcu_sched_sleep.
+ */
+
+enum rcu_sched_sleep_states {
+	rcu_sched_not_sleeping,	/* Not sleeping, callbacks need GP.  */
+	rcu_sched_sleep_prep,	/* Thinking of sleeping, rechecking. */
+	rcu_sched_sleeping,	/* Sleeping, awaken if GP needed. */
+};
+
 struct rcu_ctrlblk {
 	spinlock_t	fliplock;	/* Protect state-machine transitions. */
 	long		completed;	/* Number of last completed batch. */
 	enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
 							the rcu state machine */
+	spinlock_t	schedlock;	/* Protect rcu_sched sleep state. */
+	enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */
+	wait_queue_head_t sched_wq;	/* Place for rcu_sched to sleep. */
 };
 
 static DEFINE_PER_CPU(struct rcu_data, rcu_data);
@@ -143,8 +162,12 @@ static struct rcu_ctrlblk rcu_ctrlblk = {
 	.fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
 	.completed = 0,
 	.rcu_try_flip_state = rcu_try_flip_idle_state,
+	.schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock),
+	.sched_sleep = rcu_sched_not_sleeping,
+	.sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq),
 };
 
+static struct task_struct *rcu_sched_grace_period_task;
 
 #ifdef CONFIG_RCU_TRACE
 static char *rcu_try_flip_state_names[] =
@@ -207,6 +230,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
  */
 #define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
 
+#define RCU_SCHED_BATCH_TIME (HZ / 50)
+
 /*
  * Return the number of RCU batches processed thus far.  Useful
  * for debug and statistics.
@@ -411,32 +436,34 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp)
 	}
 }
 
-#ifdef CONFIG_NO_HZ
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
+	.dynticks = 1,
+};
 
-DEFINE_PER_CPU(long, dynticks_progress_counter) = 1;
-static DEFINE_PER_CPU(long, rcu_dyntick_snapshot);
+#ifdef CONFIG_NO_HZ
 static DEFINE_PER_CPU(int, rcu_update_flag);
 
 /**
  * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
  *
  * If the CPU was idle with dynamic ticks active, this updates the
- * dynticks_progress_counter to let the RCU handling know that the
+ * rcu_dyntick_sched.dynticks to let the RCU handling know that the
  * CPU is active.
  */
 void rcu_irq_enter(void)
 {
 	int cpu = smp_processor_id();
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
 
 	if (per_cpu(rcu_update_flag, cpu))
 		per_cpu(rcu_update_flag, cpu)++;
 
 	/*
 	 * Only update if we are coming from a stopped ticks mode
-	 * (dynticks_progress_counter is even).
+	 * (rcu_dyntick_sched.dynticks is even).
 	 */
 	if (!in_interrupt() &&
-	    (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) {
+	    (rdssp->dynticks & 0x1) == 0) {
 		/*
 		 * The following might seem like we could have a race
 		 * with NMI/SMIs. But this really isn't a problem.
@@ -459,12 +486,12 @@ void rcu_irq_enter(void)
 		 * RCU read-side critical sections on this CPU would
 		 * have already completed.
 		 */
-		per_cpu(dynticks_progress_counter, cpu)++;
+		rdssp->dynticks++;
 		/*
 		 * The following memory barrier ensures that any
 		 * rcu_read_lock() primitives in the irq handler
 		 * are seen by other CPUs to follow the above
-		 * increment to dynticks_progress_counter. This is
+		 * increment to rcu_dyntick_sched.dynticks. This is
 		 * required in order for other CPUs to correctly
 		 * determine when it is safe to advance the RCU
 		 * grace-period state machine.
@@ -472,7 +499,7 @@ void rcu_irq_enter(void)
 		smp_mb(); /* see above block comment. */
 		/*
 		 * Since we can't determine the dynamic tick mode from
-		 * the dynticks_progress_counter after this routine,
+		 * the rcu_dyntick_sched.dynticks after this routine,
 		 * we use a second flag to acknowledge that we came
 		 * from an idle state with ticks stopped.
 		 */
@@ -480,7 +507,7 @@ void rcu_irq_enter(void)
 		/*
 		 * If we take an NMI/SMI now, they will also increment
 		 * the rcu_update_flag, and will not update the
-		 * dynticks_progress_counter on exit. That is for
+		 * rcu_dyntick_sched.dynticks on exit. That is for
 		 * this IRQ to do.
 		 */
 	}
@@ -490,12 +517,13 @@ void rcu_irq_enter(void)
  * rcu_irq_exit - Called from exiting Hard irq context.
  *
  * If the CPU was idle with dynamic ticks active, update the
- * dynticks_progress_counter to put let the RCU handling be
+ * rcu_dyntick_sched.dynticks to put let the RCU handling be
  * aware that the CPU is going back to idle with no ticks.
  */
 void rcu_irq_exit(void)
 {
 	int cpu = smp_processor_id();
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
 
 	/*
 	 * rcu_update_flag is set if we interrupted the CPU
@@ -503,7 +531,7 @@ void rcu_irq_exit(void)
 	 * Once this occurs, we keep track of interrupt nesting
 	 * because a NMI/SMI could also come in, and we still
 	 * only want the IRQ that started the increment of the
-	 * dynticks_progress_counter to be the one that modifies
+	 * rcu_dyntick_sched.dynticks to be the one that modifies
 	 * it on exit.
 	 */
 	if (per_cpu(rcu_update_flag, cpu)) {
@@ -515,28 +543,29 @@ void rcu_irq_exit(void)
 
 		/*
 		 * If an NMI/SMI happens now we are still
-		 * protected by the dynticks_progress_counter being odd.
+		 * protected by the rcu_dyntick_sched.dynticks being odd.
 		 */
 
 		/*
 		 * The following memory barrier ensures that any
 		 * rcu_read_unlock() primitives in the irq handler
 		 * are seen by other CPUs to preceed the following
-		 * increment to dynticks_progress_counter. This
+		 * increment to rcu_dyntick_sched.dynticks. This
 		 * is required in order for other CPUs to determine
 		 * when it is safe to advance the RCU grace-period
 		 * state machine.
 		 */
 		smp_mb(); /* see above block comment. */
-		per_cpu(dynticks_progress_counter, cpu)++;
-		WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1);
+		rdssp->dynticks++;
+		WARN_ON(rdssp->dynticks & 0x1);
 	}
 }
 
 static void dyntick_save_progress_counter(int cpu)
 {
-	per_cpu(rcu_dyntick_snapshot, cpu) =
-		per_cpu(dynticks_progress_counter, cpu);
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+	rdssp->dynticks_snap = rdssp->dynticks;
 }
 
 static inline int
@@ -544,9 +573,10 @@ rcu_try_flip_waitack_needed(int cpu)
 {
 	long curr;
 	long snap;
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
 
-	curr = per_cpu(dynticks_progress_counter, cpu);
-	snap = per_cpu(rcu_dyntick_snapshot, cpu);
+	curr = rdssp->dynticks;
+	snap = rdssp->dynticks_snap;
 	smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
 
 	/*
@@ -580,9 +610,10 @@ rcu_try_flip_waitmb_needed(int cpu)
 {
 	long curr;
 	long snap;
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
 
-	curr = per_cpu(dynticks_progress_counter, cpu);
-	snap = per_cpu(rcu_dyntick_snapshot, cpu);
+	curr = rdssp->dynticks;
+	snap = rdssp->dynticks_snap;
 	smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
 
 	/*
@@ -609,14 +640,86 @@ rcu_try_flip_waitmb_needed(int cpu)
 	return 1;
 }
 
+static void dyntick_save_progress_counter_sched(int cpu)
+{
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+	rdssp->sched_dynticks_snap = rdssp->dynticks;
+}
+
+static int rcu_qsctr_inc_needed_dyntick(int cpu)
+{
+	long curr;
+	long snap;
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+	curr = rdssp->dynticks;
+	snap = rdssp->sched_dynticks_snap;
+	smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+
+	/*
+	 * If the CPU remained in dynticks mode for the entire time
+	 * and didn't take any interrupts, NMIs, SMIs, or whatever,
+	 * then it cannot be in the middle of an rcu_read_lock(), so
+	 * the next rcu_read_lock() it executes must use the new value
+	 * of the counter.  Therefore, this CPU has been in a quiescent
+	 * state the entire time, and we don't need to wait for it.
+	 */
+
+	if ((curr == snap) && ((curr & 0x1) == 0))
+		return 0;
+
+	/*
+	 * If the CPU passed through or entered a dynticks idle phase with
+	 * no active irq handlers, then, as above, this CPU has already
+	 * passed through a quiescent state.
+	 */
+
+	if ((curr - snap) > 2 || (snap & 0x1) == 0)
+		return 0;
+
+	/* We need this CPU to go through a quiescent state. */
+
+	return 1;
+}
+
 #else /* !CONFIG_NO_HZ */
 
-# define dyntick_save_progress_counter(cpu)	do { } while (0)
-# define rcu_try_flip_waitack_needed(cpu)	(1)
-# define rcu_try_flip_waitmb_needed(cpu)	(1)
+# define dyntick_save_progress_counter(cpu)		do { } while (0)
+# define rcu_try_flip_waitack_needed(cpu)		(1)
+# define rcu_try_flip_waitmb_needed(cpu)		(1)
+
+# define dyntick_save_progress_counter_sched(cpu)	do { } while (0)
+# define rcu_qsctr_inc_needed_dyntick(cpu)		(1)
 
 #endif /* CONFIG_NO_HZ */
 
+static void save_qsctr_sched(int cpu)
+{
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+	rdssp->sched_qs_snap = rdssp->sched_qs;
+}
+
+static inline int rcu_qsctr_inc_needed(int cpu)
+{
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+	/*
+	 * If there has been a quiescent state, no more need to wait
+	 * on this CPU.
+	 */
+
+	if (rdssp->sched_qs != rdssp->sched_qs_snap) {
+		smp_mb(); /* force ordering with cpu entering schedule(). */
+		return 0;
+	}
+
+	/* We need this CPU to go through a quiescent state. */
+
+	return 1;
+}
+
 /*
  * Get here when RCU is idle.  Decide whether we need to
  * move out of idle state, and return non-zero if so.
@@ -819,6 +922,26 @@ void rcu_check_callbacks(int cpu, int user)
 	unsigned long flags;
 	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
 
+	/*
+	 * If this CPU took its interrupt from user mode or from the
+	 * idle loop, and this is not a nested interrupt, then
+	 * this CPU has to have exited all prior preept-disable
+	 * sections of code.  So increment the counter to note this.
+	 *
+	 * The memory barrier is needed to handle the case where
+	 * writes from a preempt-disable section of code get reordered
+	 * into schedule() by this CPU's write buffer.  So the memory
+	 * barrier makes sure that the rcu_qsctr_inc() is seen by other
+	 * CPUs to happen after any such write.
+	 */
+
+	if (user ||
+	    (idle_cpu(cpu) && !in_softirq() &&
+	     hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+		smp_mb();	/* Guard against aggressive schedule(). */
+	     	rcu_qsctr_inc(cpu);
+	}
+
 	rcu_check_mb(cpu);
 	if (rcu_ctrlblk.completed == rdp->completed)
 		rcu_try_flip();
@@ -869,6 +992,8 @@ void rcu_offline_cpu(int cpu)
 	struct rcu_head *list = NULL;
 	unsigned long flags;
 	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+	struct rcu_head *schedlist = NULL;
+	struct rcu_head **schedtail = &schedlist;
 	struct rcu_head **tail = &list;
 
 	/*
@@ -882,6 +1007,11 @@ void rcu_offline_cpu(int cpu)
 		rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
 						list, tail);
 	rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
+	rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail,
+				schedlist, schedtail);
+	rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail,
+				schedlist, schedtail);
+	rdp->rcu_sched_sleeping = 0;
 	spin_unlock_irqrestore(&rdp->lock, flags);
 	rdp->waitlistcount = 0;
 
@@ -916,22 +1046,40 @@ void rcu_offline_cpu(int cpu)
 	 * fix.
 	 */
 
-	local_irq_save(flags);
+	local_irq_save(flags);  /* disable preempt till we know what lock. */
 	rdp = RCU_DATA_ME();
 	spin_lock(&rdp->lock);
 	*rdp->nexttail = list;
 	if (list)
 		rdp->nexttail = tail;
+	*rdp->nextschedtail = schedlist;
+	if (schedlist)
+		rdp->nextschedtail = schedtail;
 	spin_unlock_irqrestore(&rdp->lock, flags);
 }
 
 void __devinit rcu_online_cpu(int cpu)
 {
 	unsigned long flags;
+	struct rcu_data *rdp;
 
 	spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
 	cpu_set(cpu, rcu_cpu_online_map);
 	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
+
+	/*
+	 * The rcu_sched grace-period processing might have bypassed
+	 * this CPU, given that it was not in the rcu_cpu_online_map
+	 * when the grace-period scan started.  This means that the
+	 * grace-period task might sleep.  So make sure that if this
+	 * should happen, the first callback posted to this CPU will
+	 * wake up the grace-period task if need be.
+	 */
+
+	rdp = RCU_DATA_CPU(cpu);
+	spin_lock_irqsave(&rdp->lock, flags);
+	rdp->rcu_sched_sleeping = 1;
+	spin_unlock_irqrestore(&rdp->lock, flags);
 }
 
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -986,31 +1134,196 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 	*rdp->nexttail = head;
 	rdp->nexttail = &head->next;
 	RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
-	spin_unlock(&rdp->lock);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&rdp->lock, flags);
 }
 EXPORT_SYMBOL_GPL(call_rcu);
 
+void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+	int wake_gp = 0;
+
+	head->func = func;
+	head->next = NULL;
+	local_irq_save(flags);
+	rdp = RCU_DATA_ME();
+	spin_lock(&rdp->lock);
+	*rdp->nextschedtail = head;
+	rdp->nextschedtail = &head->next;
+	if (rdp->rcu_sched_sleeping) {
+
+		/* Grace-period processing might be sleeping... */
+
+		rdp->rcu_sched_sleeping = 0;
+		wake_gp = 1;
+	}
+	spin_unlock_irqrestore(&rdp->lock, flags);
+	if (wake_gp) {
+
+		/* Wake up grace-period processing, unless someone beat us. */
+
+		spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
+		if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping)
+			wake_gp = 0;
+		rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping;
+		spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+		if (wake_gp)
+			wake_up_interruptible(&rcu_ctrlblk.sched_wq);
+	}
+}
+EXPORT_SYMBOL_GPL(call_rcu_sched);
+
 /*
  * Wait until all currently running preempt_disable() code segments
  * (including hardware-irq-disable segments) complete.  Note that
  * in -rt this does -not- necessarily result in all currently executing
  * interrupt -handlers- having completed.
  */
-void __synchronize_sched(void)
+synchronize_rcu_xxx(__synchronize_sched, call_rcu_sched)
+EXPORT_SYMBOL_GPL(__synchronize_sched);
+
+/*
+ * kthread function that manages call_rcu_sched grace periods.
+ */
+static int rcu_sched_grace_period(void *arg)
 {
-	cpumask_t oldmask;
+	int couldsleep;		/* might sleep after current pass. */
+	int couldsleepnext = 0; /* might sleep after next pass. */
 	int cpu;
+	unsigned long flags;
+	struct rcu_data *rdp;
+	int ret;
 
-	if (sched_getaffinity(0, &oldmask) < 0)
-		oldmask = cpu_possible_map;
-	for_each_online_cpu(cpu) {
-		sched_setaffinity(0, &cpumask_of_cpu(cpu));
-		schedule();
-	}
-	sched_setaffinity(0, &oldmask);
+	/*
+	 * Each pass through the following loop handles one
+	 * rcu_sched grace period cycle.
+	 */
+	do {
+		/* Save each CPU's current state. */
+
+		for_each_online_cpu(cpu) {
+			dyntick_save_progress_counter_sched(cpu);
+			save_qsctr_sched(cpu);
+		}
+
+		/*
+		 * Sleep for about an RCU grace-period's worth to
+		 * allow better batching and to consume less CPU.
+		 */
+		schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME);
+
+		/*
+		 * If there was nothing to do last time, prepare to
+		 * sleep at the end of the current grace period cycle.
+		 */
+		couldsleep = couldsleepnext;
+		couldsleepnext = 1;
+		if (couldsleep) {
+			spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
+			rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep;
+			spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+		}
+
+		/*
+		 * Wait on each CPU in turn to have either visited
+		 * a quiescent state or been in dynticks-idle mode.
+		 */
+		for_each_online_cpu(cpu) {
+			while (rcu_qsctr_inc_needed(cpu) &&
+			       rcu_qsctr_inc_needed_dyntick(cpu)) {
+				/* resched_cpu(cpu); @@@ */
+				schedule_timeout_interruptible(1);
+			}
+		}
+
+		/* Advance callbacks for each CPU.  */
+
+		for_each_online_cpu(cpu) {
+
+			rdp = RCU_DATA_CPU(cpu);
+			spin_lock_irqsave(&rdp->lock, flags);
+
+			/*
+			 * We are running on this CPU irq-disabled, so no
+			 * CPU can go offline until we re-enable irqs.
+			 * The current CPU might have already gone
+			 * offline (between the for_each_offline_cpu and
+			 * the spin_lock_irqsave), but in that case all its
+			 * callback lists will be empty, so no harm done.
+			 *
+			 * Advance the callbacks!  We share normal RCU's
+			 * donelist, since callbacks are invoked the
+			 * same way in either case.
+			 */
+			if (rdp->waitschedlist != NULL) {
+				*rdp->donetail = rdp->waitschedlist;
+				rdp->donetail = rdp->waitschedtail;
+
+				/*
+				 * Next rcu_check_callbacks() will
+				 * do the required raise_softirq().
+				 */
+			}
+			if (rdp->nextschedlist != NULL) {
+				rdp->waitschedlist = rdp->nextschedlist;
+				rdp->waitschedtail = rdp->nextschedtail;
+				couldsleep = 0;
+				couldsleepnext = 0;
+			} else {
+				rdp->waitschedlist = NULL;
+				rdp->waitschedtail = &rdp->waitschedlist;
+			}
+			rdp->nextschedlist = NULL;
+			rdp->nextschedtail = &rdp->nextschedlist;
+
+			/* Mark sleep intention. */
+
+			rdp->rcu_sched_sleeping = couldsleep;
+
+			spin_unlock_irqrestore(&rdp->lock, flags);
+		}
+
+		/* If we saw callbacks on the last scan, go deal with them. */
+
+		if (!couldsleep)
+			continue;
+
+		/* Attempt to block... */
+
+		spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
+		if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) {
+
+			/*
+			 * Someone posted a callback after we scanned.
+			 * Go take care of it.
+			 */
+			spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+			couldsleepnext = 0;
+			continue;
+		}
+
+		/* Block until the next person posts a callback. */
+
+		rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
+		spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+		ret = 0;
+		__wait_event_interruptible(rcu_ctrlblk.sched_wq,
+			rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
+			ret);
+
+		/*
+		 * Signals would prevent us from sleeping, and we cannot
+		 * do much with them in any case.  So flush them.
+		 */
+		if (ret)
+			flush_signals(current);
+		couldsleepnext = 0;
+
+	} while (!kthread_should_stop());
+
+	return (0);
 }
-EXPORT_SYMBOL_GPL(__synchronize_sched);
 
 /*
  * Check to see if any future RCU-related work will need to be done
@@ -1027,7 +1340,9 @@ int rcu_needs_cpu(int cpu)
 
 	return (rdp->donelist != NULL ||
 		!!rdp->waitlistcount ||
-		rdp->nextlist != NULL);
+		rdp->nextlist != NULL ||
+		rdp->nextschedlist != NULL ||
+		rdp->waitschedlist != NULL);
 }
 
 int rcu_pending(int cpu)
@@ -1038,7 +1353,9 @@ int rcu_pending(int cpu)
 
 	if (rdp->donelist != NULL ||
 	    !!rdp->waitlistcount ||
-	    rdp->nextlist != NULL)
+	    rdp->nextlist != NULL ||
+	    rdp->nextschedlist != NULL ||
+	    rdp->waitschedlist != NULL)
 		return 1;
 
 	/* The RCU core needs an acknowledgement from this CPU. */
@@ -1105,6 +1422,11 @@ void __init __rcu_init(void)
 		rdp->donetail = &rdp->donelist;
 		rdp->rcu_flipctr[0] = 0;
 		rdp->rcu_flipctr[1] = 0;
+		rdp->nextschedlist = NULL;
+		rdp->nextschedtail = &rdp->nextschedlist;
+		rdp->waitschedlist = NULL;
+		rdp->waitschedtail = &rdp->waitschedlist;
+		rdp->rcu_sched_sleeping = 0;
 	}
 	register_cpu_notifier(&rcu_nb);
 
@@ -1127,11 +1449,15 @@ void __init __rcu_init(void)
 }
 
 /*
- * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
+ * Late-boot-time RCU initialization that must wait until after scheduler
+ * has been initialized.
  */
-void synchronize_kernel(void)
+void __init rcu_init_sched(void)
 {
-	synchronize_rcu();
+	rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period,
+						  NULL,
+						  "rcu_sched_grace_period");
+	WARN_ON(IS_ERR(rcu_sched_grace_period_task));
 }
 
 #ifdef CONFIG_RCU_TRACE
-- 
cgit v1.2.3


From 8db559b83009bed92e1b5dd13a651ff273d9ff62 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 12 May 2008 21:21:05 +0200
Subject: rcu: add memory barriers and comments to rcu_check_callbacks()

Add comments to the logic that infers quiescent states when interrupting
from either user mode or the idle loop.  Also add a memory barrier: it
appears that James Huang was in fact onto something, as the scheduler
is much less synchronization happy than it once was, so we can no longer
rely on its memory barriers in all cases.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reported-by: James Huang <jamesclhuang@yahoo.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/rcuclassic.c | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index f4ffbd0f306f..d8348792f9f5 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -502,10 +502,38 @@ void rcu_check_callbacks(int cpu, int user)
 	if (user ||
 	    (idle_cpu(cpu) && !in_softirq() &&
 				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+
+		/*
+		 * Get here if this CPU took its interrupt from user
+		 * mode or from the idle loop, and if this is not a
+		 * nested interrupt.  In this case, the CPU is in
+		 * a quiescent state, so count it.
+		 *
+		 * Also do a memory barrier.  This is needed to handle
+		 * the case where writes from a preempt-disable section
+		 * of code get reordered into schedule() by this CPU's
+		 * write buffer.  The memory barrier makes sure that
+		 * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see
+		 * by other CPUs to happen after any such write.
+		 */
+
+		smp_mb();  /* See above block comment. */
 		rcu_qsctr_inc(cpu);
 		rcu_bh_qsctr_inc(cpu);
-	} else if (!in_softirq())
+
+	} else if (!in_softirq()) {
+
+		/*
+		 * Get here if this CPU did not take its interrupt from
+		 * softirq, in other words, if it is not interrupting
+		 * a rcu_bh read-side critical section.  This is an _bh
+		 * critical section, so count it.  The memory barrier
+		 * is needed for the same reason as is the above one.
+		 */
+
+		smp_mb();  /* See above block comment. */
 		rcu_bh_qsctr_inc(cpu);
+	}
 	raise_rcu_softirq();
 }
 
-- 
cgit v1.2.3


From 70f12f848d3e981479b4f6f751e73c14f7c13e5b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 12 May 2008 21:21:05 +0200
Subject: rcu: add rcu_barrier_sched() and rcu_barrier_bh()

Add rcu_barrier_sched() and rcu_barrier_bh().  With these in place,
rcutorture no longer gives the occasional oops when repeatedly starting
and stopping torturing rcu_bh.  Also adds the API needed to flush out
pre-existing call_rcu_sched() callbacks.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/rcupdate.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 49 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a4e329d92883..4a74b8d48d90 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -45,6 +45,12 @@
 #include <linux/mutex.h>
 #include <linux/module.h>
 
+enum rcu_barrier {
+	RCU_BARRIER_STD,
+	RCU_BARRIER_BH,
+	RCU_BARRIER_SCHED,
+};
+
 static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
 static atomic_t rcu_barrier_cpu_count;
 static DEFINE_MUTEX(rcu_barrier_mutex);
@@ -83,19 +89,30 @@ static void rcu_barrier_callback(struct rcu_head *notused)
 /*
  * Called with preemption disabled, and from cross-cpu IRQ context.
  */
-static void rcu_barrier_func(void *notused)
+static void rcu_barrier_func(void *type)
 {
 	int cpu = smp_processor_id();
 	struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
 
 	atomic_inc(&rcu_barrier_cpu_count);
-	call_rcu(head, rcu_barrier_callback);
+	switch ((enum rcu_barrier)type) {
+	case RCU_BARRIER_STD:
+		call_rcu(head, rcu_barrier_callback);
+		break;
+	case RCU_BARRIER_BH:
+		call_rcu_bh(head, rcu_barrier_callback);
+		break;
+	case RCU_BARRIER_SCHED:
+		call_rcu_sched(head, rcu_barrier_callback);
+		break;
+	}
 }
 
-/**
- * rcu_barrier - Wait until all the in-flight RCUs are complete.
+/*
+ * Orchestrate the specified type of RCU barrier, waiting for all
+ * RCU callbacks of the specified type to complete.
  */
-void rcu_barrier(void)
+static void _rcu_barrier(enum rcu_barrier type)
 {
 	BUG_ON(in_interrupt());
 	/* Take cpucontrol mutex to protect against CPU hotplug */
@@ -111,13 +128,39 @@ void rcu_barrier(void)
 	 * until all the callbacks are queued.
 	 */
 	rcu_read_lock();
-	on_each_cpu(rcu_barrier_func, NULL, 0, 1);
+	on_each_cpu(rcu_barrier_func, (void *)type, 0, 1);
 	rcu_read_unlock();
 	wait_for_completion(&rcu_barrier_completion);
 	mutex_unlock(&rcu_barrier_mutex);
 }
+
+/**
+ * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
+ */
+void rcu_barrier(void)
+{
+	_rcu_barrier(RCU_BARRIER_STD);
+}
 EXPORT_SYMBOL_GPL(rcu_barrier);
 
+/**
+ * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
+ */
+void rcu_barrier_bh(void)
+{
+	_rcu_barrier(RCU_BARRIER_BH);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_bh);
+
+/**
+ * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
+ */
+void rcu_barrier_sched(void)
+{
+	_rcu_barrier(RCU_BARRIER_SCHED);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_sched);
+
 void __init rcu_init(void)
 {
 	__rcu_init();
-- 
cgit v1.2.3


From 2326974df29988181b6b69ed6fbf42b17adf916f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 12 May 2008 21:21:05 +0200
Subject: rcu: add call_rcu_sched() and friends to rcutorture

Add entry to rcu_torture_ops allowing the correct barrier function to
be used upon exit from rcutorture.  Also add torture options for the
new call_rcu_sched() API.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/rcutorture.c | 34 +++++++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 33acc424667e..0334b6a8baca 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -192,6 +192,7 @@ struct rcu_torture_ops {
 	int (*completed)(void);
 	void (*deferredfree)(struct rcu_torture *p);
 	void (*sync)(void);
+	void (*cb_barrier)(void);
 	int (*stats)(char *page);
 	char *name;
 };
@@ -265,6 +266,7 @@ static struct rcu_torture_ops rcu_ops = {
 	.completed = rcu_torture_completed,
 	.deferredfree = rcu_torture_deferred_free,
 	.sync = synchronize_rcu,
+	.cb_barrier = rcu_barrier,
 	.stats = NULL,
 	.name = "rcu"
 };
@@ -304,6 +306,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
 	.completed = rcu_torture_completed,
 	.deferredfree = rcu_sync_torture_deferred_free,
 	.sync = synchronize_rcu,
+	.cb_barrier = NULL,
 	.stats = NULL,
 	.name = "rcu_sync"
 };
@@ -364,6 +367,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
 	.completed = rcu_bh_torture_completed,
 	.deferredfree = rcu_bh_torture_deferred_free,
 	.sync = rcu_bh_torture_synchronize,
+	.cb_barrier = rcu_barrier_bh,
 	.stats = NULL,
 	.name = "rcu_bh"
 };
@@ -377,6 +381,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
 	.completed = rcu_bh_torture_completed,
 	.deferredfree = rcu_sync_torture_deferred_free,
 	.sync = rcu_bh_torture_synchronize,
+	.cb_barrier = NULL,
 	.stats = NULL,
 	.name = "rcu_bh_sync"
 };
@@ -458,6 +463,7 @@ static struct rcu_torture_ops srcu_ops = {
 	.completed = srcu_torture_completed,
 	.deferredfree = rcu_sync_torture_deferred_free,
 	.sync = srcu_torture_synchronize,
+	.cb_barrier = NULL,
 	.stats = srcu_torture_stats,
 	.name = "srcu"
 };
@@ -482,6 +488,11 @@ static int sched_torture_completed(void)
 	return 0;
 }
 
+static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
+{
+	call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
+}
+
 static void sched_torture_synchronize(void)
 {
 	synchronize_sched();
@@ -494,12 +505,27 @@ static struct rcu_torture_ops sched_ops = {
 	.readdelay = rcu_read_delay,  /* just reuse rcu's version. */
 	.readunlock = sched_torture_read_unlock,
 	.completed = sched_torture_completed,
-	.deferredfree = rcu_sync_torture_deferred_free,
+	.deferredfree = rcu_sched_torture_deferred_free,
 	.sync = sched_torture_synchronize,
+	.cb_barrier = rcu_barrier_sched,
 	.stats = NULL,
 	.name = "sched"
 };
 
+static struct rcu_torture_ops sched_ops_sync = {
+	.init = rcu_sync_torture_init,
+	.cleanup = NULL,
+	.readlock = sched_torture_read_lock,
+	.readdelay = rcu_read_delay,  /* just reuse rcu's version. */
+	.readunlock = sched_torture_read_unlock,
+	.completed = sched_torture_completed,
+	.deferredfree = rcu_sync_torture_deferred_free,
+	.sync = sched_torture_synchronize,
+	.cb_barrier = NULL,
+	.stats = NULL,
+	.name = "sched_sync"
+};
+
 /*
  * RCU torture writer kthread.  Repeatedly substitutes a new structure
  * for that pointed to by rcu_torture_current, freeing the old structure
@@ -848,7 +874,9 @@ rcu_torture_cleanup(void)
 	stats_task = NULL;
 
 	/* Wait for all RCU callbacks to fire.  */
-	rcu_barrier();
+
+	if (cur_ops->cb_barrier != NULL)
+		cur_ops->cb_barrier();
 
 	rcu_torture_stats_print();  /* -After- the stats thread is stopped! */
 
@@ -868,7 +896,7 @@ rcu_torture_init(void)
 	int firsterr = 0;
 	static struct rcu_torture_ops *torture_ops[] =
 		{ &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
-		  &srcu_ops, &sched_ops, };
+		  &srcu_ops, &sched_ops, &sched_ops_sync, };
 
 	/* Process args and tell the world that the torturer is on the job. */
 	for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
-- 
cgit v1.2.3


From 82524746c27fa418c250a56dd7606b9d3fc79826 Mon Sep 17 00:00:00 2001
From: Franck Bui-Huu <fbuihuu@gmail.com>
Date: Mon, 12 May 2008 21:21:05 +0200
Subject: rcu: split list.h and move rcu-protected lists into rculist.h

Move rcu-protected lists from list.h into a new header file rculist.h.

This is done because list are a very used primitive structure all over the
kernel and it's currently impossible to include other header files in this
list.h without creating some circular dependencies.

For example, list.h implements rcu-protected list and uses rcu_dereference()
without including rcupdate.h.  It actually compiles because users of
rcu_dereference() are macros.  Others RCU functions could be used too but
aren't probably because of this.

Therefore this patch creates rculist.h which includes rcupdates without to
many changes/troubles.

Signed-off-by: Franck Bui-Huu <fbuihuu@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Josh Triplett <josh@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/pid.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index 20d59fa2d493..30bd5d4b2ac7 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -30,6 +30,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/init.h>
+#include <linux/rculist.h>
 #include <linux/bootmem.h>
 #include <linux/hash.h>
 #include <linux/pid_namespace.h>
-- 
cgit v1.2.3


From d7c0651390b6a03ad53f99faec0ba88109d7191d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 12 May 2008 21:21:06 +0200
Subject: rcu: fix rcu_try_flip_waitack_needed() to prevent grace-period stall

The comment was correct -- need to make the code match the comment.
Without this patch, if a CPU goes dynticks idle (and stays there forever)
in just the right phase of preemptible-RCU grace-period processing,
grace periods stall.  The offending sequence of events (courtesy
of Promela/spin, at least after I got the liveness criterion coded
correctly...) is as follows:

o	CPU 0 is in dynticks-idle mode.  Its dynticks_progress_counter
	is (say) 10.

o	CPU 0 takes an interrupt, so rcu_irq_enter() increments CPU 0's
	dynticks_progress_counter to 11.

o	CPU 1 is doing RCU grace-period processing in rcu_try_flip_idle(),
	sees rcu_pending(), so invokes dyntick_save_progress_counter(),
	which in turn takes a snapshot of CPU 0's dynticks_progress_counter
	into CPU 0's rcu_dyntick_snapshot -- now set to 11.  CPU 1 then
	updates the RCU grace-period state to rcu_try_flip_waitack().

o	CPU 0 returns from its interrupt, so rcu_irq_exit() increments
	CPU 0's dynticks_progress_counter to 12.

o	CPU 1 later invokes rcu_try_flip_waitack(), which notices that
	CPU 0 has not yet responded, and hence in turn invokes
	rcu_try_flip_waitack_needed().  This function examines the
	state of CPU 0's dynticks_progress_counter and rcu_dyntick_snapshot
	variables, which it copies to curr (== 12) and snap (== 11),
	respectively.

	Because curr!=snap, the first condition fails.

	Because curr-snap is only 1 and snap is odd, the second
	condition fails.

	rcu_try_flip_waitack_needed() therefore incorrectly concludes
	that it must wait for CPU 0 to explicitly acknowledge the
	counter flip.

o	CPU 0 remains forever in dynticks-idle mode, never taking
	any more hardware interrupts or any NMIs, and never running
	any more tasks.  (Of course, -something- will usually eventually
	happen, which might be why we haven't seen this one in the
	wild.  Still should be fixed!)

Therefore the grace period never ends.  Fix is to make the code match
the comment, as shown below.  With this fix, the above scenario
would be satisfied with curr being even, and allow the grace period
to proceed.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Josh Triplett <josh@kernel.org>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupreempt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index aaa7976bd85f..27d0748d8d32 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -597,7 +597,7 @@ rcu_try_flip_waitack_needed(int cpu)
 	 * that this CPU already acknowledged the counter.
 	 */
 
-	if ((curr - snap) > 2 || (snap & 0x1) == 0)
+	if ((curr - snap) > 2 || (curr & 0x1) == 0)
 		return 0;
 
 	/* We need this CPU to explicitly acknowledge the counter flip. */
-- 
cgit v1.2.3


From e19a98967f49cb63352b3db4818983ea2cec24ba Mon Sep 17 00:00:00 2001
From: Huang Weiyi <weiyi.huang@gmail.com>
Date: Wed, 14 May 2008 16:23:00 -0700
Subject: rcu: remove duplicated include in kernel/rcupreempt_trace.c

Removed duplicated include file <linux/rcupdate.h> in kernel/rcupreempt_trace.c

Signed-off-by: Huang Weiyi <weiyi.huang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupreempt_trace.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
index 49ac4947af24..5edf82c34bbc 100644
--- a/kernel/rcupreempt_trace.c
+++ b/kernel/rcupreempt_trace.c
@@ -38,7 +38,6 @@
 #include <linux/moduleparam.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
-#include <linux/rcupdate.h>
 #include <linux/cpu.h>
 #include <linux/mutex.h>
 #include <linux/rcupreempt_trace.h>
-- 
cgit v1.2.3


From 247ab1a80595100bd7ea61d174f8b7b7e5f8d4bb Mon Sep 17 00:00:00 2001
From: Huang Weiyi <weiyi.huang@gmail.com>
Date: Wed, 14 May 2008 16:23:00 -0700
Subject: rcu: remove duplicated include in kernel/rcupreempt.c

Removed duplicated include file <linux/rcupdate.h> in kernel/rcupreempt.c.

Signed-off-by: Huang Weiyi <weiyi.huang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupreempt.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 27d0748d8d32..b8e4cdae4e8d 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -51,7 +51,6 @@
 #include <linux/moduleparam.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
-#include <linux/rcupdate.h>
 #include <linux/cpu.h>
 #include <linux/random.h>
 #include <linux/delay.h>
-- 
cgit v1.2.3


From a953e4597abd51b74c99e0e3b7074532a60fd031 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Mon, 12 May 2008 21:21:12 +0200
Subject: sched: replace MAX_NUMNODES with nr_node_ids in kernel/sched.c

  * Replace usages of MAX_NUMNODES with nr_node_ids in kernel/sched.c,
    where appropriate.  This saves some allocated space as well as many
    wasted cycles going through node entries that are non-existent.

For inclusion into sched-devel/latest tree.

Based on:
	git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
    +   sched-devel/latest  .../mingo/linux-2.6-sched-devel.git

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index cfa222a91539..1ed8011db826 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6879,9 +6879,9 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
 
 	min_val = INT_MAX;
 
-	for (i = 0; i < MAX_NUMNODES; i++) {
+	for (i = 0; i < nr_node_ids; i++) {
 		/* Start at @node */
-		n = (node + i) % MAX_NUMNODES;
+		n = (node + i) % nr_node_ids;
 
 		if (!nr_cpus_node(n))
 			continue;
@@ -7075,7 +7075,7 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
 		if (!sched_group_nodes)
 			continue;
 
-		for (i = 0; i < MAX_NUMNODES; i++) {
+		for (i = 0; i < nr_node_ids; i++) {
 			struct sched_group *oldsg, *sg = sched_group_nodes[i];
 
 			*nodemask = node_to_cpumask(i);
@@ -7263,7 +7263,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 	/*
 	 * Allocate the per-node list of sched groups
 	 */
-	sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *),
+	sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
 				    GFP_KERNEL);
 	if (!sched_group_nodes) {
 		printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -7407,7 +7407,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #endif
 
 	/* Set up physical groups */
-	for (i = 0; i < MAX_NUMNODES; i++) {
+	for (i = 0; i < nr_node_ids; i++) {
 		SCHED_CPUMASK_VAR(nodemask, allmasks);
 		SCHED_CPUMASK_VAR(send_covered, allmasks);
 
@@ -7431,7 +7431,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 					send_covered, tmpmask);
 	}
 
-	for (i = 0; i < MAX_NUMNODES; i++) {
+	for (i = 0; i < nr_node_ids; i++) {
 		/* Set up node groups */
 		struct sched_group *sg, *prev;
 		SCHED_CPUMASK_VAR(nodemask, allmasks);
@@ -7470,9 +7470,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		cpus_or(*covered, *covered, *nodemask);
 		prev = sg;
 
-		for (j = 0; j < MAX_NUMNODES; j++) {
+		for (j = 0; j < nr_node_ids; j++) {
 			SCHED_CPUMASK_VAR(notcovered, allmasks);
-			int n = (i + j) % MAX_NUMNODES;
+			int n = (i + j) % nr_node_ids;
 			node_to_cpumask_ptr(pnodemask, n);
 
 			cpus_complement(*notcovered, *covered);
@@ -7525,7 +7525,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 	}
 
 #ifdef CONFIG_NUMA
-	for (i = 0; i < MAX_NUMNODES; i++)
+	for (i = 0; i < nr_node_ids; i++)
 		init_numa_sched_groups_power(sched_group_nodes[i]);
 
 	if (sd_allnodes) {
-- 
cgit v1.2.3


From 363ab6f1424cdea63e5d182312d60e19077b892a Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Mon, 12 May 2008 21:21:13 +0200
Subject: core: use performance variant for_each_cpu_mask_nr

Change references from for_each_cpu_mask to for_each_cpu_mask_nr
where appropriate

Reviewed-by: Paul Jackson <pj@sgi.com>
Reviewed-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/cpu.c        |  2 +-
 kernel/rcuclassic.c |  2 +-
 kernel/rcupreempt.c | 10 +++++-----
 kernel/sched.c      | 36 ++++++++++++++++++------------------
 kernel/sched_fair.c |  2 +-
 kernel/sched_rt.c   |  6 +++---
 kernel/taskstats.c  |  4 ++--
 kernel/workqueue.c  |  6 +++---
 8 files changed, 34 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index c77bc3a1c722..50ae922c6022 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -390,7 +390,7 @@ void __ref enable_nonboot_cpus(void)
 		goto out;
 
 	printk("Enabling non-boot CPUs ...\n");
-	for_each_cpu_mask(cpu, frozen_cpus) {
+	for_each_cpu_mask_nr(cpu, frozen_cpus) {
 		error = _cpu_up(cpu, 1);
 		if (!error) {
 			printk("CPU%d is up\n", cpu);
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index f4ffbd0f306f..251358de70b0 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -92,7 +92,7 @@ static void force_quiescent_state(struct rcu_data *rdp,
 		 */
 		cpumask = rcp->cpumask;
 		cpu_clear(rdp->cpu, cpumask);
-		for_each_cpu_mask(cpu, cpumask)
+		for_each_cpu_mask_nr(cpu, cpumask)
 			smp_send_reschedule(cpu);
 	}
 }
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index e1cdf196a515..18af270125cf 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -657,7 +657,7 @@ rcu_try_flip_idle(void)
 
 	/* Now ask each CPU for acknowledgement of the flip. */
 
-	for_each_cpu_mask(cpu, rcu_cpu_online_map) {
+	for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) {
 		per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
 		dyntick_save_progress_counter(cpu);
 	}
@@ -675,7 +675,7 @@ rcu_try_flip_waitack(void)
 	int cpu;
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
-	for_each_cpu_mask(cpu, rcu_cpu_online_map)
+	for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
 		if (rcu_try_flip_waitack_needed(cpu) &&
 		    per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
 			RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
@@ -707,7 +707,7 @@ rcu_try_flip_waitzero(void)
 	/* Check to see if the sum of the "last" counters is zero. */
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
-	for_each_cpu_mask(cpu, rcu_cpu_online_map)
+	for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
 		sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
 	if (sum != 0) {
 		RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
@@ -722,7 +722,7 @@ rcu_try_flip_waitzero(void)
 	smp_mb();  /*  ^^^^^^^^^^^^ */
 
 	/* Call for a memory barrier from each CPU. */
-	for_each_cpu_mask(cpu, rcu_cpu_online_map) {
+	for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) {
 		per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
 		dyntick_save_progress_counter(cpu);
 	}
@@ -742,7 +742,7 @@ rcu_try_flip_waitmb(void)
 	int cpu;
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
-	for_each_cpu_mask(cpu, rcu_cpu_online_map)
+	for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
 		if (rcu_try_flip_waitmb_needed(cpu) &&
 		    per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
 			RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
diff --git a/kernel/sched.c b/kernel/sched.c
index 1ed8011db826..814d6e17f1e1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2271,7 +2271,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 		/* Tally up the load of all CPUs in the group */
 		avg_load = 0;
 
-		for_each_cpu_mask(i, group->cpumask) {
+		for_each_cpu_mask_nr(i, group->cpumask) {
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
 				load = source_load(i, load_idx);
@@ -2313,7 +2313,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
 	/* Traverse only the allowed CPUs */
 	cpus_and(*tmp, group->cpumask, p->cpus_allowed);
 
-	for_each_cpu_mask(i, *tmp) {
+	for_each_cpu_mask_nr(i, *tmp) {
 		load = weighted_cpuload(i);
 
 		if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -3296,7 +3296,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		max_cpu_load = 0;
 		min_cpu_load = ~0UL;
 
-		for_each_cpu_mask(i, group->cpumask) {
+		for_each_cpu_mask_nr(i, group->cpumask) {
 			struct rq *rq;
 
 			if (!cpu_isset(i, *cpus))
@@ -3560,7 +3560,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
 	unsigned long max_load = 0;
 	int i;
 
-	for_each_cpu_mask(i, group->cpumask) {
+	for_each_cpu_mask_nr(i, group->cpumask) {
 		unsigned long wl;
 
 		if (!cpu_isset(i, *cpus))
@@ -4100,7 +4100,7 @@ static void run_rebalance_domains(struct softirq_action *h)
 		int balance_cpu;
 
 		cpu_clear(this_cpu, cpus);
-		for_each_cpu_mask(balance_cpu, cpus) {
+		for_each_cpu_mask_nr(balance_cpu, cpus) {
 			/*
 			 * If this cpu gets work to do, stop the load balancing
 			 * work being done for other cpus. Next load
@@ -6832,7 +6832,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
 
 	cpus_clear(*covered);
 
-	for_each_cpu_mask(i, *span) {
+	for_each_cpu_mask_nr(i, *span) {
 		struct sched_group *sg;
 		int group = group_fn(i, cpu_map, &sg, tmpmask);
 		int j;
@@ -6843,7 +6843,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
 		cpus_clear(sg->cpumask);
 		sg->__cpu_power = 0;
 
-		for_each_cpu_mask(j, *span) {
+		for_each_cpu_mask_nr(j, *span) {
 			if (group_fn(j, cpu_map, NULL, tmpmask) != group)
 				continue;
 
@@ -7043,7 +7043,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
 	if (!sg)
 		return;
 	do {
-		for_each_cpu_mask(j, sg->cpumask) {
+		for_each_cpu_mask_nr(j, sg->cpumask) {
 			struct sched_domain *sd;
 
 			sd = &per_cpu(phys_domains, j);
@@ -7068,7 +7068,7 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
 {
 	int cpu, i;
 
-	for_each_cpu_mask(cpu, *cpu_map) {
+	for_each_cpu_mask_nr(cpu, *cpu_map) {
 		struct sched_group **sched_group_nodes
 			= sched_group_nodes_bycpu[cpu];
 
@@ -7302,7 +7302,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 	/*
 	 * Set up domains for cpus specified by the cpu_map.
 	 */
-	for_each_cpu_mask(i, *cpu_map) {
+	for_each_cpu_mask_nr(i, *cpu_map) {
 		struct sched_domain *sd = NULL, *p;
 		SCHED_CPUMASK_VAR(nodemask, allmasks);
 
@@ -7374,7 +7374,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
 #ifdef CONFIG_SCHED_SMT
 	/* Set up CPU (sibling) groups */
-	for_each_cpu_mask(i, *cpu_map) {
+	for_each_cpu_mask_nr(i, *cpu_map) {
 		SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
 		SCHED_CPUMASK_VAR(send_covered, allmasks);
 
@@ -7391,7 +7391,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
 #ifdef CONFIG_SCHED_MC
 	/* Set up multi-core groups */
-	for_each_cpu_mask(i, *cpu_map) {
+	for_each_cpu_mask_nr(i, *cpu_map) {
 		SCHED_CPUMASK_VAR(this_core_map, allmasks);
 		SCHED_CPUMASK_VAR(send_covered, allmasks);
 
@@ -7458,7 +7458,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 			goto error;
 		}
 		sched_group_nodes[i] = sg;
-		for_each_cpu_mask(j, *nodemask) {
+		for_each_cpu_mask_nr(j, *nodemask) {
 			struct sched_domain *sd;
 
 			sd = &per_cpu(node_domains, j);
@@ -7504,21 +7504,21 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
 	/* Calculate CPU power for physical packages and nodes */
 #ifdef CONFIG_SCHED_SMT
-	for_each_cpu_mask(i, *cpu_map) {
+	for_each_cpu_mask_nr(i, *cpu_map) {
 		struct sched_domain *sd = &per_cpu(cpu_domains, i);
 
 		init_sched_groups_power(i, sd);
 	}
 #endif
 #ifdef CONFIG_SCHED_MC
-	for_each_cpu_mask(i, *cpu_map) {
+	for_each_cpu_mask_nr(i, *cpu_map) {
 		struct sched_domain *sd = &per_cpu(core_domains, i);
 
 		init_sched_groups_power(i, sd);
 	}
 #endif
 
-	for_each_cpu_mask(i, *cpu_map) {
+	for_each_cpu_mask_nr(i, *cpu_map) {
 		struct sched_domain *sd = &per_cpu(phys_domains, i);
 
 		init_sched_groups_power(i, sd);
@@ -7538,7 +7538,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #endif
 
 	/* Attach the domains */
-	for_each_cpu_mask(i, *cpu_map) {
+	for_each_cpu_mask_nr(i, *cpu_map) {
 		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
 		sd = &per_cpu(cpu_domains, i);
@@ -7621,7 +7621,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
 
 	unregister_sched_domain_sysctl();
 
-	for_each_cpu_mask(i, *cpu_map)
+	for_each_cpu_mask_nr(i, *cpu_map)
 		cpu_attach_domain(NULL, &def_root_domain, i);
 	synchronize_sched();
 	arch_destroy_sched_domains(cpu_map, &tmpmask);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e24ecd39c4b8..e398318f1014 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1022,7 +1022,7 @@ static int wake_idle(int cpu, struct task_struct *p)
 		    || ((sd->flags & SD_WAKE_IDLE_FAR)
 			&& !task_hot(p, task_rq(p)->clock, sd))) {
 			cpus_and(tmp, sd->span, p->cpus_allowed);
-			for_each_cpu_mask(i, tmp) {
+			for_each_cpu_mask_nr(i, tmp) {
 				if (idle_cpu(i)) {
 					if (i != task_cpu(p)) {
 						schedstat_inc(p,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 060e87b0cb1c..d73386c6e361 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -231,7 +231,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 		return 1;
 
 	span = sched_rt_period_mask();
-	for_each_cpu_mask(i, span) {
+	for_each_cpu_mask_nr(i, span) {
 		int enqueue = 0;
 		struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
 		struct rq *rq = rq_of_rt_rq(rt_rq);
@@ -272,7 +272,7 @@ static int balance_runtime(struct rt_rq *rt_rq)
 
 	spin_lock(&rt_b->rt_runtime_lock);
 	rt_period = ktime_to_ns(rt_b->rt_period);
-	for_each_cpu_mask(i, rd->span) {
+	for_each_cpu_mask_nr(i, rd->span) {
 		struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 		s64 diff;
 
@@ -1000,7 +1000,7 @@ static int pull_rt_task(struct rq *this_rq)
 
 	next = pick_next_task_rt(this_rq);
 
-	for_each_cpu_mask(cpu, this_rq->rd->rto_mask) {
+	for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) {
 		if (this_cpu == cpu)
 			continue;
 
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 4a23517169a6..06b17547f4e7 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -301,7 +301,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
 		return -EINVAL;
 
 	if (isadd == REGISTER) {
-		for_each_cpu_mask(cpu, mask) {
+		for_each_cpu_mask_nr(cpu, mask) {
 			s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
 					 cpu_to_node(cpu));
 			if (!s)
@@ -320,7 +320,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
 
 	/* Deregister or cleanup */
 cleanup:
-	for_each_cpu_mask(cpu, mask) {
+	for_each_cpu_mask_nr(cpu, mask) {
 		listeners = &per_cpu(listener_array, cpu);
 		down_write(&listeners->sem);
 		list_for_each_entry_safe(s, tmp, &listeners->list, list) {
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 29fc39f1029c..28c2b2c96ac7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -397,7 +397,7 @@ void flush_workqueue(struct workqueue_struct *wq)
 	might_sleep();
 	lock_acquire(&wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
 	lock_release(&wq->lockdep_map, 1, _THIS_IP_);
-	for_each_cpu_mask(cpu, *cpu_map)
+	for_each_cpu_mask_nr(cpu, *cpu_map)
 		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
@@ -477,7 +477,7 @@ static void wait_on_work(struct work_struct *work)
 	wq = cwq->wq;
 	cpu_map = wq_cpu_map(wq);
 
-	for_each_cpu_mask(cpu, *cpu_map)
+	for_each_cpu_mask_nr(cpu, *cpu_map)
 		wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 }
 
@@ -813,7 +813,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
 	list_del(&wq->list);
 	spin_unlock(&workqueue_lock);
 
-	for_each_cpu_mask(cpu, *cpu_map)
+	for_each_cpu_mask_nr(cpu, *cpu_map)
 		cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
 	put_online_cpus();
 
-- 
cgit v1.2.3


From cad0e458d17c643c20c1d38f45a1d26125e6a622 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Mon, 12 May 2008 21:21:13 +0200
Subject: clocksource/events: use performance variant for_each_cpu_mask_nr

Change references from for_each_cpu_mask to for_each_cpu_mask_nr
where appropriate

Reviewed-by: Paul Jackson <pj@sgi.com>
Reviewed-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clocksource.c    | 4 ++--
 kernel/time/tick-broadcast.c | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index dadde5361f32..60ceabd53f2e 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -145,9 +145,9 @@ static void clocksource_watchdog(unsigned long data)
 		 * Cycle through CPUs to check if the CPUs stay
 		 * synchronized to each other.
 		 */
-		int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map);
+		int next_cpu = next_cpu_nr(raw_smp_processor_id(), cpu_online_map);
 
-		if (next_cpu >= NR_CPUS)
+		if (next_cpu >= nr_cpu_ids)
 			next_cpu = first_cpu(cpu_online_map);
 		watchdog_timer.expires += WATCHDOG_INTERVAL;
 		add_timer_on(&watchdog_timer, next_cpu);
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 57a1f02e5ec0..2d0a96346259 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -397,8 +397,7 @@ again:
 	mask = CPU_MASK_NONE;
 	now = ktime_get();
 	/* Find all expired events */
-	for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS;
-	     cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) {
+	for_each_cpu_mask_nr(cpu, tick_broadcast_oneshot_mask) {
 		td = &per_cpu(tick_cpu_device, cpu);
 		if (td->evtdev->next_event.tv64 <= now.tv64)
 			cpu_set(cpu, mask);
-- 
cgit v1.2.3


From bd3bff9e20f454b242d979ec2f9a4dca0d5fa06f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:41 +0200
Subject: sched: add latency tracer callbacks to the scheduler

add 3 lightweight callbacks to the tracer backend.

zero impact if tracing is turned off.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index cfa222a91539..463dcdb36ef8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2467,6 +2467,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 
 out_activate:
 #endif /* CONFIG_SMP */
+	ftrace_wake_up_task(p, rq->curr);
 	schedstat_inc(p, se.nr_wakeups);
 	if (sync)
 		schedstat_inc(p, se.nr_wakeups_sync);
@@ -2611,6 +2612,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		p->sched_class->task_new(rq, p);
 		inc_nr_running(rq);
 	}
+	ftrace_wake_up_new_task(p, rq->curr);
 	check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
@@ -2783,6 +2785,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	struct mm_struct *mm, *oldmm;
 
 	prepare_task_switch(rq, prev, next);
+	ftrace_ctx_switch(prev, next);
 	mm = next->mm;
 	oldmm = prev->active_mm;
 	/*
-- 
cgit v1.2.3


From 7c731e0a495e25e79dc1e9e68772a67a55721a65 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:41 +0200
Subject: ftrace: make the task state char-string visible to all

The tracer wants to be able to convert the state number
into a user visible character. This patch pulls that conversion
string out the scheduler into the header. This way if it were to
ever change, other parts of the kernel will know.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 463dcdb36ef8..73e600852365 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5729,7 +5729,7 @@ out_unlock:
 	return retval;
 }
 
-static const char stat_nam[] = "RSDTtZX";
+static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
 
 void sched_show_task(struct task_struct *p)
 {
-- 
cgit v1.2.3


From 16444a8a40d4c7b4f6de34af0cae1f76a4f6c901 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Date: Mon, 12 May 2008 21:20:42 +0200
Subject: ftrace: add basic support for gcc profiler instrumentation

If CONFIG_FTRACE is selected and /proc/sys/kernel/ftrace_enabled is
set to a non-zero value the ftrace routine will be called everytime
we enter a kernel function that is not marked with the "notrace"
attribute.

The ftrace routine will then call a registered function if a function
happens to be registered.

[ This code has been highly hacked by Steven Rostedt and Ingo Molnar,
  so don't blame Arnaldo for all of this ;-) ]

Update:
  It is now possible to register more than one ftrace function.
  If only one ftrace function is registered, that will be the
  function that ftrace calls directly. If more than one function
  is registered, then ftrace will call a function that will loop
  through the functions to call.

Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/Makefile       |   1 +
 kernel/trace/Kconfig  |   5 ++
 kernel/trace/Makefile |   3 ++
 kernel/trace/ftrace.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 147 insertions(+)
 create mode 100644 kernel/trace/Kconfig
 create mode 100644 kernel/trace/Makefile
 create mode 100644 kernel/trace/ftrace.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 1c9938addb9d..fa05f6d8bdbf 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_MARKERS) += marker.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
+obj-$(CONFIG_FTRACE) += trace/
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
new file mode 100644
index 000000000000..8185c91417bc
--- /dev/null
+++ b/kernel/trace/Kconfig
@@ -0,0 +1,5 @@
+#
+# Architectures that offer an FTRACE implementation should select HAVE_FTRACE:
+#
+config HAVE_FTRACE
+	bool
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
new file mode 100644
index 000000000000..bf4fd215a6a9
--- /dev/null
+++ b/kernel/trace/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_FTRACE) += libftrace.o
+
+libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
new file mode 100644
index 000000000000..b6a80b98a3fb
--- /dev/null
+++ b/kernel/trace/ftrace.c
@@ -0,0 +1,138 @@
+/*
+ * Infrastructure for profiling code inserted by 'gcc -pg'.
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2004-2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Originally ported from the -rt patch by:
+ *   Copyright (C) 2007 Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Based on code in the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+
+#include <linux/module.h>
+#include <linux/ftrace.h>
+
+static DEFINE_SPINLOCK(ftrace_func_lock);
+static struct ftrace_ops ftrace_list_end __read_mostly =
+{
+	.func = ftrace_stub,
+};
+
+static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
+ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
+
+/* mcount is defined per arch in assembly */
+EXPORT_SYMBOL(mcount);
+
+notrace void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
+{
+	struct ftrace_ops *op = ftrace_list;
+
+	/* in case someone actually ports this to alpha! */
+	read_barrier_depends();
+
+	while (op != &ftrace_list_end) {
+		/* silly alpha */
+		read_barrier_depends();
+		op->func(ip, parent_ip);
+		op = op->next;
+	};
+}
+
+/**
+ * register_ftrace_function - register a function for profiling
+ * @ops - ops structure that holds the function for profiling.
+ *
+ * Register a function to be called by all functions in the
+ * kernel.
+ *
+ * Note: @ops->func and all the functions it calls must be labeled
+ *       with "notrace", otherwise it will go into a
+ *       recursive loop.
+ */
+int register_ftrace_function(struct ftrace_ops *ops)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ftrace_func_lock, flags);
+	ops->next = ftrace_list;
+	/*
+	 * We are entering ops into the ftrace_list but another
+	 * CPU might be walking that list. We need to make sure
+	 * the ops->next pointer is valid before another CPU sees
+	 * the ops pointer included into the ftrace_list.
+	 */
+	smp_wmb();
+	ftrace_list = ops;
+	/*
+	 * For one func, simply call it directly.
+	 * For more than one func, call the chain.
+	 */
+	if (ops->next == &ftrace_list_end)
+		ftrace_trace_function = ops->func;
+	else
+		ftrace_trace_function = ftrace_list_func;
+	spin_unlock_irqrestore(&ftrace_func_lock, flags);
+
+	return 0;
+}
+
+/**
+ * unregister_ftrace_function - unresgister a function for profiling.
+ * @ops - ops structure that holds the function to unregister
+ *
+ * Unregister a function that was added to be called by ftrace profiling.
+ */
+int unregister_ftrace_function(struct ftrace_ops *ops)
+{
+	unsigned long flags;
+	struct ftrace_ops **p;
+	int ret = 0;
+
+	spin_lock_irqsave(&ftrace_func_lock, flags);
+
+	/*
+	 * If we are the only function, then the ftrace pointer is
+	 * pointing directly to that function.
+	 */
+	if (ftrace_list == ops && ops->next == &ftrace_list_end) {
+		ftrace_trace_function = ftrace_stub;
+		ftrace_list = &ftrace_list_end;
+		goto out;
+	}
+
+	for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next)
+		if (*p == ops)
+			break;
+
+	if (*p != ops) {
+		ret = -1;
+		goto out;
+	}
+
+	*p = (*p)->next;
+
+	/* If we only have one func left, then call that directly */
+	if (ftrace_list->next == &ftrace_list_end)
+		ftrace_trace_function = ftrace_list->func;
+
+ out:
+	spin_unlock_irqrestore(&ftrace_func_lock, flags);
+
+	return 0;
+}
+
+/**
+ * clear_ftrace_function - reset the ftrace function
+ *
+ * This NULLs the ftrace function and in essence stops
+ * tracing.  There may be lag
+ */
+void clear_ftrace_function(void)
+{
+	ftrace_trace_function = ftrace_stub;
+}
-- 
cgit v1.2.3


From bc0c38d139ec7fcd5c030aea16b008f3732e42ac Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:42 +0200
Subject: ftrace: latency tracer infrastructure

This patch adds the latency tracer infrastructure. This patch
does not add anything that will select and turn it on, but will
be used by later patches.

If it were to be compiled, it would add the following files
to the debugfs:

 The root tracing directory:

  /debugfs/tracing/

This patch also adds the following files:

  available_tracers
     list of available tracers. Currently no tracers are
     available. Looking into this file only shows
     "none" which is used to unregister all tracers.

  current_tracer
     The trace that is currently active. Empty on start up.
     To switch to a tracer simply echo one of the tracers that
     are listed in available_tracers:

   example: (used with later patches)

      echo function > /debugfs/tracing/current_tracer

     To disable the tracer:

       echo disable > /debugfs/tracing/current_tracer

  tracing_enabled
     echoing "1" into this file starts the ftrace function tracing
      (if sysctl kernel.ftrace_enabled=1)
     echoing "0" turns it off.

  latency_trace
      This file is readonly and holds the result of the trace.

  trace
      This file outputs a easier to read version of the trace.

  iter_ctrl
      Controls the way the output of traces look.
      So far there's two controls:
        echoing in "symonly" will only show the kallsyms variables
            without the addresses (if kallsyms was configured)
        echoing in "verbose" will change the output to show
            a lot more data, but not very easy to understand by
            humans.
        echoing in "nosymonly" turns off symonly.
        echoing in "noverbose" turns off verbose.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/Makefile       |    1 +
 kernel/trace/Kconfig  |    5 +
 kernel/trace/Makefile |    2 +
 kernel/trace/trace.c  | 1547 +++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h  |  184 ++++++
 5 files changed, 1739 insertions(+)
 create mode 100644 kernel/trace/trace.c
 create mode 100644 kernel/trace/trace.h

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index fa05f6d8bdbf..7e344e7b0cb3 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -70,6 +70,7 @@ obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_MARKERS) += marker.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
 obj-$(CONFIG_FTRACE) += trace/
+obj-$(CONFIG_TRACING) += trace/
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8185c91417bc..ce70677afbf9 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -3,3 +3,8 @@
 #
 config HAVE_FTRACE
 	bool
+
+config TRACING
+	bool
+	select DEBUG_FS
+
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index bf4fd215a6a9..7af403175255 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -1,3 +1,5 @@
 obj-$(CONFIG_FTRACE) += libftrace.o
 
+obj-$(CONFIG_TRACING) += trace.o
+
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
new file mode 100644
index 000000000000..1b8eca7650d4
--- /dev/null
+++ b/kernel/trace/trace.c
@@ -0,0 +1,1547 @@
+/*
+ * ring buffer based function tracer
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Originally taken from the RT patch by:
+ *    Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Based on code from the latency_tracer, that is:
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/utsrelease.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/hardirq.h>
+#include <linux/linkage.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/gfp.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+
+unsigned long __read_mostly	tracing_max_latency = (cycle_t)ULONG_MAX;
+unsigned long __read_mostly	tracing_thresh;
+
+static long notrace
+ns2usecs(cycle_t nsec)
+{
+	nsec += 500;
+	do_div(nsec, 1000);
+	return nsec;
+}
+
+static atomic_t			tracer_counter;
+static struct trace_array	global_trace;
+
+static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
+
+static struct trace_array	max_tr;
+
+static DEFINE_PER_CPU(struct trace_array_cpu, max_data);
+
+static int			tracer_enabled;
+static unsigned long		trace_nr_entries = 4096UL;
+
+static struct tracer		*trace_types __read_mostly;
+static struct tracer		*current_trace __read_mostly;
+static int			max_tracer_type_len;
+
+static DEFINE_MUTEX(trace_types_lock);
+
+static int __init set_nr_entries(char *str)
+{
+	if (!str)
+		return 0;
+	trace_nr_entries = simple_strtoul(str, &str, 0);
+	return 1;
+}
+__setup("trace_entries=", set_nr_entries);
+
+enum trace_type {
+	__TRACE_FIRST_TYPE = 0,
+
+	TRACE_FN,
+	TRACE_CTX,
+
+	__TRACE_LAST_TYPE
+};
+
+enum trace_flag_type {
+	TRACE_FLAG_IRQS_OFF		= 0x01,
+	TRACE_FLAG_NEED_RESCHED		= 0x02,
+	TRACE_FLAG_HARDIRQ		= 0x04,
+	TRACE_FLAG_SOFTIRQ		= 0x08,
+};
+
+enum trace_iterator_flags {
+	TRACE_ITER_PRINT_PARENT		= 0x01,
+	TRACE_ITER_SYM_OFFSET		= 0x02,
+	TRACE_ITER_SYM_ADDR		= 0x04,
+	TRACE_ITER_VERBOSE		= 0x08,
+};
+
+#define TRACE_ITER_SYM_MASK \
+	(TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
+
+/* These must match the bit postions above */
+static const char *trace_options[] = {
+	"print-parent",
+	"sym-offset",
+	"sym-addr",
+	"verbose",
+	NULL
+};
+
+static unsigned trace_flags;
+
+
+/*
+ * Copy the new maximum trace into the separate maximum-trace
+ * structure. (this way the maximum trace is permanently saved,
+ * for later retrieval via /debugfs/tracing/latency_trace)
+ */
+static void notrace
+__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
+{
+	struct trace_array_cpu *data = tr->data[cpu];
+
+	max_tr.cpu = cpu;
+	max_tr.time_start = data->preempt_timestamp;
+
+	data = max_tr.data[cpu];
+	data->saved_latency = tracing_max_latency;
+
+	memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
+	data->pid = tsk->pid;
+	data->uid = tsk->uid;
+	data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
+	data->policy = tsk->policy;
+	data->rt_priority = tsk->rt_priority;
+
+	/* record this tasks comm */
+	tracing_record_cmdline(current);
+}
+
+notrace void
+update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
+{
+	struct trace_array_cpu *data;
+	void *save_trace;
+	int i;
+
+	/* clear out all the previous traces */
+	for_each_possible_cpu(i) {
+		data = tr->data[i];
+		save_trace = max_tr.data[i]->trace;
+		memcpy(max_tr.data[i], data, sizeof(*data));
+		data->trace = save_trace;
+	}
+
+	__update_max_tr(tr, tsk, cpu);
+}
+
+/**
+ * update_max_tr_single - only copy one trace over, and reset the rest
+ * @tr - tracer
+ * @tsk - task with the latency
+ * @cpu - the cpu of the buffer to copy.
+ */
+notrace void
+update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
+{
+	struct trace_array_cpu *data = tr->data[cpu];
+	void *save_trace;
+	int i;
+
+	for_each_possible_cpu(i)
+		tracing_reset(max_tr.data[i]);
+
+	save_trace = max_tr.data[cpu]->trace;
+	memcpy(max_tr.data[cpu], data, sizeof(*data));
+	data->trace = save_trace;
+
+	__update_max_tr(tr, tsk, cpu);
+}
+
+int register_tracer(struct tracer *type)
+{
+	struct tracer *t;
+	int len;
+	int ret = 0;
+
+	if (!type->name) {
+		pr_info("Tracer must have a name\n");
+		return -1;
+	}
+
+	mutex_lock(&trace_types_lock);
+	for (t = trace_types; t; t = t->next) {
+		if (strcmp(type->name, t->name) == 0) {
+			/* already found */
+			pr_info("Trace %s already registered\n",
+				type->name);
+			ret = -1;
+			goto out;
+		}
+	}
+
+	type->next = trace_types;
+	trace_types = type;
+	len = strlen(type->name);
+	if (len > max_tracer_type_len)
+		max_tracer_type_len = len;
+ out:
+	mutex_unlock(&trace_types_lock);
+
+	return ret;
+}
+
+void unregister_tracer(struct tracer *type)
+{
+	struct tracer **t;
+	int len;
+
+	mutex_lock(&trace_types_lock);
+	for (t = &trace_types; *t; t = &(*t)->next) {
+		if (*t == type)
+			goto found;
+	}
+	pr_info("Trace %s not registered\n", type->name);
+	goto out;
+
+ found:
+	*t = (*t)->next;
+	if (strlen(type->name) != max_tracer_type_len)
+		goto out;
+
+	max_tracer_type_len = 0;
+	for (t = &trace_types; *t; t = &(*t)->next) {
+		len = strlen((*t)->name);
+		if (len > max_tracer_type_len)
+			max_tracer_type_len = len;
+	}
+ out:
+	mutex_unlock(&trace_types_lock);
+}
+
+void notrace tracing_reset(struct trace_array_cpu *data)
+{
+	data->trace_idx = 0;
+	atomic_set(&data->underrun, 0);
+}
+
+#ifdef CONFIG_FTRACE
+static void notrace
+function_trace_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct trace_array *tr = &global_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+
+	if (unlikely(!tracer_enabled))
+		return;
+
+	raw_local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1))
+		ftrace(tr, data, ip, parent_ip, flags);
+
+	atomic_dec(&data->disabled);
+	raw_local_irq_restore(flags);
+}
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+	.func = function_trace_call,
+};
+#endif
+
+notrace void tracing_start_function_trace(void)
+{
+	register_ftrace_function(&trace_ops);
+}
+
+notrace void tracing_stop_function_trace(void)
+{
+	unregister_ftrace_function(&trace_ops);
+}
+
+#define SAVED_CMDLINES 128
+static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
+static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
+static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
+static int cmdline_idx;
+static DEFINE_SPINLOCK(trace_cmdline_lock);
+atomic_t trace_record_cmdline_disabled;
+
+static void trace_init_cmdlines(void)
+{
+	memset(&map_pid_to_cmdline, -1, sizeof(map_pid_to_cmdline));
+	memset(&map_cmdline_to_pid, -1, sizeof(map_cmdline_to_pid));
+	cmdline_idx = 0;
+}
+
+notrace void trace_stop_cmdline_recording(void);
+
+static void notrace trace_save_cmdline(struct task_struct *tsk)
+{
+	unsigned map;
+	unsigned idx;
+
+	if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
+		return;
+
+	/*
+	 * It's not the end of the world if we don't get
+	 * the lock, but we also don't want to spin
+	 * nor do we want to disable interrupts,
+	 * so if we miss here, then better luck next time.
+	 */
+	if (!spin_trylock(&trace_cmdline_lock))
+		return;
+
+	idx = map_pid_to_cmdline[tsk->pid];
+	if (idx >= SAVED_CMDLINES) {
+		idx = (cmdline_idx + 1) % SAVED_CMDLINES;
+
+		map = map_cmdline_to_pid[idx];
+		if (map <= PID_MAX_DEFAULT)
+			map_pid_to_cmdline[map] = (unsigned)-1;
+
+		map_pid_to_cmdline[tsk->pid] = idx;
+
+		cmdline_idx = idx;
+	}
+
+	memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
+
+	spin_unlock(&trace_cmdline_lock);
+}
+
+static notrace char *trace_find_cmdline(int pid)
+{
+	char *cmdline = "<...>";
+	unsigned map;
+
+	if (!pid)
+		return "<idle>";
+
+	if (pid > PID_MAX_DEFAULT)
+		goto out;
+
+	map = map_pid_to_cmdline[pid];
+	if (map >= SAVED_CMDLINES)
+		goto out;
+
+	cmdline = saved_cmdlines[map];
+
+ out:
+	return cmdline;
+}
+
+notrace void tracing_record_cmdline(struct task_struct *tsk)
+{
+	if (atomic_read(&trace_record_cmdline_disabled))
+		return;
+
+	trace_save_cmdline(tsk);
+}
+
+static inline notrace struct trace_entry *
+tracing_get_trace_entry(struct trace_array *tr,
+			struct trace_array_cpu *data)
+{
+	unsigned long idx, idx_next;
+	struct trace_entry *entry;
+
+	idx = data->trace_idx;
+	idx_next = idx + 1;
+
+	if (unlikely(idx_next >= tr->entries)) {
+		atomic_inc(&data->underrun);
+		idx_next = 0;
+	}
+
+	data->trace_idx = idx_next;
+
+	if (unlikely(idx_next != 0 && atomic_read(&data->underrun)))
+		atomic_inc(&data->underrun);
+
+	entry = data->trace + idx * TRACE_ENTRY_SIZE;
+
+	return entry;
+}
+
+static inline notrace void
+tracing_generic_entry_update(struct trace_entry *entry,
+			     unsigned long flags)
+{
+	struct task_struct *tsk = current;
+	unsigned long pc;
+
+	pc = preempt_count();
+
+	entry->idx	= atomic_inc_return(&tracer_counter);
+	entry->preempt_count = pc & 0xff;
+	entry->pid	 = tsk->pid;
+	entry->t	 = now(raw_smp_processor_id());
+	entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
+		((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
+		((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
+		(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
+}
+
+notrace void
+ftrace(struct trace_array *tr, struct trace_array_cpu *data,
+		       unsigned long ip, unsigned long parent_ip,
+		       unsigned long flags)
+{
+	struct trace_entry *entry;
+
+	entry = tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, flags);
+	entry->type	    = TRACE_FN;
+	entry->fn.ip	    = ip;
+	entry->fn.parent_ip = parent_ip;
+}
+
+notrace void
+tracing_sched_switch_trace(struct trace_array *tr,
+			   struct trace_array_cpu *data,
+			   struct task_struct *prev, struct task_struct *next,
+			   unsigned long flags)
+{
+	struct trace_entry *entry;
+
+	entry = tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, flags);
+	entry->type		= TRACE_CTX;
+	entry->ctx.prev_pid	= prev->pid;
+	entry->ctx.prev_prio	= prev->prio;
+	entry->ctx.prev_state	= prev->state;
+	entry->ctx.next_pid	= next->pid;
+	entry->ctx.next_prio	= next->prio;
+}
+
+enum trace_file_type {
+	TRACE_FILE_LAT_FMT	= 1,
+};
+
+static struct trace_entry *
+trace_entry_idx(struct trace_array *tr, unsigned long idx, int cpu)
+{
+	struct trace_entry *array = tr->data[cpu]->trace;
+	unsigned long underrun;
+
+	if (idx >= tr->entries)
+		return NULL;
+
+	underrun = atomic_read(&tr->data[cpu]->underrun);
+	if (underrun)
+		idx = ((underrun - 1) + idx) % tr->entries;
+	else if (idx >= tr->data[cpu]->trace_idx)
+		return NULL;
+
+	return &array[idx];
+}
+
+static struct notrace trace_entry *
+find_next_entry(struct trace_iterator *iter, int *ent_cpu)
+{
+	struct trace_array *tr = iter->tr;
+	struct trace_entry *ent, *next = NULL;
+	int next_cpu = -1;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		if (!tr->data[cpu]->trace)
+			continue;
+		ent = trace_entry_idx(tr, iter->next_idx[cpu], cpu);
+		if (ent &&
+		    (!next || (long)(next->idx - ent->idx) > 0)) {
+			next = ent;
+			next_cpu = cpu;
+		}
+	}
+
+	if (ent_cpu)
+		*ent_cpu = next_cpu;
+
+	return next;
+}
+
+static void *find_next_entry_inc(struct trace_iterator *iter)
+{
+	struct trace_entry *next;
+	int next_cpu = -1;
+
+	next = find_next_entry(iter, &next_cpu);
+
+	if (next) {
+		iter->next_idx[next_cpu]++;
+		iter->idx++;
+	}
+	iter->ent = next;
+	iter->cpu = next_cpu;
+
+	return next ? iter : NULL;
+}
+
+static void notrace *
+s_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct trace_iterator *iter = m->private;
+	void *ent;
+	void *last_ent = iter->ent;
+	int i = (int)*pos;
+
+	(*pos)++;
+
+	/* can't go backwards */
+	if (iter->idx > i)
+		return NULL;
+
+	if (iter->idx < 0)
+		ent = find_next_entry_inc(iter);
+	else
+		ent = iter;
+
+	while (ent && iter->idx < i)
+		ent = find_next_entry_inc(iter);
+
+	iter->pos = *pos;
+
+	if (last_ent && !ent)
+		seq_puts(m, "\n\nvim:ft=help\n");
+
+	return ent;
+}
+
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+	struct trace_iterator *iter = m->private;
+	void *p = NULL;
+	loff_t l = 0;
+	int i;
+
+	mutex_lock(&trace_types_lock);
+
+	if (!current_trace || current_trace != iter->trace)
+		return NULL;
+
+	atomic_inc(&trace_record_cmdline_disabled);
+
+	/* let the tracer grab locks here if needed */
+	if (current_trace->start)
+		current_trace->start(iter);
+
+	if (*pos != iter->pos) {
+		iter->ent = NULL;
+		iter->cpu = 0;
+		iter->idx = -1;
+
+		for (i = 0; i < NR_CPUS; i++)
+			iter->next_idx[i] = 0;
+
+		for (p = iter; p && l < *pos; p = s_next(m, p, &l))
+			;
+
+	} else {
+		l = *pos;
+		p = s_next(m, p, &l);
+	}
+
+	return p;
+}
+
+static void s_stop(struct seq_file *m, void *p)
+{
+	struct trace_iterator *iter = m->private;
+
+	atomic_dec(&trace_record_cmdline_disabled);
+
+	/* let the tracer release locks here if needed */
+	if (current_trace && current_trace == iter->trace && iter->trace->stop)
+		iter->trace->stop(iter);
+
+	mutex_unlock(&trace_types_lock);
+}
+
+static void
+seq_print_sym_short(struct seq_file *m, const char *fmt, unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+	char str[KSYM_SYMBOL_LEN];
+
+	kallsyms_lookup(address, NULL, NULL, NULL, str);
+
+	seq_printf(m, fmt, str);
+#endif
+}
+
+static void
+seq_print_sym_offset(struct seq_file *m, const char *fmt, unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+	char str[KSYM_SYMBOL_LEN];
+
+	sprint_symbol(str, address);
+	seq_printf(m, fmt, str);
+#endif
+}
+
+#ifndef CONFIG_64BIT
+# define IP_FMT "%08lx"
+#else
+# define IP_FMT "%016lx"
+#endif
+
+static void notrace
+seq_print_ip_sym(struct seq_file *m, unsigned long ip, unsigned long sym_flags)
+{
+	if (!ip) {
+		seq_printf(m, "0");
+		return;
+	}
+
+	if (sym_flags & TRACE_ITER_SYM_OFFSET)
+		seq_print_sym_offset(m, "%s", ip);
+	else
+		seq_print_sym_short(m, "%s", ip);
+
+	if (sym_flags & TRACE_ITER_SYM_ADDR)
+		seq_printf(m, " <" IP_FMT ">", ip);
+}
+
+static void notrace print_lat_help_header(struct seq_file *m)
+{
+	seq_puts(m, "#                _------=> CPU#            \n");
+	seq_puts(m, "#               / _-----=> irqs-off        \n");
+	seq_puts(m, "#              | / _----=> need-resched    \n");
+	seq_puts(m, "#              || / _---=> hardirq/softirq \n");
+	seq_puts(m, "#              ||| / _--=> preempt-depth   \n");
+	seq_puts(m, "#              |||| /                      \n");
+	seq_puts(m, "#              |||||     delay             \n");
+	seq_puts(m, "#  cmd     pid ||||| time  |   caller      \n");
+	seq_puts(m, "#     \\   /    |||||   \\   |   /           \n");
+}
+
+static void notrace print_func_help_header(struct seq_file *m)
+{
+	seq_puts(m, "#           TASK-PID   CPU#    TIMESTAMP  FUNCTION\n");
+	seq_puts(m, "#              | |      |          |         |\n");
+}
+
+
+static void notrace
+print_trace_header(struct seq_file *m, struct trace_iterator *iter)
+{
+	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
+	struct trace_array *tr = iter->tr;
+	struct trace_array_cpu *data = tr->data[tr->cpu];
+	struct tracer *type = current_trace;
+	unsigned long underruns = 0;
+	unsigned long underrun;
+	unsigned long entries   = 0;
+	int cpu;
+	const char *name = "preemption";
+
+	if (type)
+		name = type->name;
+
+	for_each_possible_cpu(cpu) {
+		if (tr->data[cpu]->trace) {
+			underrun = atomic_read(&tr->data[cpu]->underrun);
+			if (underrun) {
+				underruns += underrun;
+				entries += tr->entries;
+			} else
+				entries += tr->data[cpu]->trace_idx;
+		}
+	}
+
+	seq_printf(m, "%s latency trace v1.1.5 on %s\n",
+		   name, UTS_RELEASE);
+	seq_puts(m, "-----------------------------------"
+		 "---------------------------------\n");
+	seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d |"
+		   " (M:%s VP:%d, KP:%d, SP:%d HP:%d",
+		   data->saved_latency,
+		   entries,
+		   (entries + underruns),
+		   tr->cpu,
+#if defined(CONFIG_PREEMPT_NONE)
+		   "server",
+#elif defined(CONFIG_PREEMPT_VOLUNTARY)
+		   "desktop",
+#elif defined(CONFIG_PREEMPT_DESKTOP)
+		   "preempt",
+#else
+		   "unknown",
+#endif
+		   /* These are reserved for later use */
+		   0, 0, 0, 0);
+#ifdef CONFIG_SMP
+	seq_printf(m, " #P:%d)\n", num_online_cpus());
+#else
+	seq_puts(m, ")\n");
+#endif
+	seq_puts(m, "    -----------------\n");
+	seq_printf(m, "    | task: %.16s-%d "
+		   "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n",
+		   data->comm, data->pid, data->uid, data->nice,
+		   data->policy, data->rt_priority);
+	seq_puts(m, "    -----------------\n");
+
+	if (data->critical_start) {
+		seq_puts(m, " => started at: ");
+		seq_print_ip_sym(m, data->critical_start, sym_flags);
+		seq_puts(m, "\n => ended at:   ");
+		seq_print_ip_sym(m, data->critical_end, sym_flags);
+		seq_puts(m, "\n");
+	}
+
+	seq_puts(m, "\n");
+}
+
+unsigned long nsecs_to_usecs(unsigned long nsecs)
+{
+	return nsecs / 1000;
+}
+
+static void notrace
+lat_print_generic(struct seq_file *m, struct trace_entry *entry, int cpu)
+{
+	int hardirq, softirq;
+	char *comm;
+
+	comm = trace_find_cmdline(entry->pid);
+
+	seq_printf(m, "%8.8s-%-5d ", comm, entry->pid);
+	seq_printf(m, "%d", cpu);
+	seq_printf(m, "%c%c",
+		   (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.',
+		   ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
+
+	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+	if (hardirq && softirq)
+		seq_putc(m, 'H');
+	else {
+		if (hardirq)
+			seq_putc(m, 'h');
+		else {
+			if (softirq)
+				seq_putc(m, 's');
+			else
+				seq_putc(m, '.');
+		}
+	}
+
+	if (entry->preempt_count)
+		seq_printf(m, "%x", entry->preempt_count);
+	else
+		seq_puts(m, ".");
+}
+
+unsigned long preempt_mark_thresh = 100;
+
+static void notrace
+lat_print_timestamp(struct seq_file *m, unsigned long long abs_usecs,
+		    unsigned long rel_usecs)
+{
+	seq_printf(m, " %4lldus", abs_usecs);
+	if (rel_usecs > preempt_mark_thresh)
+		seq_puts(m, "!: ");
+	else if (rel_usecs > 1)
+		seq_puts(m, "+: ");
+	else
+		seq_puts(m, " : ");
+}
+
+static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
+
+static void notrace
+print_lat_fmt(struct seq_file *m, struct trace_iterator *iter,
+	      unsigned int trace_idx, int cpu)
+{
+	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
+	struct trace_entry *next_entry = find_next_entry(iter, NULL);
+	unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
+	struct trace_entry *entry = iter->ent;
+	unsigned long abs_usecs;
+	unsigned long rel_usecs;
+	char *comm;
+	int S;
+
+	if (!next_entry)
+		next_entry = entry;
+	rel_usecs = ns2usecs(next_entry->t - entry->t);
+	abs_usecs = ns2usecs(entry->t - iter->tr->time_start);
+
+	if (verbose) {
+		comm = trace_find_cmdline(entry->pid);
+		seq_printf(m, "%16s %5d %d %d %08x %08x [%08lx]"
+			   " %ld.%03ldms (+%ld.%03ldms): ",
+			   comm,
+			   entry->pid, cpu, entry->flags,
+			   entry->preempt_count, trace_idx,
+			   ns2usecs(entry->t),
+			   abs_usecs/1000,
+			   abs_usecs % 1000, rel_usecs/1000, rel_usecs % 1000);
+	} else {
+		lat_print_generic(m, entry, cpu);
+		lat_print_timestamp(m, abs_usecs, rel_usecs);
+	}
+	switch (entry->type) {
+	case TRACE_FN:
+		seq_print_ip_sym(m, entry->fn.ip, sym_flags);
+		seq_puts(m, " (");
+		seq_print_ip_sym(m, entry->fn.parent_ip, sym_flags);
+		seq_puts(m, ")\n");
+		break;
+	case TRACE_CTX:
+		S = entry->ctx.prev_state < sizeof(state_to_char) ?
+			state_to_char[entry->ctx.prev_state] : 'X';
+		comm = trace_find_cmdline(entry->ctx.next_pid);
+		seq_printf(m, " %d:%d:%c --> %d:%d %s\n",
+			   entry->ctx.prev_pid,
+			   entry->ctx.prev_prio,
+			   S,
+			   entry->ctx.next_pid,
+			   entry->ctx.next_prio,
+			   comm);
+		break;
+	}
+}
+
+static void notrace
+print_trace_fmt(struct seq_file *m, struct trace_iterator *iter)
+{
+	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
+	struct trace_entry *entry = iter->ent;
+	unsigned long usec_rem;
+	unsigned long long t;
+	unsigned long secs;
+	char *comm;
+	int S;
+
+	comm = trace_find_cmdline(iter->ent->pid);
+
+	t = ns2usecs(entry->t);
+	usec_rem = do_div(t, 1000000ULL);
+	secs = (unsigned long)t;
+
+	seq_printf(m, "%16s-%-5d ", comm, entry->pid);
+	seq_printf(m, "[%02d] ", iter->cpu);
+	seq_printf(m, "%5lu.%06lu: ", secs, usec_rem);
+
+	switch (entry->type) {
+	case TRACE_FN:
+		seq_print_ip_sym(m, entry->fn.ip, sym_flags);
+		if ((sym_flags & TRACE_ITER_PRINT_PARENT) &&
+						entry->fn.parent_ip) {
+			seq_printf(m, " <-");
+			seq_print_ip_sym(m, entry->fn.parent_ip, sym_flags);
+		}
+		break;
+	case TRACE_CTX:
+		S = entry->ctx.prev_state < sizeof(state_to_char) ?
+			state_to_char[entry->ctx.prev_state] : 'X';
+		seq_printf(m, " %d:%d:%c ==> %d:%d\n",
+			   entry->ctx.prev_pid,
+			   entry->ctx.prev_prio,
+			   S,
+			   entry->ctx.next_pid,
+			   entry->ctx.next_prio);
+		break;
+	}
+	seq_printf(m, "\n");
+}
+
+static int trace_empty(struct trace_iterator *iter)
+{
+	struct trace_array_cpu *data;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		data = iter->tr->data[cpu];
+
+		if (data->trace &&
+		    (data->trace_idx ||
+		     atomic_read(&data->underrun)))
+			return 0;
+	}
+	return 1;
+}
+
+static int s_show(struct seq_file *m, void *v)
+{
+	struct trace_iterator *iter = v;
+
+	if (iter->ent == NULL) {
+		if (iter->tr) {
+			seq_printf(m, "# tracer: %s\n", iter->trace->name);
+			seq_puts(m, "#\n");
+		}
+		if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+			/* print nothing if the buffers are empty */
+			if (trace_empty(iter))
+				return 0;
+			print_trace_header(m, iter);
+			if (!(trace_flags & TRACE_ITER_VERBOSE))
+				print_lat_help_header(m);
+		} else {
+			if (!(trace_flags & TRACE_ITER_VERBOSE))
+				print_func_help_header(m);
+		}
+	} else {
+		if (iter->iter_flags & TRACE_FILE_LAT_FMT)
+			print_lat_fmt(m, iter, iter->idx, iter->cpu);
+		else
+			print_trace_fmt(m, iter);
+	}
+
+	return 0;
+}
+
+static struct seq_operations tracer_seq_ops = {
+	.start = s_start,
+	.next = s_next,
+	.stop = s_stop,
+	.show = s_show,
+};
+
+static struct trace_iterator notrace *
+__tracing_open(struct inode *inode, struct file *file, int *ret)
+{
+	struct trace_iterator *iter;
+
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter) {
+		*ret = -ENOMEM;
+		goto out;
+	}
+
+	mutex_lock(&trace_types_lock);
+	if (current_trace && current_trace->print_max)
+		iter->tr = &max_tr;
+	else
+		iter->tr = inode->i_private;
+	iter->trace = current_trace;
+	iter->pos = -1;
+
+	/* TODO stop tracer */
+	*ret = seq_open(file, &tracer_seq_ops);
+	if (!*ret) {
+		struct seq_file *m = file->private_data;
+		m->private = iter;
+
+		/* stop the trace while dumping */
+		if (iter->tr->ctrl)
+			tracer_enabled = 0;
+
+		if (iter->trace && iter->trace->open)
+			iter->trace->open(iter);
+	} else {
+		kfree(iter);
+		iter = NULL;
+	}
+	mutex_unlock(&trace_types_lock);
+
+ out:
+	return iter;
+}
+
+int tracing_open_generic(struct inode *inode, struct file *filp)
+{
+	filp->private_data = inode->i_private;
+	return 0;
+}
+
+int tracing_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = (struct seq_file *)file->private_data;
+	struct trace_iterator *iter = m->private;
+
+	mutex_lock(&trace_types_lock);
+	if (iter->trace && iter->trace->close)
+		iter->trace->close(iter);
+
+	/* reenable tracing if it was previously enabled */
+	if (iter->tr->ctrl)
+		tracer_enabled = 1;
+	mutex_unlock(&trace_types_lock);
+
+	seq_release(inode, file);
+	kfree(iter);
+	return 0;
+}
+
+static int tracing_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	__tracing_open(inode, file, &ret);
+
+	return ret;
+}
+
+static int tracing_lt_open(struct inode *inode, struct file *file)
+{
+	struct trace_iterator *iter;
+	int ret;
+
+	iter = __tracing_open(inode, file, &ret);
+
+	if (!ret)
+		iter->iter_flags |= TRACE_FILE_LAT_FMT;
+
+	return ret;
+}
+
+
+static void notrace *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct tracer *t = m->private;
+
+	(*pos)++;
+
+	if (t)
+		t = t->next;
+
+	m->private = t;
+
+	return t;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+	struct tracer *t = m->private;
+	loff_t l = 0;
+
+	mutex_lock(&trace_types_lock);
+	for (; t && l < *pos; t = t_next(m, t, &l))
+		;
+
+	return t;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+	mutex_unlock(&trace_types_lock);
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+	struct tracer *t = v;
+
+	if (!t)
+		return 0;
+
+	seq_printf(m, "%s", t->name);
+	if (t->next)
+		seq_putc(m, ' ');
+	else
+		seq_putc(m, '\n');
+
+	return 0;
+}
+
+static struct seq_operations show_traces_seq_ops = {
+	.start = t_start,
+	.next = t_next,
+	.stop = t_stop,
+	.show = t_show,
+};
+
+static int show_traces_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	ret = seq_open(file, &show_traces_seq_ops);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = trace_types;
+	}
+
+	return ret;
+}
+
+static struct file_operations tracing_fops = {
+	.open = tracing_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = tracing_release,
+};
+
+static struct file_operations tracing_lt_fops = {
+	.open = tracing_lt_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = tracing_release,
+};
+
+static struct file_operations show_traces_fops = {
+	.open = show_traces_open,
+	.read = seq_read,
+	.release = seq_release,
+};
+
+static ssize_t
+tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
+		       size_t cnt, loff_t *ppos)
+{
+	char *buf;
+	int r = 0;
+	int len = 0;
+	int i;
+
+	/* calulate max size */
+	for (i = 0; trace_options[i]; i++) {
+		len += strlen(trace_options[i]);
+		len += 3; /* "no" and space */
+	}
+
+	/* +2 for \n and \0 */
+	buf = kmalloc(len + 2, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	for (i = 0; trace_options[i]; i++) {
+		if (trace_flags & (1 << i))
+			r += sprintf(buf + r, "%s ", trace_options[i]);
+		else
+			r += sprintf(buf + r, "no%s ", trace_options[i]);
+	}
+
+	r += sprintf(buf + r, "\n");
+	WARN_ON(r >= len + 2);
+
+	r = simple_read_from_buffer(ubuf, cnt, ppos,
+				    buf, r);
+
+	kfree(buf);
+
+	return r;
+}
+
+static ssize_t
+tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
+			size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	char *cmp = buf;
+	int neg = 0;
+	int i;
+
+	if (cnt > 63)
+		cnt = 63;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	if (strncmp(buf, "no", 2) == 0) {
+		neg = 1;
+		cmp += 2;
+	}
+
+	for (i = 0; trace_options[i]; i++) {
+		int len = strlen(trace_options[i]);
+
+		if (strncmp(cmp, trace_options[i], len) == 0) {
+			if (neg)
+				trace_flags &= ~(1 << i);
+			else
+				trace_flags |= (1 << i);
+			break;
+		}
+	}
+
+	filp->f_pos += cnt;
+
+	return cnt;
+}
+
+static struct file_operations tracing_iter_fops = {
+	.open = tracing_open_generic,
+	.read = tracing_iter_ctrl_read,
+	.write = tracing_iter_ctrl_write,
+};
+
+static ssize_t
+tracing_ctrl_read(struct file *filp, char __user *ubuf,
+		  size_t cnt, loff_t *ppos)
+{
+	struct trace_array *tr = filp->private_data;
+	char buf[64];
+	int r;
+
+	r = sprintf(buf, "%ld\n", tr->ctrl);
+	return simple_read_from_buffer(ubuf, cnt, ppos,
+				       buf, r);
+}
+
+static ssize_t
+tracing_ctrl_write(struct file *filp, const char __user *ubuf,
+		   size_t cnt, loff_t *ppos)
+{
+	struct trace_array *tr = filp->private_data;
+	long val;
+	char buf[64];
+
+	if (cnt > 63)
+		cnt = 63;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	val = simple_strtoul(buf, NULL, 10);
+
+	val = !!val;
+
+	mutex_lock(&trace_types_lock);
+	if (tr->ctrl ^ val) {
+		if (val)
+			tracer_enabled = 1;
+		else
+			tracer_enabled = 0;
+
+		tr->ctrl = val;
+
+		if (current_trace && current_trace->ctrl_update)
+			current_trace->ctrl_update(tr);
+	}
+	mutex_unlock(&trace_types_lock);
+
+	filp->f_pos += cnt;
+
+	return cnt;
+}
+
+static ssize_t
+tracing_set_trace_read(struct file *filp, char __user *ubuf,
+		       size_t cnt, loff_t *ppos)
+{
+	char buf[max_tracer_type_len+2];
+	int r;
+
+	mutex_lock(&trace_types_lock);
+	if (current_trace)
+		r = sprintf(buf, "%s\n", current_trace->name);
+	else
+		r = sprintf(buf, "\n");
+	mutex_unlock(&trace_types_lock);
+
+	return simple_read_from_buffer(ubuf, cnt, ppos,
+				       buf, r);
+}
+
+static ssize_t
+tracing_set_trace_write(struct file *filp, const char __user *ubuf,
+			size_t cnt, loff_t *ppos)
+{
+	struct trace_array *tr = &global_trace;
+	struct tracer *t;
+	char buf[max_tracer_type_len+1];
+	int i;
+
+	if (cnt > max_tracer_type_len)
+		cnt = max_tracer_type_len;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	/* strip ending whitespace. */
+	for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
+		buf[i] = 0;
+
+	mutex_lock(&trace_types_lock);
+	for (t = trace_types; t; t = t->next) {
+		if (strcmp(t->name, buf) == 0)
+			break;
+	}
+	if (!t || t == current_trace)
+		goto out;
+
+	if (current_trace && current_trace->reset)
+		current_trace->reset(tr);
+
+	current_trace = t;
+	if (t->init)
+		t->init(tr);
+
+ out:
+	mutex_unlock(&trace_types_lock);
+
+	filp->f_pos += cnt;
+
+	return cnt;
+}
+
+static ssize_t
+tracing_max_lat_read(struct file *filp, char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	unsigned long *ptr = filp->private_data;
+	char buf[64];
+	int r;
+
+	r = snprintf(buf, 64, "%ld\n",
+		     *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr));
+	if (r > 64)
+		r = 64;
+	return simple_read_from_buffer(ubuf, cnt, ppos,
+				       buf, r);
+}
+
+static ssize_t
+tracing_max_lat_write(struct file *filp, const char __user *ubuf,
+		      size_t cnt, loff_t *ppos)
+{
+	long *ptr = filp->private_data;
+	long val;
+	char buf[64];
+
+	if (cnt > 63)
+		cnt = 63;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	val = simple_strtoul(buf, NULL, 10);
+
+	*ptr = val * 1000;
+
+	return cnt;
+}
+
+static struct file_operations tracing_max_lat_fops = {
+	.open = tracing_open_generic,
+	.read = tracing_max_lat_read,
+	.write = tracing_max_lat_write,
+};
+
+static struct file_operations tracing_ctrl_fops = {
+	.open = tracing_open_generic,
+	.read = tracing_ctrl_read,
+	.write = tracing_ctrl_write,
+};
+
+static struct file_operations set_tracer_fops = {
+	.open = tracing_open_generic,
+	.read = tracing_set_trace_read,
+	.write = tracing_set_trace_write,
+};
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+static ssize_t
+tracing_read_long(struct file *filp, char __user *ubuf,
+		  size_t cnt, loff_t *ppos)
+{
+	unsigned long *p = filp->private_data;
+	char buf[64];
+	int r;
+
+	r = sprintf(buf, "%ld\n", *p);
+	return simple_read_from_buffer(ubuf, cnt, ppos,
+				       buf, r);
+}
+
+static struct file_operations tracing_read_long_fops = {
+	.open = tracing_open_generic,
+	.read = tracing_read_long,
+};
+#endif
+
+static struct dentry *d_tracer;
+
+struct dentry *tracing_init_dentry(void)
+{
+	static int once;
+
+	if (d_tracer)
+		return d_tracer;
+
+	d_tracer = debugfs_create_dir("tracing", NULL);
+
+	if (!d_tracer && !once) {
+		once = 1;
+		pr_warning("Could not create debugfs directory 'tracing'\n");
+		return NULL;
+	}
+
+	return d_tracer;
+}
+
+static __init void tracer_init_debugfs(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+
+	entry = debugfs_create_file("tracing_enabled", 0644, d_tracer,
+				    &global_trace, &tracing_ctrl_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
+
+	entry = debugfs_create_file("iter_ctrl", 0644, d_tracer,
+				    NULL, &tracing_iter_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'iter_ctrl' entry\n");
+
+	entry = debugfs_create_file("latency_trace", 0444, d_tracer,
+				    &global_trace, &tracing_lt_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'latency_trace' entry\n");
+
+	entry = debugfs_create_file("trace", 0444, d_tracer,
+				    &global_trace, &tracing_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'trace' entry\n");
+
+	entry = debugfs_create_file("available_tracers", 0444, d_tracer,
+				    &global_trace, &show_traces_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'trace' entry\n");
+
+	entry = debugfs_create_file("current_tracer", 0444, d_tracer,
+				    &global_trace, &set_tracer_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'trace' entry\n");
+
+	entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer,
+				    &tracing_max_latency,
+				    &tracing_max_lat_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'tracing_max_latency' entry\n");
+
+	entry = debugfs_create_file("tracing_thresh", 0644, d_tracer,
+				    &tracing_thresh, &tracing_max_lat_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'tracing_threash' entry\n");
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+	entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
+				    &ftrace_update_tot_cnt,
+				    &tracing_read_long_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'dyn_ftrace_total_info' entry\n");
+#endif
+}
+
+/* dummy trace to disable tracing */
+static struct tracer no_tracer __read_mostly =
+{
+	.name = "none",
+};
+
+static inline notrace int page_order(const unsigned long size)
+{
+	const unsigned long nr_pages = DIV_ROUND_UP(size, PAGE_SIZE);
+	return ilog2(roundup_pow_of_two(nr_pages));
+}
+
+__init static int tracer_alloc_buffers(void)
+{
+	const int order = page_order(trace_nr_entries * TRACE_ENTRY_SIZE);
+	const unsigned long size = (1UL << order) << PAGE_SHIFT;
+	struct trace_entry *array;
+	int i;
+
+	for_each_possible_cpu(i) {
+		global_trace.data[i] = &per_cpu(global_trace_cpu, i);
+		max_tr.data[i] = &per_cpu(max_data, i);
+
+		array = (struct trace_entry *)
+			  __get_free_pages(GFP_KERNEL, order);
+		if (array == NULL) {
+			printk(KERN_ERR "tracer: failed to allocate"
+			       " %ld bytes for trace buffer!\n", size);
+			goto free_buffers;
+		}
+		global_trace.data[i]->trace = array;
+
+/* Only allocate if we are actually using the max trace */
+#ifdef CONFIG_TRACER_MAX_TRACE
+		array = (struct trace_entry *)
+			  __get_free_pages(GFP_KERNEL, order);
+		if (array == NULL) {
+			printk(KERN_ERR "wakeup tracer: failed to allocate"
+			       " %ld bytes for trace buffer!\n", size);
+			goto free_buffers;
+		}
+		max_tr.data[i]->trace = array;
+#endif
+	}
+
+	/*
+	 * Since we allocate by orders of pages, we may be able to
+	 * round up a bit.
+	 */
+	global_trace.entries = size / TRACE_ENTRY_SIZE;
+	max_tr.entries = global_trace.entries;
+
+	pr_info("tracer: %ld bytes allocated for %ld",
+		size, trace_nr_entries);
+	pr_info(" entries of %ld bytes\n", (long)TRACE_ENTRY_SIZE);
+	pr_info("   actual entries %ld\n", global_trace.entries);
+
+	tracer_init_debugfs();
+
+	trace_init_cmdlines();
+
+	register_tracer(&no_tracer);
+	current_trace = &no_tracer;
+
+	return 0;
+
+ free_buffers:
+	for (i-- ; i >= 0; i--) {
+		struct trace_array_cpu *data = global_trace.data[i];
+
+		if (data && data->trace) {
+			free_pages((unsigned long)data->trace, order);
+			data->trace = NULL;
+		}
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+		data = max_tr.data[i];
+		if (data && data->trace) {
+			free_pages((unsigned long)data->trace, order);
+			data->trace = NULL;
+		}
+#endif
+	}
+	return -ENOMEM;
+}
+
+device_initcall(tracer_alloc_buffers);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
new file mode 100644
index 000000000000..3173a93561d4
--- /dev/null
+++ b/kernel/trace/trace.h
@@ -0,0 +1,184 @@
+#ifndef _LINUX_KERNEL_TRACE_H
+#define _LINUX_KERNEL_TRACE_H
+
+#include <linux/fs.h>
+#include <asm/atomic.h>
+#include <linux/sched.h>
+#include <linux/clocksource.h>
+
+/*
+ * Function trace entry - function address and parent function addres:
+ */
+struct ftrace_entry {
+	unsigned long		ip;
+	unsigned long		parent_ip;
+};
+
+/*
+ * Context switch trace entry - which task (and prio) we switched from/to:
+ */
+struct ctx_switch_entry {
+	unsigned int		prev_pid;
+	unsigned char		prev_prio;
+	unsigned char		prev_state;
+	unsigned int		next_pid;
+	unsigned char		next_prio;
+};
+
+/*
+ * The trace entry - the most basic unit of tracing. This is what
+ * is printed in the end as a single line in the trace output, such as:
+ *
+ *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter
+ */
+struct trace_entry {
+	char			type;
+	char			cpu;
+	char			flags;
+	char			preempt_count;
+	int			pid;
+	cycle_t			t;
+	unsigned long		idx;
+	union {
+		struct ftrace_entry		fn;
+		struct ctx_switch_entry		ctx;
+	};
+};
+
+#define TRACE_ENTRY_SIZE	sizeof(struct trace_entry)
+
+/*
+ * The CPU trace array - it consists of thousands of trace entries
+ * plus some other descriptor data: (for example which task started
+ * the trace, etc.)
+ */
+struct trace_array_cpu {
+	void			*trace;
+	unsigned long		trace_idx;
+	atomic_t		disabled;
+	atomic_t		underrun;
+	unsigned long		saved_latency;
+	unsigned long		critical_start;
+	unsigned long		critical_end;
+	unsigned long		critical_sequence;
+	unsigned long		nice;
+	unsigned long		policy;
+	unsigned long		rt_priority;
+	cycle_t			preempt_timestamp;
+	pid_t			pid;
+	uid_t			uid;
+	char			comm[TASK_COMM_LEN];
+};
+
+struct trace_iterator;
+
+/*
+ * The trace array - an array of per-CPU trace arrays. This is the
+ * highest level data structure that individual tracers deal with.
+ * They have on/off state as well:
+ */
+struct trace_array {
+	unsigned long		entries;
+	long			ctrl;
+	int			cpu;
+	cycle_t			time_start;
+	struct trace_array_cpu	*data[NR_CPUS];
+};
+
+/*
+ * A specific tracer, represented by methods that operate on a trace array:
+ */
+struct tracer {
+	const char		*name;
+	void			(*init)(struct trace_array *tr);
+	void			(*reset)(struct trace_array *tr);
+	void			(*open)(struct trace_iterator *iter);
+	void			(*close)(struct trace_iterator *iter);
+	void			(*start)(struct trace_iterator *iter);
+	void			(*stop)(struct trace_iterator *iter);
+	void			(*ctrl_update)(struct trace_array *tr);
+	struct tracer		*next;
+	int			print_max;
+};
+
+/*
+ * Trace iterator - used by printout routines who present trace
+ * results to users and which routines might sleep, etc:
+ */
+struct trace_iterator {
+	struct trace_array	*tr;
+	struct tracer		*trace;
+	struct trace_entry	*ent;
+	unsigned long		iter_flags;
+	loff_t			pos;
+	unsigned long		next_idx[NR_CPUS];
+	int			cpu;
+	int			idx;
+};
+
+void notrace tracing_reset(struct trace_array_cpu *data);
+int tracing_open_generic(struct inode *inode, struct file *filp);
+struct dentry *tracing_init_dentry(void);
+void ftrace(struct trace_array *tr,
+			    struct trace_array_cpu *data,
+			    unsigned long ip,
+			    unsigned long parent_ip,
+			    unsigned long flags);
+void tracing_sched_switch_trace(struct trace_array *tr,
+				struct trace_array_cpu *data,
+				struct task_struct *prev,
+				struct task_struct *next,
+				unsigned long flags);
+void tracing_record_cmdline(struct task_struct *tsk);
+
+void tracing_start_function_trace(void);
+void tracing_stop_function_trace(void);
+int register_tracer(struct tracer *type);
+void unregister_tracer(struct tracer *type);
+
+extern unsigned long nsecs_to_usecs(unsigned long nsecs);
+
+extern unsigned long tracing_max_latency;
+extern unsigned long tracing_thresh;
+
+void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
+void update_max_tr_single(struct trace_array *tr,
+			  struct task_struct *tsk, int cpu);
+
+static inline notrace cycle_t now(int cpu)
+{
+	return cpu_clock(cpu);
+}
+
+#ifdef CONFIG_SCHED_TRACER
+extern void notrace
+wakeup_sched_switch(struct task_struct *prev, struct task_struct *next);
+#else
+static inline void
+wakeup_sched_switch(struct task_struct *prev, struct task_struct *next)
+{
+}
+#endif
+
+#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+typedef void
+(*tracer_switch_func_t)(void *private,
+			struct task_struct *prev,
+			struct task_struct *next);
+
+struct tracer_switch_ops {
+	tracer_switch_func_t		func;
+	void				*private;
+	struct tracer_switch_ops	*next;
+};
+
+extern int register_tracer_switch(struct tracer_switch_ops *ops);
+extern int unregister_tracer_switch(struct tracer_switch_ops *ops);
+
+#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+extern unsigned long ftrace_update_tot_cnt;
+#endif
+
+#endif /* _LINUX_KERNEL_TRACE_H */
-- 
cgit v1.2.3


From 1b29b01887e6032dcaf818c14999c7a39593b4e7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:42 +0200
Subject: ftrace: function tracer

This is a simple trace that uses the ftrace infrastructure. It is
designed to be fast and small, and easy to use. It is useful to
record things that happen over a very short period of time, and
not to analyze the system in general.

 Updates:

  available_tracers
     "function" is added to this file.

  current_tracer
    To enable the function tracer:

      echo function > /debugfs/tracing/current_tracer

     To disable the tracer:

       echo disable > /debugfs/tracing/current_tracer

The output of the function_trace file is as follows

  "echo noverbose > /debugfs/tracing/iter_ctrl"

preemption latency trace v1.1.5 on 2.6.24-rc7-tst
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------
 latency: 0 us, #419428/4361791, CPU#1 | (M:desktop VP:0, KP:0, SP:0 HP:0 #P:4)
    -----------------
    | task: -0 (uid:0 nice:0 policy:0 rt_prio:0)
    -----------------

                 _------=> CPU#
                / _-----=> irqs-off
               | / _----=> need-resched
               || / _---=> hardirq/softirq
               ||| / _--=> preempt-depth
               |||| /
               |||||     delay
   cmd     pid ||||| time  |   caller
      \   /    |||||   \   |   /
 swapper-0     0d.h. 1595128us+: set_normalized_timespec+0x8/0x2d <c043841d> (ktime_get_ts+0x4a/0x4e <c04499d4>)
 swapper-0     0d.h. 1595131us+: _spin_lock+0x8/0x18 <c0630690> (hrtimer_interrupt+0x6e/0x1b0 <c0449c56>)

Or with verbose turned on:

  "echo verbose > /debugfs/tracing/iter_ctrl"

preemption latency trace v1.1.5 on 2.6.24-rc7-tst
--------------------------------------------------------------------
 latency: 0 us, #419428/4361791, CPU#1 | (M:desktop VP:0, KP:0, SP:0 HP:0 #P:4)
    -----------------
    | task: -0 (uid:0 nice:0 policy:0 rt_prio:0)
    -----------------

         swapper     0 0 9 00000000 00000000 [f3675f41] 1595.128ms (+0.003ms): set_normalized_timespec+0x8/0x2d <c043841d> (ktime_get_ts+0x4a/0x4e <c04499d4>)
         swapper     0 0 9 00000000 00000001 [f3675f45] 1595.131ms (+0.003ms): _spin_lock+0x8/0x18 <c0630690> (hrtimer_interrupt+0x6e/0x1b0 <c0449c56>)
         swapper     0 0 9 00000000 00000002 [f3675f48] 1595.135ms (+0.003ms): _spin_lock+0x8/0x18 <c0630690> (hrtimer_interrupt+0x6e/0x1b0 <c0449c56>)

The "trace" file is not affected by the verbose mode, but is by the symonly.

 echo "nosymonly" > /debugfs/tracing/iter_ctrl

tracer:
[   81.479967] CPU 0: bash:3154 register_ftrace_function+0x5f/0x66 <ffffffff80337a4d> <-- _spin_unlock_irqrestore+0xe/0x5a <ffffffff8048cc8f>
[   81.479967] CPU 0: bash:3154 _spin_unlock_irqrestore+0x3e/0x5a <ffffffff8048ccbf> <-- sub_preempt_count+0xc/0x7a <ffffffff80233d7b>
[   81.479968] CPU 0: bash:3154 sub_preempt_count+0x30/0x7a <ffffffff80233d9f> <-- in_lock_functions+0x9/0x24 <ffffffff8025a75d>
[   81.479968] CPU 0: bash:3154 vfs_write+0x11d/0x155 <ffffffff8029a043> <-- dnotify_parent+0x12/0x78 <ffffffff802d54fb>
[   81.479968] CPU 0: bash:3154 dnotify_parent+0x2d/0x78 <ffffffff802d5516> <-- _spin_lock+0xe/0x70 <ffffffff8048c910>
[   81.479969] CPU 0: bash:3154 _spin_lock+0x1b/0x70 <ffffffff8048c91d> <-- add_preempt_count+0xe/0x77 <ffffffff80233df7>
[   81.479969] CPU 0: bash:3154 add_preempt_count+0x3e/0x77 <ffffffff80233e27> <-- in_lock_functions+0x9/0x24 <ffffffff8025a75d>

 echo "symonly" > /debugfs/tracing/iter_ctrl

tracer:
[   81.479913] CPU 0: bash:3154 register_ftrace_function+0x5f/0x66 <-- _spin_unlock_irqrestore+0xe/0x5a
[   81.479913] CPU 0: bash:3154 _spin_unlock_irqrestore+0x3e/0x5a <-- sub_preempt_count+0xc/0x7a
[   81.479913] CPU 0: bash:3154 sub_preempt_count+0x30/0x7a <-- in_lock_functions+0x9/0x24
[   81.479914] CPU 0: bash:3154 vfs_write+0x11d/0x155 <-- dnotify_parent+0x12/0x78
[   81.479914] CPU 0: bash:3154 dnotify_parent+0x2d/0x78 <-- _spin_lock+0xe/0x70
[   81.479914] CPU 0: bash:3154 _spin_lock+0x1b/0x70 <-- add_preempt_count+0xe/0x77
[   81.479914] CPU 0: bash:3154 add_preempt_count+0x3e/0x77 <-- in_lock_functions+0x9/0x24

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/Kconfig           | 13 ++++++++
 kernel/trace/Makefile          |  1 +
 kernel/trace/trace_functions.c | 73 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 87 insertions(+)
 create mode 100644 kernel/trace/trace_functions.c

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index ce70677afbf9..1399f372b5dc 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -8,3 +8,16 @@ config TRACING
 	bool
 	select DEBUG_FS
 
+config FTRACE
+	bool "Kernel Function Tracer"
+	depends on DEBUG_KERNEL && HAVE_FTRACE
+	select FRAME_POINTER
+	select TRACING
+	help
+	  Enable the kernel to trace every kernel function. This is done
+	  by using a compiler feature to insert a small, 5-byte No-Operation
+	  instruction to the beginning of every kernel function, which NOP
+	  sequence is then dynamically patched into a tracer call when
+	  tracing is enabled by the administrator. If it's runtime disabled
+	  (the bootup default), then the overhead of the instructions is very
+	  small and not measurable even in micro-benchmarks.
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 7af403175255..6bb5e50b4a40 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -1,5 +1,6 @@
 obj-$(CONFIG_FTRACE) += libftrace.o
 
 obj-$(CONFIG_TRACING) += trace.o
+obj-$(CONFIG_FTRACE) += trace_functions.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
new file mode 100644
index 000000000000..82988c5336e0
--- /dev/null
+++ b/kernel/trace/trace_functions.c
@@ -0,0 +1,73 @@
+/*
+ * ring buffer based function tracer
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Based on code from the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+
+#include "trace.h"
+
+static notrace void function_reset(struct trace_array *tr)
+{
+	int cpu;
+
+	tr->time_start = now(tr->cpu);
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr->data[cpu]);
+}
+
+static notrace void start_function_trace(struct trace_array *tr)
+{
+	function_reset(tr);
+	tracing_start_function_trace();
+}
+
+static notrace void stop_function_trace(struct trace_array *tr)
+{
+	tracing_stop_function_trace();
+}
+
+static notrace void function_trace_init(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		start_function_trace(tr);
+}
+
+static notrace void function_trace_reset(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		stop_function_trace(tr);
+}
+
+static notrace void function_trace_ctrl_update(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		start_function_trace(tr);
+	else
+		stop_function_trace(tr);
+}
+
+static struct tracer function_trace __read_mostly =
+{
+	.name	     = "ftrace",
+	.init	     = function_trace_init,
+	.reset	     = function_trace_reset,
+	.ctrl_update = function_trace_ctrl_update,
+};
+
+static __init int init_function_trace(void)
+{
+	return register_tracer(&function_trace);
+}
+
+device_initcall(init_function_trace);
-- 
cgit v1.2.3


From 35e8e302e5d6e32675df2fc1dd3a53dfa6630dc1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:42 +0200
Subject: ftrace: add tracing of context switches

This patch adds context switch tracing, of the format of:

                  _------=> CPU#
                 / _-----=> irqs-off
                | / _----=> need-resched
                || / _---=> hardirq/softirq
                ||| / _--=> preempt-depth
                |||| /
                |||||     delay
    cmd     pid ||||| time  |      pid:prio:state
       \   /    |||||   \   |      /
  swapper-0     1d..3    137us+:  0:140:R --> 2912:120
     sshd-2912  1d..3    216us+:  2912:120:S --> 0:140
  swapper-0     1d..3    261us+:  0:140:R --> 2912:120
     bash-2920  0d..3    267us+:  2920:120:S --> 0:140
     sshd-2912  1d..3    330us!:  2912:120:S --> 0:140
  swapper-0     1d..3   2389us+:  0:140:R --> 2847:120
 yum-upda-2847  1d..3   2411us!:  2847:120:S --> 0:140
  swapper-0     0d..3  11089us+:  0:140:R --> 3139:120
 gdm-bina-3139  0d..3  11113us!:  3139:120:S --> 0:140
  swapper-0     1d..3 102328us+:  0:140:R --> 2847:120
 yum-upda-2847  1d..3 102348us!:  2847:120:S --> 0:140

 "sched_switch" is added to /debugfs/tracing/available_tracers

[ Eugene Teo <eugeneteo@kernel.sg: remove unused tracing_sched_switch_enabled ]

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/Kconfig              |  11 ++++
 kernel/trace/Makefile             |   1 +
 kernel/trace/trace_sched_switch.c | 116 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 128 insertions(+)
 create mode 100644 kernel/trace/trace_sched_switch.c

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 1399f372b5dc..5d6aa92866cd 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -13,6 +13,7 @@ config FTRACE
 	depends on DEBUG_KERNEL && HAVE_FTRACE
 	select FRAME_POINTER
 	select TRACING
+	select CONTEXT_SWITCH_TRACER
 	help
 	  Enable the kernel to trace every kernel function. This is done
 	  by using a compiler feature to insert a small, 5-byte No-Operation
@@ -21,3 +22,13 @@ config FTRACE
 	  tracing is enabled by the administrator. If it's runtime disabled
 	  (the bootup default), then the overhead of the instructions is very
 	  small and not measurable even in micro-benchmarks.
+
+config CONTEXT_SWITCH_TRACER
+	bool "Trace process context switches"
+	depends on DEBUG_KERNEL
+	select TRACING
+	select MARKERS
+	help
+	  This tracer gets called from the context switch and records
+	  all switching of tasks.
+
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 6bb5e50b4a40..6b54ceb7f16e 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -1,6 +1,7 @@
 obj-$(CONFIG_FTRACE) += libftrace.o
 
 obj-$(CONFIG_TRACING) += trace.o
+obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_FTRACE) += trace_functions.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
new file mode 100644
index 000000000000..3e4771d3b890
--- /dev/null
+++ b/kernel/trace/trace_sched_switch.c
@@ -0,0 +1,116 @@
+/*
+ * trace context switch
+ *
+ * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/marker.h>
+#include <linux/ftrace.h>
+
+#include "trace.h"
+
+static struct trace_array	*ctx_trace;
+static int __read_mostly	tracer_enabled;
+
+static void notrace
+ctx_switch_func(struct task_struct *prev, struct task_struct *next)
+{
+	struct trace_array *tr = ctx_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+
+	if (!tracer_enabled)
+		return;
+
+	raw_local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1))
+		tracing_sched_switch_trace(tr, data, prev, next, flags);
+
+	atomic_dec(&data->disabled);
+	raw_local_irq_restore(flags);
+}
+
+void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next)
+{
+	tracing_record_cmdline(prev);
+
+	/*
+	 * If tracer_switch_func only points to the local
+	 * switch func, it still needs the ptr passed to it.
+	 */
+	ctx_switch_func(prev, next);
+
+	/*
+	 * Chain to the wakeup tracer (this is a NOP if disabled):
+	 */
+	wakeup_sched_switch(prev, next);
+}
+
+static notrace void sched_switch_reset(struct trace_array *tr)
+{
+	int cpu;
+
+	tr->time_start = now(tr->cpu);
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr->data[cpu]);
+}
+
+static notrace void start_sched_trace(struct trace_array *tr)
+{
+	sched_switch_reset(tr);
+	tracer_enabled = 1;
+}
+
+static notrace void stop_sched_trace(struct trace_array *tr)
+{
+	tracer_enabled = 0;
+}
+
+static notrace void sched_switch_trace_init(struct trace_array *tr)
+{
+	ctx_trace = tr;
+
+	if (tr->ctrl)
+		start_sched_trace(tr);
+}
+
+static notrace void sched_switch_trace_reset(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		stop_sched_trace(tr);
+}
+
+static void sched_switch_trace_ctrl_update(struct trace_array *tr)
+{
+	/* When starting a new trace, reset the buffers */
+	if (tr->ctrl)
+		start_sched_trace(tr);
+	else
+		stop_sched_trace(tr);
+}
+
+static struct tracer sched_switch_trace __read_mostly =
+{
+	.name		= "sched_switch",
+	.init		= sched_switch_trace_init,
+	.reset		= sched_switch_trace_reset,
+	.ctrl_update	= sched_switch_trace_ctrl_update,
+};
+
+__init static int init_sched_switch_trace(void)
+{
+	return register_tracer(&sched_switch_trace);
+}
+device_initcall(init_sched_switch_trace);
-- 
cgit v1.2.3


From 352ad25aa4a189c667cb2af333948d34692a2d27 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:42 +0200
Subject: ftrace: tracer for scheduler wakeup latency

This patch adds the tracer that tracks the wakeup latency of the
highest priority waking task.

  "wakeup" is added to /debugfs/tracing/available_tracers

Also added to /debugfs/tracing

  tracing_max_latency
     holds the current max latency for the wakeup

  wakeup_thresh
     if set to other than zero, a log will be recorded
     for every wakeup that takes longer than the number
     entered in here (usecs for all counters)
     (deletes previous trace)

Examples:

  (with ftrace_enabled = 0)

============
preemption latency trace v1.1.5 on 2.6.24-rc8
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------
 latency: 26 us, #2/2, CPU#1 | (M:rt VP:0, KP:0, SP:0 HP:0 #P:2)
    -----------------
    | task: migration/0-3 (uid:0 nice:-5 policy:1 rt_prio:99)
    -----------------

                 _------=> CPU#
                / _-----=> irqs-off
               | / _----=> need-resched
               || / _---=> hardirq/softirq
               ||| / _--=> preempt-depth
               |||| /
               |||||     delay
   cmd     pid ||||| time  |   caller
      \   /    |||||   \   |   /
   quilt-8551  0d..3    0us+: wake_up_process+0x15/0x17 <ffffffff80233e80> (sched_exec+0xc9/0x100 <ffffffff80235343>)
   quilt-8551  0d..4   26us : sched_switch_callback+0x73/0x81 <ffffffff80338d2f> (schedule+0x483/0x6d5 <ffffffff8048b3ee>)

vim:ft=help
============

  (with ftrace_enabled = 1)

============
preemption latency trace v1.1.5 on 2.6.24-rc8
--------------------------------------------------------------------
 latency: 36 us, #45/45, CPU#0 | (M:rt VP:0, KP:0, SP:0 HP:0 #P:2)
    -----------------
    | task: migration/1-5 (uid:0 nice:-5 policy:1 rt_prio:99)
    -----------------

                 _------=> CPU#
                / _-----=> irqs-off
               | / _----=> need-resched
               || / _---=> hardirq/softirq
               ||| / _--=> preempt-depth
               |||| /
               |||||     delay
   cmd     pid ||||| time  |   caller
      \   /    |||||   \   |   /
    bash-10653 1d..3    0us : wake_up_process+0x15/0x17 <ffffffff80233e80> (sched_exec+0xc9/0x100 <ffffffff80235343>)
    bash-10653 1d..3    1us : try_to_wake_up+0x271/0x2e7 <ffffffff80233dcf> (sub_preempt_count+0xc/0x7a <ffffffff8023309e>)
    bash-10653 1d..2    2us : try_to_wake_up+0x296/0x2e7 <ffffffff80233df4> (update_rq_clock+0x9/0x20 <ffffffff802303f3>)
    bash-10653 1d..2    2us : update_rq_clock+0x1e/0x20 <ffffffff80230408> (__update_rq_clock+0xc/0x90 <ffffffff80230366>)
    bash-10653 1d..2    3us : __update_rq_clock+0x1b/0x90 <ffffffff80230375> (sched_clock+0x9/0x29 <ffffffff80214529>)
    bash-10653 1d..2    4us : try_to_wake_up+0x2a6/0x2e7 <ffffffff80233e04> (activate_task+0xc/0x3f <ffffffff8022ffca>)
    bash-10653 1d..2    4us : activate_task+0x2d/0x3f <ffffffff8022ffeb> (enqueue_task+0xe/0x66 <ffffffff8022ff66>)
    bash-10653 1d..2    5us : enqueue_task+0x5b/0x66 <ffffffff8022ffb3> (enqueue_task_rt+0x9/0x3c <ffffffff80233351>)
    bash-10653 1d..2    6us : try_to_wake_up+0x2ba/0x2e7 <ffffffff80233e18> (check_preempt_wakeup+0x12/0x99 <ffffffff80234f84>)
[...]
    bash-10653 1d..5   33us : tracing_record_cmdline+0xcf/0xd4 <ffffffff80338aad> (_spin_unlock+0x9/0x33 <ffffffff8048d3ec>)
    bash-10653 1d..5   34us : _spin_unlock+0x19/0x33 <ffffffff8048d3fc> (sub_preempt_count+0xc/0x7a <ffffffff8023309e>)
    bash-10653 1d..4   35us : wakeup_sched_switch+0x65/0x2ff <ffffffff80339f66> (_spin_lock_irqsave+0xc/0xa9 <ffffffff8048d08b>)
    bash-10653 1d..4   35us : _spin_lock_irqsave+0x19/0xa9 <ffffffff8048d098> (add_preempt_count+0xe/0x77 <ffffffff8023311a>)
    bash-10653 1d..4   36us : sched_switch_callback+0x73/0x81 <ffffffff80338d2f> (schedule+0x483/0x6d5 <ffffffff8048b3ee>)

vim:ft=help
============

The [...] was added here to not waste your email box space.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/Kconfig              |  13 ++
 kernel/trace/Makefile             |   1 +
 kernel/trace/trace_sched_wakeup.c | 310 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 324 insertions(+)
 create mode 100644 kernel/trace/trace_sched_wakeup.c

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 5d6aa92866cd..892ecc94a82b 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -4,6 +4,9 @@
 config HAVE_FTRACE
 	bool
 
+config TRACER_MAX_TRACE
+	bool
+
 config TRACING
 	bool
 	select DEBUG_FS
@@ -23,6 +26,16 @@ config FTRACE
 	  (the bootup default), then the overhead of the instructions is very
 	  small and not measurable even in micro-benchmarks.
 
+config SCHED_TRACER
+	bool "Scheduling Latency Tracer"
+	depends on DEBUG_KERNEL
+	select TRACING
+	select CONTEXT_SWITCH_TRACER
+	select TRACER_MAX_TRACE
+	help
+	  This tracer tracks the latency of the highest priority task
+	  to be scheduled in, starting from the point it has woken up.
+
 config CONTEXT_SWITCH_TRACER
 	bool "Trace process context switches"
 	depends on DEBUG_KERNEL
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 6b54ceb7f16e..5508cdb19aea 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -3,5 +3,6 @@ obj-$(CONFIG_FTRACE) += libftrace.o
 obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_FTRACE) += trace_functions.o
+obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
new file mode 100644
index 000000000000..7c3ccefcf4c3
--- /dev/null
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -0,0 +1,310 @@
+/*
+ * trace task wakeup timings
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Based on code from the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+
+#include "trace.h"
+
+static struct trace_array	*wakeup_trace;
+static int __read_mostly	tracer_enabled;
+
+static struct task_struct	*wakeup_task;
+static int			wakeup_cpu;
+static unsigned			wakeup_prio = -1;
+
+static DEFINE_SPINLOCK(wakeup_lock);
+
+static void notrace __wakeup_reset(struct trace_array *tr);
+
+/*
+ * Should this new latency be reported/recorded?
+ */
+static int notrace report_latency(cycle_t delta)
+{
+	if (tracing_thresh) {
+		if (delta < tracing_thresh)
+			return 0;
+	} else {
+		if (delta <= tracing_max_latency)
+			return 0;
+	}
+	return 1;
+}
+
+void notrace
+wakeup_sched_switch(struct task_struct *prev, struct task_struct *next)
+{
+	unsigned long latency = 0, t0 = 0, t1 = 0;
+	struct trace_array *tr = wakeup_trace;
+	struct trace_array_cpu *data;
+	cycle_t T0, T1, delta;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+
+	if (unlikely(!tracer_enabled))
+		return;
+
+	/*
+	 * When we start a new trace, we set wakeup_task to NULL
+	 * and then set tracer_enabled = 1. We want to make sure
+	 * that another CPU does not see the tracer_enabled = 1
+	 * and the wakeup_task with an older task, that might
+	 * actually be the same as next.
+	 */
+	smp_rmb();
+
+	if (next != wakeup_task)
+		return;
+
+	/* The task we are waitng for is waking up */
+	data = tr->data[wakeup_cpu];
+
+	/* disable local data, not wakeup_cpu data */
+	cpu = raw_smp_processor_id();
+	disabled = atomic_inc_return(&tr->data[cpu]->disabled);
+	if (likely(disabled != 1))
+		goto out;
+
+	spin_lock_irqsave(&wakeup_lock, flags);
+
+	/* We could race with grabbing wakeup_lock */
+	if (unlikely(!tracer_enabled || next != wakeup_task))
+		goto out_unlock;
+
+	ftrace(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags);
+
+	/*
+	 * usecs conversion is slow so we try to delay the conversion
+	 * as long as possible:
+	 */
+	T0 = data->preempt_timestamp;
+	T1 = now(cpu);
+	delta = T1-T0;
+
+	if (!report_latency(delta))
+		goto out_unlock;
+
+	latency = nsecs_to_usecs(delta);
+
+	tracing_max_latency = delta;
+	t0 = nsecs_to_usecs(T0);
+	t1 = nsecs_to_usecs(T1);
+
+	update_max_tr(tr, wakeup_task, wakeup_cpu);
+
+	if (tracing_thresh) {
+		printk(KERN_INFO "(%16s-%-5d|#%d): %lu us wakeup latency "
+		       "violates %lu us threshold.\n"
+		       " => started at timestamp %lu: ",
+				wakeup_task->comm, wakeup_task->pid,
+				raw_smp_processor_id(),
+				latency, nsecs_to_usecs(tracing_thresh), t0);
+	} else {
+		printk(KERN_INFO "(%16s-%-5d|#%d): new %lu us maximum "
+		       "wakeup latency.\n => started at timestamp %lu: ",
+				wakeup_task->comm, wakeup_task->pid,
+				cpu, latency, t0);
+	}
+
+	printk(KERN_CONT "   ended at timestamp %lu: ", t1);
+	dump_stack();
+	t1 = nsecs_to_usecs(now(cpu));
+	printk(KERN_CONT "   dump-end timestamp %lu\n\n", t1);
+
+out_unlock:
+	__wakeup_reset(tr);
+	spin_unlock_irqrestore(&wakeup_lock, flags);
+out:
+	atomic_dec(&tr->data[cpu]->disabled);
+}
+
+static void notrace __wakeup_reset(struct trace_array *tr)
+{
+	struct trace_array_cpu *data;
+	int cpu;
+
+	assert_spin_locked(&wakeup_lock);
+
+	for_each_possible_cpu(cpu) {
+		data = tr->data[cpu];
+		tracing_reset(data);
+	}
+
+	wakeup_cpu = -1;
+	wakeup_prio = -1;
+
+	if (wakeup_task)
+		put_task_struct(wakeup_task);
+
+	wakeup_task = NULL;
+}
+
+static void notrace wakeup_reset(struct trace_array *tr)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&wakeup_lock, flags);
+	__wakeup_reset(tr);
+	spin_unlock_irqrestore(&wakeup_lock, flags);
+}
+
+static notrace void
+wakeup_check_start(struct trace_array *tr, struct task_struct *p,
+		   struct task_struct *curr)
+{
+	int cpu = smp_processor_id();
+	unsigned long flags;
+	long disabled;
+
+	if (likely(!rt_task(p)) ||
+			p->prio >= wakeup_prio ||
+			p->prio >= curr->prio)
+		return;
+
+	disabled = atomic_inc_return(&tr->data[cpu]->disabled);
+	if (unlikely(disabled != 1))
+		goto out;
+
+	/* interrupts should be off from try_to_wake_up */
+	spin_lock(&wakeup_lock);
+
+	/* check for races. */
+	if (!tracer_enabled || p->prio >= wakeup_prio)
+		goto out_locked;
+
+	/* reset the trace */
+	__wakeup_reset(tr);
+
+	wakeup_cpu = task_cpu(p);
+	wakeup_prio = p->prio;
+
+	wakeup_task = p;
+	get_task_struct(wakeup_task);
+
+	local_save_flags(flags);
+
+	tr->data[wakeup_cpu]->preempt_timestamp = now(cpu);
+	ftrace(tr, tr->data[wakeup_cpu], CALLER_ADDR1, CALLER_ADDR2, flags);
+
+out_locked:
+	spin_unlock(&wakeup_lock);
+out:
+	atomic_dec(&tr->data[cpu]->disabled);
+}
+
+notrace void
+ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr)
+{
+	if (likely(!tracer_enabled))
+		return;
+
+	wakeup_check_start(wakeup_trace, wakee, curr);
+}
+
+notrace void
+ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr)
+{
+	if (likely(!tracer_enabled))
+		return;
+
+	wakeup_check_start(wakeup_trace, wakee, curr);
+}
+
+static notrace void start_wakeup_tracer(struct trace_array *tr)
+{
+	wakeup_reset(tr);
+
+	/*
+	 * Don't let the tracer_enabled = 1 show up before
+	 * the wakeup_task is reset. This may be overkill since
+	 * wakeup_reset does a spin_unlock after setting the
+	 * wakeup_task to NULL, but I want to be safe.
+	 * This is a slow path anyway.
+	 */
+	smp_wmb();
+
+	tracer_enabled = 1;
+
+	return;
+}
+
+static notrace void stop_wakeup_tracer(struct trace_array *tr)
+{
+	tracer_enabled = 0;
+}
+
+static notrace void wakeup_tracer_init(struct trace_array *tr)
+{
+	wakeup_trace = tr;
+
+	if (tr->ctrl)
+		start_wakeup_tracer(tr);
+}
+
+static notrace void wakeup_tracer_reset(struct trace_array *tr)
+{
+	if (tr->ctrl) {
+		stop_wakeup_tracer(tr);
+		/* make sure we put back any tasks we are tracing */
+		wakeup_reset(tr);
+	}
+}
+
+static void wakeup_tracer_ctrl_update(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		start_wakeup_tracer(tr);
+	else
+		stop_wakeup_tracer(tr);
+}
+
+static void notrace wakeup_tracer_open(struct trace_iterator *iter)
+{
+	/* stop the trace while dumping */
+	if (iter->tr->ctrl)
+		stop_wakeup_tracer(iter->tr);
+}
+
+static void notrace wakeup_tracer_close(struct trace_iterator *iter)
+{
+	/* forget about any processes we were recording */
+	if (iter->tr->ctrl)
+		start_wakeup_tracer(iter->tr);
+}
+
+static struct tracer wakeup_tracer __read_mostly =
+{
+	.name		= "wakeup",
+	.init		= wakeup_tracer_init,
+	.reset		= wakeup_tracer_reset,
+	.open		= wakeup_tracer_open,
+	.close		= wakeup_tracer_close,
+	.ctrl_update	= wakeup_tracer_ctrl_update,
+	.print_max	= 1,
+};
+
+__init static int init_wakeup_tracer(void)
+{
+	int ret;
+
+	ret = register_tracer(&wakeup_tracer);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+device_initcall(init_wakeup_tracer);
-- 
cgit v1.2.3


From 81d68a96a39844853b37f20cc8282d9b65b78ef3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:42 +0200
Subject: ftrace: trace irq disabled critical timings

This patch adds latency tracing for critical timings
(how long interrupts are disabled for).

 "irqsoff" is added to /debugfs/tracing/available_tracers

Note:
  tracing_max_latency
    also holds the max latency for irqsoff (in usecs).
   (default to large number so one must start latency tracing)

  tracing_thresh
    threshold (in usecs) to always print out if irqs off
    is detected to be longer than stated here.
    If irq_thresh is non-zero, then max_irq_latency
    is ignored.

Here's an example of a trace with ftrace_enabled = 0

=======
preemption latency trace v1.1.5 on 2.6.24-rc7
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------
 latency: 100 us, #3/3, CPU#1 | (M:rt VP:0, KP:0, SP:0 HP:0 #P:2)
    -----------------
    | task: swapper-0 (uid:0 nice:0 policy:0 rt_prio:0)
    -----------------
 => started at: _spin_lock_irqsave+0x2a/0xb7
 => ended at:   _spin_unlock_irqrestore+0x32/0x5f

                 _------=> CPU#
                / _-----=> irqs-off
               | / _----=> need-resched
               || / _---=> hardirq/softirq
               ||| / _--=> preempt-depth
               |||| /
               |||||     delay
   cmd     pid ||||| time  |   caller
      \   /    |||||   \   |   /
 swapper-0     1d.s3    0us+: _spin_lock_irqsave+0x2a/0xb7 (e1000_update_stats+0x47/0x64c [e1000])
 swapper-0     1d.s3  100us : _spin_unlock_irqrestore+0x32/0x5f (e1000_update_stats+0x641/0x64c [e1000])
 swapper-0     1d.s3  100us : trace_hardirqs_on_caller+0x75/0x89 (_spin_unlock_irqrestore+0x32/0x5f)

vim:ft=help
=======

And this is a trace with ftrace_enabled == 1

=======
preemption latency trace v1.1.5 on 2.6.24-rc7
--------------------------------------------------------------------
 latency: 102 us, #12/12, CPU#1 | (M:rt VP:0, KP:0, SP:0 HP:0 #P:2)
    -----------------
    | task: swapper-0 (uid:0 nice:0 policy:0 rt_prio:0)
    -----------------
 => started at: _spin_lock_irqsave+0x2a/0xb7
 => ended at:   _spin_unlock_irqrestore+0x32/0x5f

                 _------=> CPU#
                / _-----=> irqs-off
               | / _----=> need-resched
               || / _---=> hardirq/softirq
               ||| / _--=> preempt-depth
               |||| /
               |||||     delay
   cmd     pid ||||| time  |   caller
      \   /    |||||   \   |   /
 swapper-0     1dNs3    0us+: _spin_lock_irqsave+0x2a/0xb7 (e1000_update_stats+0x47/0x64c [e1000])
 swapper-0     1dNs3   46us : e1000_read_phy_reg+0x16/0x225 [e1000] (e1000_update_stats+0x5e2/0x64c [e1000])
 swapper-0     1dNs3   46us : e1000_swfw_sync_acquire+0x10/0x99 [e1000] (e1000_read_phy_reg+0x49/0x225 [e1000])
 swapper-0     1dNs3   46us : e1000_get_hw_eeprom_semaphore+0x12/0xa6 [e1000] (e1000_swfw_sync_acquire+0x36/0x99 [e1000])
 swapper-0     1dNs3   47us : __const_udelay+0x9/0x47 (e1000_read_phy_reg+0x116/0x225 [e1000])
 swapper-0     1dNs3   47us+: __delay+0x9/0x50 (__const_udelay+0x45/0x47)
 swapper-0     1dNs3   97us : preempt_schedule+0xc/0x84 (__delay+0x4e/0x50)
 swapper-0     1dNs3   98us : e1000_swfw_sync_release+0xc/0x55 [e1000] (e1000_read_phy_reg+0x211/0x225 [e1000])
 swapper-0     1dNs3   99us+: e1000_put_hw_eeprom_semaphore+0x9/0x35 [e1000] (e1000_swfw_sync_release+0x50/0x55 [e1000])
 swapper-0     1dNs3  101us : _spin_unlock_irqrestore+0xe/0x5f (e1000_update_stats+0x641/0x64c [e1000])
 swapper-0     1dNs3  102us : _spin_unlock_irqrestore+0x32/0x5f (e1000_update_stats+0x641/0x64c [e1000])
 swapper-0     1dNs3  102us : trace_hardirqs_on_caller+0x75/0x89 (_spin_unlock_irqrestore+0x32/0x5f)

vim:ft=help
=======

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/fork.c                |   2 +-
 kernel/lockdep.c             |  23 ++-
 kernel/printk.c              |   2 +
 kernel/trace/Kconfig         |  18 ++
 kernel/trace/Makefile        |   1 +
 kernel/trace/trace_irqsoff.c | 402 +++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 443 insertions(+), 5 deletions(-)
 create mode 100644 kernel/trace/trace_irqsoff.c

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 19908b26cf80..d66d676dc362 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -909,7 +909,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	rt_mutex_init_task(p);
 
-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_LOCKDEP)
 	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
 	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
 #endif
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 81a4e4a3f087..e21924365ea3 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -39,6 +39,7 @@
 #include <linux/irqflags.h>
 #include <linux/utsname.h>
 #include <linux/hash.h>
+#include <linux/ftrace.h>
 
 #include <asm/sections.h>
 
@@ -982,7 +983,7 @@ check_noncircular(struct lock_class *source, unsigned int depth)
 	return 1;
 }
 
-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
 /*
  * Forwards and backwards subgraph searching, for the purposes of
  * proving that two subgraphs can be connected by a new dependency
@@ -1680,7 +1681,7 @@ valid_state(struct task_struct *curr, struct held_lock *this,
 static int mark_lock(struct task_struct *curr, struct held_lock *this,
 		     enum lock_usage_bit new_bit);
 
-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
 
 /*
  * print irq inversion bug:
@@ -2013,11 +2014,13 @@ void early_boot_irqs_on(void)
 /*
  * Hardirqs will be enabled:
  */
-void trace_hardirqs_on(void)
+void notrace trace_hardirqs_on_caller(unsigned long a0)
 {
 	struct task_struct *curr = current;
 	unsigned long ip;
 
+	time_hardirqs_on(CALLER_ADDR0, a0);
+
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
 
@@ -2055,16 +2058,23 @@ void trace_hardirqs_on(void)
 	curr->hardirq_enable_event = ++curr->irq_events;
 	debug_atomic_inc(&hardirqs_on_events);
 }
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
 
+void notrace trace_hardirqs_on(void)
+{
+	trace_hardirqs_on_caller(CALLER_ADDR0);
+}
 EXPORT_SYMBOL(trace_hardirqs_on);
 
 /*
  * Hardirqs were disabled:
  */
-void trace_hardirqs_off(void)
+void notrace trace_hardirqs_off_caller(unsigned long a0)
 {
 	struct task_struct *curr = current;
 
+	time_hardirqs_off(CALLER_ADDR0, a0);
+
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
 
@@ -2082,7 +2092,12 @@ void trace_hardirqs_off(void)
 	} else
 		debug_atomic_inc(&redundant_hardirqs_off);
 }
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
 
+void notrace trace_hardirqs_off(void)
+{
+	trace_hardirqs_off_caller(CALLER_ADDR0);
+}
 EXPORT_SYMBOL(trace_hardirqs_off);
 
 /*
diff --git a/kernel/printk.c b/kernel/printk.c
index 8fb01c32aa3b..ae7d5b9e535d 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1041,7 +1041,9 @@ void release_console_sem(void)
 		_log_end = log_end;
 		con_start = log_end;		/* Flush */
 		spin_unlock(&logbuf_lock);
+		stop_critical_timings();	/* don't trace print latency */
 		call_console_drivers(_con_start, _log_end);
+		start_critical_timings();
 		local_irq_restore(flags);
 	}
 	console_locked = 0;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 892ecc94a82b..896df1cf6adc 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -26,6 +26,24 @@ config FTRACE
 	  (the bootup default), then the overhead of the instructions is very
 	  small and not measurable even in micro-benchmarks.
 
+config IRQSOFF_TRACER
+	bool "Interrupts-off Latency Tracer"
+	default n
+	depends on TRACE_IRQFLAGS_SUPPORT
+	depends on GENERIC_TIME
+	select TRACE_IRQFLAGS
+	select TRACING
+	select TRACER_MAX_TRACE
+	help
+	  This option measures the time spent in irqs-off critical
+	  sections, with microsecond accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started
+	  via:
+
+	      echo 0 > /debugfs/tracing/tracing_max_latency
+
 config SCHED_TRACER
 	bool "Scheduling Latency Tracer"
 	depends on DEBUG_KERNEL
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 5508cdb19aea..46be8647fb65 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -3,6 +3,7 @@ obj-$(CONFIG_FTRACE) += libftrace.o
 obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_FTRACE) += trace_functions.o
+obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
new file mode 100644
index 000000000000..a9131b0cf1a5
--- /dev/null
+++ b/kernel/trace/trace_irqsoff.c
@@ -0,0 +1,402 @@
+/*
+ * trace irqs off criticall timings
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * From code in the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/kallsyms.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+
+static struct trace_array		*irqsoff_trace __read_mostly;
+static int				tracer_enabled __read_mostly;
+
+/*
+ * Sequence count - we record it when starting a measurement and
+ * skip the latency if the sequence has changed - some other section
+ * did a maximum and could disturb our measurement with serial console
+ * printouts, etc. Truly coinciding maximum latencies should be rare
+ * and what happens together happens separately as well, so this doesnt
+ * decrease the validity of the maximum found:
+ */
+static __cacheline_aligned_in_smp	unsigned long max_sequence;
+
+#ifdef CONFIG_FTRACE
+/*
+ * irqsoff uses its own tracer function to keep the overhead down:
+ */
+static void notrace
+irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct trace_array *tr = irqsoff_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+
+	if (likely(!tracer_enabled))
+		return;
+
+	local_save_flags(flags);
+
+	if (!irqs_disabled_flags(flags))
+		return;
+
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1))
+		ftrace(tr, data, ip, parent_ip, flags);
+
+	atomic_dec(&data->disabled);
+}
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+	.func = irqsoff_tracer_call,
+};
+#endif /* CONFIG_FTRACE */
+
+/*
+ * Should this new latency be reported/recorded?
+ */
+static int notrace report_latency(cycle_t delta)
+{
+	if (tracing_thresh) {
+		if (delta < tracing_thresh)
+			return 0;
+	} else {
+		if (delta <= tracing_max_latency)
+			return 0;
+	}
+	return 1;
+}
+
+static void notrace
+check_critical_timing(struct trace_array *tr,
+		      struct trace_array_cpu *data,
+		      unsigned long parent_ip,
+		      int cpu)
+{
+	unsigned long latency, t0, t1;
+	cycle_t T0, T1, T2, delta;
+	unsigned long flags;
+
+	/*
+	 * usecs conversion is slow so we try to delay the conversion
+	 * as long as possible:
+	 */
+	T0 = data->preempt_timestamp;
+	T1 = now(cpu);
+	delta = T1-T0;
+
+	local_save_flags(flags);
+
+	if (!report_latency(delta))
+		goto out;
+
+	ftrace(tr, data, CALLER_ADDR0, parent_ip, flags);
+	/*
+	 * Update the timestamp, because the trace entry above
+	 * might change it (it can only get larger so the latency
+	 * is fair to be reported):
+	 */
+	T2 = now(cpu);
+
+	delta = T2-T0;
+
+	latency = nsecs_to_usecs(delta);
+
+	if (data->critical_sequence != max_sequence)
+		goto out;
+
+	tracing_max_latency = delta;
+	t0 = nsecs_to_usecs(T0);
+	t1 = nsecs_to_usecs(T1);
+
+	data->critical_end = parent_ip;
+
+	update_max_tr_single(tr, current, cpu);
+
+	if (tracing_thresh)
+		printk(KERN_INFO "(%16s-%-5d|#%d): %lu us critical section "
+		       "violates %lu us threshold.\n"
+		       " => started at timestamp %lu: ",
+				current->comm, current->pid,
+				raw_smp_processor_id(),
+				latency, nsecs_to_usecs(tracing_thresh), t0);
+	else
+		printk(KERN_INFO "(%16s-%-5d|#%d):"
+		       " new %lu us maximum-latency "
+		       "critical section.\n => started at timestamp %lu: ",
+				current->comm, current->pid,
+				raw_smp_processor_id(),
+				latency, t0);
+
+	print_symbol(KERN_CONT "<%s>\n", data->critical_start);
+	printk(KERN_CONT " =>   ended at timestamp %lu: ", t1);
+	print_symbol(KERN_CONT "<%s>\n", data->critical_end);
+	dump_stack();
+	t1 = nsecs_to_usecs(now(cpu));
+	printk(KERN_CONT " =>   dump-end timestamp %lu\n\n", t1);
+
+	max_sequence++;
+
+out:
+	data->critical_sequence = max_sequence;
+	data->preempt_timestamp = now(cpu);
+	tracing_reset(data);
+	ftrace(tr, data, CALLER_ADDR0, parent_ip, flags);
+}
+
+static inline void notrace
+start_critical_timing(unsigned long ip, unsigned long parent_ip)
+{
+	int cpu;
+	struct trace_array *tr = irqsoff_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+
+	if (likely(!tracer_enabled))
+		return;
+
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+
+	if (unlikely(!data) || unlikely(!data->trace) ||
+	    data->critical_start || atomic_read(&data->disabled))
+		return;
+
+	atomic_inc(&data->disabled);
+
+	data->critical_sequence = max_sequence;
+	data->preempt_timestamp = now(cpu);
+	data->critical_start = parent_ip;
+	tracing_reset(data);
+
+	local_save_flags(flags);
+	ftrace(tr, data, ip, parent_ip, flags);
+
+	atomic_dec(&data->disabled);
+}
+
+static inline void notrace
+stop_critical_timing(unsigned long ip, unsigned long parent_ip)
+{
+	int cpu;
+	struct trace_array *tr = irqsoff_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+
+	if (likely(!tracer_enabled))
+		return;
+
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+
+	if (unlikely(!data) || unlikely(!data->trace) ||
+	    !data->critical_start || atomic_read(&data->disabled))
+		return;
+
+	atomic_inc(&data->disabled);
+	local_save_flags(flags);
+	ftrace(tr, data, ip, parent_ip, flags);
+	check_critical_timing(tr, data, parent_ip, cpu);
+	data->critical_start = 0;
+	atomic_dec(&data->disabled);
+}
+
+void notrace start_critical_timings(void)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+
+void notrace stop_critical_timings(void)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+
+#ifdef CONFIG_PROVE_LOCKING
+void notrace time_hardirqs_on(unsigned long a0, unsigned long a1)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		stop_critical_timing(a0, a1);
+}
+
+void notrace time_hardirqs_off(unsigned long a0, unsigned long a1)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		start_critical_timing(a0, a1);
+}
+
+#else /* !CONFIG_PROVE_LOCKING */
+
+/*
+ * Stubs:
+ */
+
+void early_boot_irqs_off(void)
+{
+}
+
+void early_boot_irqs_on(void)
+{
+}
+
+void trace_softirqs_on(unsigned long ip)
+{
+}
+
+void trace_softirqs_off(unsigned long ip)
+{
+}
+
+inline void print_irqtrace_events(struct task_struct *curr)
+{
+}
+
+/*
+ * We are only interested in hardirq on/off events:
+ */
+void notrace trace_hardirqs_on(void)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+void notrace trace_hardirqs_off(void)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+void notrace trace_hardirqs_on_caller(unsigned long caller_addr)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		stop_critical_timing(CALLER_ADDR0, caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
+
+void notrace trace_hardirqs_off_caller(unsigned long caller_addr)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		start_critical_timing(CALLER_ADDR0, caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
+
+#endif /* CONFIG_PROVE_LOCKING */
+
+static void start_irqsoff_tracer(struct trace_array *tr)
+{
+	tracer_enabled = 1;
+	register_ftrace_function(&trace_ops);
+}
+
+static void stop_irqsoff_tracer(struct trace_array *tr)
+{
+	unregister_ftrace_function(&trace_ops);
+	tracer_enabled = 0;
+}
+
+static void irqsoff_tracer_init(struct trace_array *tr)
+{
+	irqsoff_trace = tr;
+	/* make sure that the tracer is visibel */
+	smp_wmb();
+
+	if (tr->ctrl)
+		start_irqsoff_tracer(tr);
+}
+
+static void irqsoff_tracer_reset(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		stop_irqsoff_tracer(tr);
+}
+
+static void irqsoff_tracer_ctrl_update(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		start_irqsoff_tracer(tr);
+	else
+		stop_irqsoff_tracer(tr);
+}
+
+static void notrace irqsoff_tracer_open(struct trace_iterator *iter)
+{
+	/* stop the trace while dumping */
+	if (iter->tr->ctrl)
+		stop_irqsoff_tracer(iter->tr);
+}
+
+static void notrace irqsoff_tracer_close(struct trace_iterator *iter)
+{
+	if (iter->tr->ctrl)
+		start_irqsoff_tracer(iter->tr);
+}
+
+static struct tracer irqsoff_tracer __read_mostly =
+{
+	.name		= "irqsoff",
+	.init		= irqsoff_tracer_init,
+	.reset		= irqsoff_tracer_reset,
+	.open		= irqsoff_tracer_open,
+	.close		= irqsoff_tracer_close,
+	.ctrl_update	= irqsoff_tracer_ctrl_update,
+	.print_max	= 1,
+};
+
+__init static int init_irqsoff_tracer(void)
+{
+	register_tracer(&irqsoff_tracer);
+
+	return 0;
+}
+device_initcall(init_irqsoff_tracer);
-- 
cgit v1.2.3


From 6cd8a4bb2f97527a9ceb30bc77ea4e959c6a95e3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:42 +0200
Subject: ftrace: trace preempt off critical timings

Add preempt off timings. A lot of kernel core code is taken from the RT patch
latency trace that was written by Ingo Molnar.

This adds "preemptoff" and "preemptirqsoff" to /debugfs/tracing/available_tracers

Now instead of just tracing irqs off, preemption off can be selected
to be recorded.

When this is selected, it shares the same files as irqs off timings.
One can either trace preemption off, irqs off, or one or the other off.

By echoing "preemptoff" into /debugfs/tracing/current_tracer, recording
of preempt off only is performed. "irqsoff" will only record the time
irqs are disabled, but "preemptirqsoff" will take the total time irqs
or preemption are disabled. Runtime switching of these options is now
supported by simpling echoing in the appropriate trace name into
/debugfs/tracing/current_tracer.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c               |  24 +++++-
 kernel/trace/Kconfig         |  25 ++++++
 kernel/trace/Makefile        |   1 +
 kernel/trace/trace_irqsoff.c | 184 +++++++++++++++++++++++++++++++------------
 4 files changed, 183 insertions(+), 51 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 73e600852365..328494e28df2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -70,6 +70,7 @@
 #include <linux/bootmem.h>
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
+#include <linux/ftrace.h>
 
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
@@ -4365,26 +4366,44 @@ void scheduler_tick(void)
 #endif
 }
 
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+				defined(CONFIG_PREEMPT_TRACER))
+
+static inline unsigned long get_parent_ip(unsigned long addr)
+{
+	if (in_lock_functions(addr)) {
+		addr = CALLER_ADDR2;
+		if (in_lock_functions(addr))
+			addr = CALLER_ADDR3;
+	}
+	return addr;
+}
 
 void __kprobes add_preempt_count(int val)
 {
+#ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Underflow?
 	 */
 	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
 		return;
+#endif
 	preempt_count() += val;
+#ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Spinlock count overflowing soon?
 	 */
 	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
 				PREEMPT_MASK - 10);
+#endif
+	if (preempt_count() == val)
+		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 }
 EXPORT_SYMBOL(add_preempt_count);
 
 void __kprobes sub_preempt_count(int val)
 {
+#ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Underflow?
 	 */
@@ -4396,7 +4415,10 @@ void __kprobes sub_preempt_count(int val)
 	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
 			!(preempt_count() & PREEMPT_MASK)))
 		return;
+#endif
 
+	if (preempt_count() == val)
+		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 	preempt_count() -= val;
 }
 EXPORT_SYMBOL(sub_preempt_count);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 896df1cf6adc..6430016b98e8 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -44,6 +44,31 @@ config IRQSOFF_TRACER
 
 	      echo 0 > /debugfs/tracing/tracing_max_latency
 
+	  (Note that kernel size and overhead increases with this option
+	  enabled. This option and the preempt-off timing option can be
+	  used together or separately.)
+
+config PREEMPT_TRACER
+	bool "Preemption-off Latency Tracer"
+	default n
+	depends on GENERIC_TIME
+	depends on PREEMPT
+	select TRACING
+	select TRACER_MAX_TRACE
+	help
+	  This option measures the time spent in preemption off critical
+	  sections, with microsecond accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started
+	  via:
+
+	      echo 0 > /debugfs/tracing/tracing_max_latency
+
+	  (Note that kernel size and overhead increases with this option
+	  enabled. This option and the irqs-off timing option can be
+	  used together or separately.)
+
 config SCHED_TRACER
 	bool "Scheduling Latency Tracer"
 	depends on DEBUG_KERNEL
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 46be8647fb65..3fec653d6533 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_FTRACE) += trace_functions.o
 obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
+obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index a9131b0cf1a5..8b1231633dc5 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -21,6 +21,36 @@
 static struct trace_array		*irqsoff_trace __read_mostly;
 static int				tracer_enabled __read_mostly;
 
+static DEFINE_PER_CPU(int, tracing_cpu);
+
+enum {
+	TRACER_IRQS_OFF		= (1 << 1),
+	TRACER_PREEMPT_OFF	= (1 << 2),
+};
+
+static int trace_type __read_mostly;
+
+#ifdef CONFIG_PREEMPT_TRACER
+static inline int notrace
+preempt_trace(void)
+{
+	return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count());
+}
+#else
+# define preempt_trace() (0)
+#endif
+
+#ifdef CONFIG_IRQSOFF_TRACER
+static inline int notrace
+irq_trace(void)
+{
+	return ((trace_type & TRACER_IRQS_OFF) &&
+		irqs_disabled());
+}
+#else
+# define irq_trace() (0)
+#endif
+
 /*
  * Sequence count - we record it when starting a measurement and
  * skip the latency if the sequence has changed - some other section
@@ -44,14 +74,11 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
 	long disabled;
 	int cpu;
 
-	if (likely(!tracer_enabled))
+	if (likely(!__get_cpu_var(tracing_cpu)))
 		return;
 
 	local_save_flags(flags);
 
-	if (!irqs_disabled_flags(flags))
-		return;
-
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 	disabled = atomic_inc_return(&data->disabled);
@@ -171,23 +198,29 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
 	if (likely(!tracer_enabled))
 		return;
 
+	if (__get_cpu_var(tracing_cpu))
+		return;
+
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 
 	if (unlikely(!data) || unlikely(!data->trace) ||
-	    data->critical_start || atomic_read(&data->disabled))
+	    atomic_read(&data->disabled))
 		return;
 
 	atomic_inc(&data->disabled);
 
 	data->critical_sequence = max_sequence;
 	data->preempt_timestamp = now(cpu);
-	data->critical_start = parent_ip;
+	data->critical_start = parent_ip ? : ip;
 	tracing_reset(data);
 
 	local_save_flags(flags);
+
 	ftrace(tr, data, ip, parent_ip, flags);
 
+	__get_cpu_var(tracing_cpu) = 1;
+
 	atomic_dec(&data->disabled);
 }
 
@@ -199,7 +232,13 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
 	struct trace_array_cpu *data;
 	unsigned long flags;
 
-	if (likely(!tracer_enabled))
+	/* Always clear the tracing cpu on stopping the trace */
+	if (unlikely(__get_cpu_var(tracing_cpu)))
+		__get_cpu_var(tracing_cpu) = 0;
+	else
+		return;
+
+	if (!tracer_enabled)
 		return;
 
 	cpu = raw_smp_processor_id();
@@ -212,49 +251,35 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
 	atomic_inc(&data->disabled);
 	local_save_flags(flags);
 	ftrace(tr, data, ip, parent_ip, flags);
-	check_critical_timing(tr, data, parent_ip, cpu);
+	check_critical_timing(tr, data, parent_ip ? : ip, cpu);
 	data->critical_start = 0;
 	atomic_dec(&data->disabled);
 }
 
+/* start and stop critical timings used to for stoppage (in idle) */
 void notrace start_critical_timings(void)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (preempt_trace() || irq_trace())
 		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
 
 void notrace stop_critical_timings(void)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (preempt_trace() || irq_trace())
 		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
 
+#ifdef CONFIG_IRQSOFF_TRACER
 #ifdef CONFIG_PROVE_LOCKING
 void notrace time_hardirqs_on(unsigned long a0, unsigned long a1)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(a0, a1);
 }
 
 void notrace time_hardirqs_off(unsigned long a0, unsigned long a1)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (!preempt_trace() && irq_trace())
 		start_critical_timing(a0, a1);
 }
 
@@ -289,49 +314,46 @@ inline void print_irqtrace_events(struct task_struct *curr)
  */
 void notrace trace_hardirqs_on(void)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
 EXPORT_SYMBOL(trace_hardirqs_on);
 
 void notrace trace_hardirqs_off(void)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (!preempt_trace() && irq_trace())
 		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
 EXPORT_SYMBOL(trace_hardirqs_off);
 
 void notrace trace_hardirqs_on_caller(unsigned long caller_addr)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(CALLER_ADDR0, caller_addr);
 }
 EXPORT_SYMBOL(trace_hardirqs_on_caller);
 
 void notrace trace_hardirqs_off_caller(unsigned long caller_addr)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (!preempt_trace() && irq_trace())
 		start_critical_timing(CALLER_ADDR0, caller_addr);
 }
 EXPORT_SYMBOL(trace_hardirqs_off_caller);
 
 #endif /* CONFIG_PROVE_LOCKING */
+#endif /*  CONFIG_IRQSOFF_TRACER */
+
+#ifdef CONFIG_PREEMPT_TRACER
+void notrace trace_preempt_on(unsigned long a0, unsigned long a1)
+{
+	stop_critical_timing(a0, a1);
+}
+
+void notrace trace_preempt_off(unsigned long a0, unsigned long a1)
+{
+	start_critical_timing(a0, a1);
+}
+#endif /* CONFIG_PREEMPT_TRACER */
 
 static void start_irqsoff_tracer(struct trace_array *tr)
 {
@@ -345,7 +367,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr)
 	tracer_enabled = 0;
 }
 
-static void irqsoff_tracer_init(struct trace_array *tr)
+static void __irqsoff_tracer_init(struct trace_array *tr)
 {
 	irqsoff_trace = tr;
 	/* make sure that the tracer is visibel */
@@ -382,6 +404,13 @@ static void notrace irqsoff_tracer_close(struct trace_iterator *iter)
 		start_irqsoff_tracer(iter->tr);
 }
 
+#ifdef CONFIG_IRQSOFF_TRACER
+static void irqsoff_tracer_init(struct trace_array *tr)
+{
+	trace_type = TRACER_IRQS_OFF;
+
+	__irqsoff_tracer_init(tr);
+}
 static struct tracer irqsoff_tracer __read_mostly =
 {
 	.name		= "irqsoff",
@@ -392,10 +421,65 @@ static struct tracer irqsoff_tracer __read_mostly =
 	.ctrl_update	= irqsoff_tracer_ctrl_update,
 	.print_max	= 1,
 };
+# define register_irqsoff(trace) register_tracer(&trace)
+#else
+# define register_irqsoff(trace) do { } while (0)
+#endif
+
+#ifdef CONFIG_PREEMPT_TRACER
+static void preemptoff_tracer_init(struct trace_array *tr)
+{
+	trace_type = TRACER_PREEMPT_OFF;
+
+	__irqsoff_tracer_init(tr);
+}
+
+static struct tracer preemptoff_tracer __read_mostly =
+{
+	.name		= "preemptoff",
+	.init		= preemptoff_tracer_init,
+	.reset		= irqsoff_tracer_reset,
+	.open		= irqsoff_tracer_open,
+	.close		= irqsoff_tracer_close,
+	.ctrl_update	= irqsoff_tracer_ctrl_update,
+	.print_max	= 1,
+};
+# define register_preemptoff(trace) register_tracer(&trace)
+#else
+# define register_preemptoff(trace) do { } while (0)
+#endif
+
+#if defined(CONFIG_IRQSOFF_TRACER) && \
+	defined(CONFIG_PREEMPT_TRACER)
+
+static void preemptirqsoff_tracer_init(struct trace_array *tr)
+{
+	trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
+
+	__irqsoff_tracer_init(tr);
+}
+
+static struct tracer preemptirqsoff_tracer __read_mostly =
+{
+	.name		= "preemptirqsoff",
+	.init		= preemptirqsoff_tracer_init,
+	.reset		= irqsoff_tracer_reset,
+	.open		= irqsoff_tracer_open,
+	.close		= irqsoff_tracer_close,
+	.ctrl_update	= irqsoff_tracer_ctrl_update,
+	.print_max	= 1,
+};
+
+# define register_preemptirqsoff(trace) register_tracer(&trace)
+#else
+# define register_preemptirqsoff(trace) do { } while (0)
+#endif
 
 __init static int init_irqsoff_tracer(void)
 {
-	register_tracer(&irqsoff_tracer);
+	register_irqsoff(irqsoff_tracer);
+	register_preemptoff(preemptoff_tracer);
+	register_preemptirqsoff(preemptirqsoff_tracer);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 3d0833953e1b98b79ddf491dd49229eef9baeac1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:42 +0200
Subject: ftrace: dynamic enabling/disabling of function calls

This patch adds a feature to dynamically replace the ftrace code
with the jmps to allow a kernel with ftrace configured to run
as fast as it can without it configured.

The way this works, is on bootup (if ftrace is enabled), a ftrace
function is registered to record the instruction pointer of all
places that call the function.

Later, if there's still any code to patch, a kthread is awoken
(rate limited to at most once a second) that performs a stop_machine,
and replaces all the code that was called with a jmp over the call
to ftrace. It only replaces what was found the previous time. Typically
the system reaches equilibrium quickly after bootup and there's no code
patching needed at all.

e.g.

  call ftrace  /* 5 bytes */

is replaced with

  jmp 3f  /* jmp is 2 bytes and we jump 3 forward */
3:

When we want to enable ftrace for function tracing, the IP recording
is removed, and stop_machine is called again to replace all the locations
of that were recorded back to the call of ftrace.  When it is disabled,
we replace the code back to the jmp.

Allocation is done by the kthread. If the ftrace recording function is
called, and we don't have any record slots available, then we simply
skip that call. Once a second a new page (if needed) is allocated for
recording new ftrace function calls.  A large batch is allocated at
boot up to get most of the calls there.

Because we do this via stop_machine, we don't have to worry about another
CPU executing a ftrace call as we modify it. But we do need to worry
about NMI's so all functions that might be called via nmi must be
annotated with notrace_nmi. When this code is configured in, the NMI code
will not call notrace.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/Kconfig  |  17 +++
 kernel/trace/ftrace.c | 356 +++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 341 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 6430016b98e8..cad9db1dee02 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -88,3 +88,20 @@ config CONTEXT_SWITCH_TRACER
 	  This tracer gets called from the context switch and records
 	  all switching of tasks.
 
+config DYNAMIC_FTRACE
+	bool "enable/disable ftrace tracepoints dynamically"
+	depends on FTRACE
+	default y
+	help
+         This option will modify all the calls to ftrace dynamically
+	 (will patch them out of the binary image and replaces them
+	 with a No-Op instruction) as they are called. A table is
+	 created to dynamically enable them again.
+
+	 This way a CONFIG_FTRACE kernel is slightly larger, but otherwise
+	 has native performance as long as no tracing is active.
+
+	 The changes to the code are done by a kernel thread that
+	 wakes up once a second and checks to see if any ftrace calls
+	 were made. If so, it runs stop_machine (stops all CPUS)
+	 and modifies the code to jump over the call to ftrace.
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b6a80b98a3fb..d1ae2ba25274 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -13,10 +13,19 @@
  *  Copyright (C) 2004 William Lee Irwin III
  */
 
-#include <linux/module.h>
+#include <linux/stop_machine.h>
+#include <linux/clocksource.h>
+#include <linux/kallsyms.h>
+#include <linux/kthread.h>
+#include <linux/hardirq.h>
 #include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/hash.h>
+#include <linux/list.h>
+
+#include "trace.h"
 
-static DEFINE_SPINLOCK(ftrace_func_lock);
+static DEFINE_SPINLOCK(ftrace_lock);
 static struct ftrace_ops ftrace_list_end __read_mostly =
 {
 	.func = ftrace_stub,
@@ -44,21 +53,21 @@ notrace void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
 }
 
 /**
- * register_ftrace_function - register a function for profiling
- * @ops - ops structure that holds the function for profiling.
- *
- * Register a function to be called by all functions in the
- * kernel.
+ * clear_ftrace_function - reset the ftrace function
  *
- * Note: @ops->func and all the functions it calls must be labeled
- *       with "notrace", otherwise it will go into a
- *       recursive loop.
+ * This NULLs the ftrace function and in essence stops
+ * tracing.  There may be lag
  */
-int register_ftrace_function(struct ftrace_ops *ops)
+void clear_ftrace_function(void)
 {
-	unsigned long flags;
+	ftrace_trace_function = ftrace_stub;
+}
+
+static int notrace __register_ftrace_function(struct ftrace_ops *ops)
+{
+	/* Should never be called by interrupts */
+	spin_lock(&ftrace_lock);
 
-	spin_lock_irqsave(&ftrace_func_lock, flags);
 	ops->next = ftrace_list;
 	/*
 	 * We are entering ops into the ftrace_list but another
@@ -68,6 +77,7 @@ int register_ftrace_function(struct ftrace_ops *ops)
 	 */
 	smp_wmb();
 	ftrace_list = ops;
+
 	/*
 	 * For one func, simply call it directly.
 	 * For more than one func, call the chain.
@@ -76,28 +86,22 @@ int register_ftrace_function(struct ftrace_ops *ops)
 		ftrace_trace_function = ops->func;
 	else
 		ftrace_trace_function = ftrace_list_func;
-	spin_unlock_irqrestore(&ftrace_func_lock, flags);
+
+	spin_unlock(&ftrace_lock);
 
 	return 0;
 }
 
-/**
- * unregister_ftrace_function - unresgister a function for profiling.
- * @ops - ops structure that holds the function to unregister
- *
- * Unregister a function that was added to be called by ftrace profiling.
- */
-int unregister_ftrace_function(struct ftrace_ops *ops)
+static int notrace __unregister_ftrace_function(struct ftrace_ops *ops)
 {
-	unsigned long flags;
 	struct ftrace_ops **p;
 	int ret = 0;
 
-	spin_lock_irqsave(&ftrace_func_lock, flags);
+	spin_lock(&ftrace_lock);
 
 	/*
-	 * If we are the only function, then the ftrace pointer is
-	 * pointing directly to that function.
+	 * If we are removing the last function, then simply point
+	 * to the ftrace_stub.
 	 */
 	if (ftrace_list == ops && ops->next == &ftrace_list_end) {
 		ftrace_trace_function = ftrace_stub;
@@ -117,22 +121,310 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
 	*p = (*p)->next;
 
 	/* If we only have one func left, then call that directly */
-	if (ftrace_list->next == &ftrace_list_end)
+	if (ftrace_list == &ftrace_list_end ||
+	    ftrace_list->next == &ftrace_list_end)
 		ftrace_trace_function = ftrace_list->func;
 
  out:
-	spin_unlock_irqrestore(&ftrace_func_lock, flags);
+	spin_unlock(&ftrace_lock);
+
+	return ret;
+}
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+static struct hlist_head ftrace_hash[FTRACE_HASHSIZE];
+
+static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu);
+
+static DEFINE_SPINLOCK(ftrace_shutdown_lock);
+static DEFINE_MUTEX(ftraced_lock);
+
+static int ftraced_trigger;
+static int ftraced_suspend;
+
+static int ftrace_record_suspend;
+
+static inline int
+notrace ftrace_ip_in_hash(unsigned long ip, unsigned long key)
+{
+	struct dyn_ftrace *p;
+	struct hlist_node *t;
+	int found = 0;
+
+	hlist_for_each_entry(p, t, &ftrace_hash[key], node) {
+		if (p->ip == ip) {
+			found = 1;
+			break;
+		}
+	}
+
+	return found;
+}
+
+static inline void notrace
+ftrace_add_hash(struct dyn_ftrace *node, unsigned long key)
+{
+	hlist_add_head(&node->node, &ftrace_hash[key]);
+}
+
+static void notrace
+ftrace_record_ip(unsigned long ip, unsigned long parent_ip)
+{
+	struct dyn_ftrace *node;
+	unsigned long flags;
+	unsigned long key;
+	int resched;
+	int atomic;
+
+	resched = need_resched();
+	preempt_disable_notrace();
+
+	/* We simply need to protect against recursion */
+	__get_cpu_var(ftrace_shutdown_disable_cpu)++;
+	if (__get_cpu_var(ftrace_shutdown_disable_cpu) != 1)
+		goto out;
+
+	if (unlikely(ftrace_record_suspend))
+		goto out;
+
+	key = hash_long(ip, FTRACE_HASHBITS);
+
+	WARN_ON_ONCE(key >= FTRACE_HASHSIZE);
+
+	if (ftrace_ip_in_hash(ip, key))
+		goto out;
+
+	atomic = irqs_disabled();
+
+	spin_lock_irqsave(&ftrace_shutdown_lock, flags);
+
+	/* This ip may have hit the hash before the lock */
+	if (ftrace_ip_in_hash(ip, key))
+		goto out_unlock;
+
+	/*
+	 * There's a slight race that the ftraced will update the
+	 * hash and reset here. The arch alloc is responsible
+	 * for seeing if the IP has already changed, and if
+	 * it has, the alloc will fail.
+	 */
+	node = ftrace_alloc_shutdown_node(ip);
+	if (!node)
+		goto out_unlock;
+
+	node->ip = ip;
+
+	ftrace_add_hash(node, key);
+
+	ftraced_trigger = 1;
+
+ out_unlock:
+	spin_unlock_irqrestore(&ftrace_shutdown_lock, flags);
+ out:
+	__get_cpu_var(ftrace_shutdown_disable_cpu)--;
+
+	/* prevent recursion with scheduler */
+	if (resched)
+		preempt_enable_no_resched_notrace();
+	else
+		preempt_enable_notrace();
+}
+
+static struct ftrace_ops ftrace_shutdown_ops __read_mostly =
+{
+	.func = ftrace_record_ip,
+};
+
+
+static int notrace __ftrace_modify_code(void *data)
+{
+	void (*func)(void) = data;
+
+	func();
+	return 0;
+}
+
+static void notrace ftrace_run_startup_code(void)
+{
+	stop_machine_run(__ftrace_modify_code, ftrace_startup_code, NR_CPUS);
+}
+
+static void notrace ftrace_run_shutdown_code(void)
+{
+	stop_machine_run(__ftrace_modify_code, ftrace_shutdown_code, NR_CPUS);
+}
+
+static void notrace ftrace_startup(void)
+{
+	mutex_lock(&ftraced_lock);
+	ftraced_suspend++;
+	if (ftraced_suspend != 1)
+		goto out;
+	__unregister_ftrace_function(&ftrace_shutdown_ops);
+
+	ftrace_run_startup_code();
+ out:
+	mutex_unlock(&ftraced_lock);
+}
+
+static void notrace ftrace_shutdown(void)
+{
+	mutex_lock(&ftraced_lock);
+	ftraced_suspend--;
+	if (ftraced_suspend)
+		goto out;
+
+	ftrace_run_shutdown_code();
+
+	__register_ftrace_function(&ftrace_shutdown_ops);
+ out:
+	mutex_unlock(&ftraced_lock);
+}
+
+static cycle_t		ftrace_update_time;
+static unsigned long	ftrace_update_cnt;
+unsigned long		ftrace_update_tot_cnt;
+
+static int notrace __ftrace_update_code(void *ignore)
+{
+	struct dyn_ftrace *p;
+	struct hlist_head head;
+	struct hlist_node *t;
+	cycle_t start, stop;
+	int i;
+
+	/* Don't be calling ftrace ops now */
+	__unregister_ftrace_function(&ftrace_shutdown_ops);
+
+	start = now(raw_smp_processor_id());
+	ftrace_update_cnt = 0;
+
+	/* No locks needed, the machine is stopped! */
+	for (i = 0; i < FTRACE_HASHSIZE; i++) {
+		if (hlist_empty(&ftrace_hash[i]))
+			continue;
+
+		head = ftrace_hash[i];
+		INIT_HLIST_HEAD(&ftrace_hash[i]);
+
+		/* all CPUS are stopped, we are safe to modify code */
+		hlist_for_each_entry(p, t, &head, node) {
+			ftrace_code_disable(p);
+			ftrace_update_cnt++;
+		}
+
+	}
+
+	stop = now(raw_smp_processor_id());
+	ftrace_update_time = stop - start;
+	ftrace_update_tot_cnt += ftrace_update_cnt;
+
+	__register_ftrace_function(&ftrace_shutdown_ops);
 
 	return 0;
 }
 
+static void notrace ftrace_update_code(void)
+{
+	stop_machine_run(__ftrace_update_code, NULL, NR_CPUS);
+}
+
+static int notrace ftraced(void *ignore)
+{
+	unsigned long usecs;
+
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	while (!kthread_should_stop()) {
+
+		/* check once a second */
+		schedule_timeout(HZ);
+
+		mutex_lock(&ftraced_lock);
+		if (ftraced_trigger && !ftraced_suspend) {
+			ftrace_record_suspend++;
+			ftrace_update_code();
+			usecs = nsecs_to_usecs(ftrace_update_time);
+			if (ftrace_update_tot_cnt > 100000) {
+				ftrace_update_tot_cnt = 0;
+				pr_info("hm, dftrace overflow: %lu change%s"
+					 " (%lu total) in %lu usec%s\n",
+					ftrace_update_cnt,
+					ftrace_update_cnt != 1 ? "s" : "",
+					ftrace_update_tot_cnt,
+					usecs, usecs != 1 ? "s" : "");
+				WARN_ON_ONCE(1);
+			}
+			ftraced_trigger = 0;
+			ftrace_record_suspend--;
+		}
+		mutex_unlock(&ftraced_lock);
+
+		ftrace_shutdown_replenish();
+
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+static int __init notrace ftrace_shutdown_init(void)
+{
+	struct task_struct *p;
+	int ret;
+
+	ret = ftrace_shutdown_arch_init();
+	if (ret)
+		return ret;
+
+	p = kthread_run(ftraced, NULL, "ftraced");
+	if (IS_ERR(p))
+		return -1;
+
+	__register_ftrace_function(&ftrace_shutdown_ops);
+
+	return 0;
+}
+
+core_initcall(ftrace_shutdown_init);
+#else
+# define ftrace_startup()	do { } while (0)
+# define ftrace_shutdown()	do { } while (0)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
 /**
- * clear_ftrace_function - reset the ftrace function
+ * register_ftrace_function - register a function for profiling
+ * @ops - ops structure that holds the function for profiling.
  *
- * This NULLs the ftrace function and in essence stops
- * tracing.  There may be lag
+ * Register a function to be called by all functions in the
+ * kernel.
+ *
+ * Note: @ops->func and all the functions it calls must be labeled
+ *       with "notrace", otherwise it will go into a
+ *       recursive loop.
  */
-void clear_ftrace_function(void)
+int register_ftrace_function(struct ftrace_ops *ops)
 {
-	ftrace_trace_function = ftrace_stub;
+	ftrace_startup();
+
+	return __register_ftrace_function(ops);
+}
+
+/**
+ * unregister_ftrace_function - unresgister a function for profiling.
+ * @ops - ops structure that holds the function to unregister
+ *
+ * Unregister a function that was added to be called by ftrace profiling.
+ */
+int unregister_ftrace_function(struct ftrace_ops *ops)
+{
+	int ret;
+
+	ret = __unregister_ftrace_function(ops);
+
+	if (ftrace_list == &ftrace_list_end)
+		ftrace_shutdown();
+
+	return ret;
 }
-- 
cgit v1.2.3


From b0fc494fae96a7089f3651cb451f461c7291244c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:43 +0200
Subject: ftrace: add ftrace_enabled sysctl to disable mcount function

This patch adds back the sysctl ftrace_enabled. This time it is
defaulted to on, if DYNAMIC_FTRACE is configured. When ftrace_enabled
is disabled, the ftrace function is set to the stub return.

If DYNAMIC_FTRACE is also configured, on ftrace_enabled = 0,
the registered ftrace functions will all be set to jmps, but no more
new calls to ftrace recording (used to find the ftrace calling sites)
will be called.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sysctl.c       |  11 +++++
 kernel/trace/ftrace.c | 125 ++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 118 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 29116652dca8..efaf7c5500e9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -46,6 +46,7 @@
 #include <linux/nfs_fs.h>
 #include <linux/acpi.h>
 #include <linux/reboot.h>
+#include <linux/ftrace.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -455,6 +456,16 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+#ifdef CONFIG_FTRACE
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "ftrace_enabled",
+		.data		= &ftrace_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &ftrace_enable_sysctl,
+	},
+#endif
 #ifdef CONFIG_KMOD
 	{
 		.ctl_name	= KERN_MODPROBE,
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d1ae2ba25274..d3de37299ba4 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -20,12 +20,24 @@
 #include <linux/hardirq.h>
 #include <linux/ftrace.h>
 #include <linux/module.h>
+#include <linux/sysctl.h>
 #include <linux/hash.h>
 #include <linux/list.h>
 
 #include "trace.h"
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+# define FTRACE_ENABLED_INIT 1
+#else
+# define FTRACE_ENABLED_INIT 0
+#endif
+
+int ftrace_enabled = FTRACE_ENABLED_INIT;
+static int last_ftrace_enabled = FTRACE_ENABLED_INIT;
+
 static DEFINE_SPINLOCK(ftrace_lock);
+static DEFINE_MUTEX(ftrace_sysctl_lock);
+
 static struct ftrace_ops ftrace_list_end __read_mostly =
 {
 	.func = ftrace_stub,
@@ -78,14 +90,16 @@ static int notrace __register_ftrace_function(struct ftrace_ops *ops)
 	smp_wmb();
 	ftrace_list = ops;
 
-	/*
-	 * For one func, simply call it directly.
-	 * For more than one func, call the chain.
-	 */
-	if (ops->next == &ftrace_list_end)
-		ftrace_trace_function = ops->func;
-	else
-		ftrace_trace_function = ftrace_list_func;
+	if (ftrace_enabled) {
+		/*
+		 * For one func, simply call it directly.
+		 * For more than one func, call the chain.
+		 */
+		if (ops->next == &ftrace_list_end)
+			ftrace_trace_function = ops->func;
+		else
+			ftrace_trace_function = ftrace_list_func;
+	}
 
 	spin_unlock(&ftrace_lock);
 
@@ -120,10 +134,12 @@ static int notrace __unregister_ftrace_function(struct ftrace_ops *ops)
 
 	*p = (*p)->next;
 
-	/* If we only have one func left, then call that directly */
-	if (ftrace_list == &ftrace_list_end ||
-	    ftrace_list->next == &ftrace_list_end)
-		ftrace_trace_function = ftrace_list->func;
+	if (ftrace_enabled) {
+		/* If we only have one func left, then call that directly */
+		if (ftrace_list == &ftrace_list_end ||
+		    ftrace_list->next == &ftrace_list_end)
+			ftrace_trace_function = ftrace_list->func;
+	}
 
  out:
 	spin_unlock(&ftrace_lock);
@@ -263,7 +279,8 @@ static void notrace ftrace_startup(void)
 		goto out;
 	__unregister_ftrace_function(&ftrace_shutdown_ops);
 
-	ftrace_run_startup_code();
+	if (ftrace_enabled)
+		ftrace_run_startup_code();
  out:
 	mutex_unlock(&ftraced_lock);
 }
@@ -275,13 +292,32 @@ static void notrace ftrace_shutdown(void)
 	if (ftraced_suspend)
 		goto out;
 
-	ftrace_run_shutdown_code();
+	if (ftrace_enabled)
+		ftrace_run_shutdown_code();
 
 	__register_ftrace_function(&ftrace_shutdown_ops);
  out:
 	mutex_unlock(&ftraced_lock);
 }
 
+static void notrace ftrace_startup_sysctl(void)
+{
+	mutex_lock(&ftraced_lock);
+	/* ftraced_suspend is true if we want ftrace running */
+	if (ftraced_suspend)
+		ftrace_run_startup_code();
+	mutex_unlock(&ftraced_lock);
+}
+
+static void notrace ftrace_shutdown_sysctl(void)
+{
+	mutex_lock(&ftraced_lock);
+	/* ftraced_suspend is true if ftrace is running */
+	if (ftraced_suspend)
+		ftrace_run_shutdown_code();
+	mutex_unlock(&ftraced_lock);
+}
+
 static cycle_t		ftrace_update_time;
 static unsigned long	ftrace_update_cnt;
 unsigned long		ftrace_update_tot_cnt;
@@ -341,8 +377,9 @@ static int notrace ftraced(void *ignore)
 		/* check once a second */
 		schedule_timeout(HZ);
 
+		mutex_lock(&ftrace_sysctl_lock);
 		mutex_lock(&ftraced_lock);
-		if (ftraced_trigger && !ftraced_suspend) {
+		if (ftrace_enabled && ftraced_trigger && !ftraced_suspend) {
 			ftrace_record_suspend++;
 			ftrace_update_code();
 			usecs = nsecs_to_usecs(ftrace_update_time);
@@ -360,6 +397,7 @@ static int notrace ftraced(void *ignore)
 			ftrace_record_suspend--;
 		}
 		mutex_unlock(&ftraced_lock);
+		mutex_unlock(&ftrace_sysctl_lock);
 
 		ftrace_shutdown_replenish();
 
@@ -389,8 +427,10 @@ static int __init notrace ftrace_shutdown_init(void)
 
 core_initcall(ftrace_shutdown_init);
 #else
-# define ftrace_startup()	do { } while (0)
-# define ftrace_shutdown()	do { } while (0)
+# define ftrace_startup()	  do { } while (0)
+# define ftrace_shutdown()	  do { } while (0)
+# define ftrace_startup_sysctl()  do { } while (0)
+# define ftrace_shutdown_sysctl() do { } while (0)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
 /**
@@ -406,9 +446,15 @@ core_initcall(ftrace_shutdown_init);
  */
 int register_ftrace_function(struct ftrace_ops *ops)
 {
+	int ret;
+
+	mutex_lock(&ftrace_sysctl_lock);
 	ftrace_startup();
 
-	return __register_ftrace_function(ops);
+	ret = __register_ftrace_function(ops);
+	mutex_unlock(&ftrace_sysctl_lock);
+
+	return ret;
 }
 
 /**
@@ -421,10 +467,53 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
 {
 	int ret;
 
+	mutex_lock(&ftrace_sysctl_lock);
 	ret = __unregister_ftrace_function(ops);
 
 	if (ftrace_list == &ftrace_list_end)
 		ftrace_shutdown();
 
+	mutex_unlock(&ftrace_sysctl_lock);
+
+	return ret;
+}
+
+notrace int
+ftrace_enable_sysctl(struct ctl_table *table, int write,
+		     struct file *filp, void __user *buffer, size_t *lenp,
+		     loff_t *ppos)
+{
+	int ret;
+
+	mutex_lock(&ftrace_sysctl_lock);
+
+	ret  = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+
+	if (ret || !write || (last_ftrace_enabled == ftrace_enabled))
+		goto out;
+
+	last_ftrace_enabled = ftrace_enabled;
+
+	if (ftrace_enabled) {
+
+		ftrace_startup_sysctl();
+
+		/* we are starting ftrace again */
+		if (ftrace_list != &ftrace_list_end) {
+			if (ftrace_list->next == &ftrace_list_end)
+				ftrace_trace_function = ftrace_list->func;
+			else
+				ftrace_trace_function = ftrace_list_func;
+		}
+
+	} else {
+		/* stopping ftrace calls (just send to ftrace_stub) */
+		ftrace_trace_function = ftrace_stub;
+
+		ftrace_shutdown_sysctl();
+	}
+
+ out:
+	mutex_unlock(&ftrace_sysctl_lock);
 	return ret;
 }
-- 
cgit v1.2.3


From 3c1720f00bb619302ba19d55986ab565e74d06db Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:43 +0200
Subject: ftrace: move memory management out of arch code

This patch moves the memory management of the ftrace
records out of the arch code and into the generic code
making the arch code simpler.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/ftrace.c | 154 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 152 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d3de37299ba4..f6d9af3bf66b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -156,6 +156,21 @@ static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu);
 static DEFINE_SPINLOCK(ftrace_shutdown_lock);
 static DEFINE_MUTEX(ftraced_lock);
 
+struct ftrace_page {
+	struct ftrace_page	*next;
+	int			index;
+	struct dyn_ftrace	records[];
+} __attribute__((packed));
+
+#define ENTRIES_PER_PAGE \
+  ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace))
+
+/* estimate from running different kernels */
+#define NR_TO_INIT		10000
+
+static struct ftrace_page	*ftrace_pages_start;
+static struct ftrace_page	*ftrace_pages;
+
 static int ftraced_trigger;
 static int ftraced_suspend;
 
@@ -184,6 +199,21 @@ ftrace_add_hash(struct dyn_ftrace *node, unsigned long key)
 	hlist_add_head(&node->node, &ftrace_hash[key]);
 }
 
+static notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip)
+{
+	/* If this was already converted, skip it */
+	if (ftrace_ip_converted(ip))
+		return NULL;
+
+	if (ftrace_pages->index == ENTRIES_PER_PAGE) {
+		if (!ftrace_pages->next)
+			return NULL;
+		ftrace_pages = ftrace_pages->next;
+	}
+
+	return &ftrace_pages->records[ftrace_pages->index++];
+}
+
 static void notrace
 ftrace_record_ip(unsigned long ip, unsigned long parent_ip)
 {
@@ -252,6 +282,62 @@ static struct ftrace_ops ftrace_shutdown_ops __read_mostly =
 	.func = ftrace_record_ip,
 };
 
+#define MCOUNT_ADDR ((long)(&mcount))
+
+static void notrace ftrace_replace_code(int saved)
+{
+	unsigned char *new = NULL, *old = NULL;
+	struct dyn_ftrace *rec;
+	struct ftrace_page *pg;
+	unsigned long ip;
+	int failed;
+	int i;
+
+	if (saved)
+		old = ftrace_nop_replace();
+	else
+		new = ftrace_nop_replace();
+
+	for (pg = ftrace_pages_start; pg; pg = pg->next) {
+		for (i = 0; i < pg->index; i++) {
+			rec = &pg->records[i];
+
+			/* don't modify code that has already faulted */
+			if (rec->flags & FTRACE_FL_FAILED)
+				continue;
+
+			ip = rec->ip;
+
+			if (saved)
+				new = ftrace_call_replace(ip, MCOUNT_ADDR);
+			else
+				old = ftrace_call_replace(ip, MCOUNT_ADDR);
+
+			failed = ftrace_modify_code(ip, old, new);
+			if (failed)
+				rec->flags |= FTRACE_FL_FAILED;
+		}
+	}
+}
+
+static notrace void ftrace_startup_code(void)
+{
+	ftrace_replace_code(1);
+}
+
+static notrace void ftrace_shutdown_code(void)
+{
+	ftrace_replace_code(0);
+}
+
+static notrace void ftrace_shutdown_replenish(void)
+{
+	if (ftrace_pages->next)
+		return;
+
+	/* allocate another page */
+	ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
+}
 
 static int notrace __ftrace_modify_code(void *data)
 {
@@ -261,6 +347,23 @@ static int notrace __ftrace_modify_code(void *data)
 	return 0;
 }
 
+static notrace void
+ftrace_code_disable(struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned long ip;
+	unsigned char *nop, *call;
+	int failed;
+
+	ip = rec->ip;
+
+	nop = ftrace_nop_replace();
+	call = ftrace_call_replace(ip, addr);
+
+	failed = ftrace_modify_code(ip, call, nop);
+	if (failed)
+		rec->flags |= FTRACE_FL_FAILED;
+}
+
 static void notrace ftrace_run_startup_code(void)
 {
 	stop_machine_run(__ftrace_modify_code, ftrace_startup_code, NR_CPUS);
@@ -346,7 +449,7 @@ static int notrace __ftrace_update_code(void *ignore)
 
 		/* all CPUS are stopped, we are safe to modify code */
 		hlist_for_each_entry(p, t, &head, node) {
-			ftrace_code_disable(p);
+			ftrace_code_disable(p, MCOUNT_ADDR);
 			ftrace_update_cnt++;
 		}
 
@@ -407,12 +510,59 @@ static int notrace ftraced(void *ignore)
 	return 0;
 }
 
+static int __init ftrace_dyn_table_alloc(void)
+{
+	struct ftrace_page *pg;
+	int cnt;
+	int i;
+	int ret;
+
+	ret = ftrace_dyn_arch_init();
+	if (ret)
+		return ret;
+
+	/* allocate a few pages */
+	ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!ftrace_pages_start)
+		return -1;
+
+	/*
+	 * Allocate a few more pages.
+	 *
+	 * TODO: have some parser search vmlinux before
+	 *   final linking to find all calls to ftrace.
+	 *   Then we can:
+	 *    a) know how many pages to allocate.
+	 *     and/or
+	 *    b) set up the table then.
+	 *
+	 *  The dynamic code is still necessary for
+	 *  modules.
+	 */
+
+	pg = ftrace_pages = ftrace_pages_start;
+
+	cnt = NR_TO_INIT / ENTRIES_PER_PAGE;
+
+	for (i = 0; i < cnt; i++) {
+		pg->next = (void *)get_zeroed_page(GFP_KERNEL);
+
+		/* If we fail, we'll try later anyway */
+		if (!pg->next)
+			break;
+
+		pg = pg->next;
+	}
+
+	return 0;
+}
+
 static int __init notrace ftrace_shutdown_init(void)
 {
 	struct task_struct *p;
 	int ret;
 
-	ret = ftrace_shutdown_arch_init();
+	ret = ftrace_dyn_table_alloc();
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From d61f82d06672f57fca410da6f7fffd15867db622 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:43 +0200
Subject: ftrace: use dynamic patching for updating mcount calls

This patch replaces the indirect call to the mcount function
pointer with a direct call that will be patched by the
dynamic ftrace routines.

On boot up, the mcount function calls the ftace_stub function.
When the dynamic ftrace code is initialized, the ftrace_stub
is replaced with a call to the ftrace_record_ip, which records
the instruction pointers of the locations that call it.

Later, the ftraced daemon will call kstop_machine and patch all
the locations to nops.

When a ftrace is enabled, the original calls to mcount will now
be set top call ftrace_caller, which will do a direct call
to the registered ftrace function. This direct call is also patched
when the function that should be called is updated.

All patching is performed by a kstop_machine routine to prevent any
type of race conditions that is associated with modifying code
on the fly.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/ftrace.c | 183 +++++++++++++++++++++++++++++---------------------
 1 file changed, 105 insertions(+), 78 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f6d9af3bf66b..88544f9bc0ed 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -26,14 +26,8 @@
 
 #include "trace.h"
 
-#ifdef CONFIG_DYNAMIC_FTRACE
-# define FTRACE_ENABLED_INIT 1
-#else
-# define FTRACE_ENABLED_INIT 0
-#endif
-
-int ftrace_enabled = FTRACE_ENABLED_INIT;
-static int last_ftrace_enabled = FTRACE_ENABLED_INIT;
+int ftrace_enabled;
+static int last_ftrace_enabled;
 
 static DEFINE_SPINLOCK(ftrace_lock);
 static DEFINE_MUTEX(ftrace_sysctl_lock);
@@ -149,6 +143,14 @@ static int notrace __unregister_ftrace_function(struct ftrace_ops *ops)
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 
+enum {
+	FTRACE_ENABLE_CALLS		= (1 << 0),
+	FTRACE_DISABLE_CALLS		= (1 << 1),
+	FTRACE_UPDATE_TRACE_FUNC	= (1 << 2),
+	FTRACE_ENABLE_MCOUNT		= (1 << 3),
+	FTRACE_DISABLE_MCOUNT		= (1 << 4),
+};
+
 static struct hlist_head ftrace_hash[FTRACE_HASHSIZE];
 
 static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu);
@@ -199,12 +201,8 @@ ftrace_add_hash(struct dyn_ftrace *node, unsigned long key)
 	hlist_add_head(&node->node, &ftrace_hash[key]);
 }
 
-static notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip)
+static notrace struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
 {
-	/* If this was already converted, skip it */
-	if (ftrace_ip_converted(ip))
-		return NULL;
-
 	if (ftrace_pages->index == ENTRIES_PER_PAGE) {
 		if (!ftrace_pages->next)
 			return NULL;
@@ -215,7 +213,7 @@ static notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip)
 }
 
 static void notrace
-ftrace_record_ip(unsigned long ip, unsigned long parent_ip)
+ftrace_record_ip(unsigned long ip)
 {
 	struct dyn_ftrace *node;
 	unsigned long flags;
@@ -223,6 +221,9 @@ ftrace_record_ip(unsigned long ip, unsigned long parent_ip)
 	int resched;
 	int atomic;
 
+	if (!ftrace_enabled)
+		return;
+
 	resched = need_resched();
 	preempt_disable_notrace();
 
@@ -251,11 +252,12 @@ ftrace_record_ip(unsigned long ip, unsigned long parent_ip)
 
 	/*
 	 * There's a slight race that the ftraced will update the
-	 * hash and reset here. The arch alloc is responsible
-	 * for seeing if the IP has already changed, and if
-	 * it has, the alloc will fail.
+	 * hash and reset here. If it is already converted, skip it.
 	 */
-	node = ftrace_alloc_shutdown_node(ip);
+	if (ftrace_ip_converted(ip))
+		goto out_unlock;
+
+	node = ftrace_alloc_dyn_node(ip);
 	if (!node)
 		goto out_unlock;
 
@@ -277,11 +279,7 @@ ftrace_record_ip(unsigned long ip, unsigned long parent_ip)
 		preempt_enable_notrace();
 }
 
-static struct ftrace_ops ftrace_shutdown_ops __read_mostly =
-{
-	.func = ftrace_record_ip,
-};
-
+#define FTRACE_ADDR ((long)(&ftrace_caller))
 #define MCOUNT_ADDR ((long)(&mcount))
 
 static void notrace ftrace_replace_code(int saved)
@@ -309,9 +307,9 @@ static void notrace ftrace_replace_code(int saved)
 			ip = rec->ip;
 
 			if (saved)
-				new = ftrace_call_replace(ip, MCOUNT_ADDR);
+				new = ftrace_call_replace(ip, FTRACE_ADDR);
 			else
-				old = ftrace_call_replace(ip, MCOUNT_ADDR);
+				old = ftrace_call_replace(ip, FTRACE_ADDR);
 
 			failed = ftrace_modify_code(ip, old, new);
 			if (failed)
@@ -320,16 +318,6 @@ static void notrace ftrace_replace_code(int saved)
 	}
 }
 
-static notrace void ftrace_startup_code(void)
-{
-	ftrace_replace_code(1);
-}
-
-static notrace void ftrace_shutdown_code(void)
-{
-	ftrace_replace_code(0);
-}
-
 static notrace void ftrace_shutdown_replenish(void)
 {
 	if (ftrace_pages->next)
@@ -339,16 +327,8 @@ static notrace void ftrace_shutdown_replenish(void)
 	ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
 }
 
-static int notrace __ftrace_modify_code(void *data)
-{
-	void (*func)(void) = data;
-
-	func();
-	return 0;
-}
-
 static notrace void
-ftrace_code_disable(struct dyn_ftrace *rec, unsigned long addr)
+ftrace_code_disable(struct dyn_ftrace *rec)
 {
 	unsigned long ip;
 	unsigned char *nop, *call;
@@ -357,67 +337,113 @@ ftrace_code_disable(struct dyn_ftrace *rec, unsigned long addr)
 	ip = rec->ip;
 
 	nop = ftrace_nop_replace();
-	call = ftrace_call_replace(ip, addr);
+	call = ftrace_call_replace(ip, MCOUNT_ADDR);
 
 	failed = ftrace_modify_code(ip, call, nop);
 	if (failed)
 		rec->flags |= FTRACE_FL_FAILED;
 }
 
-static void notrace ftrace_run_startup_code(void)
+static int notrace __ftrace_modify_code(void *data)
 {
-	stop_machine_run(__ftrace_modify_code, ftrace_startup_code, NR_CPUS);
+	unsigned long addr;
+	int *command = data;
+
+	if (*command & FTRACE_ENABLE_CALLS)
+		ftrace_replace_code(1);
+	else if (*command & FTRACE_DISABLE_CALLS)
+		ftrace_replace_code(0);
+
+	if (*command & FTRACE_UPDATE_TRACE_FUNC)
+		ftrace_update_ftrace_func(ftrace_trace_function);
+
+	if (*command & FTRACE_ENABLE_MCOUNT) {
+		addr = (unsigned long)ftrace_record_ip;
+		ftrace_mcount_set(&addr);
+	} else if (*command & FTRACE_DISABLE_MCOUNT) {
+		addr = (unsigned long)ftrace_stub;
+		ftrace_mcount_set(&addr);
+	}
+
+	return 0;
 }
 
-static void notrace ftrace_run_shutdown_code(void)
+static void notrace ftrace_run_update_code(int command)
 {
-	stop_machine_run(__ftrace_modify_code, ftrace_shutdown_code, NR_CPUS);
+	stop_machine_run(__ftrace_modify_code, &command, NR_CPUS);
 }
 
+static ftrace_func_t saved_ftrace_func;
+
 static void notrace ftrace_startup(void)
 {
+	int command = 0;
+
 	mutex_lock(&ftraced_lock);
 	ftraced_suspend++;
-	if (ftraced_suspend != 1)
+	if (ftraced_suspend == 1)
+		command |= FTRACE_ENABLE_CALLS;
+
+	if (saved_ftrace_func != ftrace_trace_function) {
+		saved_ftrace_func = ftrace_trace_function;
+		command |= FTRACE_UPDATE_TRACE_FUNC;
+	}
+
+	if (!command || !ftrace_enabled)
 		goto out;
-	__unregister_ftrace_function(&ftrace_shutdown_ops);
 
-	if (ftrace_enabled)
-		ftrace_run_startup_code();
+	ftrace_run_update_code(command);
  out:
 	mutex_unlock(&ftraced_lock);
 }
 
 static void notrace ftrace_shutdown(void)
 {
+	int command = 0;
+
 	mutex_lock(&ftraced_lock);
 	ftraced_suspend--;
-	if (ftraced_suspend)
-		goto out;
+	if (!ftraced_suspend)
+		command |= FTRACE_DISABLE_CALLS;
 
-	if (ftrace_enabled)
-		ftrace_run_shutdown_code();
+	if (saved_ftrace_func != ftrace_trace_function) {
+		saved_ftrace_func = ftrace_trace_function;
+		command |= FTRACE_UPDATE_TRACE_FUNC;
+	}
 
-	__register_ftrace_function(&ftrace_shutdown_ops);
+	if (!command || !ftrace_enabled)
+		goto out;
+
+	ftrace_run_update_code(command);
  out:
 	mutex_unlock(&ftraced_lock);
 }
 
 static void notrace ftrace_startup_sysctl(void)
 {
+	int command = FTRACE_ENABLE_MCOUNT;
+
 	mutex_lock(&ftraced_lock);
+	/* Force update next time */
+	saved_ftrace_func = NULL;
 	/* ftraced_suspend is true if we want ftrace running */
 	if (ftraced_suspend)
-		ftrace_run_startup_code();
+		command |= FTRACE_ENABLE_CALLS;
+
+	ftrace_run_update_code(command);
 	mutex_unlock(&ftraced_lock);
 }
 
 static void notrace ftrace_shutdown_sysctl(void)
 {
+	int command = FTRACE_DISABLE_MCOUNT;
+
 	mutex_lock(&ftraced_lock);
 	/* ftraced_suspend is true if ftrace is running */
 	if (ftraced_suspend)
-		ftrace_run_shutdown_code();
+		command |= FTRACE_DISABLE_CALLS;
+
+	ftrace_run_update_code(command);
 	mutex_unlock(&ftraced_lock);
 }
 
@@ -430,11 +456,13 @@ static int notrace __ftrace_update_code(void *ignore)
 	struct dyn_ftrace *p;
 	struct hlist_head head;
 	struct hlist_node *t;
+	int save_ftrace_enabled;
 	cycle_t start, stop;
 	int i;
 
-	/* Don't be calling ftrace ops now */
-	__unregister_ftrace_function(&ftrace_shutdown_ops);
+	/* Don't be recording funcs now */
+	save_ftrace_enabled = ftrace_enabled;
+	ftrace_enabled = 0;
 
 	start = now(raw_smp_processor_id());
 	ftrace_update_cnt = 0;
@@ -449,7 +477,7 @@ static int notrace __ftrace_update_code(void *ignore)
 
 		/* all CPUS are stopped, we are safe to modify code */
 		hlist_for_each_entry(p, t, &head, node) {
-			ftrace_code_disable(p, MCOUNT_ADDR);
+			ftrace_code_disable(p);
 			ftrace_update_cnt++;
 		}
 
@@ -459,7 +487,7 @@ static int notrace __ftrace_update_code(void *ignore)
 	ftrace_update_time = stop - start;
 	ftrace_update_tot_cnt += ftrace_update_cnt;
 
-	__register_ftrace_function(&ftrace_shutdown_ops);
+	ftrace_enabled = save_ftrace_enabled;
 
 	return 0;
 }
@@ -515,11 +543,6 @@ static int __init ftrace_dyn_table_alloc(void)
 	struct ftrace_page *pg;
 	int cnt;
 	int i;
-	int ret;
-
-	ret = ftrace_dyn_arch_init();
-	if (ret)
-		return ret;
 
 	/* allocate a few pages */
 	ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
@@ -557,11 +580,19 @@ static int __init ftrace_dyn_table_alloc(void)
 	return 0;
 }
 
-static int __init notrace ftrace_shutdown_init(void)
+static int __init notrace ftrace_dynamic_init(void)
 {
 	struct task_struct *p;
+	unsigned long addr;
 	int ret;
 
+	addr = (unsigned long)ftrace_record_ip;
+	stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS);
+
+	/* ftrace_dyn_arch_init places the return code in addr */
+	if (addr)
+		return addr;
+
 	ret = ftrace_dyn_table_alloc();
 	if (ret)
 		return ret;
@@ -570,12 +601,12 @@ static int __init notrace ftrace_shutdown_init(void)
 	if (IS_ERR(p))
 		return -1;
 
-	__register_ftrace_function(&ftrace_shutdown_ops);
+	last_ftrace_enabled = ftrace_enabled = 1;
 
 	return 0;
 }
 
-core_initcall(ftrace_shutdown_init);
+core_initcall(ftrace_dynamic_init);
 #else
 # define ftrace_startup()	  do { } while (0)
 # define ftrace_shutdown()	  do { } while (0)
@@ -599,9 +630,8 @@ int register_ftrace_function(struct ftrace_ops *ops)
 	int ret;
 
 	mutex_lock(&ftrace_sysctl_lock);
-	ftrace_startup();
-
 	ret = __register_ftrace_function(ops);
+	ftrace_startup();
 	mutex_unlock(&ftrace_sysctl_lock);
 
 	return ret;
@@ -619,10 +649,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
 
 	mutex_lock(&ftrace_sysctl_lock);
 	ret = __unregister_ftrace_function(ops);
-
-	if (ftrace_list == &ftrace_list_end)
-		ftrace_shutdown();
-
+	ftrace_shutdown();
 	mutex_unlock(&ftrace_sysctl_lock);
 
 	return ret;
-- 
cgit v1.2.3


From 5072c59fd45e9976d02ee6f18c7336ef97623cbc Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:43 +0200
Subject: ftrace: add filter select functions to trace

This patch adds two files to the debugfs system:

 /debugfs/tracing/available_filter_functions

and

 /debugfs/tracing/set_ftrace_filter

The available_filter_functions lists all functions that has been
recorded by the ftraced that has called the ftrace_record_ip function.
This is to allow users to see what functions have been converted
to nops and can be enabled for tracing.

To enable functions, simply echo the names (whitespace delimited)
into set_ftrace_filter. Simple wildcards are also allowed.

echo 'scheduler' > /debugfs/tracing/set_ftrace_filter

Will have only the scheduler be activated when tracing is enabled.

echo 'sched_*' > /debugfs/tracing/set_ftrace_filter

Will have only the functions starting with 'sched_' be activated.

echo '*lock' > /debugfs/tracing/set_ftrace_filter

Will have only functions ending with 'lock' be activated.

echo '*lock*' > /debugfs/tracing/set_ftrace_filter

Will have only functions with 'lock' in its name be activated.

Note: 'sched*lock' will not work. The only wildcards that are
allowed is an asterisk and the beginning and or end of the string
passed in.

Multiple names can be passed in with whitespace delimited:

echo 'scheduler *lock *acpi*' > /debugfs/tracing/set_ftrace_filter

is also the same as:

echo 'scheduler' > /debugfs/tracing/set_ftrace_filter
echo '*lock' >> /debugfs/tracing/set_ftrace_filter
echo '*acpi*' >> /debugfs/tracing/set_ftrace_filter

Appending does just that. It appends to the list.

To disable all filters simply echo an empty line in:

echo > /debugfs/tracing/set_ftrace_filter

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/ftrace.c | 527 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 510 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 88544f9bc0ed..97d5cb7b7e75 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -16,12 +16,15 @@
 #include <linux/stop_machine.h>
 #include <linux/clocksource.h>
 #include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
 #include <linux/kthread.h>
 #include <linux/hardirq.h>
 #include <linux/ftrace.h>
-#include <linux/module.h>
+#include <linux/uaccess.h>
 #include <linux/sysctl.h>
 #include <linux/hash.h>
+#include <linux/ctype.h>
 #include <linux/list.h>
 
 #include "trace.h"
@@ -151,12 +154,15 @@ enum {
 	FTRACE_DISABLE_MCOUNT		= (1 << 4),
 };
 
+static int ftrace_filtered;
+
 static struct hlist_head ftrace_hash[FTRACE_HASHSIZE];
 
 static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu);
 
 static DEFINE_SPINLOCK(ftrace_shutdown_lock);
 static DEFINE_MUTEX(ftraced_lock);
+static DEFINE_MUTEX(ftrace_filter_lock);
 
 struct ftrace_page {
 	struct ftrace_page	*next;
@@ -282,16 +288,82 @@ ftrace_record_ip(unsigned long ip)
 #define FTRACE_ADDR ((long)(&ftrace_caller))
 #define MCOUNT_ADDR ((long)(&mcount))
 
-static void notrace ftrace_replace_code(int saved)
+static void notrace
+__ftrace_replace_code(struct dyn_ftrace *rec,
+		      unsigned char *old, unsigned char *new, int enable)
+{
+	unsigned long ip;
+	int failed;
+
+	ip = rec->ip;
+
+	if (ftrace_filtered && enable) {
+		unsigned long fl;
+		/*
+		 * If filtering is on:
+		 *
+		 * If this record is set to be filtered and
+		 * is enabled then do nothing.
+		 *
+		 * If this record is set to be filtered and
+		 * it is not enabled, enable it.
+		 *
+		 * If this record is not set to be filtered
+		 * and it is not enabled do nothing.
+		 *
+		 * If this record is not set to be filtered and
+		 * it is enabled, disable it.
+		 */
+		fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED);
+
+		if ((fl ==  (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) ||
+		    (fl == 0))
+			return;
+
+		/*
+		 * If it is enabled disable it,
+		 * otherwise enable it!
+		 */
+		if (fl == FTRACE_FL_ENABLED) {
+			/* swap new and old */
+			new = old;
+			old = ftrace_call_replace(ip, FTRACE_ADDR);
+			rec->flags &= ~FTRACE_FL_ENABLED;
+		} else {
+			new = ftrace_call_replace(ip, FTRACE_ADDR);
+			rec->flags |= FTRACE_FL_ENABLED;
+		}
+	} else {
+
+		if (enable)
+			new = ftrace_call_replace(ip, FTRACE_ADDR);
+		else
+			old = ftrace_call_replace(ip, FTRACE_ADDR);
+
+		if (enable) {
+			if (rec->flags & FTRACE_FL_ENABLED)
+				return;
+			rec->flags |= FTRACE_FL_ENABLED;
+		} else {
+			if (!(rec->flags & FTRACE_FL_ENABLED))
+				return;
+			rec->flags &= ~FTRACE_FL_ENABLED;
+		}
+	}
+
+	failed = ftrace_modify_code(ip, old, new);
+	if (failed)
+		rec->flags |= FTRACE_FL_FAILED;
+}
+
+static void notrace ftrace_replace_code(int enable)
 {
 	unsigned char *new = NULL, *old = NULL;
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
-	unsigned long ip;
-	int failed;
 	int i;
 
-	if (saved)
+	if (enable)
 		old = ftrace_nop_replace();
 	else
 		new = ftrace_nop_replace();
@@ -304,16 +376,7 @@ static void notrace ftrace_replace_code(int saved)
 			if (rec->flags & FTRACE_FL_FAILED)
 				continue;
 
-			ip = rec->ip;
-
-			if (saved)
-				new = ftrace_call_replace(ip, FTRACE_ADDR);
-			else
-				old = ftrace_call_replace(ip, FTRACE_ADDR);
-
-			failed = ftrace_modify_code(ip, old, new);
-			if (failed)
-				rec->flags |= FTRACE_FL_FAILED;
+			__ftrace_replace_code(rec, old, new, enable);
 		}
 	}
 }
@@ -580,6 +643,436 @@ static int __init ftrace_dyn_table_alloc(void)
 	return 0;
 }
 
+enum {
+	FTRACE_ITER_FILTER	= (1 << 0),
+	FTRACE_ITER_CONT	= (1 << 1),
+};
+
+#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
+
+struct ftrace_iterator {
+	loff_t			pos;
+	struct ftrace_page	*pg;
+	unsigned		idx;
+	unsigned		flags;
+	unsigned char		buffer[FTRACE_BUFF_MAX+1];
+	unsigned		buffer_idx;
+	unsigned		filtered;
+};
+
+static void notrace *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct ftrace_iterator *iter = m->private;
+	struct dyn_ftrace *rec = NULL;
+
+	(*pos)++;
+
+ retry:
+	if (iter->idx >= iter->pg->index) {
+		if (iter->pg->next) {
+			iter->pg = iter->pg->next;
+			iter->idx = 0;
+			goto retry;
+		}
+	} else {
+		rec = &iter->pg->records[iter->idx++];
+		if ((rec->flags & FTRACE_FL_FAILED) ||
+		    ((iter->flags & FTRACE_ITER_FILTER) &&
+		     !(rec->flags & FTRACE_FL_FILTER))) {
+			rec = NULL;
+			goto retry;
+		}
+	}
+
+	iter->pos = *pos;
+
+	return rec;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+	struct ftrace_iterator *iter = m->private;
+	void *p = NULL;
+	loff_t l = -1;
+
+	if (*pos != iter->pos) {
+		for (p = t_next(m, p, &l); p && l < *pos; p = t_next(m, p, &l))
+			;
+	} else {
+		l = *pos;
+		p = t_next(m, p, &l);
+	}
+
+	return p;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+	struct dyn_ftrace *rec = v;
+	char str[KSYM_SYMBOL_LEN];
+
+	if (!rec)
+		return 0;
+
+	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+
+	seq_printf(m, "%s\n", str);
+
+	return 0;
+}
+
+static struct seq_operations show_ftrace_seq_ops = {
+	.start = t_start,
+	.next = t_next,
+	.stop = t_stop,
+	.show = t_show,
+};
+
+static int notrace
+ftrace_avail_open(struct inode *inode, struct file *file)
+{
+	struct ftrace_iterator *iter;
+	int ret;
+
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter)
+		return -ENOMEM;
+
+	iter->pg = ftrace_pages_start;
+	iter->pos = -1;
+
+	ret = seq_open(file, &show_ftrace_seq_ops);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = iter;
+	} else
+		kfree(iter);
+
+	return ret;
+}
+
+int ftrace_avail_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = (struct seq_file *)file->private_data;
+	struct ftrace_iterator *iter = m->private;
+
+	seq_release(inode, file);
+	kfree(iter);
+	return 0;
+}
+
+static void notrace ftrace_filter_reset(void)
+{
+	struct ftrace_page *pg;
+	struct dyn_ftrace *rec;
+	unsigned i;
+
+	/* keep kstop machine from running */
+	preempt_disable();
+	ftrace_filtered = 0;
+	pg = ftrace_pages_start;
+	while (pg) {
+		for (i = 0; i < pg->index; i++) {
+			rec = &pg->records[i];
+			if (rec->flags & FTRACE_FL_FAILED)
+				continue;
+			rec->flags &= ~FTRACE_FL_FILTER;
+		}
+		pg = pg->next;
+	}
+	preempt_enable();
+}
+
+static int notrace
+ftrace_filter_open(struct inode *inode, struct file *file)
+{
+	struct ftrace_iterator *iter;
+	int ret = 0;
+
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter)
+		return -ENOMEM;
+
+	mutex_lock(&ftrace_filter_lock);
+	if ((file->f_mode & FMODE_WRITE) &&
+	    !(file->f_flags & O_APPEND))
+		ftrace_filter_reset();
+
+	if (file->f_mode & FMODE_READ) {
+		iter->pg = ftrace_pages_start;
+		iter->pos = -1;
+		iter->flags = FTRACE_ITER_FILTER;
+
+		ret = seq_open(file, &show_ftrace_seq_ops);
+		if (!ret) {
+			struct seq_file *m = file->private_data;
+			m->private = iter;
+		} else
+			kfree(iter);
+	} else
+		file->private_data = iter;
+	mutex_unlock(&ftrace_filter_lock);
+
+	return ret;
+}
+
+static ssize_t notrace
+ftrace_filter_read(struct file *file, char __user *ubuf,
+		       size_t cnt, loff_t *ppos)
+{
+	if (file->f_mode & FMODE_READ)
+		return seq_read(file, ubuf, cnt, ppos);
+	else
+		return -EPERM;
+}
+
+static loff_t notrace
+ftrace_filter_lseek(struct file *file, loff_t offset, int origin)
+{
+	loff_t ret;
+
+	if (file->f_mode & FMODE_READ)
+		ret = seq_lseek(file, offset, origin);
+	else
+		file->f_pos = ret = 1;
+
+	return ret;
+}
+
+enum {
+	MATCH_FULL,
+	MATCH_FRONT_ONLY,
+	MATCH_MIDDLE_ONLY,
+	MATCH_END_ONLY,
+};
+
+static void notrace
+ftrace_match(unsigned char *buff, int len)
+{
+	char str[KSYM_SYMBOL_LEN];
+	char *search = NULL;
+	struct ftrace_page *pg;
+	struct dyn_ftrace *rec;
+	int type = MATCH_FULL;
+	unsigned i, match = 0, search_len = 0;
+
+	for (i = 0; i < len; i++) {
+		if (buff[i] == '*') {
+			if (!i) {
+				search = buff + i + 1;
+				type = MATCH_END_ONLY;
+				search_len = len - (i + 1);
+			} else {
+				if (type == MATCH_END_ONLY) {
+					type = MATCH_MIDDLE_ONLY;
+				} else {
+					match = i;
+					type = MATCH_FRONT_ONLY;
+				}
+				buff[i] = 0;
+				break;
+			}
+		}
+	}
+
+	/* keep kstop machine from running */
+	preempt_disable();
+	ftrace_filtered = 1;
+	pg = ftrace_pages_start;
+	while (pg) {
+		for (i = 0; i < pg->index; i++) {
+			int matched = 0;
+			char *ptr;
+
+			rec = &pg->records[i];
+			if (rec->flags & FTRACE_FL_FAILED)
+				continue;
+			kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+			switch (type) {
+			case MATCH_FULL:
+				if (strcmp(str, buff) == 0)
+					matched = 1;
+				break;
+			case MATCH_FRONT_ONLY:
+				if (memcmp(str, buff, match) == 0)
+					matched = 1;
+				break;
+			case MATCH_MIDDLE_ONLY:
+				if (strstr(str, search))
+					matched = 1;
+				break;
+			case MATCH_END_ONLY:
+				ptr = strstr(str, search);
+				if (ptr && (ptr[search_len] == 0))
+					matched = 1;
+				break;
+			}
+			if (matched)
+				rec->flags |= FTRACE_FL_FILTER;
+		}
+		pg = pg->next;
+	}
+	preempt_enable();
+}
+
+static ssize_t notrace
+ftrace_filter_write(struct file *file, const char __user *ubuf,
+		    size_t cnt, loff_t *ppos)
+{
+	struct ftrace_iterator *iter;
+	char ch;
+	size_t read = 0;
+	ssize_t ret;
+
+	if (!cnt || cnt < 0)
+		return 0;
+
+	mutex_lock(&ftrace_filter_lock);
+
+	if (file->f_mode & FMODE_READ) {
+		struct seq_file *m = file->private_data;
+		iter = m->private;
+	} else
+		iter = file->private_data;
+
+	if (!*ppos) {
+		iter->flags &= ~FTRACE_ITER_CONT;
+		iter->buffer_idx = 0;
+	}
+
+	ret = get_user(ch, ubuf++);
+	if (ret)
+		goto out;
+	read++;
+	cnt--;
+
+	if (!(iter->flags & ~FTRACE_ITER_CONT)) {
+		/* skip white space */
+		while (cnt && isspace(ch)) {
+			ret = get_user(ch, ubuf++);
+			if (ret)
+				goto out;
+			read++;
+			cnt--;
+		}
+
+
+		if (isspace(ch)) {
+			file->f_pos += read;
+			ret = read;
+			goto out;
+		}
+
+		iter->buffer_idx = 0;
+	}
+
+	while (cnt && !isspace(ch)) {
+		if (iter->buffer_idx < FTRACE_BUFF_MAX)
+			iter->buffer[iter->buffer_idx++] = ch;
+		else {
+			ret = -EINVAL;
+			goto out;
+		}
+		ret = get_user(ch, ubuf++);
+		if (ret)
+			goto out;
+		read++;
+		cnt--;
+	}
+
+	if (isspace(ch)) {
+		iter->filtered++;
+		iter->buffer[iter->buffer_idx] = 0;
+		ftrace_match(iter->buffer, iter->buffer_idx);
+		iter->buffer_idx = 0;
+	} else
+		iter->flags |= FTRACE_ITER_CONT;
+
+
+	file->f_pos += read;
+
+	ret = read;
+ out:
+	mutex_unlock(&ftrace_filter_lock);
+
+	return ret;
+}
+
+static int notrace
+ftrace_filter_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = (struct seq_file *)file->private_data;
+	struct ftrace_iterator *iter;
+
+	mutex_lock(&ftrace_filter_lock);
+	if (file->f_mode & FMODE_READ) {
+		iter = m->private;
+
+		seq_release(inode, file);
+	} else
+		iter = file->private_data;
+
+	if (iter->buffer_idx) {
+		iter->filtered++;
+		iter->buffer[iter->buffer_idx] = 0;
+		ftrace_match(iter->buffer, iter->buffer_idx);
+	}
+
+	mutex_lock(&ftrace_sysctl_lock);
+	mutex_lock(&ftraced_lock);
+	if (iter->filtered && ftraced_suspend && ftrace_enabled)
+		ftrace_run_update_code(FTRACE_ENABLE_CALLS);
+	mutex_unlock(&ftraced_lock);
+	mutex_unlock(&ftrace_sysctl_lock);
+
+	kfree(iter);
+	mutex_unlock(&ftrace_filter_lock);
+	return 0;
+}
+
+static struct file_operations ftrace_avail_fops = {
+	.open = ftrace_avail_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = ftrace_avail_release,
+};
+
+static struct file_operations ftrace_filter_fops = {
+	.open = ftrace_filter_open,
+	.read = ftrace_filter_read,
+	.write = ftrace_filter_write,
+	.llseek = ftrace_filter_lseek,
+	.release = ftrace_filter_release,
+};
+
+static __init int ftrace_init_debugfs(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+
+	entry = debugfs_create_file("available_filter_functions", 0444,
+				    d_tracer, NULL, &ftrace_avail_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'available_filter_functions' entry\n");
+
+	entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer,
+				    NULL, &ftrace_filter_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'set_ftrace_filter' entry\n");
+	return 0;
+}
+
+fs_initcall(ftrace_init_debugfs);
+
 static int __init notrace ftrace_dynamic_init(void)
 {
 	struct task_struct *p;
@@ -657,14 +1150,14 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
 
 notrace int
 ftrace_enable_sysctl(struct ctl_table *table, int write,
-		     struct file *filp, void __user *buffer, size_t *lenp,
+		     struct file *file, void __user *buffer, size_t *lenp,
 		     loff_t *ppos)
 {
 	int ret;
 
 	mutex_lock(&ftrace_sysctl_lock);
 
-	ret  = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	ret  = proc_dointvec(table, write, file, buffer, lenp, ppos);
 
 	if (ret || !write || (last_ftrace_enabled == ftrace_enabled))
 		goto out;
-- 
cgit v1.2.3


From 4c11d7aed389375253b59e2b1865eec96663c65d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:43 +0200
Subject: ftrace: convert single large buffer into single pages.

Allocating large buffers for the tracer may fail easily.
This patch converts the buffer from a large ordered allocation
to single pages. It uses the struct page LRU field to link the
pages together.

Later patches may also implement dynamic increasing and decreasing
of the trace buffers.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 245 +++++++++++++++++++++++++++++++++++++++------------
 kernel/trace/trace.h |   8 +-
 2 files changed, 195 insertions(+), 58 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1b8eca7650d4..d7ad030a4c49 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -15,6 +15,7 @@
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
+#include <linux/pagemap.h>
 #include <linux/hardirq.h>
 #include <linux/linkage.h>
 #include <linux/uaccess.h>
@@ -49,7 +50,7 @@ static struct trace_array	max_tr;
 static DEFINE_PER_CPU(struct trace_array_cpu, max_data);
 
 static int			tracer_enabled;
-static unsigned long		trace_nr_entries = 4096UL;
+static unsigned long		trace_nr_entries = 16384UL;
 
 static struct tracer		*trace_types __read_mostly;
 static struct tracer		*current_trace __read_mostly;
@@ -57,6 +58,8 @@ static int			max_tracer_type_len;
 
 static DEFINE_MUTEX(trace_types_lock);
 
+#define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry))
+
 static int __init set_nr_entries(char *str)
 {
 	if (!str)
@@ -103,6 +106,7 @@ static const char *trace_options[] = {
 
 static unsigned trace_flags;
 
+static DEFINE_SPINLOCK(ftrace_max_lock);
 
 /*
  * Copy the new maximum trace into the separate maximum-trace
@@ -136,17 +140,23 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 {
 	struct trace_array_cpu *data;
 	void *save_trace;
+	struct list_head save_pages;
 	int i;
 
+	WARN_ON_ONCE(!irqs_disabled());
+	spin_lock(&ftrace_max_lock);
 	/* clear out all the previous traces */
 	for_each_possible_cpu(i) {
 		data = tr->data[i];
 		save_trace = max_tr.data[i]->trace;
+		save_pages = max_tr.data[i]->trace_pages;
 		memcpy(max_tr.data[i], data, sizeof(*data));
 		data->trace = save_trace;
+		data->trace_pages = save_pages;
 	}
 
 	__update_max_tr(tr, tsk, cpu);
+	spin_unlock(&ftrace_max_lock);
 }
 
 /**
@@ -160,16 +170,22 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 {
 	struct trace_array_cpu *data = tr->data[cpu];
 	void *save_trace;
+	struct list_head save_pages;
 	int i;
 
+	WARN_ON_ONCE(!irqs_disabled());
+	spin_lock(&ftrace_max_lock);
 	for_each_possible_cpu(i)
 		tracing_reset(max_tr.data[i]);
 
 	save_trace = max_tr.data[cpu]->trace;
+	save_pages = max_tr.data[cpu]->trace_pages;
 	memcpy(max_tr.data[cpu], data, sizeof(*data));
 	data->trace = save_trace;
+	data->trace_pages = save_pages;
 
 	__update_max_tr(tr, tsk, cpu);
+	spin_unlock(&ftrace_max_lock);
 }
 
 int register_tracer(struct tracer *type)
@@ -236,7 +252,8 @@ void unregister_tracer(struct tracer *type)
 void notrace tracing_reset(struct trace_array_cpu *data)
 {
 	data->trace_idx = 0;
-	atomic_set(&data->underrun, 0);
+	data->trace_current = data->trace;
+	data->trace_current_idx = 0;
 }
 
 #ifdef CONFIG_FTRACE
@@ -367,21 +384,27 @@ tracing_get_trace_entry(struct trace_array *tr,
 {
 	unsigned long idx, idx_next;
 	struct trace_entry *entry;
+	struct page *page;
+	struct list_head *next;
 
-	idx = data->trace_idx;
+	data->trace_idx++;
+	idx = data->trace_current_idx;
 	idx_next = idx + 1;
 
-	if (unlikely(idx_next >= tr->entries)) {
-		atomic_inc(&data->underrun);
+	entry = data->trace_current + idx * TRACE_ENTRY_SIZE;
+
+	if (unlikely(idx_next >= ENTRIES_PER_PAGE)) {
+		page = virt_to_page(data->trace_current);
+		if (unlikely(&page->lru == data->trace_pages.prev))
+			next = data->trace_pages.next;
+		else
+			next = page->lru.next;
+		page = list_entry(next, struct page, lru);
+		data->trace_current = page_address(page);
 		idx_next = 0;
 	}
 
-	data->trace_idx = idx_next;
-
-	if (unlikely(idx_next != 0 && atomic_read(&data->underrun)))
-		atomic_inc(&data->underrun);
-
-	entry = data->trace + idx * TRACE_ENTRY_SIZE;
+	data->trace_current_idx = idx_next;
 
 	return entry;
 }
@@ -442,21 +465,38 @@ enum trace_file_type {
 };
 
 static struct trace_entry *
-trace_entry_idx(struct trace_array *tr, unsigned long idx, int cpu)
+trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data,
+		struct trace_iterator *iter, int cpu)
 {
-	struct trace_entry *array = tr->data[cpu]->trace;
-	unsigned long underrun;
+	struct page *page;
+	struct trace_entry *array;
 
-	if (idx >= tr->entries)
+	if (iter->next_idx[cpu] >= tr->entries ||
+	    iter->next_idx[cpu] >= data->trace_idx)
 		return NULL;
 
-	underrun = atomic_read(&tr->data[cpu]->underrun);
-	if (underrun)
-		idx = ((underrun - 1) + idx) % tr->entries;
-	else if (idx >= tr->data[cpu]->trace_idx)
-		return NULL;
+	if (!iter->next_page[cpu]) {
+		/*
+		 * Initialize. If the count of elements in
+		 * this buffer is greater than the max entries
+		 * we had an underrun. Which means we looped around.
+		 * We can simply use the current pointer as our
+		 * starting point.
+		 */
+		if (data->trace_idx >= tr->entries) {
+			page = virt_to_page(data->trace_current);
+			iter->next_page[cpu] = &page->lru;
+			iter->next_page_idx[cpu] = data->trace_current_idx;
+		} else {
+			iter->next_page[cpu] = data->trace_pages.next;
+			iter->next_page_idx[cpu] = 0;
+		}
+	}
 
-	return &array[idx];
+	page = list_entry(iter->next_page[cpu], struct page, lru);
+	array = page_address(page);
+
+	return &array[iter->next_page_idx[cpu]];
 }
 
 static struct notrace trace_entry *
@@ -470,7 +510,7 @@ find_next_entry(struct trace_iterator *iter, int *ent_cpu)
 	for_each_possible_cpu(cpu) {
 		if (!tr->data[cpu]->trace)
 			continue;
-		ent = trace_entry_idx(tr, iter->next_idx[cpu], cpu);
+		ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu);
 		if (ent &&
 		    (!next || (long)(next->idx - ent->idx) > 0)) {
 			next = ent;
@@ -492,8 +532,19 @@ static void *find_next_entry_inc(struct trace_iterator *iter)
 	next = find_next_entry(iter, &next_cpu);
 
 	if (next) {
-		iter->next_idx[next_cpu]++;
 		iter->idx++;
+		iter->next_idx[next_cpu]++;
+		iter->next_page_idx[next_cpu]++;
+		if (iter->next_page_idx[next_cpu] >= ENTRIES_PER_PAGE) {
+			struct trace_array_cpu *data = iter->tr->data[next_cpu];
+
+			iter->next_page_idx[next_cpu] = 0;
+			iter->next_page[next_cpu] =
+				iter->next_page[next_cpu]->next;
+			if (iter->next_page[next_cpu] == &data->trace_pages)
+				iter->next_page[next_cpu] =
+					data->trace_pages.next;
+		}
 	}
 	iter->ent = next;
 	iter->cpu = next_cpu;
@@ -554,14 +605,16 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 		iter->cpu = 0;
 		iter->idx = -1;
 
-		for (i = 0; i < NR_CPUS; i++)
+		for_each_possible_cpu(i) {
 			iter->next_idx[i] = 0;
+			iter->next_page[i] = NULL;
+		}
 
 		for (p = iter; p && l < *pos; p = s_next(m, p, &l))
 			;
 
 	} else {
-		l = *pos;
+		l = *pos - 1;
 		p = s_next(m, p, &l);
 	}
 
@@ -654,9 +707,8 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 	struct trace_array *tr = iter->tr;
 	struct trace_array_cpu *data = tr->data[tr->cpu];
 	struct tracer *type = current_trace;
-	unsigned long underruns = 0;
-	unsigned long underrun;
-	unsigned long entries   = 0;
+	unsigned long total   = 0;
+	unsigned long entries = 0;
 	int cpu;
 	const char *name = "preemption";
 
@@ -665,11 +717,10 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 
 	for_each_possible_cpu(cpu) {
 		if (tr->data[cpu]->trace) {
-			underrun = atomic_read(&tr->data[cpu]->underrun);
-			if (underrun) {
-				underruns += underrun;
+			total += tr->data[cpu]->trace_idx;
+			if (tr->data[cpu]->trace_idx > tr->entries)
 				entries += tr->entries;
-			} else
+			else
 				entries += tr->data[cpu]->trace_idx;
 		}
 	}
@@ -682,7 +733,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 		   " (M:%s VP:%d, KP:%d, SP:%d HP:%d",
 		   data->saved_latency,
 		   entries,
-		   (entries + underruns),
+		   total,
 		   tr->cpu,
 #if defined(CONFIG_PREEMPT_NONE)
 		   "server",
@@ -882,8 +933,7 @@ static int trace_empty(struct trace_iterator *iter)
 		data = iter->tr->data[cpu];
 
 		if (data->trace &&
-		    (data->trace_idx ||
-		     atomic_read(&data->underrun)))
+		    data->trace_idx)
 			return 0;
 	}
 	return 1;
@@ -1464,42 +1514,109 @@ static struct tracer no_tracer __read_mostly =
 	.name = "none",
 };
 
-static inline notrace int page_order(const unsigned long size)
+static int trace_alloc_page(void)
 {
-	const unsigned long nr_pages = DIV_ROUND_UP(size, PAGE_SIZE);
-	return ilog2(roundup_pow_of_two(nr_pages));
+	struct trace_array_cpu *data;
+	void *array;
+	struct page *page, *tmp;
+	LIST_HEAD(pages);
+	int i;
+
+	/* first allocate a page for each CPU */
+	for_each_possible_cpu(i) {
+		array = (void *)__get_free_page(GFP_KERNEL);
+		if (array == NULL) {
+			printk(KERN_ERR "tracer: failed to allocate page"
+			       "for trace buffer!\n");
+			goto free_pages;
+		}
+
+		page = virt_to_page(array);
+		list_add(&page->lru, &pages);
+
+/* Only allocate if we are actually using the max trace */
+#ifdef CONFIG_TRACER_MAX_TRACE
+		array = (void *)__get_free_page(GFP_KERNEL);
+		if (array == NULL) {
+			printk(KERN_ERR "tracer: failed to allocate page"
+			       "for trace buffer!\n");
+			goto free_pages;
+		}
+		page = virt_to_page(array);
+		list_add(&page->lru, &pages);
+#endif
+	}
+
+	/* Now that we successfully allocate a page per CPU, add them */
+	for_each_possible_cpu(i) {
+		data = global_trace.data[i];
+		page = list_entry(pages.next, struct page, lru);
+		list_del(&page->lru);
+		list_add_tail(&page->lru, &data->trace_pages);
+		ClearPageLRU(page);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+		data = max_tr.data[i];
+		page = list_entry(pages.next, struct page, lru);
+		list_del(&page->lru);
+		list_add_tail(&page->lru, &data->trace_pages);
+		SetPageLRU(page);
+#endif
+	}
+	global_trace.entries += ENTRIES_PER_PAGE;
+
+	return 0;
+
+ free_pages:
+	list_for_each_entry_safe(page, tmp, &pages, lru) {
+		list_del(&page->lru);
+		__free_page(page);
+	}
+	return -ENOMEM;
 }
 
 __init static int tracer_alloc_buffers(void)
 {
-	const int order = page_order(trace_nr_entries * TRACE_ENTRY_SIZE);
-	const unsigned long size = (1UL << order) << PAGE_SHIFT;
-	struct trace_entry *array;
+	struct trace_array_cpu *data;
+	void *array;
+	struct page *page;
+	int pages = 0;
 	int i;
 
+	/* Allocate the first page for all buffers */
 	for_each_possible_cpu(i) {
-		global_trace.data[i] = &per_cpu(global_trace_cpu, i);
+		data = global_trace.data[i] = &per_cpu(global_trace_cpu, i);
 		max_tr.data[i] = &per_cpu(max_data, i);
 
-		array = (struct trace_entry *)
-			  __get_free_pages(GFP_KERNEL, order);
+		array = (void *)__get_free_page(GFP_KERNEL);
 		if (array == NULL) {
-			printk(KERN_ERR "tracer: failed to allocate"
-			       " %ld bytes for trace buffer!\n", size);
+			printk(KERN_ERR "tracer: failed to allocate page"
+			       "for trace buffer!\n");
 			goto free_buffers;
 		}
-		global_trace.data[i]->trace = array;
+		data->trace = array;
+
+		/* set the array to the list */
+		INIT_LIST_HEAD(&data->trace_pages);
+		page = virt_to_page(array);
+		list_add(&page->lru, &data->trace_pages);
+		/* use the LRU flag to differentiate the two buffers */
+		ClearPageLRU(page);
 
 /* Only allocate if we are actually using the max trace */
 #ifdef CONFIG_TRACER_MAX_TRACE
-		array = (struct trace_entry *)
-			  __get_free_pages(GFP_KERNEL, order);
+		array = (void *)__get_free_page(GFP_KERNEL);
 		if (array == NULL) {
-			printk(KERN_ERR "wakeup tracer: failed to allocate"
-			       " %ld bytes for trace buffer!\n", size);
+			printk(KERN_ERR "tracer: failed to allocate page"
+			       "for trace buffer!\n");
 			goto free_buffers;
 		}
 		max_tr.data[i]->trace = array;
+
+		INIT_LIST_HEAD(&max_tr.data[i]->trace_pages);
+		page = virt_to_page(array);
+		list_add(&page->lru, &max_tr.data[i]->trace_pages);
+		SetPageLRU(page);
 #endif
 	}
 
@@ -1507,11 +1624,18 @@ __init static int tracer_alloc_buffers(void)
 	 * Since we allocate by orders of pages, we may be able to
 	 * round up a bit.
 	 */
-	global_trace.entries = size / TRACE_ENTRY_SIZE;
+	global_trace.entries = ENTRIES_PER_PAGE;
 	max_tr.entries = global_trace.entries;
+	pages++;
+
+	while (global_trace.entries < trace_nr_entries) {
+		if (trace_alloc_page())
+			break;
+		pages++;
+	}
 
-	pr_info("tracer: %ld bytes allocated for %ld",
-		size, trace_nr_entries);
+	pr_info("tracer: %d pages allocated for %ld",
+		pages, trace_nr_entries);
 	pr_info(" entries of %ld bytes\n", (long)TRACE_ENTRY_SIZE);
 	pr_info("   actual entries %ld\n", global_trace.entries);
 
@@ -1526,17 +1650,26 @@ __init static int tracer_alloc_buffers(void)
 
  free_buffers:
 	for (i-- ; i >= 0; i--) {
+		struct page *page, *tmp;
 		struct trace_array_cpu *data = global_trace.data[i];
 
 		if (data && data->trace) {
-			free_pages((unsigned long)data->trace, order);
+			list_for_each_entry_safe(page, tmp,
+						 &data->trace_pages, lru) {
+				list_del(&page->lru);
+				__free_page(page);
+			}
 			data->trace = NULL;
 		}
 
 #ifdef CONFIG_TRACER_MAX_TRACE
 		data = max_tr.data[i];
 		if (data && data->trace) {
-			free_pages((unsigned long)data->trace, order);
+			list_for_each_entry_safe(page, tmp,
+						 &data->trace_pages, lru) {
+				list_del(&page->lru);
+				__free_page(page);
+			}
 			data->trace = NULL;
 		}
 #endif
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3173a93561d4..83e257e38084 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -54,9 +54,11 @@ struct trace_entry {
  */
 struct trace_array_cpu {
 	void			*trace;
+	void			*trace_current;
+	unsigned		trace_current_idx;
+	struct list_head	trace_pages;
 	unsigned long		trace_idx;
 	atomic_t		disabled;
-	atomic_t		underrun;
 	unsigned long		saved_latency;
 	unsigned long		critical_start;
 	unsigned long		critical_end;
@@ -112,8 +114,10 @@ struct trace_iterator {
 	unsigned long		iter_flags;
 	loff_t			pos;
 	unsigned long		next_idx[NR_CPUS];
+	struct list_head	*next_page[NR_CPUS];
+	unsigned		next_page_idx[NR_CPUS];
+	long			idx;
 	int			cpu;
-	int			idx;
 };
 
 void notrace tracing_reset(struct trace_array_cpu *data);
-- 
cgit v1.2.3


From 361943ad0ba3f16e66859e30a408915e008ba91e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:44 +0200
Subject: ftrace: irqs off smp_processor_id() fix

The irqsoff function tracer did a __get_cpu_var to determine
if it should trace the function or not. The problem is that
__get_cpu_var can preempt between getting the CPU and reading
the cpu variable. This means that the cpu variable that is
being read is not from the cpu being run on.

At worst, this can give a false positive, where we trace the
function when we should not.  It will never give a false negative
since we only want to trace when interrupts are disabled
and we never preempt when they are.

This fix adds a check after reading the irq flags to only
trace if the interrupts are actually disabled. It also changes
the reading of the cpu variable to use a raw_smp_processor_id
since we now don't care if we preempt. We still catch that fact.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace_irqsoff.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 8b1231633dc5..bd3f88198308 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -74,12 +74,21 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
 	long disabled;
 	int cpu;
 
-	if (likely(!__get_cpu_var(tracing_cpu)))
+	/*
+	 * Does not matter if we preempt. We test the flags
+	 * afterward, to see if irqs are disabled or not.
+	 * If we preempt and get a false positive, the flags
+	 * test will fail.
+	 */
+	cpu = raw_smp_processor_id();
+	if (likely(!per_cpu(tracing_cpu, cpu)))
 		return;
 
 	local_save_flags(flags);
+	/* slight chance to get a false positive on tracing_cpu */
+	if (!irqs_disabled_flags(flags))
+		return;
 
-	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 	disabled = atomic_inc_return(&data->disabled);
 
-- 
cgit v1.2.3


From 0764d23cf066c52de42b653144605b481d3fbdbc Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:44 +0200
Subject: ftrace: lockdep notrace annotations

Add notrace annotations to lockdep to keep ftrace from causing
recursive problems with lock tracing and debugging.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/lockdep.c  | 23 ++++++++++++-----------
 kernel/spinlock.c |  2 +-
 2 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index e21924365ea3..ac46847ba0c9 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -271,14 +271,14 @@ static struct list_head chainhash_table[CHAINHASH_SIZE];
 	((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \
 	(key2))
 
-void lockdep_off(void)
+notrace void lockdep_off(void)
 {
 	current->lockdep_recursion++;
 }
 
 EXPORT_SYMBOL(lockdep_off);
 
-void lockdep_on(void)
+notrace void lockdep_on(void)
 {
 	current->lockdep_recursion--;
 }
@@ -1041,7 +1041,7 @@ find_usage_forwards(struct lock_class *source, unsigned int depth)
  * Return 1 otherwise and keep <backwards_match> unchanged.
  * Return 0 on error.
  */
-static noinline int
+static noinline notrace int
 find_usage_backwards(struct lock_class *source, unsigned int depth)
 {
 	struct lock_list *entry;
@@ -1591,7 +1591,7 @@ static inline int validate_chain(struct task_struct *curr,
  * We are building curr_chain_key incrementally, so double-check
  * it from scratch, to make sure that it's done correctly:
  */
-static void check_chain_key(struct task_struct *curr)
+static notrace void check_chain_key(struct task_struct *curr)
 {
 #ifdef CONFIG_DEBUG_LOCKDEP
 	struct held_lock *hlock, *prev_hlock = NULL;
@@ -1967,7 +1967,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 /*
  * Mark all held locks with a usage bit:
  */
-static int
+static notrace int
 mark_held_locks(struct task_struct *curr, int hardirq)
 {
 	enum lock_usage_bit usage_bit;
@@ -2260,8 +2260,8 @@ static inline int separate_irq_context(struct task_struct *curr,
 /*
  * Mark a lock with a usage bit, and validate the state transition:
  */
-static int mark_lock(struct task_struct *curr, struct held_lock *this,
-		     enum lock_usage_bit new_bit)
+static notrace int mark_lock(struct task_struct *curr, struct held_lock *this,
+			     enum lock_usage_bit new_bit)
 {
 	unsigned int new_mask = 1 << new_bit, ret = 1;
 
@@ -2663,7 +2663,7 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
 /*
  * Check whether we follow the irq-flags state precisely:
  */
-static void check_flags(unsigned long flags)
+static notrace void check_flags(unsigned long flags)
 {
 #if defined(CONFIG_DEBUG_LOCKDEP) && defined(CONFIG_TRACE_IRQFLAGS)
 	if (!debug_locks)
@@ -2700,8 +2700,8 @@ static void check_flags(unsigned long flags)
  * We are not always called with irqs disabled - do that here,
  * and also avoid lockdep recursion:
  */
-void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
-		  int trylock, int read, int check, unsigned long ip)
+notrace void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
+			  int trylock, int read, int check, unsigned long ip)
 {
 	unsigned long flags;
 
@@ -2723,7 +2723,8 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 
 EXPORT_SYMBOL_GPL(lock_acquire);
 
-void lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
+notrace void lock_release(struct lockdep_map *lock, int nested,
+			  unsigned long ip)
 {
 	unsigned long flags;
 
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index ae28c8245123..a1fb54c93cdd 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -436,7 +436,7 @@ int __lockfunc _spin_trylock_bh(spinlock_t *lock)
 }
 EXPORT_SYMBOL(_spin_trylock_bh);
 
-int in_lock_functions(unsigned long addr)
+notrace int in_lock_functions(unsigned long addr)
 {
 	/* Linker adds these: start and end of __lockfunc functions */
 	extern char __lock_text_start[], __lock_text_end[];
-- 
cgit v1.2.3


From 18cef379d30f5ded20cc31d7f2d342639d39919d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 12 May 2008 21:20:44 +0200
Subject: ftrace: don't use raw_local_irq_save/restore

Using raw_local_irq_save/restore confuses lockdep.
It's fine to use the normal ones.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c              | 4 ++--
 kernel/trace/trace_sched_switch.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d7ad030a4c49..9175ce91b8f6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -269,7 +269,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
 	if (unlikely(!tracer_enabled))
 		return;
 
-	raw_local_irq_save(flags);
+	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 	disabled = atomic_inc_return(&data->disabled);
@@ -278,7 +278,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
 		ftrace(tr, data, ip, parent_ip, flags);
 
 	atomic_dec(&data->disabled);
-	raw_local_irq_restore(flags);
+	local_irq_restore(flags);
 }
 
 static struct ftrace_ops trace_ops __read_mostly =
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 3e4771d3b890..2715267be469 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -29,7 +29,7 @@ ctx_switch_func(struct task_struct *prev, struct task_struct *next)
 	if (!tracer_enabled)
 		return;
 
-	raw_local_irq_save(flags);
+	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 	disabled = atomic_inc_return(&data->disabled);
@@ -38,7 +38,7 @@ ctx_switch_func(struct task_struct *prev, struct task_struct *next)
 		tracing_sched_switch_trace(tr, data, prev, next, flags);
 
 	atomic_dec(&data->disabled);
-	raw_local_irq_restore(flags);
+	local_irq_restore(flags);
 }
 
 void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next)
-- 
cgit v1.2.3


From 89b2f97819dd074297bbe3e19eaa4afcc98845ad Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:44 +0200
Subject: ftrace: fix updates to max trace

This patch fixes some bugs to the updating of the max trace that
was caused by implementing the new buffering.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c         |  6 +++++-
 kernel/trace/trace_irqsoff.c | 27 +++++++++++++++------------
 2 files changed, 20 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9175ce91b8f6..95966561ba3d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -153,6 +153,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 		memcpy(max_tr.data[i], data, sizeof(*data));
 		data->trace = save_trace;
 		data->trace_pages = save_pages;
+		tracing_reset(data);
 	}
 
 	__update_max_tr(tr, tsk, cpu);
@@ -183,6 +184,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 	memcpy(max_tr.data[cpu], data, sizeof(*data));
 	data->trace = save_trace;
 	data->trace_pages = save_pages;
+	tracing_reset(data);
 
 	__update_max_tr(tr, tsk, cpu);
 	spin_unlock(&ftrace_max_lock);
@@ -877,6 +879,8 @@ print_lat_fmt(struct seq_file *m, struct trace_iterator *iter,
 			   entry->ctx.next_prio,
 			   comm);
 		break;
+	default:
+		seq_printf(m, "Unknown type %d\n", entry->type);
 	}
 }
 
@@ -1625,7 +1629,6 @@ __init static int tracer_alloc_buffers(void)
 	 * round up a bit.
 	 */
 	global_trace.entries = ENTRIES_PER_PAGE;
-	max_tr.entries = global_trace.entries;
 	pages++;
 
 	while (global_trace.entries < trace_nr_entries) {
@@ -1633,6 +1636,7 @@ __init static int tracer_alloc_buffers(void)
 			break;
 		pages++;
 	}
+	max_tr.entries = global_trace.entries;
 
 	pr_info("tracer: %d pages allocated for %ld",
 		pages, trace_nr_entries);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index bd3f88198308..74165f611f36 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -23,6 +23,8 @@ static int				tracer_enabled __read_mostly;
 
 static DEFINE_PER_CPU(int, tracing_cpu);
 
+static DEFINE_SPINLOCK(max_trace_lock);
+
 enum {
 	TRACER_IRQS_OFF		= (1 << 1),
 	TRACER_PREEMPT_OFF	= (1 << 2),
@@ -126,7 +128,7 @@ check_critical_timing(struct trace_array *tr,
 		      int cpu)
 {
 	unsigned long latency, t0, t1;
-	cycle_t T0, T1, T2, delta;
+	cycle_t T0, T1, delta;
 	unsigned long flags;
 
 	/*
@@ -142,20 +144,18 @@ check_critical_timing(struct trace_array *tr,
 	if (!report_latency(delta))
 		goto out;
 
-	ftrace(tr, data, CALLER_ADDR0, parent_ip, flags);
-	/*
-	 * Update the timestamp, because the trace entry above
-	 * might change it (it can only get larger so the latency
-	 * is fair to be reported):
-	 */
-	T2 = now(cpu);
+	spin_lock(&max_trace_lock);
 
-	delta = T2-T0;
+	/* check if we are still the max latency */
+	if (!report_latency(delta))
+		goto out_unlock;
+
+	ftrace(tr, data, CALLER_ADDR0, parent_ip, flags);
 
 	latency = nsecs_to_usecs(delta);
 
 	if (data->critical_sequence != max_sequence)
-		goto out;
+		goto out_unlock;
 
 	tracing_max_latency = delta;
 	t0 = nsecs_to_usecs(T0);
@@ -189,6 +189,9 @@ check_critical_timing(struct trace_array *tr,
 
 	max_sequence++;
 
+out_unlock:
+	spin_unlock(&max_trace_lock);
+
 out:
 	data->critical_sequence = max_sequence;
 	data->preempt_timestamp = now(cpu);
@@ -366,14 +369,14 @@ void notrace trace_preempt_off(unsigned long a0, unsigned long a1)
 
 static void start_irqsoff_tracer(struct trace_array *tr)
 {
-	tracer_enabled = 1;
 	register_ftrace_function(&trace_ops);
+	tracer_enabled = 1;
 }
 
 static void stop_irqsoff_tracer(struct trace_array *tr)
 {
-	unregister_ftrace_function(&trace_ops);
 	tracer_enabled = 0;
+	unregister_ftrace_function(&trace_ops);
 }
 
 static void __irqsoff_tracer_init(struct trace_array *tr)
-- 
cgit v1.2.3


From 57f50be14d57b0dbf88dd019e7bb0ff3a3dc7b81 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:44 +0200
Subject: ftrace: fix max latency

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 95966561ba3d..9bad2379115a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -69,6 +69,11 @@ static int __init set_nr_entries(char *str)
 }
 __setup("trace_entries=", set_nr_entries);
 
+unsigned long nsecs_to_usecs(unsigned long nsecs)
+{
+	return nsecs / 1000;
+}
+
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
 
@@ -733,7 +738,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 		 "---------------------------------\n");
 	seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d |"
 		   " (M:%s VP:%d, KP:%d, SP:%d HP:%d",
-		   data->saved_latency,
+		   nsecs_to_usecs(data->saved_latency),
 		   entries,
 		   total,
 		   tr->cpu,
@@ -771,11 +776,6 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 	seq_puts(m, "\n");
 }
 
-unsigned long nsecs_to_usecs(unsigned long nsecs)
-{
-	return nsecs / 1000;
-}
-
 static void notrace
 lat_print_generic(struct seq_file *m, struct trace_entry *entry, int cpu)
 {
-- 
cgit v1.2.3


From e1c08bdd9fa73e44096e5a82c0d5928b04ab02c8 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:44 +0200
Subject: ftrace: force recording

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/ftrace.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 97d5cb7b7e75..4facf5ceeb86 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -146,6 +146,10 @@ static int notrace __unregister_ftrace_function(struct ftrace_ops *ops)
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 
+static struct task_struct *ftraced_task;
+static DECLARE_WAIT_QUEUE_HEAD(ftraced_waiters);
+static unsigned long ftraced_iteration_counter;
+
 enum {
 	FTRACE_ENABLE_CALLS		= (1 << 0),
 	FTRACE_DISABLE_CALLS		= (1 << 1),
@@ -590,9 +594,12 @@ static int notrace ftraced(void *ignore)
 			ftraced_trigger = 0;
 			ftrace_record_suspend--;
 		}
+		ftraced_iteration_counter++;
 		mutex_unlock(&ftraced_lock);
 		mutex_unlock(&ftrace_sysctl_lock);
 
+		wake_up_interruptible(&ftraced_waiters);
+
 		ftrace_shutdown_replenish();
 
 		set_current_state(TASK_INTERRUPTIBLE);
@@ -1050,6 +1057,49 @@ static struct file_operations ftrace_filter_fops = {
 	.release = ftrace_filter_release,
 };
 
+/**
+ * ftrace_force_update - force an update to all recording ftrace functions
+ *
+ * The ftrace dynamic update daemon only wakes up once a second.
+ * There may be cases where an update needs to be done immediately
+ * for tests or internal kernel tracing to begin. This function
+ * wakes the daemon to do an update and will not return until the
+ * update is complete.
+ */
+int ftrace_force_update(void)
+{
+	unsigned long last_counter;
+	DECLARE_WAITQUEUE(wait, current);
+	int ret = 0;
+
+	if (!ftraced_task)
+		return -ENODEV;
+
+	mutex_lock(&ftraced_lock);
+	last_counter = ftraced_iteration_counter;
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	add_wait_queue(&ftraced_waiters, &wait);
+
+	do {
+		mutex_unlock(&ftraced_lock);
+		wake_up_process(ftraced_task);
+		schedule();
+		mutex_lock(&ftraced_lock);
+		if (signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
+		set_current_state(TASK_INTERRUPTIBLE);
+	} while (last_counter == ftraced_iteration_counter);
+
+	mutex_unlock(&ftraced_lock);
+	remove_wait_queue(&ftraced_waiters, &wait);
+	set_current_state(TASK_RUNNING);
+
+	return ret;
+}
+
 static __init int ftrace_init_debugfs(void)
 {
 	struct dentry *d_tracer;
@@ -1095,6 +1145,7 @@ static int __init notrace ftrace_dynamic_init(void)
 		return -1;
 
 	last_ftrace_enabled = ftrace_enabled = 1;
+	ftraced_task = p;
 
 	return 0;
 }
-- 
cgit v1.2.3


From 60a11774b38fef1ab90b18c5353bd1c7c4d311c8 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:44 +0200
Subject: ftrace: add self-tests

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/Kconfig              |  13 ++
 kernel/trace/trace.c              |  63 +++++-
 kernel/trace/trace.h              |  31 +++
 kernel/trace/trace_functions.c    |   3 +
 kernel/trace/trace_irqsoff.c      |   9 +
 kernel/trace/trace_sched_switch.c |   3 +
 kernel/trace/trace_sched_wakeup.c |   3 +
 kernel/trace/trace_selftest.c     | 415 ++++++++++++++++++++++++++++++++++++++
 8 files changed, 538 insertions(+), 2 deletions(-)
 create mode 100644 kernel/trace/trace_selftest.c

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index cad9db1dee02..3f73a1710242 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -105,3 +105,16 @@ config DYNAMIC_FTRACE
 	 wakes up once a second and checks to see if any ftrace calls
 	 were made. If so, it runs stop_machine (stops all CPUS)
 	 and modifies the code to jump over the call to ftrace.
+
+config FTRACE_SELFTEST
+	bool
+
+config FTRACE_STARTUP_TEST
+	bool "Perform a startup test on ftrace"
+	depends on TRACING
+	select FTRACE_SELFTEST
+	help
+	  This option performs a series of startup tests on ftrace. On bootup
+	  a series of tests are made to verify that the tracer is
+	  functioning properly. It will do tests on all the configured
+	  tracers of ftrace.
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9bad2379115a..f6d026f17dbb 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -32,6 +32,8 @@
 unsigned long __read_mostly	tracing_max_latency = (cycle_t)ULONG_MAX;
 unsigned long __read_mostly	tracing_thresh;
 
+static int tracing_disabled = 1;
+
 static long notrace
 ns2usecs(cycle_t nsec)
 {
@@ -217,11 +219,48 @@ int register_tracer(struct tracer *type)
 		}
 	}
 
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+	if (type->selftest) {
+		struct tracer *saved_tracer = current_trace;
+		struct trace_array_cpu *data;
+		struct trace_array *tr = &global_trace;
+		int saved_ctrl = tr->ctrl;
+		int i;
+		/*
+		 * Run a selftest on this tracer.
+		 * Here we reset the trace buffer, and set the current
+		 * tracer to be this tracer. The tracer can then run some
+		 * internal tracing to verify that everything is in order.
+		 * If we fail, we do not register this tracer.
+		 */
+		for_each_possible_cpu(i) {
+			if (!data->trace)
+				continue;
+			data = tr->data[i];
+			tracing_reset(data);
+		}
+		current_trace = type;
+		tr->ctrl = 0;
+		/* the test is responsible for initializing and enabling */
+		pr_info("Testing tracer %s: ", type->name);
+		ret = type->selftest(type, tr);
+		/* the test is responsible for resetting too */
+		current_trace = saved_tracer;
+		tr->ctrl = saved_ctrl;
+		if (ret) {
+			printk(KERN_CONT "FAILED!\n");
+			goto out;
+		}
+		printk(KERN_CONT "PASSED\n");
+	}
+#endif
+
 	type->next = trace_types;
 	trace_types = type;
 	len = strlen(type->name);
 	if (len > max_tracer_type_len)
 		max_tracer_type_len = len;
+
  out:
 	mutex_unlock(&trace_types_lock);
 
@@ -985,6 +1024,11 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 {
 	struct trace_iterator *iter;
 
+	if (tracing_disabled) {
+		*ret = -ENODEV;
+		return NULL;
+	}
+
 	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
 	if (!iter) {
 		*ret = -ENOMEM;
@@ -1023,6 +1067,9 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 
 int tracing_open_generic(struct inode *inode, struct file *filp)
 {
+	if (tracing_disabled)
+		return -ENODEV;
+
 	filp->private_data = inode->i_private;
 	return 0;
 }
@@ -1128,6 +1175,9 @@ static int show_traces_open(struct inode *inode, struct file *file)
 {
 	int ret;
 
+	if (tracing_disabled)
+		return -ENODEV;
+
 	ret = seq_open(file, &show_traces_seq_ops);
 	if (!ret) {
 		struct seq_file *m = file->private_data;
@@ -1452,6 +1502,11 @@ struct dentry *tracing_init_dentry(void)
 	return d_tracer;
 }
 
+#ifdef CONFIG_FTRACE_SELFTEST
+/* Let selftest have access to static functions in this file */
+#include "trace_selftest.c"
+#endif
+
 static __init void tracer_init_debugfs(void)
 {
 	struct dentry *d_tracer;
@@ -1585,6 +1640,7 @@ __init static int tracer_alloc_buffers(void)
 	void *array;
 	struct page *page;
 	int pages = 0;
+	int ret = -ENOMEM;
 	int i;
 
 	/* Allocate the first page for all buffers */
@@ -1650,6 +1706,9 @@ __init static int tracer_alloc_buffers(void)
 	register_tracer(&no_tracer);
 	current_trace = &no_tracer;
 
+	/* All seems OK, enable tracing */
+	tracing_disabled = 0;
+
 	return 0;
 
  free_buffers:
@@ -1678,7 +1737,7 @@ __init static int tracer_alloc_buffers(void)
 		}
 #endif
 	}
-	return -ENOMEM;
+	return ret;
 }
 
-device_initcall(tracer_alloc_buffers);
+fs_initcall(tracer_alloc_buffers);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 83e257e38084..88edbf1f6788 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -99,6 +99,10 @@ struct tracer {
 	void			(*start)(struct trace_iterator *iter);
 	void			(*stop)(struct trace_iterator *iter);
 	void			(*ctrl_update)(struct trace_array *tr);
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+	int			(*selftest)(struct tracer *trace,
+					    struct trace_array *tr);
+#endif
 	struct tracer		*next;
 	int			print_max;
 };
@@ -185,4 +189,31 @@ extern int unregister_tracer_switch(struct tracer_switch_ops *ops);
 extern unsigned long ftrace_update_tot_cnt;
 #endif
 
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+#ifdef CONFIG_FTRACE
+extern int trace_selftest_startup_function(struct tracer *trace,
+					   struct trace_array *tr);
+#endif
+#ifdef CONFIG_IRQSOFF_TRACER
+extern int trace_selftest_startup_irqsoff(struct tracer *trace,
+					  struct trace_array *tr);
+#endif
+#ifdef CONFIG_PREEMPT_TRACER
+extern int trace_selftest_startup_preemptoff(struct tracer *trace,
+					     struct trace_array *tr);
+#endif
+#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER)
+extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace,
+						 struct trace_array *tr);
+#endif
+#ifdef CONFIG_SCHED_TRACER
+extern int trace_selftest_startup_wakeup(struct tracer *trace,
+					 struct trace_array *tr);
+#endif
+#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+extern int trace_selftest_startup_sched_switch(struct tracer *trace,
+					       struct trace_array *tr);
+#endif
+#endif /* CONFIG_FTRACE_STARTUP_TEST */
+
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 82988c5336e0..5d8ad7a09605 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -63,6 +63,9 @@ static struct tracer function_trace __read_mostly =
 	.init	     = function_trace_init,
 	.reset	     = function_trace_reset,
 	.ctrl_update = function_trace_ctrl_update,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest    = trace_selftest_startup_function,
+#endif
 };
 
 static __init int init_function_trace(void)
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 74165f611f36..14183b8f79c5 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -432,6 +432,9 @@ static struct tracer irqsoff_tracer __read_mostly =
 	.close		= irqsoff_tracer_close,
 	.ctrl_update	= irqsoff_tracer_ctrl_update,
 	.print_max	= 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest    = trace_selftest_startup_irqsoff,
+#endif
 };
 # define register_irqsoff(trace) register_tracer(&trace)
 #else
@@ -455,6 +458,9 @@ static struct tracer preemptoff_tracer __read_mostly =
 	.close		= irqsoff_tracer_close,
 	.ctrl_update	= irqsoff_tracer_ctrl_update,
 	.print_max	= 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest    = trace_selftest_startup_preemptoff,
+#endif
 };
 # define register_preemptoff(trace) register_tracer(&trace)
 #else
@@ -480,6 +486,9 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
 	.close		= irqsoff_tracer_close,
 	.ctrl_update	= irqsoff_tracer_ctrl_update,
 	.print_max	= 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest    = trace_selftest_startup_preemptirqsoff,
+#endif
 };
 
 # define register_preemptirqsoff(trace) register_tracer(&trace)
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 2715267be469..6c9284103a62 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -107,6 +107,9 @@ static struct tracer sched_switch_trace __read_mostly =
 	.init		= sched_switch_trace_init,
 	.reset		= sched_switch_trace_reset,
 	.ctrl_update	= sched_switch_trace_ctrl_update,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest    = trace_selftest_startup_sched_switch,
+#endif
 };
 
 __init static int init_sched_switch_trace(void)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 7c3ccefcf4c3..3d10ff01f805 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -295,6 +295,9 @@ static struct tracer wakeup_tracer __read_mostly =
 	.close		= wakeup_tracer_close,
 	.ctrl_update	= wakeup_tracer_ctrl_update,
 	.print_max	= 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest    = trace_selftest_startup_wakeup,
+#endif
 };
 
 __init static int init_wakeup_tracer(void)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
new file mode 100644
index 000000000000..ef4d3cc009f5
--- /dev/null
+++ b/kernel/trace/trace_selftest.c
@@ -0,0 +1,415 @@
+/* Include in trace.c */
+
+#include <linux/kthread.h>
+
+static inline int trace_valid_entry(struct trace_entry *entry)
+{
+	switch (entry->type) {
+	case TRACE_FN:
+	case TRACE_CTX:
+		return 1;
+	}
+	return 0;
+}
+
+static int
+trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data)
+{
+	struct page *page;
+	struct trace_entry *entries;
+	int idx = 0;
+	int i;
+
+	page = list_entry(data->trace_pages.next, struct page, lru);
+	entries = page_address(page);
+
+	if (data->trace != entries)
+		goto failed;
+
+	/*
+	 * The starting trace buffer always has valid elements,
+	 * if any element exits.
+	 */
+	entries = data->trace;
+
+	for (i = 0; i < tr->entries; i++) {
+
+		if (i < data->trace_idx &&
+		    !trace_valid_entry(&entries[idx])) {
+			printk(KERN_CONT ".. invalid entry %d ", entries[idx].type);
+			goto failed;
+		}
+
+		idx++;
+		if (idx >= ENTRIES_PER_PAGE) {
+			page = virt_to_page(entries);
+			if (page->lru.next == &data->trace_pages) {
+				if (i != tr->entries - 1) {
+					printk(KERN_CONT ".. entries buffer mismatch");
+					goto failed;
+				}
+			} else {
+				page = list_entry(page->lru.next, struct page, lru);
+				entries = page_address(page);
+			}
+			idx = 0;
+		}
+	}
+
+	page = virt_to_page(entries);
+	if (page->lru.next != &data->trace_pages) {
+		printk(KERN_CONT ".. too many entries");
+		goto failed;
+	}
+
+	return 0;
+
+ failed:
+	printk(KERN_CONT ".. corrupted trace buffer .. ");
+	return -1;
+}
+
+/*
+ * Test the trace buffer to see if all the elements
+ * are still sane.
+ */
+static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
+{
+	unsigned long cnt = 0;
+	int cpu;
+	int ret = 0;
+
+	for_each_possible_cpu(cpu) {
+		if (!tr->data[cpu]->trace)
+			continue;
+
+		cnt += tr->data[cpu]->trace_idx;
+		printk("%d: count = %ld\n", cpu, cnt);
+
+		ret = trace_test_buffer_cpu(tr, tr->data[cpu]);
+		if (ret)
+			break;
+	}
+
+	if (count)
+		*count = cnt;
+
+	return ret;
+}
+
+#ifdef CONFIG_FTRACE
+/*
+ * Simple verification test of ftrace function tracer.
+ * Enable ftrace, sleep 1/10 second, and then read the trace
+ * buffer to see if all is in order.
+ */
+int
+trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long count;
+	int ret;
+
+	/* make sure functions have been recorded */
+	ret = ftrace_force_update();
+	if (ret) {
+		printk(KERN_CONT ".. ftraced failed .. ");
+		return ret;
+	}
+
+	/* start the tracing */
+	tr->ctrl = 1;
+	trace->init(tr);
+	/* Sleep for a 1/10 of a second */
+	msleep(100);
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	/* check the trace buffer */
+	ret = trace_test_buffer(tr, &count);
+	trace->reset(tr);
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+	}
+
+	return ret;
+}
+#endif /* CONFIG_FTRACE */
+
+#ifdef CONFIG_IRQSOFF_TRACER
+int
+trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long save_max = tracing_max_latency;
+	unsigned long count;
+	int ret;
+
+	/* start the tracing */
+	tr->ctrl = 1;
+	trace->init(tr);
+	/* reset the max latency */
+	tracing_max_latency = 0;
+	/* disable interrupts for a bit */
+	local_irq_disable();
+	udelay(100);
+	local_irq_enable();
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	/* check both trace buffers */
+	ret = trace_test_buffer(tr, NULL);
+	if (!ret)
+		ret = trace_test_buffer(&max_tr, &count);
+	trace->reset(tr);
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+	}
+
+	tracing_max_latency = save_max;
+
+	return ret;
+}
+#endif /* CONFIG_IRQSOFF_TRACER */
+
+#ifdef CONFIG_PREEMPT_TRACER
+int
+trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long save_max = tracing_max_latency;
+	unsigned long count;
+	int ret;
+
+	/* start the tracing */
+	tr->ctrl = 1;
+	trace->init(tr);
+	/* reset the max latency */
+	tracing_max_latency = 0;
+	/* disable preemption for a bit */
+	preempt_disable();
+	udelay(100);
+	preempt_enable();
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	/* check both trace buffers */
+	ret = trace_test_buffer(tr, NULL);
+	if (!ret)
+		ret = trace_test_buffer(&max_tr, &count);
+	trace->reset(tr);
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+	}
+
+	tracing_max_latency = save_max;
+
+	return ret;
+}
+#endif /* CONFIG_PREEMPT_TRACER */
+
+#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER)
+int
+trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long save_max = tracing_max_latency;
+	unsigned long count;
+	int ret;
+
+	/* start the tracing */
+	tr->ctrl = 1;
+	trace->init(tr);
+
+	/* reset the max latency */
+	tracing_max_latency = 0;
+
+	/* disable preemption and interrupts for a bit */
+	preempt_disable();
+	local_irq_disable();
+	udelay(100);
+	preempt_enable();
+	/* reverse the order of preempt vs irqs */
+	local_irq_enable();
+
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	/* check both trace buffers */
+	ret = trace_test_buffer(tr, NULL);
+	if (ret)
+		goto out;
+
+	ret = trace_test_buffer(&max_tr, &count);
+	if (ret)
+		goto out;
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+		goto out;
+	}
+
+	/* do the test by disabling interrupts first this time */
+	tracing_max_latency = 0;
+	tr->ctrl = 1;
+	trace->ctrl_update(tr);
+	preempt_disable();
+	local_irq_disable();
+	udelay(100);
+	preempt_enable();
+	/* reverse the order of preempt vs irqs */
+	local_irq_enable();
+
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	/* check both trace buffers */
+	ret = trace_test_buffer(tr, NULL);
+	if (ret)
+		goto out;
+
+	ret = trace_test_buffer(&max_tr, &count);
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+		goto out;
+	}
+
+ out:
+	trace->reset(tr);
+	tracing_max_latency = save_max;
+
+	return ret;
+}
+#endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */
+
+#ifdef CONFIG_SCHED_TRACER
+static int trace_wakeup_test_thread(void *data)
+{
+	struct completion *x = data;
+
+	/* Make this a RT thread, doesn't need to be too high */
+
+	rt_mutex_setprio(current, MAX_RT_PRIO - 5);
+
+	/* Make it know we have a new prio */
+	complete(x);
+
+	/* now go to sleep and let the test wake us up */
+	set_current_state(TASK_INTERRUPTIBLE);
+	schedule();
+
+	/* we are awake, now wait to disappear */
+	while (!kthread_should_stop()) {
+		/*
+		 * This is an RT task, do short sleeps to let
+		 * others run.
+		 */
+		msleep(100);
+	}
+
+	return 0;
+}
+
+int
+trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long save_max = tracing_max_latency;
+	struct task_struct *p;
+	struct completion isrt;
+	unsigned long count;
+	int ret;
+
+	init_completion(&isrt);
+
+	/* create a high prio thread */
+	p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test");
+	if (!IS_ERR(p)) {
+		printk(KERN_CONT "Failed to create ftrace wakeup test thread ");
+		return -1;
+	}
+
+	/* make sure the thread is running at an RT prio */
+	wait_for_completion(&isrt);
+
+	/* start the tracing */
+	tr->ctrl = 1;
+	trace->init(tr);
+	/* reset the max latency */
+	tracing_max_latency = 0;
+
+	/* sleep to let the RT thread sleep too */
+	msleep(100);
+
+	/*
+	 * Yes this is slightly racy. It is possible that for some
+	 * strange reason that the RT thread we created, did not
+	 * call schedule for 100ms after doing the completion,
+	 * and we do a wakeup on a task that already is awake.
+	 * But that is extremely unlikely, and the worst thing that
+	 * happens in such a case, is that we disable tracing.
+	 * Honestly, if this race does happen something is horrible
+	 * wrong with the system.
+	 */
+
+	wake_up_process(p);
+
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	/* check both trace buffers */
+	ret = trace_test_buffer(tr, NULL);
+	if (!ret)
+		ret = trace_test_buffer(&max_tr, &count);
+
+
+	trace->reset(tr);
+
+	tracing_max_latency = save_max;
+
+	/* kill the thread */
+	kthread_stop(p);
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+	}
+
+	return ret;
+}
+#endif /* CONFIG_SCHED_TRACER */
+
+#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+int
+trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long count;
+	int ret;
+
+	/* start the tracing */
+	tr->ctrl = 1;
+	trace->init(tr);
+	/* Sleep for a 1/10 of a second */
+	msleep(100);
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	/* check the trace buffer */
+	ret = trace_test_buffer(tr, &count);
+	trace->reset(tr);
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+	}
+
+	return ret;
+}
+#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+#endif /* CONFIG_DYNAMIC_FTRACE */
-- 
cgit v1.2.3


From c7aafc549766b87819285d3480648fc652a47bc4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:45 +0200
Subject: ftrace: cleanups

factor out code and clean it up.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/ftrace.c             |   8 +--
 kernel/trace/trace.c              | 144 ++++++++++++++++++++++++--------------
 kernel/trace/trace.h              |   8 ++-
 kernel/trace/trace_irqsoff.c      |  32 ++++-----
 kernel/trace/trace_sched_wakeup.c |  18 ++---
 kernel/trace/trace_selftest.c     |  25 ++++---
 6 files changed, 133 insertions(+), 102 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4facf5ceeb86..6d4d2e86debc 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1152,10 +1152,10 @@ static int __init notrace ftrace_dynamic_init(void)
 
 core_initcall(ftrace_dynamic_init);
 #else
-# define ftrace_startup()	  do { } while (0)
-# define ftrace_shutdown()	  do { } while (0)
-# define ftrace_startup_sysctl()  do { } while (0)
-# define ftrace_shutdown_sysctl() do { } while (0)
+# define ftrace_startup()		do { } while (0)
+# define ftrace_shutdown()		do { } while (0)
+# define ftrace_startup_sysctl()	do { } while (0)
+# define ftrace_shutdown_sysctl()	do { } while (0)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
 /**
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f6d026f17dbb..61d2f0228866 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -142,12 +142,59 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 	tracing_record_cmdline(current);
 }
 
+void check_pages(struct trace_array_cpu *data)
+{
+	struct page *page, *tmp;
+
+	BUG_ON(data->trace_pages.next->prev != &data->trace_pages);
+	BUG_ON(data->trace_pages.prev->next != &data->trace_pages);
+
+	list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) {
+		BUG_ON(page->lru.next->prev != &page->lru);
+		BUG_ON(page->lru.prev->next != &page->lru);
+	}
+}
+
+void *head_page(struct trace_array_cpu *data)
+{
+	struct page *page;
+
+	check_pages(data);
+	if (list_empty(&data->trace_pages))
+		return NULL;
+
+	page = list_entry(data->trace_pages.next, struct page, lru);
+	BUG_ON(&page->lru == &data->trace_pages);
+
+	return page_address(page);
+}
+
+notrace static void
+flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2)
+{
+	struct list_head flip_pages;
+
+	INIT_LIST_HEAD(&flip_pages);
+
+	tr1->trace_current = NULL;
+	memcpy(&tr1->trace_current_idx, &tr2->trace_current_idx,
+		sizeof(struct trace_array_cpu) -
+		offsetof(struct trace_array_cpu, trace_current_idx));
+
+	check_pages(tr1);
+	check_pages(tr2);
+	list_splice_init(&tr1->trace_pages, &flip_pages);
+	list_splice_init(&tr2->trace_pages, &tr1->trace_pages);
+	list_splice_init(&flip_pages, &tr2->trace_pages);
+	BUG_ON(!list_empty(&flip_pages));
+	check_pages(tr1);
+	check_pages(tr2);
+}
+
 notrace void
 update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 {
 	struct trace_array_cpu *data;
-	void *save_trace;
-	struct list_head save_pages;
 	int i;
 
 	WARN_ON_ONCE(!irqs_disabled());
@@ -155,11 +202,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 	/* clear out all the previous traces */
 	for_each_possible_cpu(i) {
 		data = tr->data[i];
-		save_trace = max_tr.data[i]->trace;
-		save_pages = max_tr.data[i]->trace_pages;
-		memcpy(max_tr.data[i], data, sizeof(*data));
-		data->trace = save_trace;
-		data->trace_pages = save_pages;
+		flip_trace(max_tr.data[i], data);
 		tracing_reset(data);
 	}
 
@@ -177,8 +220,6 @@ notrace void
 update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 {
 	struct trace_array_cpu *data = tr->data[cpu];
-	void *save_trace;
-	struct list_head save_pages;
 	int i;
 
 	WARN_ON_ONCE(!irqs_disabled());
@@ -186,11 +227,8 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 	for_each_possible_cpu(i)
 		tracing_reset(max_tr.data[i]);
 
-	save_trace = max_tr.data[cpu]->trace;
-	save_pages = max_tr.data[cpu]->trace_pages;
-	memcpy(max_tr.data[cpu], data, sizeof(*data));
-	data->trace = save_trace;
-	data->trace_pages = save_pages;
+	flip_trace(max_tr.data[cpu], data);
+
 	tracing_reset(data);
 
 	__update_max_tr(tr, tsk, cpu);
@@ -234,9 +272,9 @@ int register_tracer(struct tracer *type)
 		 * If we fail, we do not register this tracer.
 		 */
 		for_each_possible_cpu(i) {
-			if (!data->trace)
-				continue;
 			data = tr->data[i];
+			if (!head_page(data))
+				continue;
 			tracing_reset(data);
 		}
 		current_trace = type;
@@ -298,7 +336,7 @@ void unregister_tracer(struct tracer *type)
 void notrace tracing_reset(struct trace_array_cpu *data)
 {
 	data->trace_idx = 0;
-	data->trace_current = data->trace;
+	data->trace_current = head_page(data);
 	data->trace_current_idx = 0;
 }
 
@@ -425,26 +463,31 @@ notrace void tracing_record_cmdline(struct task_struct *tsk)
 }
 
 static inline notrace struct trace_entry *
-tracing_get_trace_entry(struct trace_array *tr,
-			struct trace_array_cpu *data)
+tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data)
 {
 	unsigned long idx, idx_next;
 	struct trace_entry *entry;
-	struct page *page;
 	struct list_head *next;
+	struct page *page;
 
 	data->trace_idx++;
 	idx = data->trace_current_idx;
 	idx_next = idx + 1;
 
+	BUG_ON(idx * TRACE_ENTRY_SIZE >= PAGE_SIZE);
+
 	entry = data->trace_current + idx * TRACE_ENTRY_SIZE;
 
 	if (unlikely(idx_next >= ENTRIES_PER_PAGE)) {
 		page = virt_to_page(data->trace_current);
-		if (unlikely(&page->lru == data->trace_pages.prev))
-			next = data->trace_pages.next;
-		else
-			next = page->lru.next;
+		/*
+		 * Roundrobin - but skip the head (which is not a real page):
+		 */
+		next = page->lru.next;
+		if (unlikely(next == &data->trace_pages))
+			next = next->next;
+		BUG_ON(next == &data->trace_pages);
+
 		page = list_entry(next, struct page, lru);
 		data->trace_current = page_address(page);
 		idx_next = 0;
@@ -456,18 +499,17 @@ tracing_get_trace_entry(struct trace_array *tr,
 }
 
 static inline notrace void
-tracing_generic_entry_update(struct trace_entry *entry,
-			     unsigned long flags)
+tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags)
 {
 	struct task_struct *tsk = current;
 	unsigned long pc;
 
 	pc = preempt_count();
 
-	entry->idx	= atomic_inc_return(&tracer_counter);
-	entry->preempt_count = pc & 0xff;
-	entry->pid	 = tsk->pid;
-	entry->t	 = now(raw_smp_processor_id());
+	entry->idx		= atomic_inc_return(&tracer_counter);
+	entry->preempt_count	= pc & 0xff;
+	entry->pid		= tsk->pid;
+	entry->t		= now(raw_smp_processor_id());
 	entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
 		((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
 		((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
@@ -476,16 +518,15 @@ tracing_generic_entry_update(struct trace_entry *entry,
 
 notrace void
 ftrace(struct trace_array *tr, struct trace_array_cpu *data,
-		       unsigned long ip, unsigned long parent_ip,
-		       unsigned long flags)
+       unsigned long ip, unsigned long parent_ip, unsigned long flags)
 {
 	struct trace_entry *entry;
 
-	entry = tracing_get_trace_entry(tr, data);
+	entry			= tracing_get_trace_entry(tr, data);
 	tracing_generic_entry_update(entry, flags);
-	entry->type	    = TRACE_FN;
-	entry->fn.ip	    = ip;
-	entry->fn.parent_ip = parent_ip;
+	entry->type		= TRACE_FN;
+	entry->fn.ip		= ip;
+	entry->fn.parent_ip	= parent_ip;
 }
 
 notrace void
@@ -496,7 +537,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 {
 	struct trace_entry *entry;
 
-	entry = tracing_get_trace_entry(tr, data);
+	entry			= tracing_get_trace_entry(tr, data);
 	tracing_generic_entry_update(entry, flags);
 	entry->type		= TRACE_CTX;
 	entry->ctx.prev_pid	= prev->pid;
@@ -540,6 +581,8 @@ trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data,
 	}
 
 	page = list_entry(iter->next_page[cpu], struct page, lru);
+	BUG_ON(&data->trace_pages == &page->lru);
+
 	array = page_address(page);
 
 	return &array[iter->next_page_idx[cpu]];
@@ -554,7 +597,7 @@ find_next_entry(struct trace_iterator *iter, int *ent_cpu)
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
-		if (!tr->data[cpu]->trace)
+		if (!head_page(tr->data[cpu]))
 			continue;
 		ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu);
 		if (ent &&
@@ -762,7 +805,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 		name = type->name;
 
 	for_each_possible_cpu(cpu) {
-		if (tr->data[cpu]->trace) {
+		if (head_page(tr->data[cpu])) {
 			total += tr->data[cpu]->trace_idx;
 			if (tr->data[cpu]->trace_idx > tr->entries)
 				entries += tr->entries;
@@ -975,8 +1018,7 @@ static int trace_empty(struct trace_iterator *iter)
 	for_each_possible_cpu(cpu) {
 		data = iter->tr->data[cpu];
 
-		if (data->trace &&
-		    data->trace_idx)
+		if (head_page(data) && data->trace_idx)
 			return 0;
 	}
 	return 1;
@@ -1576,9 +1618,9 @@ static struct tracer no_tracer __read_mostly =
 static int trace_alloc_page(void)
 {
 	struct trace_array_cpu *data;
-	void *array;
 	struct page *page, *tmp;
 	LIST_HEAD(pages);
+	void *array;
 	int i;
 
 	/* first allocate a page for each CPU */
@@ -1610,14 +1652,14 @@ static int trace_alloc_page(void)
 	for_each_possible_cpu(i) {
 		data = global_trace.data[i];
 		page = list_entry(pages.next, struct page, lru);
-		list_del(&page->lru);
+		list_del_init(&page->lru);
 		list_add_tail(&page->lru, &data->trace_pages);
 		ClearPageLRU(page);
 
 #ifdef CONFIG_TRACER_MAX_TRACE
 		data = max_tr.data[i];
 		page = list_entry(pages.next, struct page, lru);
-		list_del(&page->lru);
+		list_del_init(&page->lru);
 		list_add_tail(&page->lru, &data->trace_pages);
 		SetPageLRU(page);
 #endif
@@ -1628,7 +1670,7 @@ static int trace_alloc_page(void)
 
  free_pages:
 	list_for_each_entry_safe(page, tmp, &pages, lru) {
-		list_del(&page->lru);
+		list_del_init(&page->lru);
 		__free_page(page);
 	}
 	return -ENOMEM;
@@ -1654,7 +1696,6 @@ __init static int tracer_alloc_buffers(void)
 			       "for trace buffer!\n");
 			goto free_buffers;
 		}
-		data->trace = array;
 
 		/* set the array to the list */
 		INIT_LIST_HEAD(&data->trace_pages);
@@ -1671,7 +1712,6 @@ __init static int tracer_alloc_buffers(void)
 			       "for trace buffer!\n");
 			goto free_buffers;
 		}
-		max_tr.data[i]->trace = array;
 
 		INIT_LIST_HEAD(&max_tr.data[i]->trace_pages);
 		page = virt_to_page(array);
@@ -1716,24 +1756,22 @@ __init static int tracer_alloc_buffers(void)
 		struct page *page, *tmp;
 		struct trace_array_cpu *data = global_trace.data[i];
 
-		if (data && data->trace) {
+		if (data) {
 			list_for_each_entry_safe(page, tmp,
 						 &data->trace_pages, lru) {
-				list_del(&page->lru);
+				list_del_init(&page->lru);
 				__free_page(page);
 			}
-			data->trace = NULL;
 		}
 
 #ifdef CONFIG_TRACER_MAX_TRACE
 		data = max_tr.data[i];
-		if (data && data->trace) {
+		if (data) {
 			list_for_each_entry_safe(page, tmp,
 						 &data->trace_pages, lru) {
-				list_del(&page->lru);
+				list_del_init(&page->lru);
 				__free_page(page);
 			}
-			data->trace = NULL;
 		}
 #endif
 	}
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 88edbf1f6788..cc1d34b8b771 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -53,12 +53,12 @@ struct trace_entry {
  * the trace, etc.)
  */
 struct trace_array_cpu {
-	void			*trace;
 	void			*trace_current;
-	unsigned		trace_current_idx;
 	struct list_head	trace_pages;
-	unsigned long		trace_idx;
 	atomic_t		disabled;
+	/* these fields get copied into max-trace: */
+	unsigned		trace_current_idx;
+	unsigned long		trace_idx;
 	unsigned long		saved_latency;
 	unsigned long		critical_start;
 	unsigned long		critical_end;
@@ -216,4 +216,6 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace,
 #endif
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 
+extern void *head_page(struct trace_array_cpu *data);
+
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 14183b8f79c5..2dfebb67fdfb 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -144,7 +144,7 @@ check_critical_timing(struct trace_array *tr,
 	if (!report_latency(delta))
 		goto out;
 
-	spin_lock(&max_trace_lock);
+	spin_lock_irqsave(&max_trace_lock, flags);
 
 	/* check if we are still the max latency */
 	if (!report_latency(delta))
@@ -165,32 +165,24 @@ check_critical_timing(struct trace_array *tr,
 
 	update_max_tr_single(tr, current, cpu);
 
-	if (tracing_thresh)
-		printk(KERN_INFO "(%16s-%-5d|#%d): %lu us critical section "
-		       "violates %lu us threshold.\n"
-		       " => started at timestamp %lu: ",
+	if (tracing_thresh) {
+		printk(KERN_INFO "(%16s-%-5d|#%d):"
+			" %lu us critical section violates %lu us threshold.\n",
 				current->comm, current->pid,
 				raw_smp_processor_id(),
-				latency, nsecs_to_usecs(tracing_thresh), t0);
-	else
+				latency, nsecs_to_usecs(tracing_thresh));
+	} else {
 		printk(KERN_INFO "(%16s-%-5d|#%d):"
-		       " new %lu us maximum-latency "
-		       "critical section.\n => started at timestamp %lu: ",
+		       " new %lu us maximum-latency critical section.\n",
 				current->comm, current->pid,
 				raw_smp_processor_id(),
-				latency, t0);
-
-	print_symbol(KERN_CONT "<%s>\n", data->critical_start);
-	printk(KERN_CONT " =>   ended at timestamp %lu: ", t1);
-	print_symbol(KERN_CONT "<%s>\n", data->critical_end);
-	dump_stack();
-	t1 = nsecs_to_usecs(now(cpu));
-	printk(KERN_CONT " =>   dump-end timestamp %lu\n\n", t1);
+				latency);
+	}
 
 	max_sequence++;
 
 out_unlock:
-	spin_unlock(&max_trace_lock);
+	spin_unlock_irqrestore(&max_trace_lock, flags);
 
 out:
 	data->critical_sequence = max_sequence;
@@ -216,7 +208,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 
-	if (unlikely(!data) || unlikely(!data->trace) ||
+	if (unlikely(!data) || unlikely(!head_page(data)) ||
 	    atomic_read(&data->disabled))
 		return;
 
@@ -256,7 +248,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 
-	if (unlikely(!data) || unlikely(!data->trace) ||
+	if (unlikely(!data) || unlikely(!head_page(data)) ||
 	    !data->critical_start || atomic_read(&data->disabled))
 		return;
 
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 3d10ff01f805..688df965f3f2 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -107,24 +107,18 @@ wakeup_sched_switch(struct task_struct *prev, struct task_struct *next)
 	update_max_tr(tr, wakeup_task, wakeup_cpu);
 
 	if (tracing_thresh) {
-		printk(KERN_INFO "(%16s-%-5d|#%d): %lu us wakeup latency "
-		       "violates %lu us threshold.\n"
-		       " => started at timestamp %lu: ",
+		printk(KERN_INFO "(%16s-%-5d|#%d):"
+			" %lu us wakeup latency violates %lu us threshold.\n",
 				wakeup_task->comm, wakeup_task->pid,
 				raw_smp_processor_id(),
-				latency, nsecs_to_usecs(tracing_thresh), t0);
+				latency, nsecs_to_usecs(tracing_thresh));
 	} else {
-		printk(KERN_INFO "(%16s-%-5d|#%d): new %lu us maximum "
-		       "wakeup latency.\n => started at timestamp %lu: ",
+		printk(KERN_INFO "(%16s-%-5d|#%d):"
+			" new %lu us maximum wakeup latency.\n",
 				wakeup_task->comm, wakeup_task->pid,
-				cpu, latency, t0);
+				cpu, latency);
 	}
 
-	printk(KERN_CONT "   ended at timestamp %lu: ", t1);
-	dump_stack();
-	t1 = nsecs_to_usecs(now(cpu));
-	printk(KERN_CONT "   dump-end timestamp %lu\n\n", t1);
-
 out_unlock:
 	__wakeup_reset(tr);
 	spin_unlock_irqrestore(&wakeup_lock, flags);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index ef4d3cc009f5..c01874c3b1f9 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1,6 +1,7 @@
 /* Include in trace.c */
 
 #include <linux/kthread.h>
+#include <linux/delay.h>
 
 static inline int trace_valid_entry(struct trace_entry *entry)
 {
@@ -15,28 +16,29 @@ static inline int trace_valid_entry(struct trace_entry *entry)
 static int
 trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data)
 {
-	struct page *page;
 	struct trace_entry *entries;
+	struct page *page;
 	int idx = 0;
 	int i;
 
+	BUG_ON(list_empty(&data->trace_pages));
 	page = list_entry(data->trace_pages.next, struct page, lru);
 	entries = page_address(page);
 
-	if (data->trace != entries)
+	if (head_page(data) != entries)
 		goto failed;
 
 	/*
 	 * The starting trace buffer always has valid elements,
-	 * if any element exits.
+	 * if any element exists.
 	 */
-	entries = data->trace;
+	entries = head_page(data);
 
 	for (i = 0; i < tr->entries; i++) {
 
-		if (i < data->trace_idx &&
-		    !trace_valid_entry(&entries[idx])) {
-			printk(KERN_CONT ".. invalid entry %d ", entries[idx].type);
+		if (i < data->trace_idx && !trace_valid_entry(&entries[idx])) {
+			printk(KERN_CONT ".. invalid entry %d ",
+				entries[idx].type);
 			goto failed;
 		}
 
@@ -80,11 +82,10 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
 	int ret = 0;
 
 	for_each_possible_cpu(cpu) {
-		if (!tr->data[cpu]->trace)
+		if (!head_page(tr->data[cpu]))
 			continue;
 
 		cnt += tr->data[cpu]->trace_idx;
-		printk("%d: count = %ld\n", cpu, cnt);
 
 		ret = trace_test_buffer_cpu(tr, tr->data[cpu]);
 		if (ret)
@@ -117,6 +118,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 	}
 
 	/* start the tracing */
+	ftrace_enabled = 1;
+
 	tr->ctrl = 1;
 	trace->init(tr);
 	/* Sleep for a 1/10 of a second */
@@ -124,6 +127,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 	/* stop the tracing. */
 	tr->ctrl = 0;
 	trace->ctrl_update(tr);
+	ftrace_enabled = 0;
+
 	/* check the trace buffer */
 	ret = trace_test_buffer(tr, &count);
 	trace->reset(tr);
@@ -328,7 +333,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
 
 	/* create a high prio thread */
 	p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test");
-	if (!IS_ERR(p)) {
+	if (IS_ERR(p)) {
 		printk(KERN_CONT "Failed to create ftrace wakeup test thread ");
 		return -1;
 	}
-- 
cgit v1.2.3


From 7bd2f24c2f769e3f8f1d4fc8b9fddf689825f6a7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:45 +0200
Subject: ftrace: add README

make it easier for newbies to find their way around.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 40 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 61d2f0228866..c736dd2e068d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -995,6 +995,7 @@ print_trace_fmt(struct seq_file *m, struct trace_iterator *iter)
 			seq_printf(m, " <-");
 			seq_print_ip_sym(m, entry->fn.parent_ip, sym_flags);
 		}
+		seq_printf(m, "\n");
 		break;
 	case TRACE_CTX:
 		S = entry->ctx.prev_state < sizeof(state_to_char) ?
@@ -1007,7 +1008,6 @@ print_trace_fmt(struct seq_file *m, struct trace_iterator *iter)
 			   entry->ctx.next_prio);
 		break;
 	}
-	seq_printf(m, "\n");
 }
 
 static int trace_empty(struct trace_iterator *iter)
@@ -1332,6 +1332,39 @@ static struct file_operations tracing_iter_fops = {
 	.write = tracing_iter_ctrl_write,
 };
 
+static const char readme_msg[] =
+	"tracing mini-HOWTO:\n\n"
+	"# mkdir /debug\n"
+	"# mount -t debugfs nodev /debug\n\n"
+	"# cat /debug/tracing/available_tracers\n"
+	"wakeup preemptirqsoff preemptoff irqsoff ftrace sched_switch none\n\n"
+	"# cat /debug/tracing/current_tracer\n"
+	"none\n"
+	"# echo sched_switch > /debug/tracing/current_tracer\n"
+	"# cat /debug/tracing/current_tracer\n"
+	"sched_switch\n"
+	"# cat /debug/tracing/iter_ctrl\n"
+	"noprint-parent nosym-offset nosym-addr noverbose\n"
+	"# echo print-parent > /debug/tracing/iter_ctrl\n"
+	"# echo 1 > /debug/tracing/tracing_enabled\n"
+	"# cat /debug/tracing/trace > /tmp/trace.txt\n"
+	"echo 0 > /debug/tracing/tracing_enabled\n"
+;
+
+static ssize_t
+tracing_readme_read(struct file *filp, char __user *ubuf,
+		       size_t cnt, loff_t *ppos)
+{
+	return simple_read_from_buffer(ubuf, cnt, ppos,
+					readme_msg, strlen(readme_msg));
+}
+
+static struct file_operations tracing_readme_fops = {
+	.open = tracing_open_generic,
+	.read = tracing_readme_read,
+};
+
+
 static ssize_t
 tracing_ctrl_read(struct file *filp, char __user *ubuf,
 		  size_t cnt, loff_t *ppos)
@@ -1598,6 +1631,11 @@ static __init void tracer_init_debugfs(void)
 	if (!entry)
 		pr_warning("Could not create debugfs "
 			   "'tracing_threash' entry\n");
+	entry = debugfs_create_file("README", 0644, d_tracer,
+				    NULL, &tracing_readme_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'README' entry\n");
+
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 	entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
-- 
cgit v1.2.3


From 77a2b37d227483fe52aead242652aee406c25bf0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:45 +0200
Subject: ftrace: startup tester on dynamic tracing.

This patch adds a startup self test on dynamic code modification
and filters. The test filters on a specific function, makes sure that
no other function is traced, exectutes the function, then makes sure that
the function is traced.

This patch also fixes a slight bug with the ftrace selftest, where
tracer_enabled was not being set.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/ftrace.c         |  19 +++++++
 kernel/trace/trace_selftest.c | 113 ++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 128 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6d4d2e86debc..5e9389faaf75 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1010,6 +1010,25 @@ ftrace_filter_write(struct file *file, const char __user *ubuf,
 	return ret;
 }
 
+/**
+ * ftrace_set_filter - set a function to filter on in ftrace
+ * @buf - the string that holds the function filter text.
+ * @len - the length of the string.
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Filters denote which functions should be enabled when tracing is enabled.
+ * If @buf is NULL and reset is set, all functions will be enabled for tracing.
+ */
+notrace void ftrace_set_filter(unsigned char *buf, int len, int reset)
+{
+	mutex_lock(&ftrace_filter_lock);
+	if (reset)
+		ftrace_filter_reset();
+	if (buf)
+		ftrace_match(buf, len);
+	mutex_unlock(&ftrace_filter_lock);
+}
+
 static int notrace
 ftrace_filter_release(struct inode *inode, struct file *file)
 {
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index c01874c3b1f9..4c8a1b2d8231 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -99,6 +99,100 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
 }
 
 #ifdef CONFIG_FTRACE
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
+#define __STR(x) #x
+#define STR(x) __STR(x)
+static int DYN_FTRACE_TEST_NAME(void)
+{
+	/* used to call mcount */
+	return 0;
+}
+
+/* Test dynamic code modification and ftrace filters */
+int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
+					   struct trace_array *tr,
+					   int (*func)(void))
+{
+	unsigned long count;
+	int ret;
+	int save_ftrace_enabled = ftrace_enabled;
+	int save_tracer_enabled = tracer_enabled;
+
+	/* The ftrace test PASSED */
+	printk(KERN_CONT "PASSED\n");
+	pr_info("Testing dynamic ftrace: ");
+
+	/* enable tracing, and record the filter function */
+	ftrace_enabled = 1;
+	tracer_enabled = 1;
+
+	/* passed in by parameter to fool gcc from optimizing */
+	func();
+
+	/* update the records */
+	ret = ftrace_force_update();
+	if (ret) {
+		printk(KERN_CONT ".. ftraced failed .. ");
+		return ret;
+	}
+
+	/* filter only on our function */
+	ftrace_set_filter(STR(DYN_FTRACE_TEST_NAME),
+			  sizeof(STR(DYN_FTRACE_TEST_NAME)), 1);
+
+	/* enable tracing */
+	tr->ctrl = 1;
+	trace->init(tr);
+	/* Sleep for a 1/10 of a second */
+	msleep(100);
+
+	/* we should have nothing in the buffer */
+	ret = trace_test_buffer(tr, &count);
+	if (ret)
+		goto out;
+
+	if (count) {
+		ret = -1;
+		printk(KERN_CONT ".. filter did not filter .. ");
+		goto out;
+	}
+
+	/* call our function again */
+	func();
+
+	/* sleep again */
+	msleep(100);
+
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	ftrace_enabled = 0;
+
+	/* check the trace buffer */
+	ret = trace_test_buffer(tr, &count);
+	trace->reset(tr);
+
+	/* we should only have one item */
+	if (!ret && count != 1) {
+		printk(KERN_CONT ".. filter failed ..");
+		ret = -1;
+		goto out;
+	}
+ out:
+	ftrace_enabled = save_ftrace_enabled;
+	tracer_enabled = save_tracer_enabled;
+
+	/* Enable tracing on all functions again */
+	ftrace_set_filter(NULL, 0, 1);
+
+	return ret;
+}
+#else
+# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; })
+#endif /* CONFIG_DYNAMIC_FTRACE */
 /*
  * Simple verification test of ftrace function tracer.
  * Enable ftrace, sleep 1/10 second, and then read the trace
@@ -109,8 +203,13 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 {
 	unsigned long count;
 	int ret;
+	int save_ftrace_enabled = ftrace_enabled;
+	int save_tracer_enabled = tracer_enabled;
 
-	/* make sure functions have been recorded */
+	/* make sure msleep has been recorded */
+	msleep(1);
+
+	/* force the recorded functions to be traced */
 	ret = ftrace_force_update();
 	if (ret) {
 		printk(KERN_CONT ".. ftraced failed .. ");
@@ -119,6 +218,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 
 	/* start the tracing */
 	ftrace_enabled = 1;
+	tracer_enabled = 1;
 
 	tr->ctrl = 1;
 	trace->init(tr);
@@ -136,8 +236,16 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 	if (!ret && !count) {
 		printk(KERN_CONT ".. no entries found ..");
 		ret = -1;
+		goto out;
 	}
 
+	ret = trace_selftest_startup_dynamic_tracing(trace, tr,
+						     DYN_FTRACE_TEST_NAME);
+
+ out:
+	ftrace_enabled = save_ftrace_enabled;
+	tracer_enabled = save_tracer_enabled;
+
 	return ret;
 }
 #endif /* CONFIG_FTRACE */
@@ -415,6 +523,3 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
 	return ret;
 }
 #endif /* CONFIG_CONTEXT_SWITCH_TRACER */
-
-#ifdef CONFIG_DYNAMIC_FTRACE
-#endif /* CONFIG_DYNAMIC_FTRACE */
-- 
cgit v1.2.3


From 4e3c3333f3bd7eedfd21b1155b3c7cd24fc7f754 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:45 +0200
Subject: ftrace: fix time offset

fix time offset calculations and ordering, plus make code more consistent.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 75 ++++++++++++++++++++++++++++++++++++++--------------
 kernel/trace/trace.h |  9 ++++++-
 2 files changed, 63 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c736dd2e068d..8755a4370484 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -120,7 +120,7 @@ static DEFINE_SPINLOCK(ftrace_max_lock);
  * structure. (this way the maximum trace is permanently saved,
  * for later retrieval via /debugfs/tracing/latency_trace)
  */
-static void notrace
+static notrace void
 __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 {
 	struct trace_array_cpu *data = tr->data[cpu];
@@ -333,15 +333,16 @@ void unregister_tracer(struct tracer *type)
 	mutex_unlock(&trace_types_lock);
 }
 
-void notrace tracing_reset(struct trace_array_cpu *data)
+notrace void tracing_reset(struct trace_array_cpu *data)
 {
 	data->trace_idx = 0;
 	data->trace_current = head_page(data);
 	data->trace_current_idx = 0;
+	data->time_offset = 0;
 }
 
 #ifdef CONFIG_FTRACE
-static void notrace
+static notrace void
 function_trace_call(unsigned long ip, unsigned long parent_ip)
 {
 	struct trace_array *tr = &global_trace;
@@ -398,7 +399,7 @@ static void trace_init_cmdlines(void)
 
 notrace void trace_stop_cmdline_recording(void);
 
-static void notrace trace_save_cmdline(struct task_struct *tsk)
+static notrace void trace_save_cmdline(struct task_struct *tsk)
 {
 	unsigned map;
 	unsigned idx;
@@ -624,6 +625,7 @@ static void *find_next_entry_inc(struct trace_iterator *iter)
 		iter->idx++;
 		iter->next_idx[next_cpu]++;
 		iter->next_page_idx[next_cpu]++;
+
 		if (iter->next_page_idx[next_cpu] >= ENTRIES_PER_PAGE) {
 			struct trace_array_cpu *data = iter->tr->data[next_cpu];
 
@@ -635,19 +637,21 @@ static void *find_next_entry_inc(struct trace_iterator *iter)
 					data->trace_pages.next;
 		}
 	}
+	iter->prev_ent = iter->ent;
+	iter->prev_cpu = iter->cpu;
+
 	iter->ent = next;
 	iter->cpu = next_cpu;
 
 	return next ? iter : NULL;
 }
 
-static void notrace *
-s_next(struct seq_file *m, void *v, loff_t *pos)
+static notrace void *s_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct trace_iterator *iter = m->private;
-	void *ent;
 	void *last_ent = iter->ent;
 	int i = (int)*pos;
+	void *ent;
 
 	(*pos)++;
 
@@ -693,6 +697,8 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 		iter->ent = NULL;
 		iter->cpu = 0;
 		iter->idx = -1;
+		iter->prev_ent = NULL;
+		iter->prev_cpu = -1;
 
 		for_each_possible_cpu(i) {
 			iter->next_idx[i] = 0;
@@ -752,7 +758,7 @@ seq_print_sym_offset(struct seq_file *m, const char *fmt, unsigned long address)
 # define IP_FMT "%016lx"
 #endif
 
-static void notrace
+static notrace void
 seq_print_ip_sym(struct seq_file *m, unsigned long ip, unsigned long sym_flags)
 {
 	if (!ip) {
@@ -769,7 +775,7 @@ seq_print_ip_sym(struct seq_file *m, unsigned long ip, unsigned long sym_flags)
 		seq_printf(m, " <" IP_FMT ">", ip);
 }
 
-static void notrace print_lat_help_header(struct seq_file *m)
+static notrace void print_lat_help_header(struct seq_file *m)
 {
 	seq_puts(m, "#                _------=> CPU#            \n");
 	seq_puts(m, "#               / _-----=> irqs-off        \n");
@@ -782,14 +788,14 @@ static void notrace print_lat_help_header(struct seq_file *m)
 	seq_puts(m, "#     \\   /    |||||   \\   |   /           \n");
 }
 
-static void notrace print_func_help_header(struct seq_file *m)
+static notrace void print_func_help_header(struct seq_file *m)
 {
 	seq_puts(m, "#           TASK-PID   CPU#    TIMESTAMP  FUNCTION\n");
 	seq_puts(m, "#              | |      |          |         |\n");
 }
 
 
-static void notrace
+static notrace void
 print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 {
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
@@ -858,7 +864,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 	seq_puts(m, "\n");
 }
 
-static void notrace
+static notrace void
 lat_print_generic(struct seq_file *m, struct trace_entry *entry, int cpu)
 {
 	int hardirq, softirq;
@@ -895,7 +901,7 @@ lat_print_generic(struct seq_file *m, struct trace_entry *entry, int cpu)
 
 unsigned long preempt_mark_thresh = 100;
 
-static void notrace
+static notrace void
 lat_print_timestamp(struct seq_file *m, unsigned long long abs_usecs,
 		    unsigned long rel_usecs)
 {
@@ -910,7 +916,7 @@ lat_print_timestamp(struct seq_file *m, unsigned long long abs_usecs,
 
 static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
 
-static void notrace
+static notrace void
 print_lat_fmt(struct seq_file *m, struct trace_iterator *iter,
 	      unsigned int trace_idx, int cpu)
 {
@@ -966,20 +972,50 @@ print_lat_fmt(struct seq_file *m, struct trace_iterator *iter,
 	}
 }
 
-static void notrace
+static notrace void sync_time_offset(struct trace_iterator *iter)
+{
+	struct trace_array_cpu *prev_array, *array;
+	struct trace_entry *prev_entry, *entry;
+	cycle_t prev_t, t;
+
+	entry = iter->ent;
+	prev_entry = iter->prev_ent;
+	if (!prev_entry)
+		return;
+
+	prev_array = iter->tr->data[iter->prev_cpu];
+	array = iter->tr->data[iter->cpu];
+
+	prev_t = prev_entry->t + prev_array->time_offset;
+	t = entry->t + array->time_offset;
+
+	/*
+	 * If time goes backwards we increase the offset of
+	 * the current array, to not have observable time warps.
+	 * This will quickly synchronize the time offsets of
+	 * multiple CPUs:
+	 */
+	if (t < prev_t)
+		array->time_offset += prev_t - t;
+}
+
+static notrace void
 print_trace_fmt(struct seq_file *m, struct trace_iterator *iter)
 {
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
-	struct trace_entry *entry = iter->ent;
+	struct trace_entry *entry;
 	unsigned long usec_rem;
 	unsigned long long t;
 	unsigned long secs;
 	char *comm;
 	int S;
 
+	sync_time_offset(iter);
+	entry = iter->ent;
+
 	comm = trace_find_cmdline(iter->ent->pid);
 
-	t = ns2usecs(entry->t);
+	t = ns2usecs(entry->t + iter->tr->data[iter->cpu]->time_offset);
 	usec_rem = do_div(t, 1000000ULL);
 	secs = (unsigned long)t;
 
@@ -1158,7 +1194,7 @@ static int tracing_lt_open(struct inode *inode, struct file *file)
 }
 
 
-static void notrace *
+static notrace void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct tracer *t = m->private;
@@ -1374,8 +1410,7 @@ tracing_ctrl_read(struct file *filp, char __user *ubuf,
 	int r;
 
 	r = sprintf(buf, "%ld\n", tr->ctrl);
-	return simple_read_from_buffer(ubuf, cnt, ppos,
-				       buf, r);
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
 static ssize_t
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cc1d34b8b771..5df8ff2b84a7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -56,6 +56,8 @@ struct trace_array_cpu {
 	void			*trace_current;
 	struct list_head	trace_pages;
 	atomic_t		disabled;
+	cycle_t			time_offset;
+
 	/* these fields get copied into max-trace: */
 	unsigned		trace_current_idx;
 	unsigned long		trace_idx;
@@ -114,14 +116,19 @@ struct tracer {
 struct trace_iterator {
 	struct trace_array	*tr;
 	struct tracer		*trace;
+
 	struct trace_entry	*ent;
+	int			cpu;
+
+	struct trace_entry	*prev_ent;
+	int			prev_cpu;
+
 	unsigned long		iter_flags;
 	loff_t			pos;
 	unsigned long		next_idx[NR_CPUS];
 	struct list_head	*next_page[NR_CPUS];
 	unsigned		next_page_idx[NR_CPUS];
 	long			idx;
-	int			cpu;
 };
 
 void notrace tracing_reset(struct trace_array_cpu *data);
-- 
cgit v1.2.3


From 08bafa0efcf29fe18ec39c2147077b597368b018 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:45 +0200
Subject: ftrace: disable all tracers on corrupted buffer

If the trace buffer is detected to be corrupted, then we
disable all tracers.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace_selftest.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 4c8a1b2d8231..a6f1ed75f836 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -67,6 +67,8 @@ trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data)
 	return 0;
 
  failed:
+	/* disable tracing */
+	tracing_disabled = 1;
 	printk(KERN_CONT ".. corrupted trace buffer .. ");
 	return -1;
 }
-- 
cgit v1.2.3


From 1d4db00a5e30c7b8f8dc2a1b19e886fd942be143 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:45 +0200
Subject: ftrace: reset selftests

The tests may leave stuff in the buffers. This resets the buffers
after each test is run. If a test fails, it does not reset the
buffer to avoid touching a buffer that is corrupted.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8755a4370484..6580e7ed04be 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -289,6 +289,13 @@ int register_tracer(struct tracer *type)
 			printk(KERN_CONT "FAILED!\n");
 			goto out;
 		}
+		/* Only reset on passing, to avoid touching corrupted buffers */
+		for_each_possible_cpu(i) {
+			data = tr->data[i];
+			if (!head_page(data))
+				continue;
+			tracing_reset(data);
+		}
 		printk(KERN_CONT "PASSED\n");
 	}
 #endif
-- 
cgit v1.2.3


From 93a588f459da134be6ab17c4104e28441beb0d22 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:45 +0200
Subject: ftrace: change buffers to producer consumer

This patch changes the way the CPU trace buffers are handled.
Instead of always starting from the trace page head, the logic
is changed to a producer consumer logic. This allows for the
buffers to be drained while they are alive.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 103 ++++++++++++++++++++++++++++++---------------------
 kernel/trace/trace.h |   6 ++-
 2 files changed, 65 insertions(+), 44 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6580e7ed04be..777b859e1c2e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -176,10 +176,9 @@ flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2)
 
 	INIT_LIST_HEAD(&flip_pages);
 
-	tr1->trace_current = NULL;
-	memcpy(&tr1->trace_current_idx, &tr2->trace_current_idx,
+	memcpy(&tr1->trace_head_idx, &tr2->trace_head_idx,
 		sizeof(struct trace_array_cpu) -
-		offsetof(struct trace_array_cpu, trace_current_idx));
+		offsetof(struct trace_array_cpu, trace_head_idx));
 
 	check_pages(tr1);
 	check_pages(tr2);
@@ -228,7 +227,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 		tracing_reset(max_tr.data[i]);
 
 	flip_trace(max_tr.data[cpu], data);
-
 	tracing_reset(data);
 
 	__update_max_tr(tr, tsk, cpu);
@@ -343,9 +341,9 @@ void unregister_tracer(struct tracer *type)
 notrace void tracing_reset(struct trace_array_cpu *data)
 {
 	data->trace_idx = 0;
-	data->trace_current = head_page(data);
-	data->trace_current_idx = 0;
-	data->time_offset = 0;
+	data->trace_head = data->trace_tail = head_page(data);
+	data->trace_head_idx = 0;
+	data->trace_tail_idx = 0;
 }
 
 #ifdef CONFIG_FTRACE
@@ -470,38 +468,65 @@ notrace void tracing_record_cmdline(struct task_struct *tsk)
 	trace_save_cmdline(tsk);
 }
 
+static inline notrace struct list_head *
+trace_next_list(struct trace_array_cpu *data, struct list_head *next)
+{
+	/*
+	 * Roundrobin - but skip the head (which is not a real page):
+	 */
+	next = next->next;
+	if (unlikely(next == &data->trace_pages))
+		next = next->next;
+	BUG_ON(next == &data->trace_pages);
+
+	return next;
+}
+
+static inline notrace void *
+trace_next_page(struct trace_array_cpu *data, void *addr)
+{
+	struct list_head *next;
+	struct page *page;
+
+	page = virt_to_page(addr);
+
+	next = trace_next_list(data, &page->lru);
+	page = list_entry(next, struct page, lru);
+
+	return page_address(page);
+}
+
 static inline notrace struct trace_entry *
 tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data)
 {
 	unsigned long idx, idx_next;
 	struct trace_entry *entry;
-	struct list_head *next;
-	struct page *page;
 
 	data->trace_idx++;
-	idx = data->trace_current_idx;
+	idx = data->trace_head_idx;
 	idx_next = idx + 1;
 
 	BUG_ON(idx * TRACE_ENTRY_SIZE >= PAGE_SIZE);
 
-	entry = data->trace_current + idx * TRACE_ENTRY_SIZE;
+	entry = data->trace_head + idx * TRACE_ENTRY_SIZE;
 
 	if (unlikely(idx_next >= ENTRIES_PER_PAGE)) {
-		page = virt_to_page(data->trace_current);
-		/*
-		 * Roundrobin - but skip the head (which is not a real page):
-		 */
-		next = page->lru.next;
-		if (unlikely(next == &data->trace_pages))
-			next = next->next;
-		BUG_ON(next == &data->trace_pages);
-
-		page = list_entry(next, struct page, lru);
-		data->trace_current = page_address(page);
+		data->trace_head = trace_next_page(data, data->trace_head);
 		idx_next = 0;
 	}
 
-	data->trace_current_idx = idx_next;
+	if (data->trace_head == data->trace_tail &&
+	    idx_next == data->trace_tail_idx) {
+		/* overrun */
+		data->trace_tail_idx++;
+		if (data->trace_tail_idx >= ENTRIES_PER_PAGE) {
+			data->trace_tail =
+				trace_next_page(data, data->trace_tail);
+			data->trace_tail_idx = 0;
+		}
+	}
+
+	data->trace_head_idx = idx_next;
 
 	return entry;
 }
@@ -571,21 +596,11 @@ trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data,
 		return NULL;
 
 	if (!iter->next_page[cpu]) {
-		/*
-		 * Initialize. If the count of elements in
-		 * this buffer is greater than the max entries
-		 * we had an underrun. Which means we looped around.
-		 * We can simply use the current pointer as our
-		 * starting point.
-		 */
-		if (data->trace_idx >= tr->entries) {
-			page = virt_to_page(data->trace_current);
-			iter->next_page[cpu] = &page->lru;
-			iter->next_page_idx[cpu] = data->trace_current_idx;
-		} else {
-			iter->next_page[cpu] = data->trace_pages.next;
-			iter->next_page_idx[cpu] = 0;
-		}
+		/* Initialize the iterator for this cpu trace buffer */
+		WARN_ON(!data->trace_tail);
+		page = virt_to_page(data->trace_tail);
+		iter->next_page[cpu] = &page->lru;
+		iter->next_page_idx[cpu] = data->trace_tail_idx;
 	}
 
 	page = list_entry(iter->next_page[cpu], struct page, lru);
@@ -593,6 +608,12 @@ trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data,
 
 	array = page_address(page);
 
+	/* Still possible to catch up to the tail */
+	if (iter->next_idx[cpu] && array == data->trace_tail &&
+	    iter->next_page_idx[cpu] == data->trace_tail_idx)
+		return NULL;
+
+	WARN_ON(iter->next_page_idx[cpu] >= ENTRIES_PER_PAGE);
 	return &array[iter->next_page_idx[cpu]];
 }
 
@@ -638,10 +659,8 @@ static void *find_next_entry_inc(struct trace_iterator *iter)
 
 			iter->next_page_idx[next_cpu] = 0;
 			iter->next_page[next_cpu] =
-				iter->next_page[next_cpu]->next;
-			if (iter->next_page[next_cpu] == &data->trace_pages)
-				iter->next_page[next_cpu] =
-					data->trace_pages.next;
+			     trace_next_list(data, iter->next_page[next_cpu]);
+
 		}
 	}
 	iter->prev_ent = iter->ent;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5df8ff2b84a7..0ce127455b4b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -53,13 +53,15 @@ struct trace_entry {
  * the trace, etc.)
  */
 struct trace_array_cpu {
-	void			*trace_current;
 	struct list_head	trace_pages;
 	atomic_t		disabled;
 	cycle_t			time_offset;
 
 	/* these fields get copied into max-trace: */
-	unsigned		trace_current_idx;
+	unsigned		trace_head_idx;
+	unsigned		trace_tail_idx;
+	void			*trace_head; /* producer */
+	void			*trace_tail; /* consumer */
 	unsigned long		trace_idx;
 	unsigned long		saved_latency;
 	unsigned long		critical_start;
-- 
cgit v1.2.3


From 214023c3d13a71525e463b5e54e360b926b4dc90 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:46 +0200
Subject: ftrace: add a buffer for output

Later patches will need to print the same things as the seq output
does. But those outputs will not use the seq utility. This patch
adds a buffer to the iterator, that can be used by either the
seq utility or other output.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 202 ++++++++++++++++++++++++++++++++++-----------------
 kernel/trace/trace.h |   6 ++
 2 files changed, 140 insertions(+), 68 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 777b859e1c2e..d39f4faec7c3 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -169,6 +169,66 @@ void *head_page(struct trace_array_cpu *data)
 	return page_address(page);
 }
 
+static notrace int
+trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+{
+	int len = (PAGE_SIZE - 1) - s->len;
+	va_list ap;
+
+	if (!len)
+		return 0;
+
+	va_start(ap, fmt);
+	len = vsnprintf(s->buffer + s->len, len, fmt, ap);
+	va_end(ap);
+
+	s->len += len;
+
+	return len;
+}
+
+static notrace int
+trace_seq_puts(struct trace_seq *s, const char *str)
+{
+	int len = strlen(str);
+
+	if (len > ((PAGE_SIZE - 1) - s->len))
+		len = (PAGE_SIZE - 1) - s->len;
+
+	memcpy(s->buffer + s->len, str, len);
+	s->len += len;
+
+	return len;
+}
+
+static notrace int
+trace_seq_putc(struct trace_seq *s, unsigned char c)
+{
+	if (s->len >= (PAGE_SIZE - 1))
+		return 0;
+
+	s->buffer[s->len++] = c;
+
+	return 1;
+}
+
+static notrace void
+trace_seq_reset(struct trace_seq *s)
+{
+	s->len = 0;
+}
+
+static notrace void
+trace_print_seq(struct seq_file *m, struct trace_seq *s)
+{
+	int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
+
+	s->buffer[len] = 0;
+	seq_puts(m, s->buffer);
+
+	trace_seq_reset(s);
+}
+
 notrace static void
 flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2)
 {
@@ -756,25 +816,26 @@ static void s_stop(struct seq_file *m, void *p)
 }
 
 static void
-seq_print_sym_short(struct seq_file *m, const char *fmt, unsigned long address)
+seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
 {
 #ifdef CONFIG_KALLSYMS
 	char str[KSYM_SYMBOL_LEN];
 
 	kallsyms_lookup(address, NULL, NULL, NULL, str);
 
-	seq_printf(m, fmt, str);
+	trace_seq_printf(s, fmt, str);
 #endif
 }
 
 static void
-seq_print_sym_offset(struct seq_file *m, const char *fmt, unsigned long address)
+seq_print_sym_offset(struct trace_seq *s, const char *fmt,
+		     unsigned long address)
 {
 #ifdef CONFIG_KALLSYMS
 	char str[KSYM_SYMBOL_LEN];
 
 	sprint_symbol(str, address);
-	seq_printf(m, fmt, str);
+	trace_seq_printf(s, fmt, str);
 #endif
 }
 
@@ -785,20 +846,20 @@ seq_print_sym_offset(struct seq_file *m, const char *fmt, unsigned long address)
 #endif
 
 static notrace void
-seq_print_ip_sym(struct seq_file *m, unsigned long ip, unsigned long sym_flags)
+seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
 {
 	if (!ip) {
-		seq_printf(m, "0");
+		trace_seq_printf(s, "0");
 		return;
 	}
 
 	if (sym_flags & TRACE_ITER_SYM_OFFSET)
-		seq_print_sym_offset(m, "%s", ip);
+		seq_print_sym_offset(s, "%s", ip);
 	else
-		seq_print_sym_short(m, "%s", ip);
+		seq_print_sym_short(s, "%s", ip);
 
 	if (sym_flags & TRACE_ITER_SYM_ADDR)
-		seq_printf(m, " <" IP_FMT ">", ip);
+		trace_seq_printf(s, " <" IP_FMT ">", ip);
 }
 
 static notrace void print_lat_help_header(struct seq_file *m)
@@ -881,9 +942,11 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 
 	if (data->critical_start) {
 		seq_puts(m, " => started at: ");
-		seq_print_ip_sym(m, data->critical_start, sym_flags);
+		seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags);
+		trace_print_seq(m, &iter->seq);
 		seq_puts(m, "\n => ended at:   ");
-		seq_print_ip_sym(m, data->critical_end, sym_flags);
+		seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags);
+		trace_print_seq(m, &iter->seq);
 		seq_puts(m, "\n");
 	}
 
@@ -891,61 +954,61 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 }
 
 static notrace void
-lat_print_generic(struct seq_file *m, struct trace_entry *entry, int cpu)
+lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
 {
 	int hardirq, softirq;
 	char *comm;
 
 	comm = trace_find_cmdline(entry->pid);
 
-	seq_printf(m, "%8.8s-%-5d ", comm, entry->pid);
-	seq_printf(m, "%d", cpu);
-	seq_printf(m, "%c%c",
-		   (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.',
-		   ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
+	trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid);
+	trace_seq_printf(s, "%d", cpu);
+	trace_seq_printf(s, "%c%c",
+			(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.',
+			((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
 
 	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
 	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
 	if (hardirq && softirq)
-		seq_putc(m, 'H');
+		trace_seq_putc(s, 'H');
 	else {
 		if (hardirq)
-			seq_putc(m, 'h');
+			trace_seq_putc(s, 'h');
 		else {
 			if (softirq)
-				seq_putc(m, 's');
+				trace_seq_putc(s, 's');
 			else
-				seq_putc(m, '.');
+				trace_seq_putc(s, '.');
 		}
 	}
 
 	if (entry->preempt_count)
-		seq_printf(m, "%x", entry->preempt_count);
+		trace_seq_printf(s, "%x", entry->preempt_count);
 	else
-		seq_puts(m, ".");
+		trace_seq_puts(s, ".");
 }
 
 unsigned long preempt_mark_thresh = 100;
 
 static notrace void
-lat_print_timestamp(struct seq_file *m, unsigned long long abs_usecs,
+lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs,
 		    unsigned long rel_usecs)
 {
-	seq_printf(m, " %4lldus", abs_usecs);
+	trace_seq_printf(s, " %4lldus", abs_usecs);
 	if (rel_usecs > preempt_mark_thresh)
-		seq_puts(m, "!: ");
+		trace_seq_puts(s, "!: ");
 	else if (rel_usecs > 1)
-		seq_puts(m, "+: ");
+		trace_seq_puts(s, "+: ");
 	else
-		seq_puts(m, " : ");
+		trace_seq_puts(s, " : ");
 }
 
 static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
 
 static notrace void
-print_lat_fmt(struct seq_file *m, struct trace_iterator *iter,
-	      unsigned int trace_idx, int cpu)
+print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 {
+	struct trace_seq *s = &iter->seq;
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
 	struct trace_entry *next_entry = find_next_entry(iter, NULL);
 	unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
@@ -962,39 +1025,40 @@ print_lat_fmt(struct seq_file *m, struct trace_iterator *iter,
 
 	if (verbose) {
 		comm = trace_find_cmdline(entry->pid);
-		seq_printf(m, "%16s %5d %d %d %08x %08x [%08lx]"
-			   " %ld.%03ldms (+%ld.%03ldms): ",
-			   comm,
-			   entry->pid, cpu, entry->flags,
-			   entry->preempt_count, trace_idx,
-			   ns2usecs(entry->t),
-			   abs_usecs/1000,
-			   abs_usecs % 1000, rel_usecs/1000, rel_usecs % 1000);
+		trace_seq_printf(s, "%16s %5d %d %d %08x %08x [%08lx]"
+				 " %ld.%03ldms (+%ld.%03ldms): ",
+				 comm,
+				 entry->pid, cpu, entry->flags,
+				 entry->preempt_count, trace_idx,
+				 ns2usecs(entry->t),
+				 abs_usecs/1000,
+				 abs_usecs % 1000, rel_usecs/1000,
+				 rel_usecs % 1000);
 	} else {
-		lat_print_generic(m, entry, cpu);
-		lat_print_timestamp(m, abs_usecs, rel_usecs);
+		lat_print_generic(s, entry, cpu);
+		lat_print_timestamp(s, abs_usecs, rel_usecs);
 	}
 	switch (entry->type) {
 	case TRACE_FN:
-		seq_print_ip_sym(m, entry->fn.ip, sym_flags);
-		seq_puts(m, " (");
-		seq_print_ip_sym(m, entry->fn.parent_ip, sym_flags);
-		seq_puts(m, ")\n");
+		seq_print_ip_sym(s, entry->fn.ip, sym_flags);
+		trace_seq_puts(s, " (");
+		seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags);
+		trace_seq_puts(s, ")\n");
 		break;
 	case TRACE_CTX:
 		S = entry->ctx.prev_state < sizeof(state_to_char) ?
 			state_to_char[entry->ctx.prev_state] : 'X';
 		comm = trace_find_cmdline(entry->ctx.next_pid);
-		seq_printf(m, " %d:%d:%c --> %d:%d %s\n",
-			   entry->ctx.prev_pid,
-			   entry->ctx.prev_prio,
-			   S,
-			   entry->ctx.next_pid,
-			   entry->ctx.next_prio,
-			   comm);
+		trace_seq_printf(s, " %d:%d:%c --> %d:%d %s\n",
+				 entry->ctx.prev_pid,
+				 entry->ctx.prev_prio,
+				 S,
+				 entry->ctx.next_pid,
+				 entry->ctx.next_prio,
+				 comm);
 		break;
 	default:
-		seq_printf(m, "Unknown type %d\n", entry->type);
+		trace_seq_printf(s, "Unknown type %d\n", entry->type);
 	}
 }
 
@@ -1026,8 +1090,9 @@ static notrace void sync_time_offset(struct trace_iterator *iter)
 }
 
 static notrace void
-print_trace_fmt(struct seq_file *m, struct trace_iterator *iter)
+print_trace_fmt(struct trace_iterator *iter)
 {
+	struct trace_seq *s = &iter->seq;
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
 	struct trace_entry *entry;
 	unsigned long usec_rem;
@@ -1045,29 +1110,29 @@ print_trace_fmt(struct seq_file *m, struct trace_iterator *iter)
 	usec_rem = do_div(t, 1000000ULL);
 	secs = (unsigned long)t;
 
-	seq_printf(m, "%16s-%-5d ", comm, entry->pid);
-	seq_printf(m, "[%02d] ", iter->cpu);
-	seq_printf(m, "%5lu.%06lu: ", secs, usec_rem);
+	trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
+	trace_seq_printf(s, "[%02d] ", iter->cpu);
+	trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem);
 
 	switch (entry->type) {
 	case TRACE_FN:
-		seq_print_ip_sym(m, entry->fn.ip, sym_flags);
+		seq_print_ip_sym(s, entry->fn.ip, sym_flags);
 		if ((sym_flags & TRACE_ITER_PRINT_PARENT) &&
 						entry->fn.parent_ip) {
-			seq_printf(m, " <-");
-			seq_print_ip_sym(m, entry->fn.parent_ip, sym_flags);
+			trace_seq_printf(s, " <-");
+			seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags);
 		}
-		seq_printf(m, "\n");
+		trace_seq_printf(s, "\n");
 		break;
 	case TRACE_CTX:
 		S = entry->ctx.prev_state < sizeof(state_to_char) ?
 			state_to_char[entry->ctx.prev_state] : 'X';
-		seq_printf(m, " %d:%d:%c ==> %d:%d\n",
-			   entry->ctx.prev_pid,
-			   entry->ctx.prev_prio,
-			   S,
-			   entry->ctx.next_pid,
-			   entry->ctx.next_prio);
+		trace_seq_printf(s, " %d:%d:%c ==> %d:%d\n",
+				 entry->ctx.prev_pid,
+				 entry->ctx.prev_prio,
+				 S,
+				 entry->ctx.next_pid,
+				 entry->ctx.next_prio);
 		break;
 	}
 }
@@ -1108,9 +1173,10 @@ static int s_show(struct seq_file *m, void *v)
 		}
 	} else {
 		if (iter->iter_flags & TRACE_FILE_LAT_FMT)
-			print_lat_fmt(m, iter, iter->idx, iter->cpu);
+			print_lat_fmt(iter, iter->idx, iter->cpu);
 		else
-			print_trace_fmt(m, iter);
+			print_trace_fmt(iter);
+		trace_print_seq(m, &iter->seq);
 	}
 
 	return 0;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 0ce127455b4b..f5b32ca0b457 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -111,11 +111,17 @@ struct tracer {
 	int			print_max;
 };
 
+struct trace_seq {
+	unsigned char		buffer[PAGE_SIZE];
+	unsigned int		len;
+};
+
 /*
  * Trace iterator - used by printout routines who present trace
  * results to users and which routines might sleep, etc:
  */
 struct trace_iterator {
+	struct trace_seq	seq;
 	struct trace_array	*tr;
 	struct tracer		*trace;
 
-- 
cgit v1.2.3


From b3806b4316306dc9c542eff6c23d7d42918f3504 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:46 +0200
Subject: ftrace: user run time file reading

This patch creates a file called trace_pipe in the tracing
debug directory. This file is a consumer of the trace buffers.
This means that reads of this file consumes the entries from
the trace buffers so that they will not be read a second time,
as contrast to the static buffers latency_trace and trace.

Reading from the trace_pipe will remove the entries from trace
and latency_trace too.

The advantage that trace_pipe has is that it can record live
traces. It will block when there is nothing in the buffer,
and read the entries as they are entered.  An EOF happens when
tracing is disabled (tracing_enabled = 0).

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 353 ++++++++++++++++++++++++++++++++++++++++++++-------
 kernel/trace/trace.h |   2 +
 2 files changed, 309 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d39f4faec7c3..a40687a4413a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -174,15 +174,20 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 {
 	int len = (PAGE_SIZE - 1) - s->len;
 	va_list ap;
+	int ret;
 
 	if (!len)
 		return 0;
 
 	va_start(ap, fmt);
-	len = vsnprintf(s->buffer + s->len, len, fmt, ap);
+	ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
 	va_end(ap);
 
-	s->len += len;
+	/* If we can't write it all, don't bother writing anything */
+	if (ret > len)
+		return 0;
+
+	s->len += ret;
 
 	return len;
 }
@@ -193,7 +198,7 @@ trace_seq_puts(struct trace_seq *s, const char *str)
 	int len = strlen(str);
 
 	if (len > ((PAGE_SIZE - 1) - s->len))
-		len = (PAGE_SIZE - 1) - s->len;
+		return 0;
 
 	memcpy(s->buffer + s->len, str, len);
 	s->len += len;
@@ -615,11 +620,13 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
 {
 	struct trace_entry *entry;
 
+	spin_lock(&data->lock);
 	entry			= tracing_get_trace_entry(tr, data);
 	tracing_generic_entry_update(entry, flags);
 	entry->type		= TRACE_FN;
 	entry->fn.ip		= ip;
 	entry->fn.parent_ip	= parent_ip;
+	spin_unlock(&data->lock);
 }
 
 notrace void
@@ -630,6 +637,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 {
 	struct trace_entry *entry;
 
+	spin_lock(&data->lock);
 	entry			= tracing_get_trace_entry(tr, data);
 	tracing_generic_entry_update(entry, flags);
 	entry->type		= TRACE_CTX;
@@ -638,6 +646,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->ctx.prev_state	= prev->state;
 	entry->ctx.next_pid	= next->pid;
 	entry->ctx.next_prio	= next->prio;
+	spin_unlock(&data->lock);
 }
 
 enum trace_file_type {
@@ -652,7 +661,9 @@ trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data,
 	struct trace_entry *array;
 
 	if (iter->next_idx[cpu] >= tr->entries ||
-	    iter->next_idx[cpu] >= data->trace_idx)
+	    iter->next_idx[cpu] >= data->trace_idx ||
+	    (data->trace_head == data->trace_tail &&
+	     data->trace_head_idx == data->trace_tail_idx))
 		return NULL;
 
 	if (!iter->next_page[cpu]) {
@@ -702,33 +713,57 @@ find_next_entry(struct trace_iterator *iter, int *ent_cpu)
 	return next;
 }
 
-static void *find_next_entry_inc(struct trace_iterator *iter)
+static notrace void
+trace_iterator_increment(struct trace_iterator *iter)
 {
-	struct trace_entry *next;
-	int next_cpu = -1;
+	iter->idx++;
+	iter->next_idx[iter->cpu]++;
+	iter->next_page_idx[iter->cpu]++;
+	if (iter->next_page_idx[iter->cpu] >= ENTRIES_PER_PAGE) {
+		struct trace_array_cpu *data = iter->tr->data[iter->cpu];
 
-	next = find_next_entry(iter, &next_cpu);
+		iter->next_page_idx[iter->cpu] = 0;
+		iter->next_page[iter->cpu] =
+			trace_next_list(data, iter->next_page[iter->cpu]);
+	}
+}
 
-	if (next) {
-		iter->idx++;
-		iter->next_idx[next_cpu]++;
-		iter->next_page_idx[next_cpu]++;
+static notrace void
+trace_consume(struct trace_iterator *iter)
+{
+	struct trace_array_cpu *data = iter->tr->data[iter->cpu];
+
+	data->trace_tail_idx++;
+	if (data->trace_tail_idx >= ENTRIES_PER_PAGE) {
+		data->trace_tail = trace_next_page(data, data->trace_tail);
+		data->trace_tail_idx = 0;
+	}
 
-		if (iter->next_page_idx[next_cpu] >= ENTRIES_PER_PAGE) {
-			struct trace_array_cpu *data = iter->tr->data[next_cpu];
+	/* Check if we empty it, then reset the index */
+	if (data->trace_head == data->trace_tail &&
+	    data->trace_head_idx == data->trace_tail_idx)
+		data->trace_idx = 0;
 
-			iter->next_page_idx[next_cpu] = 0;
-			iter->next_page[next_cpu] =
-			     trace_next_list(data, iter->next_page[next_cpu]);
+	trace_iterator_increment(iter);
+}
+
+static notrace void *
+find_next_entry_inc(struct trace_iterator *iter)
+{
+	struct trace_entry *next;
+	int next_cpu = -1;
+
+	next = find_next_entry(iter, &next_cpu);
 
-		}
-	}
 	iter->prev_ent = iter->ent;
 	iter->prev_cpu = iter->cpu;
 
 	iter->ent = next;
 	iter->cpu = next_cpu;
 
+	if (next)
+		trace_iterator_increment(iter);
+
 	return next ? iter : NULL;
 }
 
@@ -815,7 +850,7 @@ static void s_stop(struct seq_file *m, void *p)
 	mutex_unlock(&trace_types_lock);
 }
 
-static void
+static int
 seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
 {
 #ifdef CONFIG_KALLSYMS
@@ -823,11 +858,12 @@ seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
 
 	kallsyms_lookup(address, NULL, NULL, NULL, str);
 
-	trace_seq_printf(s, fmt, str);
+	return trace_seq_printf(s, fmt, str);
 #endif
+	return 1;
 }
 
-static void
+static int
 seq_print_sym_offset(struct trace_seq *s, const char *fmt,
 		     unsigned long address)
 {
@@ -835,8 +871,9 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
 	char str[KSYM_SYMBOL_LEN];
 
 	sprint_symbol(str, address);
-	trace_seq_printf(s, fmt, str);
+	return trace_seq_printf(s, fmt, str);
 #endif
+	return 1;
 }
 
 #ifndef CONFIG_64BIT
@@ -845,21 +882,25 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
 # define IP_FMT "%016lx"
 #endif
 
-static notrace void
+static notrace int
 seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
 {
-	if (!ip) {
-		trace_seq_printf(s, "0");
-		return;
-	}
+	int ret;
+
+	if (!ip)
+		return trace_seq_printf(s, "0");
 
 	if (sym_flags & TRACE_ITER_SYM_OFFSET)
-		seq_print_sym_offset(s, "%s", ip);
+		ret = seq_print_sym_offset(s, "%s", ip);
 	else
-		seq_print_sym_short(s, "%s", ip);
+		ret = seq_print_sym_short(s, "%s", ip);
+
+	if (!ret)
+		return 0;
 
 	if (sym_flags & TRACE_ITER_SYM_ADDR)
-		trace_seq_printf(s, " <" IP_FMT ">", ip);
+		ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+	return ret;
 }
 
 static notrace void print_lat_help_header(struct seq_file *m)
@@ -1089,7 +1130,7 @@ static notrace void sync_time_offset(struct trace_iterator *iter)
 		array->time_offset += prev_t - t;
 }
 
-static notrace void
+static notrace int
 print_trace_fmt(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
@@ -1100,6 +1141,7 @@ print_trace_fmt(struct trace_iterator *iter)
 	unsigned long secs;
 	char *comm;
 	int S;
+	int ret;
 
 	sync_time_offset(iter);
 	entry = iter->ent;
@@ -1110,31 +1152,49 @@ print_trace_fmt(struct trace_iterator *iter)
 	usec_rem = do_div(t, 1000000ULL);
 	secs = (unsigned long)t;
 
-	trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
-	trace_seq_printf(s, "[%02d] ", iter->cpu);
-	trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem);
+	ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
+	if (!ret)
+		return 0;
+	ret = trace_seq_printf(s, "[%02d] ", iter->cpu);
+	if (!ret)
+		return 0;
+	ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem);
+	if (!ret)
+		return 0;
 
 	switch (entry->type) {
 	case TRACE_FN:
-		seq_print_ip_sym(s, entry->fn.ip, sym_flags);
+		ret = seq_print_ip_sym(s, entry->fn.ip, sym_flags);
+		if (!ret)
+			return 0;
 		if ((sym_flags & TRACE_ITER_PRINT_PARENT) &&
 						entry->fn.parent_ip) {
-			trace_seq_printf(s, " <-");
-			seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags);
+			ret = trace_seq_printf(s, " <-");
+			if (!ret)
+				return 0;
+			ret = seq_print_ip_sym(s, entry->fn.parent_ip,
+					       sym_flags);
+			if (!ret)
+				return 0;
 		}
-		trace_seq_printf(s, "\n");
+		ret = trace_seq_printf(s, "\n");
+		if (!ret)
+			return 0;
 		break;
 	case TRACE_CTX:
 		S = entry->ctx.prev_state < sizeof(state_to_char) ?
 			state_to_char[entry->ctx.prev_state] : 'X';
-		trace_seq_printf(s, " %d:%d:%c ==> %d:%d\n",
-				 entry->ctx.prev_pid,
-				 entry->ctx.prev_prio,
-				 S,
-				 entry->ctx.next_pid,
-				 entry->ctx.next_prio);
+		ret = trace_seq_printf(s, " %d:%d:%c ==> %d:%d\n",
+				       entry->ctx.prev_pid,
+				       entry->ctx.prev_prio,
+				       S,
+				       entry->ctx.next_pid,
+				       entry->ctx.next_prio);
+		if (!ret)
+			return 0;
 		break;
 	}
+	return 1;
 }
 
 static int trace_empty(struct trace_iterator *iter)
@@ -1145,7 +1205,9 @@ static int trace_empty(struct trace_iterator *iter)
 	for_each_possible_cpu(cpu) {
 		data = iter->tr->data[cpu];
 
-		if (head_page(data) && data->trace_idx)
+		if (head_page(data) && data->trace_idx &&
+		    (data->trace_tail != data->trace_head ||
+		     data->trace_tail_idx != data->trace_head_idx))
 			return 0;
 	}
 	return 1;
@@ -1645,6 +1707,192 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
 	return cnt;
 }
 
+static atomic_t tracing_reader;
+
+static int tracing_open_pipe(struct inode *inode, struct file *filp)
+{
+	struct trace_iterator *iter;
+
+	if (tracing_disabled)
+		return -ENODEV;
+
+	/* We only allow for reader of the pipe */
+	if (atomic_inc_return(&tracing_reader) != 1) {
+		atomic_dec(&tracing_reader);
+		return -EBUSY;
+	}
+
+	/* create a buffer to store the information to pass to userspace */
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter)
+		return -ENOMEM;
+
+	iter->tr = &global_trace;
+
+	filp->private_data = iter;
+
+	return 0;
+}
+
+static int tracing_release_pipe(struct inode *inode, struct file *file)
+{
+	struct trace_iterator *iter = file->private_data;
+
+	kfree(iter);
+	atomic_dec(&tracing_reader);
+
+	return 0;
+}
+
+/*
+ * Consumer reader.
+ */
+static ssize_t
+tracing_read_pipe(struct file *filp, char __user *ubuf,
+		  size_t cnt, loff_t *ppos)
+{
+	struct trace_iterator *iter = filp->private_data;
+	struct trace_array_cpu *data;
+	static cpumask_t mask;
+	struct trace_entry *entry;
+	static int start;
+	unsigned long flags;
+	int read = 0;
+	int cpu;
+	int len;
+	int ret;
+
+	/* return any leftover data */
+	if (iter->seq.len > start) {
+		len = iter->seq.len - start;
+		if (cnt > len)
+			cnt = len;
+		ret = copy_to_user(ubuf, iter->seq.buffer + start, cnt);
+		if (ret)
+			cnt = -EFAULT;
+
+		start += len;
+
+		return cnt;
+	}
+
+	trace_seq_reset(&iter->seq);
+	start = 0;
+
+	while (trace_empty(iter)) {
+		/*
+		 * This is a make-shift waitqueue. The reason we don't use
+		 * an actual wait queue is because:
+		 *  1) we only ever have one waiter
+		 *  2) the tracing, traces all functions, we don't want
+		 *     the overhead of calling wake_up and friends
+		 *     (and tracing them too)
+		 *     Anyway, this is really very primitive wakeup.
+		 */
+		set_current_state(TASK_INTERRUPTIBLE);
+		iter->tr->waiter = current;
+
+		/* sleep for one second, and try again. */
+		schedule_timeout(HZ);
+
+		iter->tr->waiter = NULL;
+
+		if (signal_pending(current))
+			return -EINTR;
+
+		/*
+		 * We block until we read something and tracing is disabled.
+		 * We still block if tracing is disabled, but we have never
+		 * read anything. This allows a user to cat this file, and
+		 * then enable tracing. But after we have read something,
+		 * we give an EOF when tracing is again disabled.
+		 *
+		 * iter->pos will be 0 if we haven't read anything.
+		 */
+		if (!tracer_enabled && iter->pos)
+			break;
+
+		continue;
+	}
+
+	/* stop when tracing is finished */
+	if (trace_empty(iter))
+		return 0;
+
+	if (cnt >= PAGE_SIZE)
+		cnt = PAGE_SIZE - 1;
+
+	memset(iter, 0, sizeof(*iter));
+	iter->tr = &global_trace;
+	iter->pos = -1;
+
+	/*
+	 * We need to stop all tracing on all CPUS to read the
+	 * the next buffer. This is a bit expensive, but is
+	 * not done often. We fill all what we can read,
+	 * and then release the locks again.
+	 */
+
+	cpus_clear(mask);
+	local_irq_save(flags);
+	for_each_possible_cpu(cpu) {
+		data = iter->tr->data[cpu];
+
+		if (!head_page(data) || !data->trace_idx)
+			continue;
+
+		atomic_inc(&data->disabled);
+		spin_lock(&data->lock);
+		cpu_set(cpu, mask);
+	}
+
+	while ((entry = find_next_entry(iter, &cpu))) {
+
+		if (!entry)
+			break;
+
+		iter->ent = entry;
+		iter->cpu = cpu;
+
+		ret = print_trace_fmt(iter);
+		if (!ret)
+			break;
+
+		trace_consume(iter);
+
+		if (iter->seq.len >= cnt)
+			break;
+
+	}
+
+	for_each_possible_cpu(cpu) {
+		data = iter->tr->data[cpu];
+
+		if (!cpu_isset(cpu, mask))
+			continue;
+		spin_unlock(&data->lock);
+		atomic_dec(&data->disabled);
+	}
+	local_irq_restore(flags);
+
+	/* Now copy what we have to the user */
+	read = iter->seq.len;
+	if (read > cnt)
+		read = cnt;
+
+	ret = copy_to_user(ubuf, iter->seq.buffer, read);
+
+	if (read < iter->seq.len)
+		start = read;
+	else
+		trace_seq_reset(&iter->seq);
+
+	if (ret)
+		read = -EFAULT;
+
+	return read;
+}
+
 static struct file_operations tracing_max_lat_fops = {
 	.open = tracing_open_generic,
 	.read = tracing_max_lat_read,
@@ -1663,6 +1911,12 @@ static struct file_operations set_tracer_fops = {
 	.write = tracing_set_trace_write,
 };
 
+static struct file_operations tracing_pipe_fops = {
+	.open = tracing_open_pipe,
+	.read = tracing_read_pipe,
+	.release = tracing_release_pipe,
+};
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 static ssize_t
@@ -1763,6 +2017,11 @@ static __init void tracer_init_debugfs(void)
 	if (!entry)
 		pr_warning("Could not create debugfs 'README' entry\n");
 
+	entry = debugfs_create_file("trace_pipe", 0644, d_tracer,
+				    NULL, &tracing_pipe_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'tracing_threash' entry\n");
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 	entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
@@ -1816,6 +2075,7 @@ static int trace_alloc_page(void)
 	/* Now that we successfully allocate a page per CPU, add them */
 	for_each_possible_cpu(i) {
 		data = global_trace.data[i];
+		spin_lock_init(&data->lock);
 		page = list_entry(pages.next, struct page, lru);
 		list_del_init(&page->lru);
 		list_add_tail(&page->lru, &data->trace_pages);
@@ -1823,6 +2083,7 @@ static int trace_alloc_page(void)
 
 #ifdef CONFIG_TRACER_MAX_TRACE
 		data = max_tr.data[i];
+		spin_lock_init(&data->lock);
 		page = list_entry(pages.next, struct page, lru);
 		list_del_init(&page->lru);
 		list_add_tail(&page->lru, &data->trace_pages);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f5b32ca0b457..29a7ea59de50 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -55,6 +55,7 @@ struct trace_entry {
 struct trace_array_cpu {
 	struct list_head	trace_pages;
 	atomic_t		disabled;
+	spinlock_t		lock;
 	cycle_t			time_offset;
 
 	/* these fields get copied into max-trace: */
@@ -88,6 +89,7 @@ struct trace_array {
 	long			ctrl;
 	int			cpu;
 	cycle_t			time_start;
+	struct task_struct	*waiter;
 	struct trace_array_cpu	*data[NR_CPUS];
 };
 
-- 
cgit v1.2.3


From d4c5a2f5870939d837293de87b41dda0012a4572 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:46 +0200
Subject: ftrace: fix locking

we can hold all cpu trace buffer locks at once - put each into a
separate lock class.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 8 +++-----
 kernel/trace/trace.h | 1 +
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a40687a4413a..b3811ca74071 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1865,11 +1865,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 
 	}
 
-	for_each_possible_cpu(cpu) {
+	for_each_cpu_mask(cpu, mask) {
 		data = iter->tr->data[cpu];
-
-		if (!cpu_isset(cpu, mask))
-			continue;
 		spin_unlock(&data->lock);
 		atomic_dec(&data->disabled);
 	}
@@ -2076,6 +2073,7 @@ static int trace_alloc_page(void)
 	for_each_possible_cpu(i) {
 		data = global_trace.data[i];
 		spin_lock_init(&data->lock);
+		lockdep_set_class(&data->lock, &data->lock_key);
 		page = list_entry(pages.next, struct page, lru);
 		list_del_init(&page->lru);
 		list_add_tail(&page->lru, &data->trace_pages);
@@ -2084,6 +2082,7 @@ static int trace_alloc_page(void)
 #ifdef CONFIG_TRACER_MAX_TRACE
 		data = max_tr.data[i];
 		spin_lock_init(&data->lock);
+		lockdep_set_class(&data->lock, &data->lock_key);
 		page = list_entry(pages.next, struct page, lru);
 		list_del_init(&page->lru);
 		list_add_tail(&page->lru, &data->trace_pages);
@@ -2203,5 +2202,4 @@ __init static int tracer_alloc_buffers(void)
 	}
 	return ret;
 }
-
 fs_initcall(tracer_alloc_buffers);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 29a7ea59de50..b0408356f0e0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -56,6 +56,7 @@ struct trace_array_cpu {
 	struct list_head	trace_pages;
 	atomic_t		disabled;
 	spinlock_t		lock;
+	struct lock_class_key	lock_key;
 	cycle_t			time_offset;
 
 	/* these fields get copied into max-trace: */
-- 
cgit v1.2.3


From 4bf39a9411a4ce8712954e03a9bd1592ee345919 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:46 +0200
Subject: ftrace: cleanups

no code changed.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/ftrace.c |  5 +++-
 kernel/trace/trace.c  | 72 +++++++++++++++++++++++++--------------------------
 2 files changed, 39 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5e9389faaf75..97c40865a93e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -756,9 +756,11 @@ ftrace_avail_open(struct inode *inode, struct file *file)
 	ret = seq_open(file, &show_ftrace_seq_ops);
 	if (!ret) {
 		struct seq_file *m = file->private_data;
+
 		m->private = iter;
-	} else
+	} else {
 		kfree(iter);
+	}
 
 	return ret;
 }
@@ -770,6 +772,7 @@ int ftrace_avail_release(struct inode *inode, struct file *file)
 
 	seq_release(inode, file);
 	kfree(iter);
+
 	return 0;
 }
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b3811ca74071..4550afda9607 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1245,10 +1245,10 @@ static int s_show(struct seq_file *m, void *v)
 }
 
 static struct seq_operations tracer_seq_ops = {
-	.start = s_start,
-	.next = s_next,
-	.stop = s_stop,
-	.show = s_show,
+	.start		= s_start,
+	.next		= s_next,
+	.stop		= s_stop,
+	.show		= s_show,
 };
 
 static struct trace_iterator notrace *
@@ -1397,10 +1397,10 @@ static int t_show(struct seq_file *m, void *v)
 }
 
 static struct seq_operations show_traces_seq_ops = {
-	.start = t_start,
-	.next = t_next,
-	.stop = t_stop,
-	.show = t_show,
+	.start		= t_start,
+	.next		= t_next,
+	.stop		= t_stop,
+	.show		= t_show,
 };
 
 static int show_traces_open(struct inode *inode, struct file *file)
@@ -1420,17 +1420,17 @@ static int show_traces_open(struct inode *inode, struct file *file)
 }
 
 static struct file_operations tracing_fops = {
-	.open = tracing_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = tracing_release,
+	.open		= tracing_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= tracing_release,
 };
 
 static struct file_operations tracing_lt_fops = {
-	.open = tracing_lt_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = tracing_release,
+	.open		= tracing_lt_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= tracing_release,
 };
 
 static struct file_operations show_traces_fops = {
@@ -1620,8 +1620,7 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
 		r = sprintf(buf, "\n");
 	mutex_unlock(&trace_types_lock);
 
-	return simple_read_from_buffer(ubuf, cnt, ppos,
-				       buf, r);
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
 static ssize_t
@@ -1680,8 +1679,7 @@ tracing_max_lat_read(struct file *filp, char __user *ubuf,
 		     *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr));
 	if (r > 64)
 		r = 64;
-	return simple_read_from_buffer(ubuf, cnt, ppos,
-				       buf, r);
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
 static ssize_t
@@ -1891,27 +1889,27 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 }
 
 static struct file_operations tracing_max_lat_fops = {
-	.open = tracing_open_generic,
-	.read = tracing_max_lat_read,
-	.write = tracing_max_lat_write,
+	.open		= tracing_open_generic,
+	.read		= tracing_max_lat_read,
+	.write		= tracing_max_lat_write,
 };
 
 static struct file_operations tracing_ctrl_fops = {
-	.open = tracing_open_generic,
-	.read = tracing_ctrl_read,
-	.write = tracing_ctrl_write,
+	.open		= tracing_open_generic,
+	.read		= tracing_ctrl_read,
+	.write		= tracing_ctrl_write,
 };
 
 static struct file_operations set_tracer_fops = {
-	.open = tracing_open_generic,
-	.read = tracing_set_trace_read,
-	.write = tracing_set_trace_write,
+	.open		= tracing_open_generic,
+	.read		= tracing_set_trace_read,
+	.write		= tracing_set_trace_write,
 };
 
 static struct file_operations tracing_pipe_fops = {
-	.open = tracing_open_pipe,
-	.read = tracing_read_pipe,
-	.release = tracing_release_pipe,
+	.open		= tracing_open_pipe,
+	.read		= tracing_read_pipe,
+	.release	= tracing_release_pipe,
 };
 
 #ifdef CONFIG_DYNAMIC_FTRACE
@@ -1925,13 +1923,13 @@ tracing_read_long(struct file *filp, char __user *ubuf,
 	int r;
 
 	r = sprintf(buf, "%ld\n", *p);
-	return simple_read_from_buffer(ubuf, cnt, ppos,
-				       buf, r);
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
 static struct file_operations tracing_read_long_fops = {
-	.open = tracing_open_generic,
-	.read = tracing_read_long,
+	.open		= tracing_open_generic,
+	.read		= tracing_read_long,
 };
 #endif
 
@@ -2033,7 +2031,7 @@ static __init void tracer_init_debugfs(void)
 /* dummy trace to disable tracing */
 static struct tracer no_tracer __read_mostly =
 {
-	.name = "none",
+	.name		= "none",
 };
 
 static int trace_alloc_page(void)
-- 
cgit v1.2.3


From 750ed1a40783432d0dcb0e6c2e813a12615d7664 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:46 +0200
Subject: ftrace: timestamp syncing, prepare

rename and uninline now() to ftrace_now().

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/ftrace.c             | 4 ++--
 kernel/trace/trace.c              | 7 ++++++-
 kernel/trace/trace.h              | 5 +----
 kernel/trace/trace_functions.c    | 2 +-
 kernel/trace/trace_irqsoff.c      | 6 +++---
 kernel/trace/trace_sched_switch.c | 2 +-
 kernel/trace/trace_sched_wakeup.c | 4 ++--
 7 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 97c40865a93e..a15e068535f8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -531,7 +531,7 @@ static int notrace __ftrace_update_code(void *ignore)
 	save_ftrace_enabled = ftrace_enabled;
 	ftrace_enabled = 0;
 
-	start = now(raw_smp_processor_id());
+	start = ftrace_now(raw_smp_processor_id());
 	ftrace_update_cnt = 0;
 
 	/* No locks needed, the machine is stopped! */
@@ -550,7 +550,7 @@ static int notrace __ftrace_update_code(void *ignore)
 
 	}
 
-	stop = now(raw_smp_processor_id());
+	stop = ftrace_now(raw_smp_processor_id());
 	ftrace_update_time = stop - start;
 	ftrace_update_tot_cnt += ftrace_update_cnt;
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4550afda9607..e3778ab0d3f7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -42,6 +42,11 @@ ns2usecs(cycle_t nsec)
 	return nsec;
 }
 
+notrace cycle_t ftrace_now(int cpu)
+{
+	return cpu_clock(cpu);
+}
+
 static atomic_t			tracer_counter;
 static struct trace_array	global_trace;
 
@@ -607,7 +612,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags)
 	entry->idx		= atomic_inc_return(&tracer_counter);
 	entry->preempt_count	= pc & 0xff;
 	entry->pid		= tsk->pid;
-	entry->t		= now(raw_smp_processor_id());
+	entry->t		= ftrace_now(raw_smp_processor_id());
 	entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
 		((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
 		((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b0408356f0e0..30cad677e9d0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -171,10 +171,7 @@ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
 void update_max_tr_single(struct trace_array *tr,
 			  struct task_struct *tsk, int cpu);
 
-static inline notrace cycle_t now(int cpu)
-{
-	return cpu_clock(cpu);
-}
+extern notrace cycle_t ftrace_now(int cpu);
 
 #ifdef CONFIG_SCHED_TRACER
 extern void notrace
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 5d8ad7a09605..e5d34b78fc99 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -20,7 +20,7 @@ static notrace void function_reset(struct trace_array *tr)
 {
 	int cpu;
 
-	tr->time_start = now(tr->cpu);
+	tr->time_start = ftrace_now(tr->cpu);
 
 	for_each_online_cpu(cpu)
 		tracing_reset(tr->data[cpu]);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 2dfebb67fdfb..d2a6e6f1ad2d 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -136,7 +136,7 @@ check_critical_timing(struct trace_array *tr,
 	 * as long as possible:
 	 */
 	T0 = data->preempt_timestamp;
-	T1 = now(cpu);
+	T1 = ftrace_now(cpu);
 	delta = T1-T0;
 
 	local_save_flags(flags);
@@ -186,7 +186,7 @@ out_unlock:
 
 out:
 	data->critical_sequence = max_sequence;
-	data->preempt_timestamp = now(cpu);
+	data->preempt_timestamp = ftrace_now(cpu);
 	tracing_reset(data);
 	ftrace(tr, data, CALLER_ADDR0, parent_ip, flags);
 }
@@ -215,7 +215,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
 	atomic_inc(&data->disabled);
 
 	data->critical_sequence = max_sequence;
-	data->preempt_timestamp = now(cpu);
+	data->preempt_timestamp = ftrace_now(cpu);
 	data->critical_start = parent_ip ? : ip;
 	tracing_reset(data);
 
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 6c9284103a62..8d656672da93 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -61,7 +61,7 @@ static notrace void sched_switch_reset(struct trace_array *tr)
 {
 	int cpu;
 
-	tr->time_start = now(tr->cpu);
+	tr->time_start = ftrace_now(tr->cpu);
 
 	for_each_online_cpu(cpu)
 		tracing_reset(tr->data[cpu]);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 688df965f3f2..b7df825c3af9 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -92,7 +92,7 @@ wakeup_sched_switch(struct task_struct *prev, struct task_struct *next)
 	 * as long as possible:
 	 */
 	T0 = data->preempt_timestamp;
-	T1 = now(cpu);
+	T1 = ftrace_now(cpu);
 	delta = T1-T0;
 
 	if (!report_latency(delta))
@@ -191,7 +191,7 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p,
 
 	local_save_flags(flags);
 
-	tr->data[wakeup_cpu]->preempt_timestamp = now(cpu);
+	tr->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu);
 	ftrace(tr, tr->data[wakeup_cpu], CALLER_ADDR1, CALLER_ADDR2, flags);
 
 out_locked:
-- 
cgit v1.2.3


From 53c37c17aafcf50f7c6fddaf01dda8f9d7e31ddf Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:46 +0200
Subject: ftrace: fast, scalable, synchronized timestamps

implement globally synchronized, fast and scalable time source for tracing.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 53 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e3778ab0d3f7..9a931c7c2da3 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -42,9 +42,61 @@ ns2usecs(cycle_t nsec)
 	return nsec;
 }
 
+static const int time_sync_freq_max = 128;
+static const cycle_t time_sync_thresh = 100000;
+
+static DEFINE_PER_CPU(cycle_t, time_offset);
+static DEFINE_PER_CPU(cycle_t, prev_cpu_time);
+static DEFINE_PER_CPU(int, time_sync_count);
+static DEFINE_PER_CPU(int, time_sync_freq);
+
+/*
+ * Global lock which we take every now and then to synchronize
+ * the CPUs time. This method is not warp-safe, but it's good
+ * enough to synchronize slowly diverging time sources and thus
+ * it's good enough for tracing:
+ */
+static DEFINE_SPINLOCK(time_sync_lock);
+static cycle_t prev_global_time;
+
+static notrace cycle_t __ftrace_now_sync(cycles_t time, int cpu)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&time_sync_lock, flags);
+
+	/*
+	 * Update the synchronization frequency:
+	 */
+	if (per_cpu(time_sync_freq, cpu) < time_sync_freq_max)
+		per_cpu(time_sync_freq, cpu) *= 2;
+	per_cpu(time_sync_count, cpu) = per_cpu(time_sync_freq, cpu);
+
+	if (time < prev_global_time) {
+		per_cpu(time_offset, cpu) += prev_global_time - time;
+		time = prev_global_time;
+	} else {
+		prev_global_time = time;
+	}
+
+	spin_unlock_irqrestore(&time_sync_lock, flags);
+
+	return time;
+}
+
 notrace cycle_t ftrace_now(int cpu)
 {
-	return cpu_clock(cpu);
+	cycle_t prev_cpu_time, time, delta_time;
+
+	prev_cpu_time = per_cpu(prev_cpu_time, cpu);
+	time = sched_clock() + per_cpu(time_offset, cpu);
+	delta_time = time-prev_cpu_time;
+
+	if (unlikely(delta_time > time_sync_thresh ||
+				--per_cpu(time_sync_count, cpu) <= 0))
+		time = __ftrace_now_sync(time, cpu);
+
+	return time;
 }
 
 static atomic_t			tracer_counter;
-- 
cgit v1.2.3


From cdd31cd2d7a0dcbec2cce3974f7129dd4fc8c879 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:46 +0200
Subject: ftrace: remove-idx-sync

remove idx syncing - it's expensive on SMP.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 40 +++++-----------------------------------
 kernel/trace/trace.h |  2 --
 2 files changed, 5 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9a931c7c2da3..ce8ceb8aea6a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -99,7 +99,6 @@ notrace cycle_t ftrace_now(int cpu)
 	return time;
 }
 
-static atomic_t			tracer_counter;
 static struct trace_array	global_trace;
 
 static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
@@ -661,7 +660,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags)
 
 	pc = preempt_count();
 
-	entry->idx		= atomic_inc_return(&tracer_counter);
 	entry->preempt_count	= pc & 0xff;
 	entry->pid		= tsk->pid;
 	entry->t		= ftrace_now(raw_smp_processor_id());
@@ -757,8 +755,10 @@ find_next_entry(struct trace_iterator *iter, int *ent_cpu)
 		if (!head_page(tr->data[cpu]))
 			continue;
 		ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu);
-		if (ent &&
-		    (!next || (long)(next->idx - ent->idx) > 0)) {
+		/*
+		 * Pick the entry with the smallest timestamp:
+		 */
+		if (ent && (!next || ent->t < next->t)) {
 			next = ent;
 			next_cpu = cpu;
 		}
@@ -800,8 +800,6 @@ trace_consume(struct trace_iterator *iter)
 	if (data->trace_head == data->trace_tail &&
 	    data->trace_head_idx == data->trace_tail_idx)
 		data->trace_idx = 0;
-
-	trace_iterator_increment(iter);
 }
 
 static notrace void *
@@ -1160,33 +1158,6 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 	}
 }
 
-static notrace void sync_time_offset(struct trace_iterator *iter)
-{
-	struct trace_array_cpu *prev_array, *array;
-	struct trace_entry *prev_entry, *entry;
-	cycle_t prev_t, t;
-
-	entry = iter->ent;
-	prev_entry = iter->prev_ent;
-	if (!prev_entry)
-		return;
-
-	prev_array = iter->tr->data[iter->prev_cpu];
-	array = iter->tr->data[iter->cpu];
-
-	prev_t = prev_entry->t + prev_array->time_offset;
-	t = entry->t + array->time_offset;
-
-	/*
-	 * If time goes backwards we increase the offset of
-	 * the current array, to not have observable time warps.
-	 * This will quickly synchronize the time offsets of
-	 * multiple CPUs:
-	 */
-	if (t < prev_t)
-		array->time_offset += prev_t - t;
-}
-
 static notrace int
 print_trace_fmt(struct trace_iterator *iter)
 {
@@ -1200,12 +1171,11 @@ print_trace_fmt(struct trace_iterator *iter)
 	int S;
 	int ret;
 
-	sync_time_offset(iter);
 	entry = iter->ent;
 
 	comm = trace_find_cmdline(iter->ent->pid);
 
-	t = ns2usecs(entry->t + iter->tr->data[iter->cpu]->time_offset);
+	t = ns2usecs(entry->t);
 	usec_rem = do_div(t, 1000000ULL);
 	secs = (unsigned long)t;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 30cad677e9d0..27fa2d06f499 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -38,7 +38,6 @@ struct trace_entry {
 	char			preempt_count;
 	int			pid;
 	cycle_t			t;
-	unsigned long		idx;
 	union {
 		struct ftrace_entry		fn;
 		struct ctx_switch_entry		ctx;
@@ -57,7 +56,6 @@ struct trace_array_cpu {
 	atomic_t		disabled;
 	spinlock_t		lock;
 	struct lock_class_key	lock_key;
-	cycle_t			time_offset;
 
 	/* these fields get copied into max-trace: */
 	unsigned		trace_head_idx;
-- 
cgit v1.2.3


From 8c523a9d82dbc4f3f7d972df8c0f1eacd83d0d55 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:46 +0200
Subject: ftrace: clean-up-pipe-iteration

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ce8ceb8aea6a..42f1926acf73 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -770,12 +770,12 @@ find_next_entry(struct trace_iterator *iter, int *ent_cpu)
 	return next;
 }
 
-static notrace void
-trace_iterator_increment(struct trace_iterator *iter)
+static notrace void trace_iterator_increment(struct trace_iterator *iter)
 {
 	iter->idx++;
 	iter->next_idx[iter->cpu]++;
 	iter->next_page_idx[iter->cpu]++;
+
 	if (iter->next_page_idx[iter->cpu] >= ENTRIES_PER_PAGE) {
 		struct trace_array_cpu *data = iter->tr->data[iter->cpu];
 
@@ -785,8 +785,7 @@ trace_iterator_increment(struct trace_iterator *iter)
 	}
 }
 
-static notrace void
-trace_consume(struct trace_iterator *iter)
+static notrace void trace_consume(struct trace_iterator *iter)
 {
 	struct trace_array_cpu *data = iter->tr->data[iter->cpu];
 
@@ -802,8 +801,7 @@ trace_consume(struct trace_iterator *iter)
 		data->trace_idx = 0;
 }
 
-static notrace void *
-find_next_entry_inc(struct trace_iterator *iter)
+static notrace void *find_next_entry_inc(struct trace_iterator *iter)
 {
 	struct trace_entry *next;
 	int next_cpu = -1;
@@ -1871,14 +1869,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 		cpu_set(cpu, mask);
 	}
 
-	while ((entry = find_next_entry(iter, &cpu))) {
-
-		if (!entry)
-			break;
-
-		iter->ent = entry;
-		iter->cpu = cpu;
-
+	while ((entry = find_next_entry_inc(iter)) != NULL) {
 		ret = print_trace_fmt(iter);
 		if (!ret)
 			break;
@@ -1887,7 +1878,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 
 		if (iter->seq.len >= cnt)
 			break;
-
 	}
 
 	for_each_cpu_mask(cpu, mask) {
-- 
cgit v1.2.3


From f9896bf30928922a3913a3810a4ab7908da6cfe7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:47 +0200
Subject: ftrace: add raw output

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 63 +++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 55 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 42f1926acf73..bebd263f582f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -153,6 +153,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_SYM_OFFSET		= 0x02,
 	TRACE_ITER_SYM_ADDR		= 0x04,
 	TRACE_ITER_VERBOSE		= 0x08,
+	TRACE_ITER_RAW			= 0x10,
 };
 
 #define TRACE_ITER_SYM_MASK \
@@ -164,6 +165,7 @@ static const char *trace_options[] = {
 	"sym-offset",
 	"sym-addr",
 	"verbose",
+	"raw",
 	NULL
 };
 
@@ -1099,7 +1101,7 @@ lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs,
 
 static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
 
-static notrace void
+static notrace int
 print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 {
 	struct trace_seq *s = &iter->seq;
@@ -1154,10 +1156,10 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 	default:
 		trace_seq_printf(s, "Unknown type %d\n", entry->type);
 	}
+	return 1;
 }
 
-static notrace int
-print_trace_fmt(struct trace_iterator *iter)
+static notrace int print_trace_fmt(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
@@ -1222,6 +1224,43 @@ print_trace_fmt(struct trace_iterator *iter)
 	return 1;
 }
 
+static notrace int print_raw_fmt(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry;
+	int ret;
+	int S;
+
+	entry = iter->ent;
+
+	ret = trace_seq_printf(s, "%d %d %llu ",
+		entry->pid, iter->cpu, entry->t);
+	if (!ret)
+		return 0;
+
+	switch (entry->type) {
+	case TRACE_FN:
+		ret = trace_seq_printf(s, "%x %x\n",
+					entry->fn.ip, entry->fn.parent_ip);
+		if (!ret)
+			return 0;
+		break;
+	case TRACE_CTX:
+		S = entry->ctx.prev_state < sizeof(state_to_char) ?
+			state_to_char[entry->ctx.prev_state] : 'X';
+		ret = trace_seq_printf(s, "%d %d %c %d %d\n",
+				       entry->ctx.prev_pid,
+				       entry->ctx.prev_prio,
+				       S,
+				       entry->ctx.next_pid,
+				       entry->ctx.next_prio);
+		if (!ret)
+			return 0;
+		break;
+	}
+	return 1;
+}
+
 static int trace_empty(struct trace_iterator *iter)
 {
 	struct trace_array_cpu *data;
@@ -1238,6 +1277,17 @@ static int trace_empty(struct trace_iterator *iter)
 	return 1;
 }
 
+static int print_trace_line(struct trace_iterator *iter)
+{
+	if (trace_flags & TRACE_ITER_RAW)
+		return print_raw_fmt(iter);
+
+	if (iter->iter_flags & TRACE_FILE_LAT_FMT)
+		return print_lat_fmt(iter, iter->idx, iter->cpu);
+
+	return print_trace_fmt(iter);
+}
+
 static int s_show(struct seq_file *m, void *v)
 {
 	struct trace_iterator *iter = v;
@@ -1259,10 +1309,7 @@ static int s_show(struct seq_file *m, void *v)
 				print_func_help_header(m);
 		}
 	} else {
-		if (iter->iter_flags & TRACE_FILE_LAT_FMT)
-			print_lat_fmt(iter, iter->idx, iter->cpu);
-		else
-			print_trace_fmt(iter);
+		print_trace_line(iter);
 		trace_print_seq(m, &iter->seq);
 	}
 
@@ -1870,7 +1917,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 	}
 
 	while ((entry = find_next_entry_inc(iter)) != NULL) {
-		ret = print_trace_fmt(iter);
+		ret = print_trace_line(iter);
 		if (!ret)
 			break;
 
-- 
cgit v1.2.3


From cb0f12aae8d085140d37ada351aa5a8e76c3f9b0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:47 +0200
Subject: ftrace: bin-output

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bebd263f582f..d78cbc4fc519 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -154,6 +154,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_SYM_ADDR		= 0x04,
 	TRACE_ITER_VERBOSE		= 0x08,
 	TRACE_ITER_RAW			= 0x10,
+	TRACE_ITER_BIN			= 0x20,
 };
 
 #define TRACE_ITER_SYM_MASK \
@@ -166,6 +167,7 @@ static const char *trace_options[] = {
 	"sym-addr",
 	"verbose",
 	"raw",
+	"bin",
 	NULL
 };
 
@@ -275,6 +277,18 @@ trace_seq_putc(struct trace_seq *s, unsigned char c)
 	return 1;
 }
 
+static notrace int
+trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
+{
+	if (len > ((PAGE_SIZE - 1) - s->len))
+		return 0;
+
+	memcpy(s->buffer + s->len, mem, len);
+	s->len += len;
+
+	return len;
+}
+
 static notrace void
 trace_seq_reset(struct trace_seq *s)
 {
@@ -1261,6 +1275,39 @@ static notrace int print_raw_fmt(struct trace_iterator *iter)
 	return 1;
 }
 
+#define SEQ_PUT_FIELD_RET(s, x)				\
+do {							\
+	if (!trace_seq_putmem(s, &(x), sizeof(x)))	\
+		return 0;				\
+} while (0)
+
+static notrace int print_bin_fmt(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry;
+
+	entry = iter->ent;
+
+	SEQ_PUT_FIELD_RET(s, entry->pid);
+	SEQ_PUT_FIELD_RET(s, entry->cpu);
+	SEQ_PUT_FIELD_RET(s, entry->t);
+
+	switch (entry->type) {
+	case TRACE_FN:
+		SEQ_PUT_FIELD_RET(s, entry->fn.ip);
+		SEQ_PUT_FIELD_RET(s, entry->fn.parent_ip);
+		break;
+	case TRACE_CTX:
+		SEQ_PUT_FIELD_RET(s, entry->ctx.prev_pid);
+		SEQ_PUT_FIELD_RET(s, entry->ctx.prev_prio);
+		SEQ_PUT_FIELD_RET(s, entry->ctx.prev_state);
+		SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid);
+		SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio);
+		break;
+	}
+	return 1;
+}
+
 static int trace_empty(struct trace_iterator *iter)
 {
 	struct trace_array_cpu *data;
@@ -1279,6 +1326,9 @@ static int trace_empty(struct trace_iterator *iter)
 
 static int print_trace_line(struct trace_iterator *iter)
 {
+	if (trace_flags & TRACE_ITER_BIN)
+		return print_bin_fmt(iter);
+
 	if (trace_flags & TRACE_ITER_RAW)
 		return print_raw_fmt(iter);
 
-- 
cgit v1.2.3


From f0a920d5752e1788c0cba2add103076bcc0f7a49 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:47 +0200
Subject: ftrace: add trace_special()

for ad-hoc tracing.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h | 15 +++++++++++++++
 2 files changed, 59 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d78cbc4fc519..fa13059eb462 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -137,6 +137,7 @@ enum trace_type {
 
 	TRACE_FN,
 	TRACE_CTX,
+	TRACE_SPECIAL,
 
 	__TRACE_LAST_TYPE
 };
@@ -700,6 +701,22 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
 	spin_unlock(&data->lock);
 }
 
+notrace void
+trace_special(struct trace_array *tr, struct trace_array_cpu *data,
+	      unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+	struct trace_entry *entry;
+
+	spin_lock(&data->lock);
+	entry			= tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, 0);
+	entry->type		= TRACE_SPECIAL;
+	entry->special.arg1	= arg1;
+	entry->special.arg2	= arg2;
+	entry->special.arg3	= arg3;
+	spin_unlock(&data->lock);
+}
+
 notrace void
 tracing_sched_switch_trace(struct trace_array *tr,
 			   struct trace_array_cpu *data,
@@ -1167,6 +1184,12 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 				 entry->ctx.next_prio,
 				 comm);
 		break;
+	case TRACE_SPECIAL:
+		trace_seq_printf(s, " %lx %lx %lx\n",
+				 entry->special.arg1,
+				 entry->special.arg2,
+				 entry->special.arg3);
+		break;
 	default:
 		trace_seq_printf(s, "Unknown type %d\n", entry->type);
 	}
@@ -1234,6 +1257,14 @@ static notrace int print_trace_fmt(struct trace_iterator *iter)
 		if (!ret)
 			return 0;
 		break;
+	case TRACE_SPECIAL:
+		ret = trace_seq_printf(s, " %lx %lx %lx\n",
+				 entry->special.arg1,
+				 entry->special.arg2,
+				 entry->special.arg3);
+		if (!ret)
+			return 0;
+		break;
 	}
 	return 1;
 }
@@ -1271,6 +1302,14 @@ static notrace int print_raw_fmt(struct trace_iterator *iter)
 		if (!ret)
 			return 0;
 		break;
+	case TRACE_SPECIAL:
+		ret = trace_seq_printf(s, " %lx %lx %lx\n",
+				 entry->special.arg1,
+				 entry->special.arg2,
+				 entry->special.arg3);
+		if (!ret)
+			return 0;
+		break;
 	}
 	return 1;
 }
@@ -1304,6 +1343,11 @@ static notrace int print_bin_fmt(struct trace_iterator *iter)
 		SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid);
 		SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio);
 		break;
+	case TRACE_SPECIAL:
+		SEQ_PUT_FIELD_RET(s, entry->special.arg1);
+		SEQ_PUT_FIELD_RET(s, entry->special.arg2);
+		SEQ_PUT_FIELD_RET(s, entry->special.arg3);
+		break;
 	}
 	return 1;
 }
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 27fa2d06f499..7bdfef35c05a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -25,6 +25,15 @@ struct ctx_switch_entry {
 	unsigned char		next_prio;
 };
 
+/*
+ * Special (free-form) trace entry:
+ */
+struct special_entry {
+	unsigned long		arg1;
+	unsigned long		arg2;
+	unsigned long		arg3;
+};
+
 /*
  * The trace entry - the most basic unit of tracing. This is what
  * is printed in the end as a single line in the trace output, such as:
@@ -41,6 +50,7 @@ struct trace_entry {
 	union {
 		struct ftrace_entry		fn;
 		struct ctx_switch_entry		ctx;
+		struct special_entry		special;
 	};
 };
 
@@ -154,6 +164,11 @@ void tracing_sched_switch_trace(struct trace_array *tr,
 				struct task_struct *next,
 				unsigned long flags);
 void tracing_record_cmdline(struct task_struct *tsk);
+void trace_special(struct trace_array *tr,
+		   struct trace_array_cpu *data,
+		   unsigned long arg1,
+		   unsigned long arg2,
+		   unsigned long arg3);
 
 void tracing_start_function_trace(void);
 void tracing_stop_function_trace(void);
-- 
cgit v1.2.3


From dcb6308f2b56720599f6b9d5a01c33e67e69bde4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:48 +0200
Subject: ftrace, locking fix

should be an irq-safe lock ...

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fa13059eb462..70f94fa92c10 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -691,14 +691,15 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
        unsigned long ip, unsigned long parent_ip, unsigned long flags)
 {
 	struct trace_entry *entry;
+	unsigned long irq_flags;
 
-	spin_lock(&data->lock);
+	spin_lock_irqsave(&data->lock, irq_flags);
 	entry			= tracing_get_trace_entry(tr, data);
 	tracing_generic_entry_update(entry, flags);
 	entry->type		= TRACE_FN;
 	entry->fn.ip		= ip;
 	entry->fn.parent_ip	= parent_ip;
-	spin_unlock(&data->lock);
+	spin_unlock_irqrestore(&data->lock, irq_flags);
 }
 
 notrace void
@@ -706,15 +707,16 @@ trace_special(struct trace_array *tr, struct trace_array_cpu *data,
 	      unsigned long arg1, unsigned long arg2, unsigned long arg3)
 {
 	struct trace_entry *entry;
+	unsigned long irq_flags;
 
-	spin_lock(&data->lock);
+	spin_lock_irqsave(&data->lock, irq_flags);
 	entry			= tracing_get_trace_entry(tr, data);
 	tracing_generic_entry_update(entry, 0);
 	entry->type		= TRACE_SPECIAL;
 	entry->special.arg1	= arg1;
 	entry->special.arg2	= arg2;
 	entry->special.arg3	= arg3;
-	spin_unlock(&data->lock);
+	spin_unlock_irqrestore(&data->lock, irq_flags);
 }
 
 notrace void
@@ -724,8 +726,9 @@ tracing_sched_switch_trace(struct trace_array *tr,
 			   unsigned long flags)
 {
 	struct trace_entry *entry;
+	unsigned long irq_flags;
 
-	spin_lock(&data->lock);
+	spin_lock_irqsave(&data->lock, irq_flags);
 	entry			= tracing_get_trace_entry(tr, data);
 	tracing_generic_entry_update(entry, flags);
 	entry->type		= TRACE_CTX;
@@ -734,7 +737,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->ctx.prev_state	= prev->state;
 	entry->ctx.next_pid	= next->pid;
 	entry->ctx.next_prio	= next->prio;
-	spin_unlock(&data->lock);
+	spin_unlock_irqrestore(&data->lock, irq_flags);
 }
 
 enum trace_file_type {
-- 
cgit v1.2.3


From 088b1e427dbba2af93cb6a7d39258c10ff58dd27 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:48 +0200
Subject: ftrace: pipe fixes

Some fixes for better output with the trace pipe.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 70f94fa92c10..c56fc5e60133 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -770,11 +770,6 @@ trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data,
 
 	array = page_address(page);
 
-	/* Still possible to catch up to the tail */
-	if (iter->next_idx[cpu] && array == data->trace_tail &&
-	    iter->next_page_idx[cpu] == data->trace_tail_idx)
-		return NULL;
-
 	WARN_ON(iter->next_page_idx[cpu] >= ENTRIES_PER_PAGE);
 	return &array[iter->next_page_idx[cpu]];
 }
@@ -1921,7 +1916,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 	struct trace_iterator *iter = filp->private_data;
 	struct trace_array_cpu *data;
 	static cpumask_t mask;
-	struct trace_entry *entry;
 	static int start;
 	unsigned long flags;
 	int read = 0;
@@ -2013,10 +2007,15 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 		cpu_set(cpu, mask);
 	}
 
-	while ((entry = find_next_entry_inc(iter)) != NULL) {
+	while (find_next_entry_inc(iter) != NULL) {
+		int len = iter->seq.len;
+
 		ret = print_trace_line(iter);
-		if (!ret)
+		if (!ret) {
+			/* don't print partial lines */
+			iter->seq.len = len;
 			break;
+		}
 
 		trace_consume(iter);
 
-- 
cgit v1.2.3


From 37ad508419f0fdfda7b378756eb1f35cfd26d96d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:48 +0200
Subject: ftrace - fix dynamic ftrace memory leak

The ftrace dynamic function update allocates a record to store the
instruction pointers that are being modified. If the modified
instruction pointer fails to update, then the record is marked as
failed and nothing more is done.

Worse, if the modification fails, but the record ip function is still
called, it will allocate a new record and try again. In just a matter
of time, will this cause a serious memory leak and crash the system.

This patch plugs this memory leak. When a record fails, it is
included back into the pool of records to be used. Now a record may
fail over and over again, but the number of allocated records will
not increase.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/ftrace.c | 45 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 42 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a15e068535f8..8e02aa690b2b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -188,6 +188,8 @@ static int ftraced_suspend;
 
 static int ftrace_record_suspend;
 
+static struct dyn_ftrace *ftrace_free_records;
+
 static inline int
 notrace ftrace_ip_in_hash(unsigned long ip, unsigned long key)
 {
@@ -211,8 +213,35 @@ ftrace_add_hash(struct dyn_ftrace *node, unsigned long key)
 	hlist_add_head(&node->node, &ftrace_hash[key]);
 }
 
+static notrace void ftrace_free_rec(struct dyn_ftrace *rec)
+{
+	/* no locking, only called from kstop_machine */
+
+	rec->ip = (unsigned long)ftrace_free_records;
+	ftrace_free_records = rec;
+	rec->flags |= FTRACE_FL_FREE;
+}
+
 static notrace struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
 {
+	struct dyn_ftrace *rec;
+
+	/* First check for freed records */
+	if (ftrace_free_records) {
+		rec = ftrace_free_records;
+
+		/* todo, disable tracing altogether on this warning */
+		if (unlikely(!(rec->flags & FTRACE_FL_FREE))) {
+			WARN_ON_ONCE(1);
+			ftrace_free_records = NULL;
+			return NULL;
+		}
+
+		ftrace_free_records = (void *)rec->ip;
+		memset(rec, 0, sizeof(*rec));
+		return rec;
+	}
+
 	if (ftrace_pages->index == ENTRIES_PER_PAGE) {
 		if (!ftrace_pages->next)
 			return NULL;
@@ -356,8 +385,16 @@ __ftrace_replace_code(struct dyn_ftrace *rec,
 	}
 
 	failed = ftrace_modify_code(ip, old, new);
-	if (failed)
-		rec->flags |= FTRACE_FL_FAILED;
+	if (failed) {
+		unsigned long key;
+		/* It is possible that the function hasn't been converted yet */
+		key = hash_long(ip, FTRACE_HASHBITS);
+		if (!ftrace_ip_in_hash(ip, key)) {
+			rec->flags |= FTRACE_FL_FAILED;
+			ftrace_free_rec(rec);
+		}
+
+	}
 }
 
 static void notrace ftrace_replace_code(int enable)
@@ -407,8 +444,10 @@ ftrace_code_disable(struct dyn_ftrace *rec)
 	call = ftrace_call_replace(ip, MCOUNT_ADDR);
 
 	failed = ftrace_modify_code(ip, call, nop);
-	if (failed)
+	if (failed) {
 		rec->flags |= FTRACE_FL_FAILED;
+		ftrace_free_rec(rec);
+	}
 }
 
 static int notrace __ftrace_modify_code(void *data)
-- 
cgit v1.2.3


From 4eebcc81a33fbc45e28542b50197ed7b3c486d90 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:48 +0200
Subject: ftrace: disable tracing on failure

Since ftrace touches practically every function. If we detect any
anomaly, we want to fully disable ftrace. This patch adds code
to try shutdown ftrace as much as possible without doing any more
harm is something is detected not quite correct.

This only kills ftrace, this patch does have checks for other parts of
the tracer (irqsoff, wakeup, etc.).

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/ftrace.c         | 112 ++++++++++++++++++++++++++++++++++++++----
 kernel/trace/trace_selftest.c |   4 ++
 2 files changed, 107 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8e02aa690b2b..ff42345dd78e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -29,9 +29,16 @@
 
 #include "trace.h"
 
-int ftrace_enabled;
+/* ftrace_enabled is a method to turn ftrace on or off */
+int ftrace_enabled __read_mostly;
 static int last_ftrace_enabled;
 
+/*
+ * ftrace_disabled is set when an anomaly is discovered.
+ * ftrace_disabled is much stronger than ftrace_enabled.
+ */
+static int ftrace_disabled __read_mostly;
+
 static DEFINE_SPINLOCK(ftrace_lock);
 static DEFINE_MUTEX(ftrace_sysctl_lock);
 
@@ -230,10 +237,11 @@ static notrace struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
 	if (ftrace_free_records) {
 		rec = ftrace_free_records;
 
-		/* todo, disable tracing altogether on this warning */
 		if (unlikely(!(rec->flags & FTRACE_FL_FREE))) {
 			WARN_ON_ONCE(1);
 			ftrace_free_records = NULL;
+			ftrace_disabled = 1;
+			ftrace_enabled = 0;
 			return NULL;
 		}
 
@@ -260,7 +268,7 @@ ftrace_record_ip(unsigned long ip)
 	int resched;
 	int atomic;
 
-	if (!ftrace_enabled)
+	if (!ftrace_enabled || ftrace_disabled)
 		return;
 
 	resched = need_resched();
@@ -485,6 +493,9 @@ static void notrace ftrace_startup(void)
 {
 	int command = 0;
 
+	if (unlikely(ftrace_disabled))
+		return;
+
 	mutex_lock(&ftraced_lock);
 	ftraced_suspend++;
 	if (ftraced_suspend == 1)
@@ -507,6 +518,9 @@ static void notrace ftrace_shutdown(void)
 {
 	int command = 0;
 
+	if (unlikely(ftrace_disabled))
+		return;
+
 	mutex_lock(&ftraced_lock);
 	ftraced_suspend--;
 	if (!ftraced_suspend)
@@ -529,6 +543,9 @@ static void notrace ftrace_startup_sysctl(void)
 {
 	int command = FTRACE_ENABLE_MCOUNT;
 
+	if (unlikely(ftrace_disabled))
+		return;
+
 	mutex_lock(&ftraced_lock);
 	/* Force update next time */
 	saved_ftrace_func = NULL;
@@ -544,6 +561,9 @@ static void notrace ftrace_shutdown_sysctl(void)
 {
 	int command = FTRACE_DISABLE_MCOUNT;
 
+	if (unlikely(ftrace_disabled))
+		return;
+
 	mutex_lock(&ftraced_lock);
 	/* ftraced_suspend is true if ftrace is running */
 	if (ftraced_suspend)
@@ -600,6 +620,9 @@ static int notrace __ftrace_update_code(void *ignore)
 
 static void notrace ftrace_update_code(void)
 {
+	if (unlikely(ftrace_disabled))
+		return;
+
 	stop_machine_run(__ftrace_update_code, NULL, NR_CPUS);
 }
 
@@ -614,6 +637,9 @@ static int notrace ftraced(void *ignore)
 		/* check once a second */
 		schedule_timeout(HZ);
 
+		if (unlikely(ftrace_disabled))
+			continue;
+
 		mutex_lock(&ftrace_sysctl_lock);
 		mutex_lock(&ftraced_lock);
 		if (ftrace_enabled && ftraced_trigger && !ftraced_suspend) {
@@ -628,6 +654,7 @@ static int notrace ftraced(void *ignore)
 					ftrace_update_cnt != 1 ? "s" : "",
 					ftrace_update_tot_cnt,
 					usecs, usecs != 1 ? "s" : "");
+				ftrace_disabled = 1;
 				WARN_ON_ONCE(1);
 			}
 			ftraced_trigger = 0;
@@ -785,6 +812,9 @@ ftrace_avail_open(struct inode *inode, struct file *file)
 	struct ftrace_iterator *iter;
 	int ret;
 
+	if (unlikely(ftrace_disabled))
+		return -ENODEV;
+
 	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
 	if (!iter)
 		return -ENOMEM;
@@ -843,6 +873,9 @@ ftrace_filter_open(struct inode *inode, struct file *file)
 	struct ftrace_iterator *iter;
 	int ret = 0;
 
+	if (unlikely(ftrace_disabled))
+		return -ENODEV;
+
 	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
 	if (!iter)
 		return -ENOMEM;
@@ -1063,6 +1096,9 @@ ftrace_filter_write(struct file *file, const char __user *ubuf,
  */
 notrace void ftrace_set_filter(unsigned char *buf, int len, int reset)
 {
+	if (unlikely(ftrace_disabled))
+		return;
+
 	mutex_lock(&ftrace_filter_lock);
 	if (reset)
 		ftrace_filter_reset();
@@ -1133,7 +1169,7 @@ int ftrace_force_update(void)
 	DECLARE_WAITQUEUE(wait, current);
 	int ret = 0;
 
-	if (!ftraced_task)
+	if (unlikely(ftrace_disabled))
 		return -ENODEV;
 
 	mutex_lock(&ftraced_lock);
@@ -1142,6 +1178,11 @@ int ftrace_force_update(void)
 	set_current_state(TASK_INTERRUPTIBLE);
 	add_wait_queue(&ftraced_waiters, &wait);
 
+	if (unlikely(!ftraced_task)) {
+		ret = -ENODEV;
+		goto out;
+	}
+
 	do {
 		mutex_unlock(&ftraced_lock);
 		wake_up_process(ftraced_task);
@@ -1154,6 +1195,7 @@ int ftrace_force_update(void)
 		set_current_state(TASK_INTERRUPTIBLE);
 	} while (last_counter == ftraced_iteration_counter);
 
+ out:
 	mutex_unlock(&ftraced_lock);
 	remove_wait_queue(&ftraced_waiters, &wait);
 	set_current_state(TASK_RUNNING);
@@ -1161,6 +1203,22 @@ int ftrace_force_update(void)
 	return ret;
 }
 
+static void ftrace_force_shutdown(void)
+{
+	struct task_struct *task;
+	int command = FTRACE_DISABLE_CALLS | FTRACE_UPDATE_TRACE_FUNC;
+
+	mutex_lock(&ftraced_lock);
+	task = ftraced_task;
+	ftraced_task = NULL;
+	ftraced_suspend = -1;
+	ftrace_run_update_code(command);
+	mutex_unlock(&ftraced_lock);
+
+	if (task)
+		kthread_stop(task);
+}
+
 static __init int ftrace_init_debugfs(void)
 {
 	struct dentry *d_tracer;
@@ -1194,21 +1252,29 @@ static int __init notrace ftrace_dynamic_init(void)
 	stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS);
 
 	/* ftrace_dyn_arch_init places the return code in addr */
-	if (addr)
-		return addr;
+	if (addr) {
+		ret = (int)addr;
+		goto failed;
+	}
 
 	ret = ftrace_dyn_table_alloc();
 	if (ret)
-		return ret;
+		goto failed;
 
 	p = kthread_run(ftraced, NULL, "ftraced");
-	if (IS_ERR(p))
-		return -1;
+	if (IS_ERR(p)) {
+		ret = -1;
+		goto failed;
+	}
 
 	last_ftrace_enabled = ftrace_enabled = 1;
 	ftraced_task = p;
 
 	return 0;
+
+ failed:
+	ftrace_disabled = 1;
+	return ret;
 }
 
 core_initcall(ftrace_dynamic_init);
@@ -1217,8 +1283,30 @@ core_initcall(ftrace_dynamic_init);
 # define ftrace_shutdown()		do { } while (0)
 # define ftrace_startup_sysctl()	do { } while (0)
 # define ftrace_shutdown_sysctl()	do { } while (0)
+# define ftrace_force_shutdown()	do { } while (0)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
+/**
+ * ftrace_kill - totally shutdown ftrace
+ *
+ * This is a safety measure. If something was detected that seems
+ * wrong, calling this function will keep ftrace from doing
+ * any more modifications, and updates.
+ * used when something went wrong.
+ */
+void ftrace_kill(void)
+{
+	mutex_lock(&ftrace_sysctl_lock);
+	ftrace_disabled = 1;
+	ftrace_enabled = 0;
+
+	clear_ftrace_function();
+	mutex_unlock(&ftrace_sysctl_lock);
+
+	/* Try to totally disable ftrace */
+	ftrace_force_shutdown();
+}
+
 /**
  * register_ftrace_function - register a function for profiling
  * @ops - ops structure that holds the function for profiling.
@@ -1234,6 +1322,9 @@ int register_ftrace_function(struct ftrace_ops *ops)
 {
 	int ret;
 
+	if (unlikely(ftrace_disabled))
+		return -1;
+
 	mutex_lock(&ftrace_sysctl_lock);
 	ret = __register_ftrace_function(ops);
 	ftrace_startup();
@@ -1267,6 +1358,9 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 {
 	int ret;
 
+	if (unlikely(ftrace_disabled))
+		return -ENODEV;
+
 	mutex_lock(&ftrace_sysctl_lock);
 
 	ret  = proc_dointvec(table, write, file, buffer, lenp, ppos);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index a6f1ed75f836..85715b86a342 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -248,6 +248,10 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 	ftrace_enabled = save_ftrace_enabled;
 	tracer_enabled = save_tracer_enabled;
 
+	/* kill ftrace totally if we failed */
+	if (ret)
+		ftrace_kill();
+
 	return ret;
 }
 #endif /* CONFIG_FTRACE */
-- 
cgit v1.2.3


From 26994ead1fc8cced63f17e9848edc1771036664e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:48 +0200
Subject: ftrace: enabled tracing by default

This patch is the correct way to have tracing enabled by default.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c56fc5e60133..3dc6eac85b11 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -107,7 +107,7 @@ static struct trace_array	max_tr;
 
 static DEFINE_PER_CPU(struct trace_array_cpu, max_data);
 
-static int			tracer_enabled;
+static int			tracer_enabled = 1;
 static unsigned long		trace_nr_entries = 16384UL;
 
 static struct tracer		*trace_types __read_mostly;
@@ -2268,6 +2268,8 @@ __init static int tracer_alloc_buffers(void)
 	int ret = -ENOMEM;
 	int i;
 
+	global_trace.ctrl = tracer_enabled;
+
 	/* Allocate the first page for all buffers */
 	for_each_possible_cpu(i) {
 		data = global_trace.data[i] = &per_cpu(global_trace_cpu, i);
-- 
cgit v1.2.3


From 0fd9e0dac9026df09986a4b201518ae015814aef Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:48 +0200
Subject: ftrace: use cpu clock again

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 54 +---------------------------------------------------
 1 file changed, 1 insertion(+), 53 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3dc6eac85b11..d74c039305ad 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -42,61 +42,9 @@ ns2usecs(cycle_t nsec)
 	return nsec;
 }
 
-static const int time_sync_freq_max = 128;
-static const cycle_t time_sync_thresh = 100000;
-
-static DEFINE_PER_CPU(cycle_t, time_offset);
-static DEFINE_PER_CPU(cycle_t, prev_cpu_time);
-static DEFINE_PER_CPU(int, time_sync_count);
-static DEFINE_PER_CPU(int, time_sync_freq);
-
-/*
- * Global lock which we take every now and then to synchronize
- * the CPUs time. This method is not warp-safe, but it's good
- * enough to synchronize slowly diverging time sources and thus
- * it's good enough for tracing:
- */
-static DEFINE_SPINLOCK(time_sync_lock);
-static cycle_t prev_global_time;
-
-static notrace cycle_t __ftrace_now_sync(cycles_t time, int cpu)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&time_sync_lock, flags);
-
-	/*
-	 * Update the synchronization frequency:
-	 */
-	if (per_cpu(time_sync_freq, cpu) < time_sync_freq_max)
-		per_cpu(time_sync_freq, cpu) *= 2;
-	per_cpu(time_sync_count, cpu) = per_cpu(time_sync_freq, cpu);
-
-	if (time < prev_global_time) {
-		per_cpu(time_offset, cpu) += prev_global_time - time;
-		time = prev_global_time;
-	} else {
-		prev_global_time = time;
-	}
-
-	spin_unlock_irqrestore(&time_sync_lock, flags);
-
-	return time;
-}
-
 notrace cycle_t ftrace_now(int cpu)
 {
-	cycle_t prev_cpu_time, time, delta_time;
-
-	prev_cpu_time = per_cpu(prev_cpu_time, cpu);
-	time = sched_clock() + per_cpu(time_offset, cpu);
-	delta_time = time-prev_cpu_time;
-
-	if (unlikely(delta_time > time_sync_thresh ||
-				--per_cpu(time_sync_count, cpu) <= 0))
-		time = __ftrace_now_sync(time, cpu);
-
-	return time;
+	return cpu_clock(cpu);
 }
 
 static struct trace_array	global_trace;
-- 
cgit v1.2.3


From 2e0f57618529a2739a5e1570e6c445c9c966b595 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:49 +0200
Subject: ftrace: build fix

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c           | 111 ++++++++++++++++++++++++-----------------
 kernel/trace/trace_functions.c |   2 +-
 2 files changed, 67 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d74c039305ad..71b25b79b3de 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -432,47 +432,6 @@ notrace void tracing_reset(struct trace_array_cpu *data)
 	data->trace_tail_idx = 0;
 }
 
-#ifdef CONFIG_FTRACE
-static notrace void
-function_trace_call(unsigned long ip, unsigned long parent_ip)
-{
-	struct trace_array *tr = &global_trace;
-	struct trace_array_cpu *data;
-	unsigned long flags;
-	long disabled;
-	int cpu;
-
-	if (unlikely(!tracer_enabled))
-		return;
-
-	local_irq_save(flags);
-	cpu = raw_smp_processor_id();
-	data = tr->data[cpu];
-	disabled = atomic_inc_return(&data->disabled);
-
-	if (likely(disabled == 1))
-		ftrace(tr, data, ip, parent_ip, flags);
-
-	atomic_dec(&data->disabled);
-	local_irq_restore(flags);
-}
-
-static struct ftrace_ops trace_ops __read_mostly =
-{
-	.func = function_trace_call,
-};
-#endif
-
-notrace void tracing_start_function_trace(void)
-{
-	register_ftrace_function(&trace_ops);
-}
-
-notrace void tracing_stop_function_trace(void)
-{
-	unregister_ftrace_function(&trace_ops);
-}
-
 #define SAVED_CMDLINES 128
 static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
 static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
@@ -635,8 +594,8 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags)
 }
 
 notrace void
-ftrace(struct trace_array *tr, struct trace_array_cpu *data,
-       unsigned long ip, unsigned long parent_ip, unsigned long flags)
+__ftrace(struct trace_array *tr, struct trace_array_cpu *data,
+	 unsigned long ip, unsigned long parent_ip, unsigned long flags)
 {
 	struct trace_entry *entry;
 	unsigned long irq_flags;
@@ -650,6 +609,14 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
 	spin_unlock_irqrestore(&data->lock, irq_flags);
 }
 
+notrace void
+ftrace(struct trace_array *tr, struct trace_array_cpu *data,
+       unsigned long ip, unsigned long parent_ip, unsigned long flags)
+{
+	if (likely(!atomic_read(&data->disabled)))
+		__ftrace(tr, data, ip, parent_ip, flags);
+}
+
 notrace void
 trace_special(struct trace_array *tr, struct trace_array_cpu *data,
 	      unsigned long arg1, unsigned long arg2, unsigned long arg3)
@@ -688,6 +655,47 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	spin_unlock_irqrestore(&data->lock, irq_flags);
 }
 
+#ifdef CONFIG_FTRACE
+static notrace void
+function_trace_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct trace_array *tr = &global_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+
+	if (unlikely(!tracer_enabled))
+		return;
+
+	local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1))
+		__ftrace(tr, data, ip, parent_ip, flags);
+
+	atomic_dec(&data->disabled);
+	local_irq_restore(flags);
+}
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+	.func = function_trace_call,
+};
+
+notrace void tracing_start_function_trace(void)
+{
+	register_ftrace_function(&trace_ops);
+}
+
+notrace void tracing_stop_function_trace(void)
+{
+	unregister_ftrace_function(&trace_ops);
+}
+#endif
+
 enum trace_file_type {
 	TRACE_FILE_LAT_FMT	= 1,
 };
@@ -722,7 +730,7 @@ trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data,
 	return &array[iter->next_page_idx[cpu]];
 }
 
-static struct notrace trace_entry *
+static struct trace_entry * notrace
 find_next_entry(struct trace_iterator *iter, int *ent_cpu)
 {
 	struct trace_array *tr = iter->tr;
@@ -1866,6 +1874,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 	static cpumask_t mask;
 	static int start;
 	unsigned long flags;
+	int ftrace_save;
 	int read = 0;
 	int cpu;
 	int len;
@@ -1944,6 +1953,9 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 
 	cpus_clear(mask);
 	local_irq_save(flags);
+	ftrace_save = ftrace_enabled;
+	ftrace_enabled = 0;
+	smp_wmb();
 	for_each_possible_cpu(cpu) {
 		data = iter->tr->data[cpu];
 
@@ -1951,10 +1963,14 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 			continue;
 
 		atomic_inc(&data->disabled);
-		spin_lock(&data->lock);
 		cpu_set(cpu, mask);
 	}
 
+	for_each_cpu_mask(cpu, mask) {
+		data = iter->tr->data[cpu];
+		spin_lock(&data->lock);
+	}
+
 	while (find_next_entry_inc(iter) != NULL) {
 		int len = iter->seq.len;
 
@@ -1974,8 +1990,13 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 	for_each_cpu_mask(cpu, mask) {
 		data = iter->tr->data[cpu];
 		spin_unlock(&data->lock);
+	}
+
+	for_each_cpu_mask(cpu, mask) {
+		data = iter->tr->data[cpu];
 		atomic_dec(&data->disabled);
 	}
+	ftrace_enabled = ftrace_save;
 	local_irq_restore(flags);
 
 	/* Now copy what we have to the user */
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index e5d34b78fc99..69a0eb00a0a5 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -9,10 +9,10 @@
  *  Copyright (C) 2004-2006 Ingo Molnar
  *  Copyright (C) 2004 William Lee Irwin III
  */
-#include <linux/fs.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
+#include <linux/fs.h>
 
 #include "trace.h"
 
-- 
cgit v1.2.3


From 5e3ca0ec76fce92daa4eed0d02de9c79b1fe3920 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:49 +0200
Subject: ftrace: introduce the "hex" output method

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 92 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 71b25b79b3de..6974b212e938 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -103,7 +103,8 @@ enum trace_iterator_flags {
 	TRACE_ITER_SYM_ADDR		= 0x04,
 	TRACE_ITER_VERBOSE		= 0x08,
 	TRACE_ITER_RAW			= 0x10,
-	TRACE_ITER_BIN			= 0x20,
+	TRACE_ITER_HEX			= 0x20,
+	TRACE_ITER_BIN			= 0x40,
 };
 
 #define TRACE_ITER_SYM_MASK \
@@ -116,6 +117,7 @@ static const char *trace_options[] = {
 	"sym-addr",
 	"verbose",
 	"raw",
+	"hex",
 	"bin",
 	NULL
 };
@@ -238,6 +240,47 @@ trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
 	return len;
 }
 
+#define HEX_CHARS 17
+
+static notrace int
+trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
+{
+	unsigned char hex[HEX_CHARS];
+	unsigned char *data;
+	unsigned char byte;
+	int i, j;
+
+	BUG_ON(len >= HEX_CHARS);
+
+	data = mem;
+
+#ifdef __BIG_ENDIAN
+	for (i = 0, j = 0; i < len; i++) {
+#else
+	for (i = len-1, j = 0; i >= 0; i--) {
+#endif
+		byte = data[i];
+
+		hex[j]   = byte & 0x0f;
+		if (hex[j] >= 10)
+			hex[j] += 'a' - 10;
+		else
+			hex[j] += '0';
+		j++;
+
+		hex[j] = byte >> 4;
+		if (hex[j] >= 10)
+			hex[j] += 'a' - 10;
+		else
+			hex[j] += '0';
+		j++;
+	}
+	hex[j] = ' ';
+	j++;
+
+	return trace_seq_putmem(s, hex, j);
+}
+
 static notrace void
 trace_seq_reset(struct trace_seq *s)
 {
@@ -1274,6 +1317,51 @@ do {							\
 		return 0;				\
 } while (0)
 
+#define SEQ_PUT_HEX_FIELD_RET(s, x)			\
+do {							\
+	if (!trace_seq_putmem_hex(s, &(x), sizeof(x)))	\
+		return 0;				\
+} while (0)
+
+static notrace int print_hex_fmt(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	unsigned char newline = '\n';
+	struct trace_entry *entry;
+	int S;
+
+	entry = iter->ent;
+
+	SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
+	SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
+	SEQ_PUT_HEX_FIELD_RET(s, entry->t);
+
+	switch (entry->type) {
+	case TRACE_FN:
+		SEQ_PUT_HEX_FIELD_RET(s, entry->fn.ip);
+		SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip);
+		break;
+	case TRACE_CTX:
+		S = entry->ctx.prev_state < sizeof(state_to_char) ?
+			state_to_char[entry->ctx.prev_state] : 'X';
+		SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_pid);
+		SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_prio);
+		SEQ_PUT_HEX_FIELD_RET(s, S);
+		SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_pid);
+		SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_prio);
+		SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip);
+		break;
+	case TRACE_SPECIAL:
+		SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg1);
+		SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg2);
+		SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg3);
+		break;
+	}
+	SEQ_PUT_FIELD_RET(s, newline);
+
+	return 1;
+}
+
 static notrace int print_bin_fmt(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
@@ -1327,6 +1415,9 @@ static int print_trace_line(struct trace_iterator *iter)
 	if (trace_flags & TRACE_ITER_BIN)
 		return print_bin_fmt(iter);
 
+	if (trace_flags & TRACE_ITER_HEX)
+		return print_hex_fmt(iter);
+
 	if (trace_flags & TRACE_ITER_RAW)
 		return print_raw_fmt(iter);
 
-- 
cgit v1.2.3


From 2577046740fe6d77864128c6187c11125c2449ea Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:49 +0200
Subject: ftrace: build fix

no need to backmerge, only affects ftrace-enabled kernels. (which is
not the default)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6974b212e938..958c4d77a67b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1965,7 +1965,9 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 	static cpumask_t mask;
 	static int start;
 	unsigned long flags;
+#ifdef CONFIG_FTRACE
 	int ftrace_save;
+#endif
 	int read = 0;
 	int cpu;
 	int len;
@@ -2044,8 +2046,10 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 
 	cpus_clear(mask);
 	local_irq_save(flags);
+#ifdef CONFIG_FTRACE
 	ftrace_save = ftrace_enabled;
 	ftrace_enabled = 0;
+#endif
 	smp_wmb();
 	for_each_possible_cpu(cpu) {
 		data = iter->tr->data[cpu];
@@ -2087,7 +2091,9 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 		data = iter->tr->data[cpu];
 		atomic_dec(&data->disabled);
 	}
+#ifdef CONFIG_FTRACE
 	ftrace_enabled = ftrace_save;
+#endif
 	local_irq_restore(flags);
 
 	/* Now copy what we have to the user */
-- 
cgit v1.2.3


From 2a2cc8f7c4d0dfd75720867f7dc58d24f075edfc Mon Sep 17 00:00:00 2001
From: Soeren Sandmann Pedersen <sandmann@redhat.com>
Date: Mon, 12 May 2008 21:20:49 +0200
Subject: ftrace: allow the event pipe to be polled

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 39 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 958c4d77a67b..d041578affd0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -24,6 +24,7 @@
 #include <linux/percpu.h>
 #include <linux/ctype.h>
 #include <linux/init.h>
+#include <linux/poll.h>
 #include <linux/gfp.h>
 #include <linux/fs.h>
 
@@ -63,6 +64,7 @@ static struct tracer		*current_trace __read_mostly;
 static int			max_tracer_type_len;
 
 static DEFINE_MUTEX(trace_types_lock);
+static DECLARE_WAIT_QUEUE_HEAD (trace_wait);
 
 #define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry))
 
@@ -105,6 +107,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_RAW			= 0x10,
 	TRACE_ITER_HEX			= 0x20,
 	TRACE_ITER_BIN			= 0x40,
+	TRACE_ITER_BLOCK		= 0x80,
 };
 
 #define TRACE_ITER_SYM_MASK \
@@ -119,6 +122,7 @@ static const char *trace_options[] = {
 	"raw",
 	"hex",
 	"bin",
+	"block",
 	NULL
 };
 
@@ -650,6 +654,9 @@ __ftrace(struct trace_array *tr, struct trace_array_cpu *data,
 	entry->fn.ip		= ip;
 	entry->fn.parent_ip	= parent_ip;
 	spin_unlock_irqrestore(&data->lock, irq_flags);
+
+	if (!(trace_flags & TRACE_ITER_BLOCK))
+		wake_up (&trace_wait);
 }
 
 notrace void
@@ -675,6 +682,9 @@ trace_special(struct trace_array *tr, struct trace_array_cpu *data,
 	entry->special.arg2	= arg2;
 	entry->special.arg3	= arg3;
 	spin_unlock_irqrestore(&data->lock, irq_flags);
+
+	if (!(trace_flags & TRACE_ITER_BLOCK))
+		wake_up (&trace_wait);
 }
 
 notrace void
@@ -696,6 +706,9 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->ctx.next_pid	= next->pid;
 	entry->ctx.next_prio	= next->prio;
 	spin_unlock_irqrestore(&data->lock, irq_flags);
+
+	if (!(trace_flags & TRACE_ITER_BLOCK))
+		wake_up (&trace_wait);
 }
 
 #ifdef CONFIG_FTRACE
@@ -1765,7 +1778,6 @@ static struct file_operations tracing_readme_fops = {
 	.read = tracing_readme_read,
 };
 
-
 static ssize_t
 tracing_ctrl_read(struct file *filp, char __user *ubuf,
 		  size_t cnt, loff_t *ppos)
@@ -1953,6 +1965,28 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static unsigned int
+tracing_poll_pipe(struct file *filp, poll_table *poll_table)
+{
+	struct trace_iterator *iter = filp->private_data;
+
+	if (trace_flags & TRACE_ITER_BLOCK) {
+		/*
+		 * Always select as readable when in blocking mode
+		 */
+		return POLLIN | POLLRDNORM;
+	}
+	else {
+		if (!trace_empty(iter))
+			return POLLIN | POLLRDNORM;
+		poll_wait(filp, &trace_wait, poll_table);
+		if (!trace_empty(iter))
+			return POLLIN | POLLRDNORM;
+
+		return 0;
+	}
+}
+
 /*
  * Consumer reader.
  */
@@ -1991,6 +2025,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 	start = 0;
 
 	while (trace_empty(iter)) {
+		if (!(trace_flags & TRACE_ITER_BLOCK))
+			return -EWOULDBLOCK;
 		/*
 		 * This is a make-shift waitqueue. The reason we don't use
 		 * an actual wait queue is because:
@@ -2134,6 +2170,7 @@ static struct file_operations set_tracer_fops = {
 
 static struct file_operations tracing_pipe_fops = {
 	.open		= tracing_open_pipe,
+	.poll		= tracing_poll_pipe,
 	.read		= tracing_read_pipe,
 	.release	= tracing_release_pipe,
 };
-- 
cgit v1.2.3


From 6fb44b717c10ecf37beaaebd312f3afa93fed714 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:49 +0200
Subject: ftrace: add trace_function api for other tracers to use

A new check was added in the ftrace function that wont trace if the CPU
trace buffer is disabled.  Unfortunately, other tracers used ftrace() to
write to the buffer after they disabled it. The new disable check makes
these calls into a nop.

This patch changes the __ftrace that is called without the check into a
new api for the other tracers to use, called "trace_function". The other
tracers use this interface instead when the trace CPU buffer is already
disabled.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c              |  8 ++++----
 kernel/trace/trace.h              |  5 +++++
 kernel/trace/trace_irqsoff.c      | 10 +++++-----
 kernel/trace/trace_sched_wakeup.c |  5 +++--
 4 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d041578affd0..9022c357032a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -641,8 +641,8 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags)
 }
 
 notrace void
-__ftrace(struct trace_array *tr, struct trace_array_cpu *data,
-	 unsigned long ip, unsigned long parent_ip, unsigned long flags)
+trace_function(struct trace_array *tr, struct trace_array_cpu *data,
+	       unsigned long ip, unsigned long parent_ip, unsigned long flags)
 {
 	struct trace_entry *entry;
 	unsigned long irq_flags;
@@ -664,7 +664,7 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
        unsigned long ip, unsigned long parent_ip, unsigned long flags)
 {
 	if (likely(!atomic_read(&data->disabled)))
-		__ftrace(tr, data, ip, parent_ip, flags);
+		trace_function(tr, data, ip, parent_ip, flags);
 }
 
 notrace void
@@ -730,7 +730,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
 	disabled = atomic_inc_return(&data->disabled);
 
 	if (likely(disabled == 1))
-		__ftrace(tr, data, ip, parent_ip, flags);
+		trace_function(tr, data, ip, parent_ip, flags);
 
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 7bdfef35c05a..faf9f67246ac 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -169,6 +169,11 @@ void trace_special(struct trace_array *tr,
 		   unsigned long arg1,
 		   unsigned long arg2,
 		   unsigned long arg3);
+void trace_function(struct trace_array *tr,
+		    struct trace_array_cpu *data,
+		    unsigned long ip,
+		    unsigned long parent_ip,
+		    unsigned long flags);
 
 void tracing_start_function_trace(void);
 void tracing_stop_function_trace(void);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index d2a6e6f1ad2d..3269f4ff5172 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -95,7 +95,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
 	disabled = atomic_inc_return(&data->disabled);
 
 	if (likely(disabled == 1))
-		ftrace(tr, data, ip, parent_ip, flags);
+		trace_function(tr, data, ip, parent_ip, flags);
 
 	atomic_dec(&data->disabled);
 }
@@ -150,7 +150,7 @@ check_critical_timing(struct trace_array *tr,
 	if (!report_latency(delta))
 		goto out_unlock;
 
-	ftrace(tr, data, CALLER_ADDR0, parent_ip, flags);
+	trace_function(tr, data, CALLER_ADDR0, parent_ip, flags);
 
 	latency = nsecs_to_usecs(delta);
 
@@ -188,7 +188,7 @@ out:
 	data->critical_sequence = max_sequence;
 	data->preempt_timestamp = ftrace_now(cpu);
 	tracing_reset(data);
-	ftrace(tr, data, CALLER_ADDR0, parent_ip, flags);
+	trace_function(tr, data, CALLER_ADDR0, parent_ip, flags);
 }
 
 static inline void notrace
@@ -221,7 +221,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
 
 	local_save_flags(flags);
 
-	ftrace(tr, data, ip, parent_ip, flags);
+	trace_function(tr, data, ip, parent_ip, flags);
 
 	__get_cpu_var(tracing_cpu) = 1;
 
@@ -254,7 +254,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
 
 	atomic_inc(&data->disabled);
 	local_save_flags(flags);
-	ftrace(tr, data, ip, parent_ip, flags);
+	trace_function(tr, data, ip, parent_ip, flags);
 	check_critical_timing(tr, data, parent_ip ? : ip, cpu);
 	data->critical_start = 0;
 	atomic_dec(&data->disabled);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index b7df825c3af9..3549e4154f1f 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -85,7 +85,7 @@ wakeup_sched_switch(struct task_struct *prev, struct task_struct *next)
 	if (unlikely(!tracer_enabled || next != wakeup_task))
 		goto out_unlock;
 
-	ftrace(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags);
+	trace_function(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags);
 
 	/*
 	 * usecs conversion is slow so we try to delay the conversion
@@ -192,7 +192,8 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p,
 	local_save_flags(flags);
 
 	tr->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu);
-	ftrace(tr, tr->data[wakeup_cpu], CALLER_ADDR1, CALLER_ADDR2, flags);
+	trace_function(tr, tr->data[wakeup_cpu],
+		       CALLER_ADDR1, CALLER_ADDR2, flags);
 
 out_locked:
 	spin_unlock(&wakeup_lock);
-- 
cgit v1.2.3


From 9ff9cdb2d3b0971f89e899b3420aadd91bddc215 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:50 +0200
Subject: ftrace: cleanups

clean up recent code.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/ftrace.c         | 5 +++--
 kernel/trace/trace_irqsoff.c  | 2 +-
 kernel/trace/trace_selftest.c | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ff42345dd78e..425b1fec3d83 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -197,8 +197,8 @@ static int ftrace_record_suspend;
 
 static struct dyn_ftrace *ftrace_free_records;
 
-static inline int
-notrace ftrace_ip_in_hash(unsigned long ip, unsigned long key)
+static inline int notrace
+ftrace_ip_in_hash(unsigned long ip, unsigned long key)
 {
 	struct dyn_ftrace *p;
 	struct hlist_node *t;
@@ -1249,6 +1249,7 @@ static int __init notrace ftrace_dynamic_init(void)
 	int ret;
 
 	addr = (unsigned long)ftrace_record_ip;
+
 	stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS);
 
 	/* ftrace_dyn_arch_init places the return code in addr */
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 3269f4ff5172..2ac0d09db6fb 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -309,7 +309,7 @@ void trace_softirqs_off(unsigned long ip)
 {
 }
 
-inline void print_irqtrace_events(struct task_struct *curr)
+inline notrace void print_irqtrace_events(struct task_struct *curr)
 {
 }
 
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 85715b86a342..546307de6e3d 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -3,7 +3,7 @@
 #include <linux/kthread.h>
 #include <linux/delay.h>
 
-static inline int trace_valid_entry(struct trace_entry *entry)
+static notrace inline int trace_valid_entry(struct trace_entry *entry)
 {
 	switch (entry->type) {
 	case TRACE_FN:
-- 
cgit v1.2.3


From caf8cdebfb6c1cff50ea8077f1a07c2333d6d1fd Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:50 +0200
Subject: ftrace: remove address of function names

PowerPC is very fragile when it comes to use of function names
and function addresses.  ftrace needs to either use all function
addresses or function names (i.e. my_func as suppose to &my_func).

This patch chooses to use the names and not the addresses, and
makes ftrace consistent.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/ftrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 425b1fec3d83..57350cbd1f61 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -326,8 +326,8 @@ ftrace_record_ip(unsigned long ip)
 		preempt_enable_notrace();
 }
 
-#define FTRACE_ADDR ((long)(&ftrace_caller))
-#define MCOUNT_ADDR ((long)(&mcount))
+#define FTRACE_ADDR ((long)(ftrace_caller))
+#define MCOUNT_ADDR ((long)(mcount))
 
 static void notrace
 __ftrace_replace_code(struct dyn_ftrace *rec,
-- 
cgit v1.2.3


From b53dde9d34f2df396540988ebc65c33400f57b04 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:51 +0200
Subject: ftrace: disable -pg for the tracer itself

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/Makefile | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 3fec653d6533..c25a6cd6a529 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -1,3 +1,11 @@
+
+# Do not instrument the tracer itself:
+
+ifdef CONFIG_FTRACE
+ORIG_CFLAGS := $(KBUILD_CFLAGS)
+KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
+endif
+
 obj-$(CONFIG_FTRACE) += libftrace.o
 
 obj-$(CONFIG_TRACING) += trace.o
-- 
cgit v1.2.3


From e309b41dd65aa953f86765eeeecc941d8e1e8b8f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:51 +0200
Subject: ftrace: remove notrace

now that we have a kbuild method for notrace, no need to pollute the
C code with the annotations.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/ftrace.c             | 66 +++++++++++++--------------
 kernel/trace/trace.c              | 94 +++++++++++++++++++--------------------
 kernel/trace/trace.h              |  6 +--
 kernel/trace/trace_functions.c    | 12 ++---
 kernel/trace/trace_irqsoff.c      | 40 ++++++++---------
 kernel/trace/trace_sched_switch.c | 12 ++---
 kernel/trace/trace_sched_wakeup.c | 28 ++++++------
 kernel/trace/trace_selftest.c     |  2 +-
 8 files changed, 130 insertions(+), 130 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 57350cbd1f61..281d97a3208c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -53,7 +53,7 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
 /* mcount is defined per arch in assembly */
 EXPORT_SYMBOL(mcount);
 
-notrace void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
+void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
 {
 	struct ftrace_ops *op = ftrace_list;
 
@@ -79,7 +79,7 @@ void clear_ftrace_function(void)
 	ftrace_trace_function = ftrace_stub;
 }
 
-static int notrace __register_ftrace_function(struct ftrace_ops *ops)
+static int __register_ftrace_function(struct ftrace_ops *ops)
 {
 	/* Should never be called by interrupts */
 	spin_lock(&ftrace_lock);
@@ -110,7 +110,7 @@ static int notrace __register_ftrace_function(struct ftrace_ops *ops)
 	return 0;
 }
 
-static int notrace __unregister_ftrace_function(struct ftrace_ops *ops)
+static int __unregister_ftrace_function(struct ftrace_ops *ops)
 {
 	struct ftrace_ops **p;
 	int ret = 0;
@@ -197,7 +197,7 @@ static int ftrace_record_suspend;
 
 static struct dyn_ftrace *ftrace_free_records;
 
-static inline int notrace
+static inline int
 ftrace_ip_in_hash(unsigned long ip, unsigned long key)
 {
 	struct dyn_ftrace *p;
@@ -214,13 +214,13 @@ ftrace_ip_in_hash(unsigned long ip, unsigned long key)
 	return found;
 }
 
-static inline void notrace
+static inline void
 ftrace_add_hash(struct dyn_ftrace *node, unsigned long key)
 {
 	hlist_add_head(&node->node, &ftrace_hash[key]);
 }
 
-static notrace void ftrace_free_rec(struct dyn_ftrace *rec)
+static void ftrace_free_rec(struct dyn_ftrace *rec)
 {
 	/* no locking, only called from kstop_machine */
 
@@ -229,7 +229,7 @@ static notrace void ftrace_free_rec(struct dyn_ftrace *rec)
 	rec->flags |= FTRACE_FL_FREE;
 }
 
-static notrace struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
+static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
 {
 	struct dyn_ftrace *rec;
 
@@ -259,7 +259,7 @@ static notrace struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
 	return &ftrace_pages->records[ftrace_pages->index++];
 }
 
-static void notrace
+static void
 ftrace_record_ip(unsigned long ip)
 {
 	struct dyn_ftrace *node;
@@ -329,7 +329,7 @@ ftrace_record_ip(unsigned long ip)
 #define FTRACE_ADDR ((long)(ftrace_caller))
 #define MCOUNT_ADDR ((long)(mcount))
 
-static void notrace
+static void
 __ftrace_replace_code(struct dyn_ftrace *rec,
 		      unsigned char *old, unsigned char *new, int enable)
 {
@@ -405,7 +405,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec,
 	}
 }
 
-static void notrace ftrace_replace_code(int enable)
+static void ftrace_replace_code(int enable)
 {
 	unsigned char *new = NULL, *old = NULL;
 	struct dyn_ftrace *rec;
@@ -430,7 +430,7 @@ static void notrace ftrace_replace_code(int enable)
 	}
 }
 
-static notrace void ftrace_shutdown_replenish(void)
+static void ftrace_shutdown_replenish(void)
 {
 	if (ftrace_pages->next)
 		return;
@@ -439,7 +439,7 @@ static notrace void ftrace_shutdown_replenish(void)
 	ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
 }
 
-static notrace void
+static void
 ftrace_code_disable(struct dyn_ftrace *rec)
 {
 	unsigned long ip;
@@ -458,7 +458,7 @@ ftrace_code_disable(struct dyn_ftrace *rec)
 	}
 }
 
-static int notrace __ftrace_modify_code(void *data)
+static int __ftrace_modify_code(void *data)
 {
 	unsigned long addr;
 	int *command = data;
@@ -482,14 +482,14 @@ static int notrace __ftrace_modify_code(void *data)
 	return 0;
 }
 
-static void notrace ftrace_run_update_code(int command)
+static void ftrace_run_update_code(int command)
 {
 	stop_machine_run(__ftrace_modify_code, &command, NR_CPUS);
 }
 
 static ftrace_func_t saved_ftrace_func;
 
-static void notrace ftrace_startup(void)
+static void ftrace_startup(void)
 {
 	int command = 0;
 
@@ -514,7 +514,7 @@ static void notrace ftrace_startup(void)
 	mutex_unlock(&ftraced_lock);
 }
 
-static void notrace ftrace_shutdown(void)
+static void ftrace_shutdown(void)
 {
 	int command = 0;
 
@@ -539,7 +539,7 @@ static void notrace ftrace_shutdown(void)
 	mutex_unlock(&ftraced_lock);
 }
 
-static void notrace ftrace_startup_sysctl(void)
+static void ftrace_startup_sysctl(void)
 {
 	int command = FTRACE_ENABLE_MCOUNT;
 
@@ -557,7 +557,7 @@ static void notrace ftrace_startup_sysctl(void)
 	mutex_unlock(&ftraced_lock);
 }
 
-static void notrace ftrace_shutdown_sysctl(void)
+static void ftrace_shutdown_sysctl(void)
 {
 	int command = FTRACE_DISABLE_MCOUNT;
 
@@ -577,7 +577,7 @@ static cycle_t		ftrace_update_time;
 static unsigned long	ftrace_update_cnt;
 unsigned long		ftrace_update_tot_cnt;
 
-static int notrace __ftrace_update_code(void *ignore)
+static int __ftrace_update_code(void *ignore)
 {
 	struct dyn_ftrace *p;
 	struct hlist_head head;
@@ -618,7 +618,7 @@ static int notrace __ftrace_update_code(void *ignore)
 	return 0;
 }
 
-static void notrace ftrace_update_code(void)
+static void ftrace_update_code(void)
 {
 	if (unlikely(ftrace_disabled))
 		return;
@@ -626,7 +626,7 @@ static void notrace ftrace_update_code(void)
 	stop_machine_run(__ftrace_update_code, NULL, NR_CPUS);
 }
 
-static int notrace ftraced(void *ignore)
+static int ftraced(void *ignore)
 {
 	unsigned long usecs;
 
@@ -733,7 +733,7 @@ struct ftrace_iterator {
 	unsigned		filtered;
 };
 
-static void notrace *
+static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct ftrace_iterator *iter = m->private;
@@ -806,7 +806,7 @@ static struct seq_operations show_ftrace_seq_ops = {
 	.show = t_show,
 };
 
-static int notrace
+static int
 ftrace_avail_open(struct inode *inode, struct file *file)
 {
 	struct ftrace_iterator *iter;
@@ -845,7 +845,7 @@ int ftrace_avail_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static void notrace ftrace_filter_reset(void)
+static void ftrace_filter_reset(void)
 {
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
@@ -867,7 +867,7 @@ static void notrace ftrace_filter_reset(void)
 	preempt_enable();
 }
 
-static int notrace
+static int
 ftrace_filter_open(struct inode *inode, struct file *file)
 {
 	struct ftrace_iterator *iter;
@@ -903,7 +903,7 @@ ftrace_filter_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
-static ssize_t notrace
+static ssize_t
 ftrace_filter_read(struct file *file, char __user *ubuf,
 		       size_t cnt, loff_t *ppos)
 {
@@ -913,7 +913,7 @@ ftrace_filter_read(struct file *file, char __user *ubuf,
 		return -EPERM;
 }
 
-static loff_t notrace
+static loff_t
 ftrace_filter_lseek(struct file *file, loff_t offset, int origin)
 {
 	loff_t ret;
@@ -933,7 +933,7 @@ enum {
 	MATCH_END_ONLY,
 };
 
-static void notrace
+static void
 ftrace_match(unsigned char *buff, int len)
 {
 	char str[KSYM_SYMBOL_LEN];
@@ -1002,7 +1002,7 @@ ftrace_match(unsigned char *buff, int len)
 	preempt_enable();
 }
 
-static ssize_t notrace
+static ssize_t
 ftrace_filter_write(struct file *file, const char __user *ubuf,
 		    size_t cnt, loff_t *ppos)
 {
@@ -1094,7 +1094,7 @@ ftrace_filter_write(struct file *file, const char __user *ubuf,
  * Filters denote which functions should be enabled when tracing is enabled.
  * If @buf is NULL and reset is set, all functions will be enabled for tracing.
  */
-notrace void ftrace_set_filter(unsigned char *buf, int len, int reset)
+void ftrace_set_filter(unsigned char *buf, int len, int reset)
 {
 	if (unlikely(ftrace_disabled))
 		return;
@@ -1107,7 +1107,7 @@ notrace void ftrace_set_filter(unsigned char *buf, int len, int reset)
 	mutex_unlock(&ftrace_filter_lock);
 }
 
-static int notrace
+static int
 ftrace_filter_release(struct inode *inode, struct file *file)
 {
 	struct seq_file *m = (struct seq_file *)file->private_data;
@@ -1242,7 +1242,7 @@ static __init int ftrace_init_debugfs(void)
 
 fs_initcall(ftrace_init_debugfs);
 
-static int __init notrace ftrace_dynamic_init(void)
+static int __init ftrace_dynamic_init(void)
 {
 	struct task_struct *p;
 	unsigned long addr;
@@ -1352,7 +1352,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
 	return ret;
 }
 
-notrace int
+int
 ftrace_enable_sysctl(struct ctl_table *table, int write,
 		     struct file *file, void __user *buffer, size_t *lenp,
 		     loff_t *ppos)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9022c357032a..f5898051fdd9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -35,7 +35,7 @@ unsigned long __read_mostly	tracing_thresh;
 
 static int tracing_disabled = 1;
 
-static long notrace
+static long
 ns2usecs(cycle_t nsec)
 {
 	nsec += 500;
@@ -43,7 +43,7 @@ ns2usecs(cycle_t nsec)
 	return nsec;
 }
 
-notrace cycle_t ftrace_now(int cpu)
+cycle_t ftrace_now(int cpu)
 {
 	return cpu_clock(cpu);
 }
@@ -135,7 +135,7 @@ static DEFINE_SPINLOCK(ftrace_max_lock);
  * structure. (this way the maximum trace is permanently saved,
  * for later retrieval via /debugfs/tracing/latency_trace)
  */
-static notrace void
+static void
 __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 {
 	struct trace_array_cpu *data = tr->data[cpu];
@@ -184,7 +184,7 @@ void *head_page(struct trace_array_cpu *data)
 	return page_address(page);
 }
 
-static notrace int
+static int
 trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 {
 	int len = (PAGE_SIZE - 1) - s->len;
@@ -207,7 +207,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 	return len;
 }
 
-static notrace int
+static int
 trace_seq_puts(struct trace_seq *s, const char *str)
 {
 	int len = strlen(str);
@@ -221,7 +221,7 @@ trace_seq_puts(struct trace_seq *s, const char *str)
 	return len;
 }
 
-static notrace int
+static int
 trace_seq_putc(struct trace_seq *s, unsigned char c)
 {
 	if (s->len >= (PAGE_SIZE - 1))
@@ -232,7 +232,7 @@ trace_seq_putc(struct trace_seq *s, unsigned char c)
 	return 1;
 }
 
-static notrace int
+static int
 trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
 {
 	if (len > ((PAGE_SIZE - 1) - s->len))
@@ -246,7 +246,7 @@ trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
 
 #define HEX_CHARS 17
 
-static notrace int
+static int
 trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
 {
 	unsigned char hex[HEX_CHARS];
@@ -285,13 +285,13 @@ trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
 	return trace_seq_putmem(s, hex, j);
 }
 
-static notrace void
+static void
 trace_seq_reset(struct trace_seq *s)
 {
 	s->len = 0;
 }
 
-static notrace void
+static void
 trace_print_seq(struct seq_file *m, struct trace_seq *s)
 {
 	int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
@@ -302,7 +302,7 @@ trace_print_seq(struct seq_file *m, struct trace_seq *s)
 	trace_seq_reset(s);
 }
 
-notrace static void
+static void
 flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2)
 {
 	struct list_head flip_pages;
@@ -323,7 +323,7 @@ flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2)
 	check_pages(tr2);
 }
 
-notrace void
+void
 update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 {
 	struct trace_array_cpu *data;
@@ -348,7 +348,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
  * @tsk - task with the latency
  * @cpu - the cpu of the buffer to copy.
  */
-notrace void
+void
 update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 {
 	struct trace_array_cpu *data = tr->data[cpu];
@@ -471,7 +471,7 @@ void unregister_tracer(struct tracer *type)
 	mutex_unlock(&trace_types_lock);
 }
 
-notrace void tracing_reset(struct trace_array_cpu *data)
+void tracing_reset(struct trace_array_cpu *data)
 {
 	data->trace_idx = 0;
 	data->trace_head = data->trace_tail = head_page(data);
@@ -494,9 +494,9 @@ static void trace_init_cmdlines(void)
 	cmdline_idx = 0;
 }
 
-notrace void trace_stop_cmdline_recording(void);
+void trace_stop_cmdline_recording(void);
 
-static notrace void trace_save_cmdline(struct task_struct *tsk)
+static void trace_save_cmdline(struct task_struct *tsk)
 {
 	unsigned map;
 	unsigned idx;
@@ -531,7 +531,7 @@ static notrace void trace_save_cmdline(struct task_struct *tsk)
 	spin_unlock(&trace_cmdline_lock);
 }
 
-static notrace char *trace_find_cmdline(int pid)
+static char *trace_find_cmdline(int pid)
 {
 	char *cmdline = "<...>";
 	unsigned map;
@@ -552,7 +552,7 @@ static notrace char *trace_find_cmdline(int pid)
 	return cmdline;
 }
 
-notrace void tracing_record_cmdline(struct task_struct *tsk)
+void tracing_record_cmdline(struct task_struct *tsk)
 {
 	if (atomic_read(&trace_record_cmdline_disabled))
 		return;
@@ -560,7 +560,7 @@ notrace void tracing_record_cmdline(struct task_struct *tsk)
 	trace_save_cmdline(tsk);
 }
 
-static inline notrace struct list_head *
+static inline struct list_head *
 trace_next_list(struct trace_array_cpu *data, struct list_head *next)
 {
 	/*
@@ -574,7 +574,7 @@ trace_next_list(struct trace_array_cpu *data, struct list_head *next)
 	return next;
 }
 
-static inline notrace void *
+static inline void *
 trace_next_page(struct trace_array_cpu *data, void *addr)
 {
 	struct list_head *next;
@@ -588,7 +588,7 @@ trace_next_page(struct trace_array_cpu *data, void *addr)
 	return page_address(page);
 }
 
-static inline notrace struct trace_entry *
+static inline struct trace_entry *
 tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data)
 {
 	unsigned long idx, idx_next;
@@ -623,7 +623,7 @@ tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data)
 	return entry;
 }
 
-static inline notrace void
+static inline void
 tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags)
 {
 	struct task_struct *tsk = current;
@@ -640,7 +640,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags)
 		(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
 }
 
-notrace void
+void
 trace_function(struct trace_array *tr, struct trace_array_cpu *data,
 	       unsigned long ip, unsigned long parent_ip, unsigned long flags)
 {
@@ -659,7 +659,7 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data,
 		wake_up (&trace_wait);
 }
 
-notrace void
+void
 ftrace(struct trace_array *tr, struct trace_array_cpu *data,
        unsigned long ip, unsigned long parent_ip, unsigned long flags)
 {
@@ -667,7 +667,7 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
 		trace_function(tr, data, ip, parent_ip, flags);
 }
 
-notrace void
+void
 trace_special(struct trace_array *tr, struct trace_array_cpu *data,
 	      unsigned long arg1, unsigned long arg2, unsigned long arg3)
 {
@@ -687,7 +687,7 @@ trace_special(struct trace_array *tr, struct trace_array_cpu *data,
 		wake_up (&trace_wait);
 }
 
-notrace void
+void
 tracing_sched_switch_trace(struct trace_array *tr,
 			   struct trace_array_cpu *data,
 			   struct task_struct *prev, struct task_struct *next,
@@ -712,7 +712,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 }
 
 #ifdef CONFIG_FTRACE
-static notrace void
+static void
 function_trace_call(unsigned long ip, unsigned long parent_ip)
 {
 	struct trace_array *tr = &global_trace;
@@ -741,12 +741,12 @@ static struct ftrace_ops trace_ops __read_mostly =
 	.func = function_trace_call,
 };
 
-notrace void tracing_start_function_trace(void)
+void tracing_start_function_trace(void)
 {
 	register_ftrace_function(&trace_ops);
 }
 
-notrace void tracing_stop_function_trace(void)
+void tracing_stop_function_trace(void)
 {
 	unregister_ftrace_function(&trace_ops);
 }
@@ -786,7 +786,7 @@ trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data,
 	return &array[iter->next_page_idx[cpu]];
 }
 
-static struct trace_entry * notrace
+static struct trace_entry *
 find_next_entry(struct trace_iterator *iter, int *ent_cpu)
 {
 	struct trace_array *tr = iter->tr;
@@ -813,7 +813,7 @@ find_next_entry(struct trace_iterator *iter, int *ent_cpu)
 	return next;
 }
 
-static notrace void trace_iterator_increment(struct trace_iterator *iter)
+static void trace_iterator_increment(struct trace_iterator *iter)
 {
 	iter->idx++;
 	iter->next_idx[iter->cpu]++;
@@ -828,7 +828,7 @@ static notrace void trace_iterator_increment(struct trace_iterator *iter)
 	}
 }
 
-static notrace void trace_consume(struct trace_iterator *iter)
+static void trace_consume(struct trace_iterator *iter)
 {
 	struct trace_array_cpu *data = iter->tr->data[iter->cpu];
 
@@ -844,7 +844,7 @@ static notrace void trace_consume(struct trace_iterator *iter)
 		data->trace_idx = 0;
 }
 
-static notrace void *find_next_entry_inc(struct trace_iterator *iter)
+static void *find_next_entry_inc(struct trace_iterator *iter)
 {
 	struct trace_entry *next;
 	int next_cpu = -1;
@@ -863,7 +863,7 @@ static notrace void *find_next_entry_inc(struct trace_iterator *iter)
 	return next ? iter : NULL;
 }
 
-static notrace void *s_next(struct seq_file *m, void *v, loff_t *pos)
+static void *s_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct trace_iterator *iter = m->private;
 	void *last_ent = iter->ent;
@@ -978,7 +978,7 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
 # define IP_FMT "%016lx"
 #endif
 
-static notrace int
+static int
 seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
 {
 	int ret;
@@ -999,7 +999,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
 	return ret;
 }
 
-static notrace void print_lat_help_header(struct seq_file *m)
+static void print_lat_help_header(struct seq_file *m)
 {
 	seq_puts(m, "#                _------=> CPU#            \n");
 	seq_puts(m, "#               / _-----=> irqs-off        \n");
@@ -1012,14 +1012,14 @@ static notrace void print_lat_help_header(struct seq_file *m)
 	seq_puts(m, "#     \\   /    |||||   \\   |   /           \n");
 }
 
-static notrace void print_func_help_header(struct seq_file *m)
+static void print_func_help_header(struct seq_file *m)
 {
 	seq_puts(m, "#           TASK-PID   CPU#    TIMESTAMP  FUNCTION\n");
 	seq_puts(m, "#              | |      |          |         |\n");
 }
 
 
-static notrace void
+static void
 print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 {
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
@@ -1090,7 +1090,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 	seq_puts(m, "\n");
 }
 
-static notrace void
+static void
 lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
 {
 	int hardirq, softirq;
@@ -1127,7 +1127,7 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
 
 unsigned long preempt_mark_thresh = 100;
 
-static notrace void
+static void
 lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs,
 		    unsigned long rel_usecs)
 {
@@ -1142,7 +1142,7 @@ lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs,
 
 static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
 
-static notrace int
+static int
 print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 {
 	struct trace_seq *s = &iter->seq;
@@ -1206,7 +1206,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 	return 1;
 }
 
-static notrace int print_trace_fmt(struct trace_iterator *iter)
+static int print_trace_fmt(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
@@ -1279,7 +1279,7 @@ static notrace int print_trace_fmt(struct trace_iterator *iter)
 	return 1;
 }
 
-static notrace int print_raw_fmt(struct trace_iterator *iter)
+static int print_raw_fmt(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
 	struct trace_entry *entry;
@@ -1336,7 +1336,7 @@ do {							\
 		return 0;				\
 } while (0)
 
-static notrace int print_hex_fmt(struct trace_iterator *iter)
+static int print_hex_fmt(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
 	unsigned char newline = '\n';
@@ -1375,7 +1375,7 @@ static notrace int print_hex_fmt(struct trace_iterator *iter)
 	return 1;
 }
 
-static notrace int print_bin_fmt(struct trace_iterator *iter)
+static int print_bin_fmt(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
 	struct trace_entry *entry;
@@ -1475,7 +1475,7 @@ static struct seq_operations tracer_seq_ops = {
 	.show		= s_show,
 };
 
-static struct trace_iterator notrace *
+static struct trace_iterator *
 __tracing_open(struct inode *inode, struct file *file, int *ret)
 {
 	struct trace_iterator *iter;
@@ -1572,7 +1572,7 @@ static int tracing_lt_open(struct inode *inode, struct file *file)
 }
 
 
-static notrace void *
+static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct tracer *t = m->private;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index faf9f67246ac..2b7352bf1ce6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -150,7 +150,7 @@ struct trace_iterator {
 	long			idx;
 };
 
-void notrace tracing_reset(struct trace_array_cpu *data);
+void tracing_reset(struct trace_array_cpu *data);
 int tracing_open_generic(struct inode *inode, struct file *filp);
 struct dentry *tracing_init_dentry(void);
 void ftrace(struct trace_array *tr,
@@ -189,10 +189,10 @@ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
 void update_max_tr_single(struct trace_array *tr,
 			  struct task_struct *tsk, int cpu);
 
-extern notrace cycle_t ftrace_now(int cpu);
+extern cycle_t ftrace_now(int cpu);
 
 #ifdef CONFIG_SCHED_TRACER
-extern void notrace
+extern void
 wakeup_sched_switch(struct task_struct *prev, struct task_struct *next);
 #else
 static inline void
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 69a0eb00a0a5..4165d34bd28a 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -16,7 +16,7 @@
 
 #include "trace.h"
 
-static notrace void function_reset(struct trace_array *tr)
+static void function_reset(struct trace_array *tr)
 {
 	int cpu;
 
@@ -26,30 +26,30 @@ static notrace void function_reset(struct trace_array *tr)
 		tracing_reset(tr->data[cpu]);
 }
 
-static notrace void start_function_trace(struct trace_array *tr)
+static void start_function_trace(struct trace_array *tr)
 {
 	function_reset(tr);
 	tracing_start_function_trace();
 }
 
-static notrace void stop_function_trace(struct trace_array *tr)
+static void stop_function_trace(struct trace_array *tr)
 {
 	tracing_stop_function_trace();
 }
 
-static notrace void function_trace_init(struct trace_array *tr)
+static void function_trace_init(struct trace_array *tr)
 {
 	if (tr->ctrl)
 		start_function_trace(tr);
 }
 
-static notrace void function_trace_reset(struct trace_array *tr)
+static void function_trace_reset(struct trace_array *tr)
 {
 	if (tr->ctrl)
 		stop_function_trace(tr);
 }
 
-static notrace void function_trace_ctrl_update(struct trace_array *tr)
+static void function_trace_ctrl_update(struct trace_array *tr)
 {
 	if (tr->ctrl)
 		start_function_trace(tr);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 2ac0d09db6fb..7a4dc014b8ab 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -33,7 +33,7 @@ enum {
 static int trace_type __read_mostly;
 
 #ifdef CONFIG_PREEMPT_TRACER
-static inline int notrace
+static inline int
 preempt_trace(void)
 {
 	return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count());
@@ -43,7 +43,7 @@ preempt_trace(void)
 #endif
 
 #ifdef CONFIG_IRQSOFF_TRACER
-static inline int notrace
+static inline int
 irq_trace(void)
 {
 	return ((trace_type & TRACER_IRQS_OFF) &&
@@ -67,7 +67,7 @@ static __cacheline_aligned_in_smp	unsigned long max_sequence;
 /*
  * irqsoff uses its own tracer function to keep the overhead down:
  */
-static void notrace
+static void
 irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
 {
 	struct trace_array *tr = irqsoff_trace;
@@ -109,7 +109,7 @@ static struct ftrace_ops trace_ops __read_mostly =
 /*
  * Should this new latency be reported/recorded?
  */
-static int notrace report_latency(cycle_t delta)
+static int report_latency(cycle_t delta)
 {
 	if (tracing_thresh) {
 		if (delta < tracing_thresh)
@@ -121,7 +121,7 @@ static int notrace report_latency(cycle_t delta)
 	return 1;
 }
 
-static void notrace
+static void
 check_critical_timing(struct trace_array *tr,
 		      struct trace_array_cpu *data,
 		      unsigned long parent_ip,
@@ -191,7 +191,7 @@ out:
 	trace_function(tr, data, CALLER_ADDR0, parent_ip, flags);
 }
 
-static inline void notrace
+static inline void
 start_critical_timing(unsigned long ip, unsigned long parent_ip)
 {
 	int cpu;
@@ -228,7 +228,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
 	atomic_dec(&data->disabled);
 }
 
-static inline void notrace
+static inline void
 stop_critical_timing(unsigned long ip, unsigned long parent_ip)
 {
 	int cpu;
@@ -261,13 +261,13 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
 }
 
 /* start and stop critical timings used to for stoppage (in idle) */
-void notrace start_critical_timings(void)
+void start_critical_timings(void)
 {
 	if (preempt_trace() || irq_trace())
 		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
 
-void notrace stop_critical_timings(void)
+void stop_critical_timings(void)
 {
 	if (preempt_trace() || irq_trace())
 		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
@@ -275,13 +275,13 @@ void notrace stop_critical_timings(void)
 
 #ifdef CONFIG_IRQSOFF_TRACER
 #ifdef CONFIG_PROVE_LOCKING
-void notrace time_hardirqs_on(unsigned long a0, unsigned long a1)
+void time_hardirqs_on(unsigned long a0, unsigned long a1)
 {
 	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(a0, a1);
 }
 
-void notrace time_hardirqs_off(unsigned long a0, unsigned long a1)
+void time_hardirqs_off(unsigned long a0, unsigned long a1)
 {
 	if (!preempt_trace() && irq_trace())
 		start_critical_timing(a0, a1);
@@ -309,35 +309,35 @@ void trace_softirqs_off(unsigned long ip)
 {
 }
 
-inline notrace void print_irqtrace_events(struct task_struct *curr)
+inline void print_irqtrace_events(struct task_struct *curr)
 {
 }
 
 /*
  * We are only interested in hardirq on/off events:
  */
-void notrace trace_hardirqs_on(void)
+void trace_hardirqs_on(void)
 {
 	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
 EXPORT_SYMBOL(trace_hardirqs_on);
 
-void notrace trace_hardirqs_off(void)
+void trace_hardirqs_off(void)
 {
 	if (!preempt_trace() && irq_trace())
 		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
 EXPORT_SYMBOL(trace_hardirqs_off);
 
-void notrace trace_hardirqs_on_caller(unsigned long caller_addr)
+void trace_hardirqs_on_caller(unsigned long caller_addr)
 {
 	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(CALLER_ADDR0, caller_addr);
 }
 EXPORT_SYMBOL(trace_hardirqs_on_caller);
 
-void notrace trace_hardirqs_off_caller(unsigned long caller_addr)
+void trace_hardirqs_off_caller(unsigned long caller_addr)
 {
 	if (!preempt_trace() && irq_trace())
 		start_critical_timing(CALLER_ADDR0, caller_addr);
@@ -348,12 +348,12 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
 #endif /*  CONFIG_IRQSOFF_TRACER */
 
 #ifdef CONFIG_PREEMPT_TRACER
-void notrace trace_preempt_on(unsigned long a0, unsigned long a1)
+void trace_preempt_on(unsigned long a0, unsigned long a1)
 {
 	stop_critical_timing(a0, a1);
 }
 
-void notrace trace_preempt_off(unsigned long a0, unsigned long a1)
+void trace_preempt_off(unsigned long a0, unsigned long a1)
 {
 	start_critical_timing(a0, a1);
 }
@@ -395,14 +395,14 @@ static void irqsoff_tracer_ctrl_update(struct trace_array *tr)
 		stop_irqsoff_tracer(tr);
 }
 
-static void notrace irqsoff_tracer_open(struct trace_iterator *iter)
+static void irqsoff_tracer_open(struct trace_iterator *iter)
 {
 	/* stop the trace while dumping */
 	if (iter->tr->ctrl)
 		stop_irqsoff_tracer(iter->tr);
 }
 
-static void notrace irqsoff_tracer_close(struct trace_iterator *iter)
+static void irqsoff_tracer_close(struct trace_iterator *iter)
 {
 	if (iter->tr->ctrl)
 		start_irqsoff_tracer(iter->tr);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 8d656672da93..b738eaca1dbe 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -17,7 +17,7 @@
 static struct trace_array	*ctx_trace;
 static int __read_mostly	tracer_enabled;
 
-static void notrace
+static void
 ctx_switch_func(struct task_struct *prev, struct task_struct *next)
 {
 	struct trace_array *tr = ctx_trace;
@@ -57,7 +57,7 @@ void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next)
 	wakeup_sched_switch(prev, next);
 }
 
-static notrace void sched_switch_reset(struct trace_array *tr)
+static void sched_switch_reset(struct trace_array *tr)
 {
 	int cpu;
 
@@ -67,18 +67,18 @@ static notrace void sched_switch_reset(struct trace_array *tr)
 		tracing_reset(tr->data[cpu]);
 }
 
-static notrace void start_sched_trace(struct trace_array *tr)
+static void start_sched_trace(struct trace_array *tr)
 {
 	sched_switch_reset(tr);
 	tracer_enabled = 1;
 }
 
-static notrace void stop_sched_trace(struct trace_array *tr)
+static void stop_sched_trace(struct trace_array *tr)
 {
 	tracer_enabled = 0;
 }
 
-static notrace void sched_switch_trace_init(struct trace_array *tr)
+static void sched_switch_trace_init(struct trace_array *tr)
 {
 	ctx_trace = tr;
 
@@ -86,7 +86,7 @@ static notrace void sched_switch_trace_init(struct trace_array *tr)
 		start_sched_trace(tr);
 }
 
-static notrace void sched_switch_trace_reset(struct trace_array *tr)
+static void sched_switch_trace_reset(struct trace_array *tr)
 {
 	if (tr->ctrl)
 		stop_sched_trace(tr);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 3549e4154f1f..662679c78b66 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -27,12 +27,12 @@ static unsigned			wakeup_prio = -1;
 
 static DEFINE_SPINLOCK(wakeup_lock);
 
-static void notrace __wakeup_reset(struct trace_array *tr);
+static void __wakeup_reset(struct trace_array *tr);
 
 /*
  * Should this new latency be reported/recorded?
  */
-static int notrace report_latency(cycle_t delta)
+static int report_latency(cycle_t delta)
 {
 	if (tracing_thresh) {
 		if (delta < tracing_thresh)
@@ -44,7 +44,7 @@ static int notrace report_latency(cycle_t delta)
 	return 1;
 }
 
-void notrace
+void
 wakeup_sched_switch(struct task_struct *prev, struct task_struct *next)
 {
 	unsigned long latency = 0, t0 = 0, t1 = 0;
@@ -126,7 +126,7 @@ out:
 	atomic_dec(&tr->data[cpu]->disabled);
 }
 
-static void notrace __wakeup_reset(struct trace_array *tr)
+static void __wakeup_reset(struct trace_array *tr)
 {
 	struct trace_array_cpu *data;
 	int cpu;
@@ -147,7 +147,7 @@ static void notrace __wakeup_reset(struct trace_array *tr)
 	wakeup_task = NULL;
 }
 
-static void notrace wakeup_reset(struct trace_array *tr)
+static void wakeup_reset(struct trace_array *tr)
 {
 	unsigned long flags;
 
@@ -156,7 +156,7 @@ static void notrace wakeup_reset(struct trace_array *tr)
 	spin_unlock_irqrestore(&wakeup_lock, flags);
 }
 
-static notrace void
+static void
 wakeup_check_start(struct trace_array *tr, struct task_struct *p,
 		   struct task_struct *curr)
 {
@@ -201,7 +201,7 @@ out:
 	atomic_dec(&tr->data[cpu]->disabled);
 }
 
-notrace void
+void
 ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr)
 {
 	if (likely(!tracer_enabled))
@@ -210,7 +210,7 @@ ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr)
 	wakeup_check_start(wakeup_trace, wakee, curr);
 }
 
-notrace void
+void
 ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr)
 {
 	if (likely(!tracer_enabled))
@@ -219,7 +219,7 @@ ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr)
 	wakeup_check_start(wakeup_trace, wakee, curr);
 }
 
-static notrace void start_wakeup_tracer(struct trace_array *tr)
+static void start_wakeup_tracer(struct trace_array *tr)
 {
 	wakeup_reset(tr);
 
@@ -237,12 +237,12 @@ static notrace void start_wakeup_tracer(struct trace_array *tr)
 	return;
 }
 
-static notrace void stop_wakeup_tracer(struct trace_array *tr)
+static void stop_wakeup_tracer(struct trace_array *tr)
 {
 	tracer_enabled = 0;
 }
 
-static notrace void wakeup_tracer_init(struct trace_array *tr)
+static void wakeup_tracer_init(struct trace_array *tr)
 {
 	wakeup_trace = tr;
 
@@ -250,7 +250,7 @@ static notrace void wakeup_tracer_init(struct trace_array *tr)
 		start_wakeup_tracer(tr);
 }
 
-static notrace void wakeup_tracer_reset(struct trace_array *tr)
+static void wakeup_tracer_reset(struct trace_array *tr)
 {
 	if (tr->ctrl) {
 		stop_wakeup_tracer(tr);
@@ -267,14 +267,14 @@ static void wakeup_tracer_ctrl_update(struct trace_array *tr)
 		stop_wakeup_tracer(tr);
 }
 
-static void notrace wakeup_tracer_open(struct trace_iterator *iter)
+static void wakeup_tracer_open(struct trace_iterator *iter)
 {
 	/* stop the trace while dumping */
 	if (iter->tr->ctrl)
 		stop_wakeup_tracer(iter->tr);
 }
 
-static void notrace wakeup_tracer_close(struct trace_iterator *iter)
+static void wakeup_tracer_close(struct trace_iterator *iter)
 {
 	/* forget about any processes we were recording */
 	if (iter->tr->ctrl)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 546307de6e3d..85715b86a342 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -3,7 +3,7 @@
 #include <linux/kthread.h>
 #include <linux/delay.h>
 
-static notrace inline int trace_valid_entry(struct trace_entry *entry)
+static inline int trace_valid_entry(struct trace_entry *entry)
 {
 	switch (entry->type) {
 	case TRACE_FN:
-- 
cgit v1.2.3


From 57422797dc009fc83766bcf230d29dbe6e08e21e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:51 +0200
Subject: ftrace: add wakeup events to sched tracer

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c              | 42 +++++++++++++++++++++++++++++++++++----
 kernel/trace/trace.h              | 12 +++++++++++
 kernel/trace/trace_sched_switch.c | 36 +++++++++++++++++++++++++++++++++
 kernel/trace/trace_sched_wakeup.c |  2 +-
 kernel/trace/trace_selftest.c     |  1 +
 5 files changed, 88 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f5898051fdd9..192c1354a7e0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -57,7 +57,7 @@ static struct trace_array	max_tr;
 static DEFINE_PER_CPU(struct trace_array_cpu, max_data);
 
 static int			tracer_enabled = 1;
-static unsigned long		trace_nr_entries = 16384UL;
+static unsigned long		trace_nr_entries = 65536UL;
 
 static struct tracer		*trace_types __read_mostly;
 static struct tracer		*current_trace __read_mostly;
@@ -87,6 +87,7 @@ enum trace_type {
 
 	TRACE_FN,
 	TRACE_CTX,
+	TRACE_WAKE,
 	TRACE_SPECIAL,
 
 	__TRACE_LAST_TYPE
@@ -711,6 +712,30 @@ tracing_sched_switch_trace(struct trace_array *tr,
 		wake_up (&trace_wait);
 }
 
+void
+tracing_sched_wakeup_trace(struct trace_array *tr,
+			   struct trace_array_cpu *data,
+			   struct task_struct *wakee, struct task_struct *curr,
+			   unsigned long flags)
+{
+	struct trace_entry *entry;
+	unsigned long irq_flags;
+
+	spin_lock_irqsave(&data->lock, irq_flags);
+	entry			= tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, flags);
+	entry->type		= TRACE_WAKE;
+	entry->ctx.prev_pid	= curr->pid;
+	entry->ctx.prev_prio	= curr->prio;
+	entry->ctx.prev_state	= curr->state;
+	entry->ctx.next_pid	= wakee->pid;
+	entry->ctx.next_prio	= wakee->prio;
+	spin_unlock_irqrestore(&data->lock, irq_flags);
+
+	if (!(trace_flags & TRACE_ITER_BLOCK))
+		wake_up(&trace_wait);
+}
+
 #ifdef CONFIG_FTRACE
 static void
 function_trace_call(unsigned long ip, unsigned long parent_ip)
@@ -1183,13 +1208,14 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 		trace_seq_puts(s, ")\n");
 		break;
 	case TRACE_CTX:
+	case TRACE_WAKE:
 		S = entry->ctx.prev_state < sizeof(state_to_char) ?
 			state_to_char[entry->ctx.prev_state] : 'X';
 		comm = trace_find_cmdline(entry->ctx.next_pid);
-		trace_seq_printf(s, " %d:%d:%c --> %d:%d %s\n",
+		trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d %s\n",
 				 entry->ctx.prev_pid,
 				 entry->ctx.prev_prio,
-				 S,
+				 S, entry->type == TRACE_CTX ? "==>" : "  +",
 				 entry->ctx.next_pid,
 				 entry->ctx.next_prio,
 				 comm);
@@ -1256,12 +1282,14 @@ static int print_trace_fmt(struct trace_iterator *iter)
 			return 0;
 		break;
 	case TRACE_CTX:
+	case TRACE_WAKE:
 		S = entry->ctx.prev_state < sizeof(state_to_char) ?
 			state_to_char[entry->ctx.prev_state] : 'X';
-		ret = trace_seq_printf(s, " %d:%d:%c ==> %d:%d\n",
+		ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d\n",
 				       entry->ctx.prev_pid,
 				       entry->ctx.prev_prio,
 				       S,
+				       entry->type == TRACE_CTX ? "==>" : "  +",
 				       entry->ctx.next_pid,
 				       entry->ctx.next_prio);
 		if (!ret)
@@ -1301,8 +1329,11 @@ static int print_raw_fmt(struct trace_iterator *iter)
 			return 0;
 		break;
 	case TRACE_CTX:
+	case TRACE_WAKE:
 		S = entry->ctx.prev_state < sizeof(state_to_char) ?
 			state_to_char[entry->ctx.prev_state] : 'X';
+		if (entry->type == TRACE_WAKE)
+			S = '+';
 		ret = trace_seq_printf(s, "%d %d %c %d %d\n",
 				       entry->ctx.prev_pid,
 				       entry->ctx.prev_prio,
@@ -1355,8 +1386,11 @@ static int print_hex_fmt(struct trace_iterator *iter)
 		SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip);
 		break;
 	case TRACE_CTX:
+	case TRACE_WAKE:
 		S = entry->ctx.prev_state < sizeof(state_to_char) ?
 			state_to_char[entry->ctx.prev_state] : 'X';
+		if (entry->type == TRACE_WAKE)
+			S = '+';
 		SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_pid);
 		SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_prio);
 		SEQ_PUT_HEX_FIELD_RET(s, S);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2b7352bf1ce6..90e0ba0f6eba 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -164,6 +164,12 @@ void tracing_sched_switch_trace(struct trace_array *tr,
 				struct task_struct *next,
 				unsigned long flags);
 void tracing_record_cmdline(struct task_struct *tsk);
+
+void tracing_sched_wakeup_trace(struct trace_array *tr,
+				struct trace_array_cpu *data,
+				struct task_struct *wakee,
+				struct task_struct *cur,
+				unsigned long flags);
 void trace_special(struct trace_array *tr,
 		   struct trace_array_cpu *data,
 		   unsigned long arg1,
@@ -194,11 +200,17 @@ extern cycle_t ftrace_now(int cpu);
 #ifdef CONFIG_SCHED_TRACER
 extern void
 wakeup_sched_switch(struct task_struct *prev, struct task_struct *next);
+extern void
+wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr);
 #else
 static inline void
 wakeup_sched_switch(struct task_struct *prev, struct task_struct *next)
 {
 }
+static inline void
+wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr)
+{
+}
 #endif
 
 #ifdef CONFIG_CONTEXT_SWITCH_TRACER
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index b738eaca1dbe..8b1cf1a3aee0 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -41,6 +41,29 @@ ctx_switch_func(struct task_struct *prev, struct task_struct *next)
 	local_irq_restore(flags);
 }
 
+static void wakeup_func(struct task_struct *wakee, struct task_struct *curr)
+{
+	struct trace_array *tr = ctx_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+
+	if (!tracer_enabled)
+		return;
+
+	local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1))
+		tracing_sched_wakeup_trace(tr, data, wakee, curr, flags);
+
+	atomic_dec(&data->disabled);
+	local_irq_restore(flags);
+}
+
 void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next)
 {
 	tracing_record_cmdline(prev);
@@ -57,6 +80,19 @@ void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next)
 	wakeup_sched_switch(prev, next);
 }
 
+void
+ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr)
+{
+	tracing_record_cmdline(curr);
+
+	wakeup_func(wakee, curr);
+
+	/*
+	 * Chain to the wakeup tracer (this is a NOP if disabled):
+	 */
+	wakeup_sched_wakeup(wakee, curr);
+}
+
 static void sched_switch_reset(struct trace_array *tr)
 {
 	int cpu;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 662679c78b66..87fa7b253b57 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -202,7 +202,7 @@ out:
 }
 
 void
-ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr)
+wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr)
 {
 	if (likely(!tracer_enabled))
 		return;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 85715b86a342..39dd452647da 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -8,6 +8,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
 	switch (entry->type) {
 	case TRACE_FN:
 	case TRACE_CTX:
+	case TRACE_WAKE:
 		return 1;
 	}
 	return 0;
-- 
cgit v1.2.3


From 86387f7ee5d3273ff4859e2c64ce656639b6ca65 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:51 +0200
Subject: ftrace: add stack tracing

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/Kconfig |   1 +
 kernel/trace/trace.c | 103 ++++++++++++++++++++++++++++++++++++++++++---------
 kernel/trace/trace.h |  11 ++++++
 3 files changed, 97 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 3f73a1710242..eb1988ed84b7 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -10,6 +10,7 @@ config TRACER_MAX_TRACE
 config TRACING
 	bool
 	select DEBUG_FS
+	select STACKTRACE
 
 config FTRACE
 	bool "Kernel Function Tracer"
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 192c1354a7e0..b4b1b4fe99fd 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -28,6 +28,8 @@
 #include <linux/gfp.h>
 #include <linux/fs.h>
 
+#include <linux/stacktrace.h>
+
 #include "trace.h"
 
 unsigned long __read_mostly	tracing_max_latency = (cycle_t)ULONG_MAX;
@@ -88,6 +90,7 @@ enum trace_type {
 	TRACE_FN,
 	TRACE_CTX,
 	TRACE_WAKE,
+	TRACE_STACK,
 	TRACE_SPECIAL,
 
 	__TRACE_LAST_TYPE
@@ -109,6 +112,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_HEX			= 0x20,
 	TRACE_ITER_BIN			= 0x40,
 	TRACE_ITER_BLOCK		= 0x80,
+	TRACE_ITER_STACKTRACE		= 0x100,
 };
 
 #define TRACE_ITER_SYM_MASK \
@@ -124,10 +128,11 @@ static const char *trace_options[] = {
 	"hex",
 	"bin",
 	"block",
+	"stacktrace",
 	NULL
 };
 
-static unsigned trace_flags;
+static unsigned trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_STACKTRACE;
 
 static DEFINE_SPINLOCK(ftrace_max_lock);
 
@@ -657,7 +662,7 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data,
 	spin_unlock_irqrestore(&data->lock, irq_flags);
 
 	if (!(trace_flags & TRACE_ITER_BLOCK))
-		wake_up (&trace_wait);
+		wake_up(&trace_wait);
 }
 
 void
@@ -685,13 +690,39 @@ trace_special(struct trace_array *tr, struct trace_array_cpu *data,
 	spin_unlock_irqrestore(&data->lock, irq_flags);
 
 	if (!(trace_flags & TRACE_ITER_BLOCK))
-		wake_up (&trace_wait);
+		wake_up(&trace_wait);
+}
+
+void __trace_stack(struct trace_array *tr,
+		   struct trace_array_cpu *data,
+		   unsigned long flags,
+		   int skip)
+{
+	struct trace_entry *entry;
+	struct stack_trace trace;
+
+	if (!(trace_flags & TRACE_ITER_STACKTRACE))
+		return;
+
+	entry			= tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, flags);
+	entry->type		= TRACE_STACK;
+
+	memset(&entry->stack, 0, sizeof(entry->stack));
+
+	trace.nr_entries	= 0;
+	trace.max_entries	= FTRACE_STACK_ENTRIES;
+	trace.skip		= skip;
+	trace.entries		= entry->stack.caller;
+
+	save_stack_trace(&trace);
 }
 
 void
 tracing_sched_switch_trace(struct trace_array *tr,
 			   struct trace_array_cpu *data,
-			   struct task_struct *prev, struct task_struct *next,
+			   struct task_struct *prev,
+			   struct task_struct *next,
 			   unsigned long flags)
 {
 	struct trace_entry *entry;
@@ -706,16 +737,18 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->ctx.prev_state	= prev->state;
 	entry->ctx.next_pid	= next->pid;
 	entry->ctx.next_prio	= next->prio;
+	__trace_stack(tr, data, flags, 4);
 	spin_unlock_irqrestore(&data->lock, irq_flags);
 
 	if (!(trace_flags & TRACE_ITER_BLOCK))
-		wake_up (&trace_wait);
+		wake_up(&trace_wait);
 }
 
 void
 tracing_sched_wakeup_trace(struct trace_array *tr,
 			   struct trace_array_cpu *data,
-			   struct task_struct *wakee, struct task_struct *curr,
+			   struct task_struct *wakee,
+			   struct task_struct *curr,
 			   unsigned long flags)
 {
 	struct trace_entry *entry;
@@ -730,6 +763,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->ctx.prev_state	= curr->state;
 	entry->ctx.next_pid	= wakee->pid;
 	entry->ctx.next_prio	= wakee->prio;
+	__trace_stack(tr, data, flags, 5);
 	spin_unlock_irqrestore(&data->lock, irq_flags);
 
 	if (!(trace_flags & TRACE_ITER_BLOCK))
@@ -1179,6 +1213,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 	unsigned long rel_usecs;
 	char *comm;
 	int S;
+	int i;
 
 	if (!next_entry)
 		next_entry = entry;
@@ -1197,8 +1232,10 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 				 abs_usecs % 1000, rel_usecs/1000,
 				 rel_usecs % 1000);
 	} else {
-		lat_print_generic(s, entry, cpu);
-		lat_print_timestamp(s, abs_usecs, rel_usecs);
+		if (entry->type != TRACE_STACK) {
+			lat_print_generic(s, entry, cpu);
+			lat_print_timestamp(s, abs_usecs, rel_usecs);
+		}
 	}
 	switch (entry->type) {
 	case TRACE_FN:
@@ -1226,6 +1263,14 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 				 entry->special.arg2,
 				 entry->special.arg3);
 		break;
+	case TRACE_STACK:
+		for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+			if (i)
+				trace_seq_puts(s, " <= ");
+			seq_print_ip_sym(s, entry->stack.caller[i], sym_flags);
+		}
+		trace_seq_puts(s, "\n");
+		break;
 	default:
 		trace_seq_printf(s, "Unknown type %d\n", entry->type);
 	}
@@ -1241,8 +1286,9 @@ static int print_trace_fmt(struct trace_iterator *iter)
 	unsigned long long t;
 	unsigned long secs;
 	char *comm;
-	int S;
 	int ret;
+	int S;
+	int i;
 
 	entry = iter->ent;
 
@@ -1252,15 +1298,17 @@ static int print_trace_fmt(struct trace_iterator *iter)
 	usec_rem = do_div(t, 1000000ULL);
 	secs = (unsigned long)t;
 
-	ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
-	if (!ret)
-		return 0;
-	ret = trace_seq_printf(s, "[%02d] ", iter->cpu);
-	if (!ret)
-		return 0;
-	ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem);
-	if (!ret)
-		return 0;
+	if (entry->type != TRACE_STACK) {
+		ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
+		if (!ret)
+			return 0;
+		ret = trace_seq_printf(s, "[%02d] ", iter->cpu);
+		if (!ret)
+			return 0;
+		ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem);
+		if (!ret)
+			return 0;
+	}
 
 	switch (entry->type) {
 	case TRACE_FN:
@@ -1303,6 +1351,22 @@ static int print_trace_fmt(struct trace_iterator *iter)
 		if (!ret)
 			return 0;
 		break;
+	case TRACE_STACK:
+		for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+			if (i) {
+				ret = trace_seq_puts(s, " <= ");
+				if (!ret)
+					return 0;
+			}
+			ret = seq_print_ip_sym(s, entry->stack.caller[i],
+					       sym_flags);
+			if (!ret)
+				return 0;
+		}
+		ret = trace_seq_puts(s, "\n");
+		if (!ret)
+			return 0;
+		break;
 	}
 	return 1;
 }
@@ -1344,6 +1408,7 @@ static int print_raw_fmt(struct trace_iterator *iter)
 			return 0;
 		break;
 	case TRACE_SPECIAL:
+	case TRACE_STACK:
 		ret = trace_seq_printf(s, " %lx %lx %lx\n",
 				 entry->special.arg1,
 				 entry->special.arg2,
@@ -1399,6 +1464,7 @@ static int print_hex_fmt(struct trace_iterator *iter)
 		SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip);
 		break;
 	case TRACE_SPECIAL:
+	case TRACE_STACK:
 		SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg1);
 		SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg2);
 		SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg3);
@@ -1433,6 +1499,7 @@ static int print_bin_fmt(struct trace_iterator *iter)
 		SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio);
 		break;
 	case TRACE_SPECIAL:
+	case TRACE_STACK:
 		SEQ_PUT_FIELD_RET(s, entry->special.arg1);
 		SEQ_PUT_FIELD_RET(s, entry->special.arg2);
 		SEQ_PUT_FIELD_RET(s, entry->special.arg3);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 90e0ba0f6eba..387bdcf45e28 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -34,6 +34,16 @@ struct special_entry {
 	unsigned long		arg3;
 };
 
+/*
+ * Stack-trace entry:
+ */
+
+#define FTRACE_STACK_ENTRIES	5
+
+struct stack_entry {
+	unsigned long		caller[FTRACE_STACK_ENTRIES];
+};
+
 /*
  * The trace entry - the most basic unit of tracing. This is what
  * is printed in the end as a single line in the trace output, such as:
@@ -51,6 +61,7 @@ struct trace_entry {
 		struct ftrace_entry		fn;
 		struct ctx_switch_entry		ctx;
 		struct special_entry		special;
+		struct stack_entry		stack;
 	};
 };
 
-- 
cgit v1.2.3


From 8ac0fca4ccb355ce50471d7aa3f10f5900b28b95 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:51 +0200
Subject: ftrace: sched tracer fix

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c                    |  2 +-
 kernel/trace/trace_sched_wakeup.c | 13 +++----------
 2 files changed, 4 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 328494e28df2..53ab1174664f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2613,7 +2613,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		p->sched_class->task_new(rq, p);
 		inc_nr_running(rq);
 	}
-	ftrace_wake_up_new_task(p, rq->curr);
+	ftrace_wake_up_task(p, rq->curr);
 	check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 87fa7b253b57..2a012423f9d0 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -201,20 +201,13 @@ out:
 	atomic_dec(&tr->data[cpu]->disabled);
 }
 
-void
-wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr)
+void wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr)
 {
 	if (likely(!tracer_enabled))
 		return;
 
-	wakeup_check_start(wakeup_trace, wakee, curr);
-}
-
-void
-ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr)
-{
-	if (likely(!tracer_enabled))
-		return;
+	tracing_record_cmdline(curr);
+	tracing_record_cmdline(wakee);
 
 	wakeup_check_start(wakeup_trace, wakee, curr);
 }
-- 
cgit v1.2.3


From 4c1f4d4f0175129934d5dbc19a39296430937a05 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:51 +0200
Subject: ftrace: make nostacktrace the default

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b4b1b4fe99fd..0e4b7119e263 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -132,7 +132,7 @@ static const char *trace_options[] = {
 	NULL
 };
 
-static unsigned trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_STACKTRACE;
+static unsigned trace_flags = TRACE_ITER_PRINT_PARENT;
 
 static DEFINE_SPINLOCK(ftrace_max_lock);
 
-- 
cgit v1.2.3


From 4e65551905fb0300ae7e667cbaa41ee2e3f29a13 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:52 +0200
Subject: ftrace: sched tracer, trace full rbtree

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c                    | 35 ++++++++++++++++++++++---
 kernel/trace/trace.c              | 55 ++++++++++++++++-----------------------
 kernel/trace/trace.h              | 14 ++++++++++
 kernel/trace/trace_sched_switch.c | 24 +++++++++++------
 4 files changed, 85 insertions(+), 43 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 53ab1174664f..b9208a0e33a0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2394,6 +2394,35 @@ static int sched_balance_self(int cpu, int flag)
 
 #endif /* CONFIG_SMP */
 
+#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+
+void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data)
+{
+	struct sched_entity *se;
+	struct task_struct *p;
+	struct rb_node *curr;
+	struct rq *rq = __rq;
+
+	curr = first_fair(&rq->cfs);
+	if (!curr)
+		return;
+
+	while (curr) {
+		se = rb_entry(curr, struct sched_entity, run_node);
+		if (!entity_is_task(se))
+			continue;
+
+		p = task_of(se);
+
+		__trace_special(__tr, __data,
+			      p->pid, p->se.vruntime, p->se.sum_exec_runtime);
+
+		curr = rb_next(curr);
+	}
+}
+
+#endif
+
 /***
  * try_to_wake_up - wake up a thread
  * @p: the to-be-woken-up thread
@@ -2468,7 +2497,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 
 out_activate:
 #endif /* CONFIG_SMP */
-	ftrace_wake_up_task(p, rq->curr);
+	ftrace_wake_up_task(rq, p, rq->curr);
 	schedstat_inc(p, se.nr_wakeups);
 	if (sync)
 		schedstat_inc(p, se.nr_wakeups_sync);
@@ -2613,7 +2642,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		p->sched_class->task_new(rq, p);
 		inc_nr_running(rq);
 	}
-	ftrace_wake_up_task(p, rq->curr);
+	ftrace_wake_up_task(rq, p, rq->curr);
 	check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
@@ -2786,7 +2815,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	struct mm_struct *mm, *oldmm;
 
 	prepare_task_switch(rq, prev, next);
-	ftrace_ctx_switch(prev, next);
+	ftrace_ctx_switch(rq, prev, next);
 	mm = next->mm;
 	oldmm = prev->active_mm;
 	/*
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0e4b7119e263..65173b14b914 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -66,7 +66,18 @@ static struct tracer		*current_trace __read_mostly;
 static int			max_tracer_type_len;
 
 static DEFINE_MUTEX(trace_types_lock);
-static DECLARE_WAIT_QUEUE_HEAD (trace_wait);
+static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
+
+unsigned long trace_flags = TRACE_ITER_PRINT_PARENT;
+
+/*
+ * FIXME: where should this be called?
+ */
+void trace_wake_up(void)
+{
+	if (!(trace_flags & TRACE_ITER_BLOCK))
+		wake_up(&trace_wait);
+}
 
 #define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry))
 
@@ -103,18 +114,6 @@ enum trace_flag_type {
 	TRACE_FLAG_SOFTIRQ		= 0x08,
 };
 
-enum trace_iterator_flags {
-	TRACE_ITER_PRINT_PARENT		= 0x01,
-	TRACE_ITER_SYM_OFFSET		= 0x02,
-	TRACE_ITER_SYM_ADDR		= 0x04,
-	TRACE_ITER_VERBOSE		= 0x08,
-	TRACE_ITER_RAW			= 0x10,
-	TRACE_ITER_HEX			= 0x20,
-	TRACE_ITER_BIN			= 0x40,
-	TRACE_ITER_BLOCK		= 0x80,
-	TRACE_ITER_STACKTRACE		= 0x100,
-};
-
 #define TRACE_ITER_SYM_MASK \
 	(TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
 
@@ -132,8 +131,6 @@ static const char *trace_options[] = {
 	NULL
 };
 
-static unsigned trace_flags = TRACE_ITER_PRINT_PARENT;
-
 static DEFINE_SPINLOCK(ftrace_max_lock);
 
 /*
@@ -660,9 +657,6 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data,
 	entry->fn.ip		= ip;
 	entry->fn.parent_ip	= parent_ip;
 	spin_unlock_irqrestore(&data->lock, irq_flags);
-
-	if (!(trace_flags & TRACE_ITER_BLOCK))
-		wake_up(&trace_wait);
 }
 
 void
@@ -673,10 +667,14 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
 		trace_function(tr, data, ip, parent_ip, flags);
 }
 
+#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+
 void
-trace_special(struct trace_array *tr, struct trace_array_cpu *data,
-	      unsigned long arg1, unsigned long arg2, unsigned long arg3)
+__trace_special(void *__tr, void *__data,
+		unsigned long arg1, unsigned long arg2, unsigned long arg3)
 {
+	struct trace_array_cpu *data = __data;
+	struct trace_array *tr = __tr;
 	struct trace_entry *entry;
 	unsigned long irq_flags;
 
@@ -688,11 +686,10 @@ trace_special(struct trace_array *tr, struct trace_array_cpu *data,
 	entry->special.arg2	= arg2;
 	entry->special.arg3	= arg3;
 	spin_unlock_irqrestore(&data->lock, irq_flags);
-
-	if (!(trace_flags & TRACE_ITER_BLOCK))
-		wake_up(&trace_wait);
 }
 
+#endif
+
 void __trace_stack(struct trace_array *tr,
 		   struct trace_array_cpu *data,
 		   unsigned long flags,
@@ -739,9 +736,6 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->ctx.next_prio	= next->prio;
 	__trace_stack(tr, data, flags, 4);
 	spin_unlock_irqrestore(&data->lock, irq_flags);
-
-	if (!(trace_flags & TRACE_ITER_BLOCK))
-		wake_up(&trace_wait);
 }
 
 void
@@ -765,9 +759,6 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->ctx.next_prio	= wakee->prio;
 	__trace_stack(tr, data, flags, 5);
 	spin_unlock_irqrestore(&data->lock, irq_flags);
-
-	if (!(trace_flags & TRACE_ITER_BLOCK))
-		wake_up(&trace_wait);
 }
 
 #ifdef CONFIG_FTRACE
@@ -1258,7 +1249,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 				 comm);
 		break;
 	case TRACE_SPECIAL:
-		trace_seq_printf(s, " %lx %lx %lx\n",
+		trace_seq_printf(s, " %ld %ld %ld\n",
 				 entry->special.arg1,
 				 entry->special.arg2,
 				 entry->special.arg3);
@@ -1344,7 +1335,7 @@ static int print_trace_fmt(struct trace_iterator *iter)
 			return 0;
 		break;
 	case TRACE_SPECIAL:
-		ret = trace_seq_printf(s, " %lx %lx %lx\n",
+		ret = trace_seq_printf(s, " %ld %ld %ld\n",
 				 entry->special.arg1,
 				 entry->special.arg2,
 				 entry->special.arg3);
@@ -1409,7 +1400,7 @@ static int print_raw_fmt(struct trace_iterator *iter)
 		break;
 	case TRACE_SPECIAL:
 	case TRACE_STACK:
-		ret = trace_seq_printf(s, " %lx %lx %lx\n",
+		ret = trace_seq_printf(s, " %ld %ld %ld\n",
 				 entry->special.arg1,
 				 entry->special.arg2,
 				 entry->special.arg3);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 387bdcf45e28..75e237475674 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -274,4 +274,18 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace,
 
 extern void *head_page(struct trace_array_cpu *data);
 
+extern unsigned long trace_flags;
+
+enum trace_iterator_flags {
+	TRACE_ITER_PRINT_PARENT		= 0x01,
+	TRACE_ITER_SYM_OFFSET		= 0x02,
+	TRACE_ITER_SYM_ADDR		= 0x04,
+	TRACE_ITER_VERBOSE		= 0x08,
+	TRACE_ITER_RAW			= 0x10,
+	TRACE_ITER_HEX			= 0x20,
+	TRACE_ITER_BIN			= 0x40,
+	TRACE_ITER_BLOCK		= 0x80,
+	TRACE_ITER_STACKTRACE		= 0x100,
+};
+
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 8b1cf1a3aee0..12658b3f2b28 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -18,7 +18,7 @@ static struct trace_array	*ctx_trace;
 static int __read_mostly	tracer_enabled;
 
 static void
-ctx_switch_func(struct task_struct *prev, struct task_struct *next)
+ctx_switch_func(void *__rq, struct task_struct *prev, struct task_struct *next)
 {
 	struct trace_array *tr = ctx_trace;
 	struct trace_array_cpu *data;
@@ -34,14 +34,17 @@ ctx_switch_func(struct task_struct *prev, struct task_struct *next)
 	data = tr->data[cpu];
 	disabled = atomic_inc_return(&data->disabled);
 
-	if (likely(disabled == 1))
+	if (likely(disabled == 1)) {
 		tracing_sched_switch_trace(tr, data, prev, next, flags);
+		ftrace_all_fair_tasks(__rq, tr, data);
+	}
 
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
 }
 
-static void wakeup_func(struct task_struct *wakee, struct task_struct *curr)
+static void
+wakeup_func(void *__rq, struct task_struct *wakee, struct task_struct *curr)
 {
 	struct trace_array *tr = ctx_trace;
 	struct trace_array_cpu *data;
@@ -57,14 +60,18 @@ static void wakeup_func(struct task_struct *wakee, struct task_struct *curr)
 	data = tr->data[cpu];
 	disabled = atomic_inc_return(&data->disabled);
 
-	if (likely(disabled == 1))
+	if (likely(disabled == 1)) {
 		tracing_sched_wakeup_trace(tr, data, wakee, curr, flags);
+		ftrace_all_fair_tasks(__rq, tr, data);
+	}
 
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
 }
 
-void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next)
+void
+ftrace_ctx_switch(void *__rq, struct task_struct *prev,
+		  struct task_struct *next)
 {
 	tracing_record_cmdline(prev);
 
@@ -72,7 +79,7 @@ void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next)
 	 * If tracer_switch_func only points to the local
 	 * switch func, it still needs the ptr passed to it.
 	 */
-	ctx_switch_func(prev, next);
+	ctx_switch_func(__rq, prev, next);
 
 	/*
 	 * Chain to the wakeup tracer (this is a NOP if disabled):
@@ -81,11 +88,12 @@ void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next)
 }
 
 void
-ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr)
+ftrace_wake_up_task(void *__rq, struct task_struct *wakee,
+		    struct task_struct *curr)
 {
 	tracing_record_cmdline(curr);
 
-	wakeup_func(wakee, curr);
+	wakeup_func(__rq, wakee, curr);
 
 	/*
 	 * Chain to the wakeup tracer (this is a NOP if disabled):
-- 
cgit v1.2.3


From 24cd5d111e8c713e62cda7ca1d01232402e3d3c9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:52 +0200
Subject: ftrace: trace curr/next tasks

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b9208a0e33a0..673b588b713b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2398,8 +2398,8 @@ static int sched_balance_self(int cpu, int flag)
 
 void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data)
 {
-	struct sched_entity *se;
 	struct task_struct *p;
+	struct sched_entity *se;
 	struct rb_node *curr;
 	struct rq *rq = __rq;
 
@@ -2407,6 +2407,17 @@ void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data)
 	if (!curr)
 		return;
 
+	if (rq->cfs.curr) {
+		p = task_of(rq->cfs.curr);
+		__trace_special(__tr, __data,
+		      p->pid, p->se.vruntime, p->se.sum_exec_runtime);
+	}
+	if (rq->cfs.next) {
+		p = task_of(rq->cfs.next);
+		__trace_special(__tr, __data,
+		      p->pid, p->se.vruntime, p->se.sum_exec_runtime);
+	}
+
 	while (curr) {
 		se = rb_entry(curr, struct sched_entity, run_node);
 		if (!entity_is_task(se))
-- 
cgit v1.2.3


From 017730c11241e26577673eb9d957cfc66172ea91 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:52 +0200
Subject: ftrace: fix wakeups

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c       | 18 ++++++++++++++++++
 kernel/trace/trace.c | 15 +++++++++++----
 2 files changed, 29 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 673b588b713b..9ca4a2e6a236 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -642,6 +642,24 @@ static inline void update_rq_clock(struct rq *rq)
 # define const_debug static const
 #endif
 
+/**
+ * runqueue_is_locked
+ *
+ * Returns true if the current cpu runqueue is locked.
+ * This interface allows printk to be called with the runqueue lock
+ * held and know whether or not it is OK to wake up the klogd.
+ */
+int runqueue_is_locked(void)
+{
+	int cpu = get_cpu();
+	struct rq *rq = cpu_rq(cpu);
+	int ret;
+
+	ret = spin_is_locked(&rq->lock);
+	put_cpu();
+	return ret;
+}
+
 /*
  * Debugging: various feature bits
  */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 65173b14b914..2ca9d66aa74e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -70,12 +70,13 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT;
 
-/*
- * FIXME: where should this be called?
- */
 void trace_wake_up(void)
 {
-	if (!(trace_flags & TRACE_ITER_BLOCK))
+	/*
+	 * The runqueue_is_locked() can fail, but this is the best we
+	 * have for now:
+	 */
+	if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked())
 		wake_up(&trace_wait);
 }
 
@@ -657,6 +658,8 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data,
 	entry->fn.ip		= ip;
 	entry->fn.parent_ip	= parent_ip;
 	spin_unlock_irqrestore(&data->lock, irq_flags);
+
+	trace_wake_up();
 }
 
 void
@@ -686,6 +689,8 @@ __trace_special(void *__tr, void *__data,
 	entry->special.arg2	= arg2;
 	entry->special.arg3	= arg3;
 	spin_unlock_irqrestore(&data->lock, irq_flags);
+
+	trace_wake_up();
 }
 
 #endif
@@ -759,6 +764,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->ctx.next_prio	= wakee->prio;
 	__trace_stack(tr, data, flags, 5);
 	spin_unlock_irqrestore(&data->lock, irq_flags);
+
+	trace_wake_up();
 }
 
 #ifdef CONFIG_FTRACE
-- 
cgit v1.2.3


From 1a3c3034336320554a3342572dae98d69e054fc7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:52 +0200
Subject: ftrace: fix __trace_special()

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2ca9d66aa74e..65d2c0a61ed4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -670,8 +670,6 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
 		trace_function(tr, data, ip, parent_ip, flags);
 }
 
-#ifdef CONFIG_CONTEXT_SWITCH_TRACER
-
 void
 __trace_special(void *__tr, void *__data,
 		unsigned long arg1, unsigned long arg2, unsigned long arg3)
@@ -693,8 +691,6 @@ __trace_special(void *__tr, void *__data,
 	trace_wake_up();
 }
 
-#endif
-
 void __trace_stack(struct trace_array *tr,
 		   struct trace_array_cpu *data,
 		   unsigned long flags,
-- 
cgit v1.2.3


From 4ac3ba41d372e3a9e420b36bc43589662b188a14 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:52 +0200
Subject: ftrace: trace scheduler rbtree

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c              | 1 +
 kernel/trace/trace.h              | 1 +
 kernel/trace/trace_sched_switch.c | 6 ++++--
 3 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 65d2c0a61ed4..06380dc1ebe0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -129,6 +129,7 @@ static const char *trace_options[] = {
 	"bin",
 	"block",
 	"stacktrace",
+	"sched-tree",
 	NULL
 };
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 75e237475674..a52015702a28 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -286,6 +286,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_BIN			= 0x40,
 	TRACE_ITER_BLOCK		= 0x80,
 	TRACE_ITER_STACKTRACE		= 0x100,
+	TRACE_ITER_SCHED_TREE		= 0x200,
 };
 
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 12658b3f2b28..5555b906a666 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -36,7 +36,8 @@ ctx_switch_func(void *__rq, struct task_struct *prev, struct task_struct *next)
 
 	if (likely(disabled == 1)) {
 		tracing_sched_switch_trace(tr, data, prev, next, flags);
-		ftrace_all_fair_tasks(__rq, tr, data);
+		if (trace_flags & TRACE_ITER_SCHED_TREE)
+			ftrace_all_fair_tasks(__rq, tr, data);
 	}
 
 	atomic_dec(&data->disabled);
@@ -62,7 +63,8 @@ wakeup_func(void *__rq, struct task_struct *wakee, struct task_struct *curr)
 
 	if (likely(disabled == 1)) {
 		tracing_sched_wakeup_trace(tr, data, wakee, curr, flags);
-		ftrace_all_fair_tasks(__rq, tr, data);
+		if (trace_flags & TRACE_ITER_SCHED_TREE)
+			ftrace_all_fair_tasks(__rq, tr, data);
 	}
 
 	atomic_dec(&data->disabled);
-- 
cgit v1.2.3


From c7078de1aaf562a31b20984409c38cc1b40fa8a3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:52 +0200
Subject: ftrace: add tracing_cpumask

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 70 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 06380dc1ebe0..18f0ab88ca85 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -70,6 +70,23 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT;
 
+/*
+ * Only trace on a CPU if the bitmask is set:
+ */
+static cpumask_t tracing_cpumask __read_mostly = CPU_MASK_ALL;
+
+/*
+ * The tracer itself will not take this lock, but still we want
+ * to provide a consistent cpumask to user-space:
+ */
+static DEFINE_MUTEX(tracing_cpumask_update_lock);
+
+/*
+ * Temporary storage for the character representation of the
+ * CPU bitmask:
+ */
+static char mask_str[NR_CPUS];
+
 void trace_wake_up(void)
 {
 	/*
@@ -1754,9 +1771,49 @@ static struct file_operations tracing_lt_fops = {
 };
 
 static struct file_operations show_traces_fops = {
-	.open = show_traces_open,
-	.read = seq_read,
-	.release = seq_release,
+	.open		= show_traces_open,
+	.read		= seq_read,
+	.release	= seq_release,
+};
+
+static ssize_t
+tracing_cpumask_read(struct file *filp, char __user *ubuf,
+		     size_t count, loff_t *ppos)
+{
+	int err;
+
+	count = min(count, (size_t)NR_CPUS);
+
+	mutex_lock(&tracing_cpumask_update_lock);
+	cpumask_scnprintf(mask_str, NR_CPUS, tracing_cpumask);
+	err = copy_to_user(ubuf, mask_str, count);
+	if (err)
+		count = -EFAULT;
+	mutex_unlock(&tracing_cpumask_update_lock);
+
+	return count;
+}
+
+static ssize_t
+tracing_cpumask_write(struct file *filp, const char __user *ubuf,
+		      size_t count, loff_t *ppos)
+{
+	int err;
+
+	mutex_lock(&tracing_cpumask_update_lock);
+	err = cpumask_parse_user(ubuf, count, tracing_cpumask);
+	mutex_unlock(&tracing_cpumask_update_lock);
+
+	if (err)
+		return err;
+
+	return count;
+}
+
+static struct file_operations tracing_cpumask_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_cpumask_read,
+	.write		= tracing_cpumask_write,
 };
 
 static ssize_t
@@ -1837,9 +1894,9 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
 }
 
 static struct file_operations tracing_iter_fops = {
-	.open = tracing_open_generic,
-	.read = tracing_iter_ctrl_read,
-	.write = tracing_iter_ctrl_write,
+	.open		= tracing_open_generic,
+	.read		= tracing_iter_ctrl_read,
+	.write		= tracing_iter_ctrl_write,
 };
 
 static const char readme_msg[] =
@@ -1870,8 +1927,8 @@ tracing_readme_read(struct file *filp, char __user *ubuf,
 }
 
 static struct file_operations tracing_readme_fops = {
-	.open = tracing_open_generic,
-	.read = tracing_readme_read,
+	.open		= tracing_open_generic,
+	.read		= tracing_readme_read,
 };
 
 static ssize_t
@@ -2334,6 +2391,11 @@ static __init void tracer_init_debugfs(void)
 	if (!entry)
 		pr_warning("Could not create debugfs 'iter_ctrl' entry\n");
 
+	entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
+				    NULL, &tracing_cpumask_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'tracing_cpumask' entry\n");
+
 	entry = debugfs_create_file("latency_trace", 0444, d_tracer,
 				    &global_trace, &tracing_lt_fops);
 	if (!entry)
-- 
cgit v1.2.3


From 36dfe9252bd4c9b55e8454363fb7e444c92c5030 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:52 +0200
Subject: ftrace: make use of tracing_cpumask

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 95 +++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 64 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 18f0ab88ca85..3ae17e254fc3 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -70,23 +70,6 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT;
 
-/*
- * Only trace on a CPU if the bitmask is set:
- */
-static cpumask_t tracing_cpumask __read_mostly = CPU_MASK_ALL;
-
-/*
- * The tracer itself will not take this lock, but still we want
- * to provide a consistent cpumask to user-space:
- */
-static DEFINE_MUTEX(tracing_cpumask_update_lock);
-
-/*
- * Temporary storage for the character representation of the
- * CPU bitmask:
- */
-static char mask_str[NR_CPUS];
-
 void trace_wake_up(void)
 {
 	/*
@@ -1776,19 +1759,46 @@ static struct file_operations show_traces_fops = {
 	.release	= seq_release,
 };
 
+/*
+ * Only trace on a CPU if the bitmask is set:
+ */
+static cpumask_t tracing_cpumask = CPU_MASK_ALL;
+
+/*
+ * When tracing/tracing_cpu_mask is modified then this holds
+ * the new bitmask we are about to install:
+ */
+static cpumask_t tracing_cpumask_new;
+
+/*
+ * The tracer itself will not take this lock, but still we want
+ * to provide a consistent cpumask to user-space:
+ */
+static DEFINE_MUTEX(tracing_cpumask_update_lock);
+
+/*
+ * Temporary storage for the character representation of the
+ * CPU bitmask (and one more byte for the newline):
+ */
+static char mask_str[NR_CPUS + 1];
+
 static ssize_t
 tracing_cpumask_read(struct file *filp, char __user *ubuf,
 		     size_t count, loff_t *ppos)
 {
-	int err;
-
-	count = min(count, (size_t)NR_CPUS);
+	int len;
 
 	mutex_lock(&tracing_cpumask_update_lock);
-	cpumask_scnprintf(mask_str, NR_CPUS, tracing_cpumask);
-	err = copy_to_user(ubuf, mask_str, count);
-	if (err)
-		count = -EFAULT;
+
+	len = cpumask_scnprintf(mask_str, count, tracing_cpumask);
+	if (count - len < 2) {
+		count = -EINVAL;
+		goto out_err;
+	}
+	len += sprintf(mask_str + len, "\n");
+	count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1);
+
+out_err:
 	mutex_unlock(&tracing_cpumask_update_lock);
 
 	return count;
@@ -1798,16 +1808,40 @@ static ssize_t
 tracing_cpumask_write(struct file *filp, const char __user *ubuf,
 		      size_t count, loff_t *ppos)
 {
-	int err;
+	int err, cpu;
 
 	mutex_lock(&tracing_cpumask_update_lock);
-	err = cpumask_parse_user(ubuf, count, tracing_cpumask);
-	mutex_unlock(&tracing_cpumask_update_lock);
-
+	err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
 	if (err)
-		return err;
+		goto err_unlock;
+
+	spin_lock_irq(&ftrace_max_lock);
+	for_each_possible_cpu(cpu) {
+		/*
+		 * Increase/decrease the disabled counter if we are
+		 * about to flip a bit in the cpumask:
+		 */
+		if (cpu_isset(cpu, tracing_cpumask) &&
+				!cpu_isset(cpu, tracing_cpumask_new)) {
+			atomic_inc(&global_trace.data[cpu]->disabled);
+		}
+		if (!cpu_isset(cpu, tracing_cpumask) &&
+				cpu_isset(cpu, tracing_cpumask_new)) {
+			atomic_dec(&global_trace.data[cpu]->disabled);
+		}
+	}
+	spin_unlock_irq(&ftrace_max_lock);
+
+	tracing_cpumask = tracing_cpumask_new;
+
+	mutex_unlock(&tracing_cpumask_update_lock);
 
 	return count;
+
+err_unlock:
+	mutex_unlock(&tracing_cpumask_update_lock);
+
+	return err;
 }
 
 static struct file_operations tracing_cpumask_fops = {
@@ -1846,8 +1880,7 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
 	r += sprintf(buf + r, "\n");
 	WARN_ON(r >= len + 2);
 
-	r = simple_read_from_buffer(ubuf, cnt, ppos,
-				    buf, r);
+	r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 
 	kfree(buf);
 
-- 
cgit v1.2.3


From d9af56fbd8b37181bffaad060f74aa2e17382874 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:53 +0200
Subject: ftrace: fix cmdline tracing

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace_sched_switch.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 5555b906a666..5a217e863586 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -29,6 +29,8 @@ ctx_switch_func(void *__rq, struct task_struct *prev, struct task_struct *next)
 	if (!tracer_enabled)
 		return;
 
+	tracing_record_cmdline(prev);
+
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
@@ -56,6 +58,8 @@ wakeup_func(void *__rq, struct task_struct *wakee, struct task_struct *curr)
 	if (!tracer_enabled)
 		return;
 
+	tracing_record_cmdline(curr);
+
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
@@ -75,8 +79,6 @@ void
 ftrace_ctx_switch(void *__rq, struct task_struct *prev,
 		  struct task_struct *next)
 {
-	tracing_record_cmdline(prev);
-
 	/*
 	 * If tracer_switch_func only points to the local
 	 * switch func, it still needs the ptr passed to it.
@@ -93,8 +95,6 @@ void
 ftrace_wake_up_task(void *__rq, struct task_struct *wakee,
 		    struct task_struct *curr)
 {
-	tracing_record_cmdline(curr);
-
 	wakeup_func(__rq, wakee, curr);
 
 	/*
-- 
cgit v1.2.3


From 442e544ce52d4415a024920b84fb95c5f9aa0855 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:53 +0200
Subject: ftrace: iter ctrl fix

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3ae17e254fc3..688b4cf72d99 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1920,6 +1920,11 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
 			break;
 		}
 	}
+	/*
+	 * If no option could be set, return an error:
+	 */
+	if (!trace_options[i])
+		return -EINVAL;
 
 	filp->f_pos += cnt;
 
-- 
cgit v1.2.3


From f29c73fe3404f8799ed57aaf48859e0b55fc071f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:53 +0200
Subject: ftrace: include cpu in stacktrace

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 688b4cf72d99..3a4032492fcb 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1227,10 +1227,8 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 				 abs_usecs % 1000, rel_usecs/1000,
 				 rel_usecs % 1000);
 	} else {
-		if (entry->type != TRACE_STACK) {
-			lat_print_generic(s, entry, cpu);
-			lat_print_timestamp(s, abs_usecs, rel_usecs);
-		}
+		lat_print_generic(s, entry, cpu);
+		lat_print_timestamp(s, abs_usecs, rel_usecs);
 	}
 	switch (entry->type) {
 	case TRACE_FN:
@@ -1293,17 +1291,15 @@ static int print_trace_fmt(struct trace_iterator *iter)
 	usec_rem = do_div(t, 1000000ULL);
 	secs = (unsigned long)t;
 
-	if (entry->type != TRACE_STACK) {
-		ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
-		if (!ret)
-			return 0;
-		ret = trace_seq_printf(s, "[%02d] ", iter->cpu);
-		if (!ret)
-			return 0;
-		ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem);
-		if (!ret)
-			return 0;
-	}
+	ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
+	if (!ret)
+		return 0;
+	ret = trace_seq_printf(s, "[%02d] ", iter->cpu);
+	if (!ret)
+		return 0;
+	ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem);
+	if (!ret)
+		return 0;
 
 	switch (entry->type) {
 	case TRACE_FN:
-- 
cgit v1.2.3


From 36fc25a9f48deacd8aa08cd2d1c186a4e412604f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:53 +0200
Subject: ftrace: sched tree fix

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 9ca4a2e6a236..3bc7c5362998 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2414,6 +2414,23 @@ static int sched_balance_self(int cpu, int flag)
 
 #ifdef CONFIG_CONTEXT_SWITCH_TRACER
 
+void ftrace_task(struct task_struct *p, void *__tr, void *__data)
+{
+#if 0
+	/*  
+	 * trace timeline tree
+	 */
+	__trace_special(__tr, __data,
+			p->pid, p->se.vruntime, p->se.sum_exec_runtime);
+#else
+	/*
+	 * trace balance metrics
+	 */
+	__trace_special(__tr, __data,
+			p->pid, p->se.avg_overlap, 0);
+#endif
+}
+
 void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data)
 {
 	struct task_struct *p;
@@ -2421,32 +2438,22 @@ void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data)
 	struct rb_node *curr;
 	struct rq *rq = __rq;
 
-	curr = first_fair(&rq->cfs);
-	if (!curr)
-		return;
-
 	if (rq->cfs.curr) {
 		p = task_of(rq->cfs.curr);
-		__trace_special(__tr, __data,
-		      p->pid, p->se.vruntime, p->se.sum_exec_runtime);
+		ftrace_task(p, __tr, __data);
 	}
 	if (rq->cfs.next) {
 		p = task_of(rq->cfs.next);
-		__trace_special(__tr, __data,
-		      p->pid, p->se.vruntime, p->se.sum_exec_runtime);
+		ftrace_task(p, __tr, __data);
 	}
 
-	while (curr) {
+	for (curr = first_fair(&rq->cfs); curr; curr = rb_next(curr)) {
 		se = rb_entry(curr, struct sched_entity, run_node);
 		if (!entity_is_task(se))
 			continue;
 
 		p = task_of(se);
-
-		__trace_special(__tr, __data,
-			      p->pid, p->se.vruntime, p->se.sum_exec_runtime);
-
-		curr = rb_next(curr);
+		ftrace_task(p, __tr, __data);
 	}
 }
 
-- 
cgit v1.2.3


From 88a4216c3ec4281fc7e6725cc3a3ccd01fb1aa14 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:53 +0200
Subject: ftrace: sched special

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_fair.c               |  3 +++
 kernel/trace/trace.c              |  6 +++---
 kernel/trace/trace_sched_switch.c | 24 ++++++++++++++++++++++++
 3 files changed, 30 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e24ecd39c4b8..dc1856f10795 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1061,6 +1061,8 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
 	if (!(this_sd->flags & SD_WAKE_AFFINE))
 		return 0;
 
+	ftrace_special(__LINE__, curr->se.avg_overlap, sync);
+	ftrace_special(__LINE__, p->se.avg_overlap, -1);
 	/*
 	 * If the currently running task will sleep within
 	 * a reasonable amount of time then attract this newly
@@ -1238,6 +1240,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 	if (unlikely(se == pse))
 		return;
 
+	ftrace_special(__LINE__, p->pid, se->last_wakeup);
 	cfs_rq_of(pse)->next = pse;
 
 	/*
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3a4032492fcb..b87a26414892 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1251,7 +1251,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 				 comm);
 		break;
 	case TRACE_SPECIAL:
-		trace_seq_printf(s, " %ld %ld %ld\n",
+		trace_seq_printf(s, "# %ld %ld %ld\n",
 				 entry->special.arg1,
 				 entry->special.arg2,
 				 entry->special.arg3);
@@ -1335,7 +1335,7 @@ static int print_trace_fmt(struct trace_iterator *iter)
 			return 0;
 		break;
 	case TRACE_SPECIAL:
-		ret = trace_seq_printf(s, " %ld %ld %ld\n",
+		ret = trace_seq_printf(s, "# %ld %ld %ld\n",
 				 entry->special.arg1,
 				 entry->special.arg2,
 				 entry->special.arg3);
@@ -1400,7 +1400,7 @@ static int print_raw_fmt(struct trace_iterator *iter)
 		break;
 	case TRACE_SPECIAL:
 	case TRACE_STACK:
-		ret = trace_seq_printf(s, " %ld %ld %ld\n",
+		ret = trace_seq_printf(s, "# %ld %ld %ld\n",
 				 entry->special.arg1,
 				 entry->special.arg2,
 				 entry->special.arg3);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 5a217e863586..bddf676914ed 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -103,6 +103,30 @@ ftrace_wake_up_task(void *__rq, struct task_struct *wakee,
 	wakeup_sched_wakeup(wakee, curr);
 }
 
+void
+ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+	struct trace_array *tr = ctx_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+
+	if (!tracer_enabled)
+		return;
+
+	local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1))
+		__trace_special(tr, data, arg1, arg2, arg3);
+
+	atomic_dec(&data->disabled);
+	local_irq_restore(flags);
+}
+
 static void sched_switch_reset(struct trace_array *tr)
 {
 	int cpu;
-- 
cgit v1.2.3


From bac524d3f3dfeffa3a9d44f2c64035b88bcaacb4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 12 May 2008 21:20:53 +0200
Subject: ftrace: trace next state

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 35 +++++++++++++++++++++++++----------
 kernel/trace/trace.h |  1 +
 2 files changed, 26 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b87a26414892..b63fe909f87b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -736,6 +736,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->ctx.prev_state	= prev->state;
 	entry->ctx.next_pid	= next->pid;
 	entry->ctx.next_prio	= next->prio;
+	entry->ctx.next_state	= next->state;
 	__trace_stack(tr, data, flags, 4);
 	spin_unlock_irqrestore(&data->lock, irq_flags);
 }
@@ -759,6 +760,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->ctx.prev_state	= curr->state;
 	entry->ctx.next_pid	= wakee->pid;
 	entry->ctx.next_prio	= wakee->prio;
+	entry->ctx.next_state	= wakee->state;
 	__trace_stack(tr, data, flags, 5);
 	spin_unlock_irqrestore(&data->lock, irq_flags);
 
@@ -1207,7 +1209,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 	unsigned long abs_usecs;
 	unsigned long rel_usecs;
 	char *comm;
-	int S;
+	int S, T;
 	int i;
 
 	if (!next_entry)
@@ -1241,14 +1243,17 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 	case TRACE_WAKE:
 		S = entry->ctx.prev_state < sizeof(state_to_char) ?
 			state_to_char[entry->ctx.prev_state] : 'X';
+		T = entry->ctx.next_state < sizeof(state_to_char) ?
+			state_to_char[entry->ctx.next_state] : 'X';
+
 		comm = trace_find_cmdline(entry->ctx.next_pid);
-		trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d %s\n",
+		trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %s\n",
 				 entry->ctx.prev_pid,
 				 entry->ctx.prev_prio,
 				 S, entry->type == TRACE_CTX ? "==>" : "  +",
 				 entry->ctx.next_pid,
 				 entry->ctx.next_prio,
-				 comm);
+				 T, comm);
 		break;
 	case TRACE_SPECIAL:
 		trace_seq_printf(s, "# %ld %ld %ld\n",
@@ -1280,7 +1285,7 @@ static int print_trace_fmt(struct trace_iterator *iter)
 	unsigned long secs;
 	char *comm;
 	int ret;
-	int S;
+	int S, T;
 	int i;
 
 	entry = iter->ent;
@@ -1324,13 +1329,16 @@ static int print_trace_fmt(struct trace_iterator *iter)
 	case TRACE_WAKE:
 		S = entry->ctx.prev_state < sizeof(state_to_char) ?
 			state_to_char[entry->ctx.prev_state] : 'X';
-		ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d\n",
+		T = entry->ctx.next_state < sizeof(state_to_char) ?
+			state_to_char[entry->ctx.next_state] : 'X';
+		ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c\n",
 				       entry->ctx.prev_pid,
 				       entry->ctx.prev_prio,
 				       S,
 				       entry->type == TRACE_CTX ? "==>" : "  +",
 				       entry->ctx.next_pid,
-				       entry->ctx.next_prio);
+				       entry->ctx.next_prio,
+				       T);
 		if (!ret)
 			return 0;
 		break;
@@ -1367,7 +1375,7 @@ static int print_raw_fmt(struct trace_iterator *iter)
 	struct trace_seq *s = &iter->seq;
 	struct trace_entry *entry;
 	int ret;
-	int S;
+	int S, T;
 
 	entry = iter->ent;
 
@@ -1387,14 +1395,17 @@ static int print_raw_fmt(struct trace_iterator *iter)
 	case TRACE_WAKE:
 		S = entry->ctx.prev_state < sizeof(state_to_char) ?
 			state_to_char[entry->ctx.prev_state] : 'X';
+		T = entry->ctx.next_state < sizeof(state_to_char) ?
+			state_to_char[entry->ctx.next_state] : 'X';
 		if (entry->type == TRACE_WAKE)
 			S = '+';
-		ret = trace_seq_printf(s, "%d %d %c %d %d\n",
+		ret = trace_seq_printf(s, "%d %d %c %d %d %c\n",
 				       entry->ctx.prev_pid,
 				       entry->ctx.prev_prio,
 				       S,
 				       entry->ctx.next_pid,
-				       entry->ctx.next_prio);
+				       entry->ctx.next_prio,
+				       T);
 		if (!ret)
 			return 0;
 		break;
@@ -1428,7 +1439,7 @@ static int print_hex_fmt(struct trace_iterator *iter)
 	struct trace_seq *s = &iter->seq;
 	unsigned char newline = '\n';
 	struct trace_entry *entry;
-	int S;
+	int S, T;
 
 	entry = iter->ent;
 
@@ -1445,6 +1456,8 @@ static int print_hex_fmt(struct trace_iterator *iter)
 	case TRACE_WAKE:
 		S = entry->ctx.prev_state < sizeof(state_to_char) ?
 			state_to_char[entry->ctx.prev_state] : 'X';
+		T = entry->ctx.next_state < sizeof(state_to_char) ?
+			state_to_char[entry->ctx.next_state] : 'X';
 		if (entry->type == TRACE_WAKE)
 			S = '+';
 		SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_pid);
@@ -1453,6 +1466,7 @@ static int print_hex_fmt(struct trace_iterator *iter)
 		SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_pid);
 		SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_prio);
 		SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip);
+		SEQ_PUT_HEX_FIELD_RET(s, T);
 		break;
 	case TRACE_SPECIAL:
 	case TRACE_STACK:
@@ -1488,6 +1502,7 @@ static int print_bin_fmt(struct trace_iterator *iter)
 		SEQ_PUT_FIELD_RET(s, entry->ctx.prev_state);
 		SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid);
 		SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio);
+		SEQ_PUT_FIELD_RET(s, entry->ctx.next_state);
 		break;
 	case TRACE_SPECIAL:
 	case TRACE_STACK:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index a52015702a28..96951a8d09a4 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -23,6 +23,7 @@ struct ctx_switch_entry {
 	unsigned char		prev_state;
 	unsigned int		next_pid;
 	unsigned char		next_prio;
+	unsigned char		next_state;
 };
 
 /*
-- 
cgit v1.2.3


From 5429db2d26a59903c81a4f6c6dae7eb9daaea5fc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 12 May 2008 21:20:53 +0200
Subject: ftrace: fix wakeup callback

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 3bc7c5362998..1ec3fb2efee6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2533,7 +2533,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 
 out_activate:
 #endif /* CONFIG_SMP */
-	ftrace_wake_up_task(rq, p, rq->curr);
 	schedstat_inc(p, se.nr_wakeups);
 	if (sync)
 		schedstat_inc(p, se.nr_wakeups_sync);
@@ -2548,6 +2547,7 @@ out_activate:
 	success = 1;
 
 out_running:
+	ftrace_wake_up_task(rq, p, rq->curr);
 	check_preempt_curr(rq, p);
 
 	p->state = TASK_RUNNING;
-- 
cgit v1.2.3


From 694379e9ed4f2f6babe111bf001c69e2e263338b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:54 +0200
Subject: ftrace: make it more available in the Kconfig

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/Kconfig | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index eb1988ed84b7..ebc158e6d59a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -14,7 +14,7 @@ config TRACING
 
 config FTRACE
 	bool "Kernel Function Tracer"
-	depends on DEBUG_KERNEL && HAVE_FTRACE
+	depends on HAVE_FTRACE
 	select FRAME_POINTER
 	select TRACING
 	select CONTEXT_SWITCH_TRACER
@@ -72,7 +72,6 @@ config PREEMPT_TRACER
 
 config SCHED_TRACER
 	bool "Scheduling Latency Tracer"
-	depends on DEBUG_KERNEL
 	select TRACING
 	select CONTEXT_SWITCH_TRACER
 	select TRACER_MAX_TRACE
@@ -82,7 +81,6 @@ config SCHED_TRACER
 
 config CONTEXT_SWITCH_TRACER
 	bool "Trace process context switches"
-	depends on DEBUG_KERNEL
 	select TRACING
 	select MARKERS
 	help
-- 
cgit v1.2.3


From 8f96da02c14d722ad9a3713cd7273ce28c9036ad Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 12 May 2008 21:20:54 +0200
Subject: ftrace: remove wakeup from function trace

trace_function is called by mcount and calling wake_up from that
can have unpredictable results. This patch removes the wakeup from
trace_function.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b63fe909f87b..736dcfb3ed01 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -659,8 +659,6 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data,
 	entry->fn.ip		= ip;
 	entry->fn.parent_ip	= parent_ip;
 	spin_unlock_irqrestore(&data->lock, irq_flags);
-
-	trace_wake_up();
 }
 
 void
-- 
cgit v1.2.3


From 4fe8c3048cd8280a54256bca9cac2007bd546c33 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 12 May 2008 21:20:54 +0200
Subject: ftrace: printk and trace irqsoff and wakeups

printk called from wakeup critical timings and irqs off can
cause deadlocks since printk might do a wakeup itself. If the
call to printk happens with the runqueue lock held, it can
deadlock.

This patch protects the printk from being called in trace irqs off
with a test to see if the runqueue for the current CPU is locked.
If it is locked, the printk is skipped.

The wakeup always holds the runqueue lock, so the printk is
simply removed.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace_irqsoff.c      | 26 ++++++++++++++------------
 kernel/trace/trace_sched_wakeup.c | 13 -------------
 2 files changed, 14 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 7a4dc014b8ab..d0c1748b1e2c 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -165,18 +165,20 @@ check_critical_timing(struct trace_array *tr,
 
 	update_max_tr_single(tr, current, cpu);
 
-	if (tracing_thresh) {
-		printk(KERN_INFO "(%16s-%-5d|#%d):"
-			" %lu us critical section violates %lu us threshold.\n",
-				current->comm, current->pid,
-				raw_smp_processor_id(),
-				latency, nsecs_to_usecs(tracing_thresh));
-	} else {
-		printk(KERN_INFO "(%16s-%-5d|#%d):"
-		       " new %lu us maximum-latency critical section.\n",
-				current->comm, current->pid,
-				raw_smp_processor_id(),
-				latency);
+	if (!runqueue_is_locked()) {
+		if (tracing_thresh) {
+			printk(KERN_INFO "(%16s-%-5d|#%d): %lu us critical"
+			       " section violates %lu us threshold.\n",
+			       current->comm, current->pid,
+			       raw_smp_processor_id(),
+			       latency, nsecs_to_usecs(tracing_thresh));
+		} else {
+			printk(KERN_INFO "(%16s-%-5d|#%d): new %lu us"
+			       " maximum-latency critical section.\n",
+			       current->comm, current->pid,
+			       raw_smp_processor_id(),
+			       latency);
+		}
 	}
 
 	max_sequence++;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 2a012423f9d0..5948011006bc 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -106,19 +106,6 @@ wakeup_sched_switch(struct task_struct *prev, struct task_struct *next)
 
 	update_max_tr(tr, wakeup_task, wakeup_cpu);
 
-	if (tracing_thresh) {
-		printk(KERN_INFO "(%16s-%-5d|#%d):"
-			" %lu us wakeup latency violates %lu us threshold.\n",
-				wakeup_task->comm, wakeup_task->pid,
-				raw_smp_processor_id(),
-				latency, nsecs_to_usecs(tracing_thresh));
-	} else {
-		printk(KERN_INFO "(%16s-%-5d|#%d):"
-			" new %lu us maximum wakeup latency.\n",
-				wakeup_task->comm, wakeup_task->pid,
-				cpu, latency);
-	}
-
 out_unlock:
 	__wakeup_reset(tr);
 	spin_unlock_irqrestore(&wakeup_lock, flags);
-- 
cgit v1.2.3


From 06fa75ab566c50e01bfd7b055bde85cf9b1bc98a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 12 May 2008 21:20:54 +0200
Subject: ftrace: add TRACE_STACK and TRACE_SPECIAL to selftest validation

The selftest validation code checks for valid entries in the trace buffer.
TRACE_STACK and TRACE_SPECIAL have been added to the code but not to
the validator. This patch adds the two to prevent them from flagging a
failure in the selftest.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace_selftest.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 39dd452647da..92f4acb7740c 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -9,6 +9,8 @@ static inline int trace_valid_entry(struct trace_entry *entry)
 	case TRACE_FN:
 	case TRACE_CTX:
 	case TRACE_WAKE:
+	case TRACE_STACK:
+	case TRACE_SPECIAL:
 		return 1;
 	}
 	return 0;
@@ -180,7 +182,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 
 	/* we should only have one item */
 	if (!ret && count != 1) {
-		printk(KERN_CONT ".. filter failed ..");
+		printk(KERN_CONT ".. filter failed count=%ld ..", count);
 		ret = -1;
 		goto out;
 	}
-- 
cgit v1.2.3


From d05cdb25d80f06f77aa6bddb53cd1390d4d91a0b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 12 May 2008 21:20:54 +0200
Subject: ftrace: fix dynamic ftrace selftest

With the adding of the configuration changes in the Makefile to prevent
tracing of functions in the ftrace code, all tracing of all the ftrace
code has been removed. Unfortunately, one of the selftests, relied on
a function to be traced. With the new change, the function was no longer
traced and the test failed.

This patch separates out the test function into its own file so that
we can add the "-pg" flag to the compilation of that function and the
adding of the mcount call to that function.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/Makefile                 | 4 ++++
 kernel/trace/trace.h                  | 2 ++
 kernel/trace/trace_selftest.c         | 6 ------
 kernel/trace/trace_selftest_dynamic.c | 7 +++++++
 4 files changed, 13 insertions(+), 6 deletions(-)
 create mode 100644 kernel/trace/trace_selftest_dynamic.c

(limited to 'kernel')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c25a6cd6a529..d9efbbfa2bdf 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -4,6 +4,10 @@
 ifdef CONFIG_FTRACE
 ORIG_CFLAGS := $(KBUILD_CFLAGS)
 KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
+
+# selftest needs instrumentation
+CFLAGS_trace_selftest_dynamic.o = -pg
+obj-y += trace_selftest_dynamic.o
 endif
 
 obj-$(CONFIG_FTRACE) += libftrace.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 96951a8d09a4..98cbfd05d754 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -244,6 +244,8 @@ extern int unregister_tracer_switch(struct tracer_switch_ops *ops);
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 extern unsigned long ftrace_update_tot_cnt;
+#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
+extern int DYN_FTRACE_TEST_NAME(void);
 #endif
 
 #ifdef CONFIG_FTRACE_STARTUP_TEST
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 92f4acb7740c..83e55a2000cc 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -107,14 +107,8 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 
-#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
 #define __STR(x) #x
 #define STR(x) __STR(x)
-static int DYN_FTRACE_TEST_NAME(void)
-{
-	/* used to call mcount */
-	return 0;
-}
 
 /* Test dynamic code modification and ftrace filters */
 int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c
new file mode 100644
index 000000000000..54dd77cce5bf
--- /dev/null
+++ b/kernel/trace/trace_selftest_dynamic.c
@@ -0,0 +1,7 @@
+#include "trace.h"
+
+int DYN_FTRACE_TEST_NAME(void)
+{
+	/* used to call mcount */
+	return 0;
+}
-- 
cgit v1.2.3


From 4d9493c90f8e6e1b164aede3814010a290161abb Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:54 +0200
Subject: ftrace: remove add-hoc code

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c                    | 47 ---------------------------------------
 kernel/sched_fair.c               |  3 ---
 kernel/trace/trace_sched_switch.c | 10 ++-------
 3 files changed, 2 insertions(+), 58 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 1ec3fb2efee6..ad95cca4e42e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2412,53 +2412,6 @@ static int sched_balance_self(int cpu, int flag)
 
 #endif /* CONFIG_SMP */
 
-#ifdef CONFIG_CONTEXT_SWITCH_TRACER
-
-void ftrace_task(struct task_struct *p, void *__tr, void *__data)
-{
-#if 0
-	/*  
-	 * trace timeline tree
-	 */
-	__trace_special(__tr, __data,
-			p->pid, p->se.vruntime, p->se.sum_exec_runtime);
-#else
-	/*
-	 * trace balance metrics
-	 */
-	__trace_special(__tr, __data,
-			p->pid, p->se.avg_overlap, 0);
-#endif
-}
-
-void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data)
-{
-	struct task_struct *p;
-	struct sched_entity *se;
-	struct rb_node *curr;
-	struct rq *rq = __rq;
-
-	if (rq->cfs.curr) {
-		p = task_of(rq->cfs.curr);
-		ftrace_task(p, __tr, __data);
-	}
-	if (rq->cfs.next) {
-		p = task_of(rq->cfs.next);
-		ftrace_task(p, __tr, __data);
-	}
-
-	for (curr = first_fair(&rq->cfs); curr; curr = rb_next(curr)) {
-		se = rb_entry(curr, struct sched_entity, run_node);
-		if (!entity_is_task(se))
-			continue;
-
-		p = task_of(se);
-		ftrace_task(p, __tr, __data);
-	}
-}
-
-#endif
-
 /***
  * try_to_wake_up - wake up a thread
  * @p: the to-be-woken-up thread
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index dc1856f10795..e24ecd39c4b8 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1061,8 +1061,6 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
 	if (!(this_sd->flags & SD_WAKE_AFFINE))
 		return 0;
 
-	ftrace_special(__LINE__, curr->se.avg_overlap, sync);
-	ftrace_special(__LINE__, p->se.avg_overlap, -1);
 	/*
 	 * If the currently running task will sleep within
 	 * a reasonable amount of time then attract this newly
@@ -1240,7 +1238,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 	if (unlikely(se == pse))
 		return;
 
-	ftrace_special(__LINE__, p->pid, se->last_wakeup);
 	cfs_rq_of(pse)->next = pse;
 
 	/*
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index bddf676914ed..5671db0e1827 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -36,11 +36,8 @@ ctx_switch_func(void *__rq, struct task_struct *prev, struct task_struct *next)
 	data = tr->data[cpu];
 	disabled = atomic_inc_return(&data->disabled);
 
-	if (likely(disabled == 1)) {
+	if (likely(disabled == 1))
 		tracing_sched_switch_trace(tr, data, prev, next, flags);
-		if (trace_flags & TRACE_ITER_SCHED_TREE)
-			ftrace_all_fair_tasks(__rq, tr, data);
-	}
 
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
@@ -65,11 +62,8 @@ wakeup_func(void *__rq, struct task_struct *wakee, struct task_struct *curr)
 	data = tr->data[cpu];
 	disabled = atomic_inc_return(&data->disabled);
 
-	if (likely(disabled == 1)) {
+	if (likely(disabled == 1))
 		tracing_sched_wakeup_trace(tr, data, wakee, curr, flags);
-		if (trace_flags & TRACE_ITER_SCHED_TREE)
-			ftrace_all_fair_tasks(__rq, tr, data);
-	}
 
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
-- 
cgit v1.2.3


From c5f888cae49dfe3e86d9d1e0dab2b63ceb057be3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 12 May 2008 21:20:55 +0200
Subject: ftrace: irqsoff use raw_smp_processor_id

This patch changes the use of __get_cpu_var to explicitly calling
raw_smp_processor_id and using the per_cpu() macro. On some debug
configurations, the use of __get_cpu_var may cause ftrace to trigger
and this can cause problems with the irqsoff tracing.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace_irqsoff.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index d0c1748b1e2c..761f3ec66c50 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -204,14 +204,14 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
 	if (likely(!tracer_enabled))
 		return;
 
-	if (__get_cpu_var(tracing_cpu))
+	cpu = raw_smp_processor_id();
+
+	if (per_cpu(tracing_cpu, cpu))
 		return;
 
-	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 
-	if (unlikely(!data) || unlikely(!head_page(data)) ||
-	    atomic_read(&data->disabled))
+	if (unlikely(!data) || atomic_read(&data->disabled))
 		return;
 
 	atomic_inc(&data->disabled);
@@ -225,7 +225,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
 
 	trace_function(tr, data, ip, parent_ip, flags);
 
-	__get_cpu_var(tracing_cpu) = 1;
+	per_cpu(tracing_cpu, cpu) = 1;
 
 	atomic_dec(&data->disabled);
 }
@@ -238,16 +238,16 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
 	struct trace_array_cpu *data;
 	unsigned long flags;
 
+	cpu = raw_smp_processor_id();
 	/* Always clear the tracing cpu on stopping the trace */
-	if (unlikely(__get_cpu_var(tracing_cpu)))
-		__get_cpu_var(tracing_cpu) = 0;
+	if (unlikely(per_cpu(tracing_cpu, cpu)))
+		per_cpu(tracing_cpu, cpu) = 0;
 	else
 		return;
 
 	if (!tracer_enabled)
 		return;
 
-	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 
 	if (unlikely(!data) || unlikely(!head_page(data)) ||
@@ -255,6 +255,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
 		return;
 
 	atomic_inc(&data->disabled);
+
 	local_save_flags(flags);
 	trace_function(tr, data, ip, parent_ip, flags);
 	check_critical_timing(tr, data, parent_ip ? : ip, cpu);
@@ -376,7 +377,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr)
 static void __irqsoff_tracer_init(struct trace_array *tr)
 {
 	irqsoff_trace = tr;
-	/* make sure that the tracer is visibel */
+	/* make sure that the tracer is visible */
 	smp_wmb();
 
 	if (tr->ctrl)
-- 
cgit v1.2.3


From 92205c2343527a863d660360599a4bf8cede77b0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 12 May 2008 21:20:55 +0200
Subject: ftrace: user raw_spin_lock in tracing

Lock debugging enabled cause huge performance problems for tracing. Having
the lock verification happening for every function that is called
because mcount calls spin_lock can cripple the system.

This patch converts the spin_locks used by ftrace into raw_spin_locks.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 51 ++++++++++++++++++++++++++++++---------------------
 kernel/trace/trace.h |  2 +-
 2 files changed, 31 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 736dcfb3ed01..3009aafa4dde 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -133,7 +133,8 @@ static const char *trace_options[] = {
 	NULL
 };
 
-static DEFINE_SPINLOCK(ftrace_max_lock);
+static raw_spinlock_t ftrace_max_lock =
+	(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 
 /*
  * Copy the new maximum trace into the separate maximum-trace
@@ -335,7 +336,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 	int i;
 
 	WARN_ON_ONCE(!irqs_disabled());
-	spin_lock(&ftrace_max_lock);
+	__raw_spin_lock(&ftrace_max_lock);
 	/* clear out all the previous traces */
 	for_each_possible_cpu(i) {
 		data = tr->data[i];
@@ -344,7 +345,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 	}
 
 	__update_max_tr(tr, tsk, cpu);
-	spin_unlock(&ftrace_max_lock);
+	__raw_spin_unlock(&ftrace_max_lock);
 }
 
 /**
@@ -360,7 +361,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 	int i;
 
 	WARN_ON_ONCE(!irqs_disabled());
-	spin_lock(&ftrace_max_lock);
+	__raw_spin_lock(&ftrace_max_lock);
 	for_each_possible_cpu(i)
 		tracing_reset(max_tr.data[i]);
 
@@ -368,7 +369,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 	tracing_reset(data);
 
 	__update_max_tr(tr, tsk, cpu);
-	spin_unlock(&ftrace_max_lock);
+	__raw_spin_unlock(&ftrace_max_lock);
 }
 
 int register_tracer(struct tracer *type)
@@ -652,13 +653,15 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data,
 	struct trace_entry *entry;
 	unsigned long irq_flags;
 
-	spin_lock_irqsave(&data->lock, irq_flags);
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&data->lock);
 	entry			= tracing_get_trace_entry(tr, data);
 	tracing_generic_entry_update(entry, flags);
 	entry->type		= TRACE_FN;
 	entry->fn.ip		= ip;
 	entry->fn.parent_ip	= parent_ip;
-	spin_unlock_irqrestore(&data->lock, irq_flags);
+	__raw_spin_unlock(&data->lock);
+	raw_local_irq_restore(irq_flags);
 }
 
 void
@@ -678,14 +681,16 @@ __trace_special(void *__tr, void *__data,
 	struct trace_entry *entry;
 	unsigned long irq_flags;
 
-	spin_lock_irqsave(&data->lock, irq_flags);
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&data->lock);
 	entry			= tracing_get_trace_entry(tr, data);
 	tracing_generic_entry_update(entry, 0);
 	entry->type		= TRACE_SPECIAL;
 	entry->special.arg1	= arg1;
 	entry->special.arg2	= arg2;
 	entry->special.arg3	= arg3;
-	spin_unlock_irqrestore(&data->lock, irq_flags);
+	__raw_spin_unlock(&data->lock);
+	raw_local_irq_restore(irq_flags);
 
 	trace_wake_up();
 }
@@ -725,7 +730,8 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	struct trace_entry *entry;
 	unsigned long irq_flags;
 
-	spin_lock_irqsave(&data->lock, irq_flags);
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&data->lock);
 	entry			= tracing_get_trace_entry(tr, data);
 	tracing_generic_entry_update(entry, flags);
 	entry->type		= TRACE_CTX;
@@ -736,7 +742,8 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->ctx.next_prio	= next->prio;
 	entry->ctx.next_state	= next->state;
 	__trace_stack(tr, data, flags, 4);
-	spin_unlock_irqrestore(&data->lock, irq_flags);
+	__raw_spin_unlock(&data->lock);
+	raw_local_irq_restore(irq_flags);
 }
 
 void
@@ -749,7 +756,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	struct trace_entry *entry;
 	unsigned long irq_flags;
 
-	spin_lock_irqsave(&data->lock, irq_flags);
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&data->lock);
 	entry			= tracing_get_trace_entry(tr, data);
 	tracing_generic_entry_update(entry, flags);
 	entry->type		= TRACE_WAKE;
@@ -760,7 +768,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->ctx.next_prio	= wakee->prio;
 	entry->ctx.next_state	= wakee->state;
 	__trace_stack(tr, data, flags, 5);
-	spin_unlock_irqrestore(&data->lock, irq_flags);
+	__raw_spin_unlock(&data->lock);
+	raw_local_irq_restore(irq_flags);
 
 	trace_wake_up();
 }
@@ -1824,7 +1833,8 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
 	if (err)
 		goto err_unlock;
 
-	spin_lock_irq(&ftrace_max_lock);
+	raw_local_irq_disable();
+	__raw_spin_lock(&ftrace_max_lock);
 	for_each_possible_cpu(cpu) {
 		/*
 		 * Increase/decrease the disabled counter if we are
@@ -1839,7 +1849,8 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
 			atomic_dec(&global_trace.data[cpu]->disabled);
 		}
 	}
-	spin_unlock_irq(&ftrace_max_lock);
+	__raw_spin_unlock(&ftrace_max_lock);
+	raw_local_irq_enable();
 
 	tracing_cpumask = tracing_cpumask_new;
 
@@ -2299,7 +2310,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 
 	for_each_cpu_mask(cpu, mask) {
 		data = iter->tr->data[cpu];
-		spin_lock(&data->lock);
+		__raw_spin_lock(&data->lock);
 	}
 
 	while (find_next_entry_inc(iter) != NULL) {
@@ -2320,7 +2331,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 
 	for_each_cpu_mask(cpu, mask) {
 		data = iter->tr->data[cpu];
-		spin_unlock(&data->lock);
+		__raw_spin_unlock(&data->lock);
 	}
 
 	for_each_cpu_mask(cpu, mask) {
@@ -2538,8 +2549,7 @@ static int trace_alloc_page(void)
 	/* Now that we successfully allocate a page per CPU, add them */
 	for_each_possible_cpu(i) {
 		data = global_trace.data[i];
-		spin_lock_init(&data->lock);
-		lockdep_set_class(&data->lock, &data->lock_key);
+		data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 		page = list_entry(pages.next, struct page, lru);
 		list_del_init(&page->lru);
 		list_add_tail(&page->lru, &data->trace_pages);
@@ -2547,8 +2557,7 @@ static int trace_alloc_page(void)
 
 #ifdef CONFIG_TRACER_MAX_TRACE
 		data = max_tr.data[i];
-		spin_lock_init(&data->lock);
-		lockdep_set_class(&data->lock, &data->lock_key);
+		data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 		page = list_entry(pages.next, struct page, lru);
 		list_del_init(&page->lru);
 		list_add_tail(&page->lru, &data->trace_pages);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 98cbfd05d754..25cba28eb9ba 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -76,7 +76,7 @@ struct trace_entry {
 struct trace_array_cpu {
 	struct list_head	trace_pages;
 	atomic_t		disabled;
-	spinlock_t		lock;
+	raw_spinlock_t		lock;
 	struct lock_class_key	lock_key;
 
 	/* these fields get copied into max-trace: */
-- 
cgit v1.2.3


From 1d09daa55d2e9bab7e7d30f0d05e5a7bc60b2a4a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 12 May 2008 21:20:55 +0200
Subject: ftrace: use Makefile to remove tracing from lockdep

This patch removes the "notrace" annotation from lockdep and adds the debugging
files in the kernel director to those that should not be compiled with
"-pg" mcount tracing.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/Makefile  |  8 ++++++++
 kernel/lockdep.c | 26 +++++++++++++-------------
 2 files changed, 21 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 7e344e7b0cb3..d2f80ea4cd9a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,6 +11,14 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o
 
+ifdef CONFIG_FTRACE
+# Do not profile debug utilities
+ORIG_CFLAGS := $(KBUILD_CFLAGS)
+KBUILD_CFLAGS = $(if $(filter-out lockdep% %debug,$(basename $(notdir $@))), \
+	$(ORIG_CFLAGS), \
+	$(subst -pg,,$(ORIG_CFLAGS)))
+endif
+
 obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index ac46847ba0c9..90a440cbd6de 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -271,14 +271,14 @@ static struct list_head chainhash_table[CHAINHASH_SIZE];
 	((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \
 	(key2))
 
-notrace void lockdep_off(void)
+void lockdep_off(void)
 {
 	current->lockdep_recursion++;
 }
 
 EXPORT_SYMBOL(lockdep_off);
 
-notrace void lockdep_on(void)
+void lockdep_on(void)
 {
 	current->lockdep_recursion--;
 }
@@ -1041,7 +1041,7 @@ find_usage_forwards(struct lock_class *source, unsigned int depth)
  * Return 1 otherwise and keep <backwards_match> unchanged.
  * Return 0 on error.
  */
-static noinline notrace int
+static noinline int
 find_usage_backwards(struct lock_class *source, unsigned int depth)
 {
 	struct lock_list *entry;
@@ -1591,7 +1591,7 @@ static inline int validate_chain(struct task_struct *curr,
  * We are building curr_chain_key incrementally, so double-check
  * it from scratch, to make sure that it's done correctly:
  */
-static notrace void check_chain_key(struct task_struct *curr)
+static void check_chain_key(struct task_struct *curr)
 {
 #ifdef CONFIG_DEBUG_LOCKDEP
 	struct held_lock *hlock, *prev_hlock = NULL;
@@ -1967,7 +1967,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 /*
  * Mark all held locks with a usage bit:
  */
-static notrace int
+static int
 mark_held_locks(struct task_struct *curr, int hardirq)
 {
 	enum lock_usage_bit usage_bit;
@@ -2014,7 +2014,7 @@ void early_boot_irqs_on(void)
 /*
  * Hardirqs will be enabled:
  */
-void notrace trace_hardirqs_on_caller(unsigned long a0)
+void trace_hardirqs_on_caller(unsigned long a0)
 {
 	struct task_struct *curr = current;
 	unsigned long ip;
@@ -2060,7 +2060,7 @@ void notrace trace_hardirqs_on_caller(unsigned long a0)
 }
 EXPORT_SYMBOL(trace_hardirqs_on_caller);
 
-void notrace trace_hardirqs_on(void)
+void trace_hardirqs_on(void)
 {
 	trace_hardirqs_on_caller(CALLER_ADDR0);
 }
@@ -2069,7 +2069,7 @@ EXPORT_SYMBOL(trace_hardirqs_on);
 /*
  * Hardirqs were disabled:
  */
-void notrace trace_hardirqs_off_caller(unsigned long a0)
+void trace_hardirqs_off_caller(unsigned long a0)
 {
 	struct task_struct *curr = current;
 
@@ -2094,7 +2094,7 @@ void notrace trace_hardirqs_off_caller(unsigned long a0)
 }
 EXPORT_SYMBOL(trace_hardirqs_off_caller);
 
-void notrace trace_hardirqs_off(void)
+void trace_hardirqs_off(void)
 {
 	trace_hardirqs_off_caller(CALLER_ADDR0);
 }
@@ -2260,7 +2260,7 @@ static inline int separate_irq_context(struct task_struct *curr,
 /*
  * Mark a lock with a usage bit, and validate the state transition:
  */
-static notrace int mark_lock(struct task_struct *curr, struct held_lock *this,
+static int mark_lock(struct task_struct *curr, struct held_lock *this,
 			     enum lock_usage_bit new_bit)
 {
 	unsigned int new_mask = 1 << new_bit, ret = 1;
@@ -2663,7 +2663,7 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
 /*
  * Check whether we follow the irq-flags state precisely:
  */
-static notrace void check_flags(unsigned long flags)
+static void check_flags(unsigned long flags)
 {
 #if defined(CONFIG_DEBUG_LOCKDEP) && defined(CONFIG_TRACE_IRQFLAGS)
 	if (!debug_locks)
@@ -2700,7 +2700,7 @@ static notrace void check_flags(unsigned long flags)
  * We are not always called with irqs disabled - do that here,
  * and also avoid lockdep recursion:
  */
-notrace void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
+void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 			  int trylock, int read, int check, unsigned long ip)
 {
 	unsigned long flags;
@@ -2723,7 +2723,7 @@ notrace void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 
 EXPORT_SYMBOL_GPL(lock_acquire);
 
-notrace void lock_release(struct lockdep_map *lock, int nested,
+void lock_release(struct lockdep_map *lock, int nested,
 			  unsigned long ip)
 {
 	unsigned long flags;
-- 
cgit v1.2.3


From c1d2327b36f2261ffa8ff7227321ba900c7eee7f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:55 +0200
Subject: ftrace: restrict tracing to HAVE_FTRACE architectures

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/Kconfig | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index ebc158e6d59a..f3005717bcd0 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -32,6 +32,7 @@ config IRQSOFF_TRACER
 	default n
 	depends on TRACE_IRQFLAGS_SUPPORT
 	depends on GENERIC_TIME
+	depends on HAVE_FTRACE
 	select TRACE_IRQFLAGS
 	select TRACING
 	select TRACER_MAX_TRACE
@@ -54,6 +55,7 @@ config PREEMPT_TRACER
 	default n
 	depends on GENERIC_TIME
 	depends on PREEMPT
+	depends on HAVE_FTRACE
 	select TRACING
 	select TRACER_MAX_TRACE
 	help
@@ -72,6 +74,7 @@ config PREEMPT_TRACER
 
 config SCHED_TRACER
 	bool "Scheduling Latency Tracer"
+	depends on HAVE_FTRACE
 	select TRACING
 	select CONTEXT_SWITCH_TRACER
 	select TRACER_MAX_TRACE
@@ -81,6 +84,7 @@ config SCHED_TRACER
 
 config CONTEXT_SWITCH_TRACER
 	bool "Trace process context switches"
+	depends on HAVE_FTRACE
 	select TRACING
 	select MARKERS
 	help
-- 
cgit v1.2.3


From 07a267cdd2fd7d1de9455b1e36a1635ace7276c7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:55 +0200
Subject: ftrace: add UNINTERRUPTIBLE state for kftraced on disable

When dynamic ftrace fails and sets itself disabled, the ftraced daemon
will go back to sleep everytime it wakes up. The setting of the
ftraced state to UNINTERRUPTIBLE is skipped in this process, and the
daemon takes up 100% of the CPU.  This patch makes sure the ftraced daemon
sets itself to UNINTERRUPTIBLE in that loop.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/ftrace.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 281d97a3208c..40f64f7cd850 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -630,10 +630,10 @@ static int ftraced(void *ignore)
 {
 	unsigned long usecs;
 
-	set_current_state(TASK_INTERRUPTIBLE);
-
 	while (!kthread_should_stop()) {
 
+		set_current_state(TASK_INTERRUPTIBLE);
+
 		/* check once a second */
 		schedule_timeout(HZ);
 
@@ -667,8 +667,6 @@ static int ftraced(void *ignore)
 		wake_up_interruptible(&ftraced_waiters);
 
 		ftrace_shutdown_replenish();
-
-		set_current_state(TASK_INTERRUPTIBLE);
 	}
 	__set_current_state(TASK_RUNNING);
 	return 0;
-- 
cgit v1.2.3


From d15f57f23eaba975309a153b23699cd0c0236974 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:56 +0200
Subject: ftrace: fix mutex unlock in trace output

If the trace output changes on reading the trace files, there is a chance
that the start function will return NULL. If the start function of a sequence
returns NULL the stop equivalent is not called. In this case, all locks
that are taken must be released even if they are released in the stop function.

This patch fixes a case that a mutex was not released on return of NULL
in the start sequence function.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3009aafa4dde..ea11f4ebfae1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -964,8 +964,10 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 
 	mutex_lock(&trace_types_lock);
 
-	if (!current_trace || current_trace != iter->trace)
+	if (!current_trace || current_trace != iter->trace) {
+		mutex_unlock(&trace_types_lock);
 		return NULL;
+	}
 
 	atomic_inc(&trace_record_cmdline_disabled);
 
-- 
cgit v1.2.3


From 30afdcb1de0a37a2086145a82ca3febebe47d019 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 12 May 2008 21:20:56 +0200
Subject: ftrace: selftest protect againt max flip

There is a slight race condition in the selftest where the max update
of the wakeup and irqs/preemption off tests can be doing a max update as
the buffers are being tested. If this happens the system can crash with
a GPF.

This patch adds the max update spinlock around the checking of the
buffers to prevent such a race.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace_selftest.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 83e55a2000cc..395274e783b3 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -82,10 +82,12 @@ trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data)
  */
 static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
 {
-	unsigned long cnt = 0;
-	int cpu;
-	int ret = 0;
+	unsigned long flags, cnt = 0;
+	int cpu, ret = 0;
 
+	/* Don't allow flipping of max traces now */
+	raw_local_irq_save(flags);
+	__raw_spin_lock(&ftrace_max_lock);
 	for_each_possible_cpu(cpu) {
 		if (!head_page(tr->data[cpu]))
 			continue;
@@ -96,6 +98,8 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
 		if (ret)
 			break;
 	}
+	__raw_spin_unlock(&ftrace_max_lock);
+	raw_local_irq_restore(flags);
 
 	if (count)
 		*count = cnt;
-- 
cgit v1.2.3


From 72829bc3d63cdc592d8f7dd86ad3b3fe8900fb74 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 23 May 2008 21:37:28 +0200
Subject: ftrace: move enums to ftrace.h and make helper function global

picked from the mmiotracer patches to distangle the patch queues.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 24 ++++++++----------------
 kernel/trace/trace.h | 15 +++++++++++++++
 2 files changed, 23 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ea11f4ebfae1..0eef0503febd 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -37,7 +37,7 @@ unsigned long __read_mostly	tracing_thresh;
 
 static int tracing_disabled = 1;
 
-static long
+long
 ns2usecs(cycle_t nsec)
 {
 	nsec += 500;
@@ -96,18 +96,6 @@ unsigned long nsecs_to_usecs(unsigned long nsecs)
 	return nsecs / 1000;
 }
 
-enum trace_type {
-	__TRACE_FIRST_TYPE = 0,
-
-	TRACE_FN,
-	TRACE_CTX,
-	TRACE_WAKE,
-	TRACE_STACK,
-	TRACE_SPECIAL,
-
-	__TRACE_LAST_TYPE
-};
-
 enum trace_flag_type {
 	TRACE_FLAG_IRQS_OFF		= 0x01,
 	TRACE_FLAG_NEED_RESCHED		= 0x02,
@@ -190,7 +178,7 @@ void *head_page(struct trace_array_cpu *data)
 	return page_address(page);
 }
 
-static int
+int
 trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 {
 	int len = (PAGE_SIZE - 1) - s->len;
@@ -205,7 +193,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 	va_end(ap);
 
 	/* If we can't write it all, don't bother writing anything */
-	if (ret > len)
+	if (ret >= len)
 		return 0;
 
 	s->len += ret;
@@ -638,7 +626,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags)
 	pc = preempt_count();
 
 	entry->preempt_count	= pc & 0xff;
-	entry->pid		= tsk->pid;
+	entry->pid		= (tsk) ? tsk->pid : 0;
 	entry->t		= ftrace_now(raw_smp_processor_id());
 	entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
 		((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
@@ -1541,6 +1529,9 @@ static int trace_empty(struct trace_iterator *iter)
 
 static int print_trace_line(struct trace_iterator *iter)
 {
+	if (iter->trace && iter->trace->print_line)
+		return iter->trace->print_line(iter);
+
 	if (trace_flags & TRACE_ITER_BIN)
 		return print_bin_fmt(iter);
 
@@ -2162,6 +2153,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
 		return -ENOMEM;
 
 	iter->tr = &global_trace;
+	iter->trace = current_trace;
 
 	filp->private_data = iter;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 25cba28eb9ba..b0ca7473671b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -6,6 +6,18 @@
 #include <linux/sched.h>
 #include <linux/clocksource.h>
 
+enum trace_type {
+	__TRACE_FIRST_TYPE = 0,
+
+	TRACE_FN,
+	TRACE_CTX,
+	TRACE_WAKE,
+	TRACE_STACK,
+	TRACE_SPECIAL,
+
+	__TRACE_LAST_TYPE
+};
+
 /*
  * Function trace entry - function address and parent function addres:
  */
@@ -130,6 +142,7 @@ struct tracer {
 	int			(*selftest)(struct tracer *trace,
 					    struct trace_array *tr);
 #endif
+	int			(*print_line)(struct trace_iterator *iter);
 	struct tracer		*next;
 	int			print_max;
 };
@@ -276,6 +289,8 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace,
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 
 extern void *head_page(struct trace_array_cpu *data);
+extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
+extern long ns2usecs(cycle_t nsec);
 
 extern unsigned long trace_flags;
 
-- 
cgit v1.2.3


From d17d969160c18b631a19c2b34d260691402650f8 Mon Sep 17 00:00:00 2001
From: Ankita Garg <ankita@in.ibm.com>
Date: Mon, 12 May 2008 21:20:58 +0200
Subject: ftrace: fix conversion of task state to char in latency tracer

The conversion of task states to a character in the sched_switch tracer (part
of latency tracer infrastructure), seems to be incorrect. We currently do it
by indexing into the state_to_char array using the state value. The state
values do not map directly into the array index and are thus incorrect. The
following patch addresses this issue. This is also what is being done even
in the show_task() routine in kernel/sched.c

The patch has been compile and run tested.

Signed-off-by: Ankita Garg <ankita@in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0eef0503febd..9197782d15c9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1208,6 +1208,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 	char *comm;
 	int S, T;
 	int i;
+	unsigned state;
 
 	if (!next_entry)
 		next_entry = entry;
@@ -1238,11 +1239,11 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 		break;
 	case TRACE_CTX:
 	case TRACE_WAKE:
-		S = entry->ctx.prev_state < sizeof(state_to_char) ?
-			state_to_char[entry->ctx.prev_state] : 'X';
 		T = entry->ctx.next_state < sizeof(state_to_char) ?
 			state_to_char[entry->ctx.next_state] : 'X';
 
+		state = entry->ctx.prev_state ? __ffs(entry->ctx.prev_state) + 1 : 0;
+		S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X';
 		comm = trace_find_cmdline(entry->ctx.next_pid);
 		trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %s\n",
 				 entry->ctx.prev_pid,
-- 
cgit v1.2.3


From 8487c23765b6e0444ec6b5f1530766d63fe68e35 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 12 May 2008 21:20:58 +0200
Subject: ftrace: allow trace_pipe to block on all reads

We expect things like "cat" to block on reads to trace_pipe. That's what
trace_pipe is for.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9197782d15c9..fd4ecc292005 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2231,8 +2231,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 	start = 0;
 
 	while (trace_empty(iter)) {
-		if (!(trace_flags & TRACE_ITER_BLOCK))
-			return -EWOULDBLOCK;
 		/*
 		 * This is a make-shift waitqueue. The reason we don't use
 		 * an actual wait queue is because:
-- 
cgit v1.2.3


From b5685aede3b7b65e72ddc73b951aa1f70798a614 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 12 May 2008 21:20:58 +0200
Subject: ftrace: restore iterator trace in pipe read

The trace iterator is reset in the read. We still need to restore the tracer
that the trace_pipe was opened with.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fd4ecc292005..d141fc98f3a8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2202,6 +2202,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 {
 	struct trace_iterator *iter = filp->private_data;
 	struct trace_array_cpu *data;
+	struct trace_array *tr = iter->tr;
+	struct tracer *tracer = iter->trace;
 	static cpumask_t mask;
 	static int start;
 	unsigned long flags;
@@ -2274,7 +2276,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 		cnt = PAGE_SIZE - 1;
 
 	memset(iter, 0, sizeof(*iter));
-	iter->tr = &global_trace;
+	iter->tr = tr;
+	iter->trace = tracer;
 	iter->pos = -1;
 
 	/*
-- 
cgit v1.2.3


From 845279972f1736c3463c9cebd1bad92a0a347176 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 12 May 2008 21:20:58 +0200
Subject: ftrace: return EOF in trace_pipe on change of tracer

Break out of while loop with EOF when the current_trace changes.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d141fc98f3a8..2af940433e96 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2253,6 +2253,9 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 		if (signal_pending(current))
 			return -EINTR;
 
+		if (iter->trace != current_trace)
+			return 0;
+
 		/*
 		 * We block until we read something and tracing is disabled.
 		 * We still block if tracing is disabled, but we have never
-- 
cgit v1.2.3


From 2dc8f09571a61d9cb3dc47bba6608689dfe15d16 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 12 May 2008 21:20:58 +0200
Subject: ftrace: trace_pipe implement NONBLOCK

This patch implements "NONBLOCK" for trace_pipe. If the trace_pipe is opened
with O_NONBLOCK, then the trace_pipe read will not block when buffer is empty.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2af940433e96..3b4eaf36ed5d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2233,6 +2233,10 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 	start = 0;
 
 	while (trace_empty(iter)) {
+
+		if ((filp->f_flags & O_NONBLOCK))
+			return -EAGAIN;
+
 		/*
 		 * This is a make-shift waitqueue. The reason we don't use
 		 * an actual wait queue is because:
-- 
cgit v1.2.3


From 05bd68c514579e007b46e4fa0461b78416a3f4c2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 12 May 2008 21:20:59 +0200
Subject: ftrace: user proper API for setting RT prios in selftest

The wakeup selftest used an internal API for setting the test task priority.
This patch fixes it to use the proper API for performing such a task.

Thanks goes to Randy Dunlap for pointing out this build failure.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace_selftest.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 395274e783b3..a5f6001c3332 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -410,11 +410,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 #ifdef CONFIG_SCHED_TRACER
 static int trace_wakeup_test_thread(void *data)
 {
-	struct completion *x = data;
-
 	/* Make this a RT thread, doesn't need to be too high */
+	struct sched_param param = { .sched_priority = 5 };
+	struct completion *x = data;
 
-	rt_mutex_setprio(current, MAX_RT_PRIO - 5);
+	sched_setscheduler(current, SCHED_FIFO, &param);
 
 	/* Make it know we have a new prio */
 	complete(x);
-- 
cgit v1.2.3


From a98a3c3fde3ae7614f19758a043691b6f59dac53 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 12 May 2008 21:20:59 +0200
Subject: ftrace: trace_entries to dynamically change trace buffer size

This patch adds /debug/tracing/trace_entries that allows users to
see as well as modify the number of trace entries the buffers hold.

The number of entries only increments in ENTRIES_PER_PAGE which is
calculated by the size of an entry with the number of entries that
can fit in a page. The user does not need to use an exact size, but
the entries will be rounded to one of the increments.

Trying to set the entries to 0 will return with -EINVAL.

To avoid race conditions, the modification of the buffer size can only
be done when tracing is completely disabled (current_tracer == none).
A info message will be printed if a user tries to modify the buffer size
when not set to none.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 145 ++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 137 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3b4eaf36ed5d..4723e012151d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -35,6 +35,15 @@
 unsigned long __read_mostly	tracing_max_latency = (cycle_t)ULONG_MAX;
 unsigned long __read_mostly	tracing_thresh;
 
+/* dummy trace to disable tracing */
+static struct tracer no_tracer __read_mostly =
+{
+	.name		= "none",
+};
+
+static int trace_alloc_page(void);
+static int trace_free_page(void);
+
 static int tracing_disabled = 1;
 
 long
@@ -2364,6 +2373,70 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 	return read;
 }
 
+static ssize_t
+tracing_entries_read(struct file *filp, char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	struct trace_array *tr = filp->private_data;
+	char buf[64];
+	int r;
+
+	r = sprintf(buf, "%lu\n", tr->entries);
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+tracing_entries_write(struct file *filp, const char __user *ubuf,
+		      size_t cnt, loff_t *ppos)
+{
+	unsigned long val;
+	char buf[64];
+
+	if (cnt > 63)
+		cnt = 63;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	val = simple_strtoul(buf, NULL, 10);
+
+	/* must have at least 1 entry */
+	if (!val)
+		return -EINVAL;
+
+	mutex_lock(&trace_types_lock);
+
+	if (current_trace != &no_tracer) {
+		cnt = -EBUSY;
+		pr_info("ftrace: set current_tracer to none"
+			" before modifying buffer size\n");
+		goto out;
+	}
+
+	if (val > global_trace.entries) {
+		while (global_trace.entries < val) {
+			if (trace_alloc_page()) {
+				cnt = -ENOMEM;
+				goto out;
+			}
+		}
+	} else {
+		/* include the number of entries in val (inc of page entries) */
+		while (global_trace.entries > val + (ENTRIES_PER_PAGE - 1))
+			trace_free_page();
+	}
+
+	filp->f_pos += cnt;
+
+ out:
+	max_tr.entries = global_trace.entries;
+	mutex_unlock(&trace_types_lock);
+
+	return cnt;
+}
+
 static struct file_operations tracing_max_lat_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_max_lat_read,
@@ -2389,6 +2462,12 @@ static struct file_operations tracing_pipe_fops = {
 	.release	= tracing_release_pipe,
 };
 
+static struct file_operations tracing_entries_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_entries_read,
+	.write		= tracing_entries_write,
+};
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 static ssize_t
@@ -2500,6 +2579,12 @@ static __init void tracer_init_debugfs(void)
 		pr_warning("Could not create debugfs "
 			   "'tracing_threash' entry\n");
 
+	entry = debugfs_create_file("trace_entries", 0644, d_tracer,
+				    &global_trace, &tracing_entries_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'tracing_threash' entry\n");
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 	entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
 				    &ftrace_update_tot_cnt,
@@ -2510,12 +2595,6 @@ static __init void tracer_init_debugfs(void)
 #endif
 }
 
-/* dummy trace to disable tracing */
-static struct tracer no_tracer __read_mostly =
-{
-	.name		= "none",
-};
-
 static int trace_alloc_page(void)
 {
 	struct trace_array_cpu *data;
@@ -2552,7 +2631,6 @@ static int trace_alloc_page(void)
 	/* Now that we successfully allocate a page per CPU, add them */
 	for_each_possible_cpu(i) {
 		data = global_trace.data[i];
-		data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 		page = list_entry(pages.next, struct page, lru);
 		list_del_init(&page->lru);
 		list_add_tail(&page->lru, &data->trace_pages);
@@ -2560,7 +2638,6 @@ static int trace_alloc_page(void)
 
 #ifdef CONFIG_TRACER_MAX_TRACE
 		data = max_tr.data[i];
-		data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 		page = list_entry(pages.next, struct page, lru);
 		list_del_init(&page->lru);
 		list_add_tail(&page->lru, &data->trace_pages);
@@ -2579,6 +2656,55 @@ static int trace_alloc_page(void)
 	return -ENOMEM;
 }
 
+static int trace_free_page(void)
+{
+	struct trace_array_cpu *data;
+	struct page *page;
+	struct list_head *p;
+	int i;
+	int ret = 0;
+
+	/* free one page from each buffer */
+	for_each_possible_cpu(i) {
+		data = global_trace.data[i];
+		p = data->trace_pages.next;
+		if (p == &data->trace_pages) {
+			/* should never happen */
+			WARN_ON(1);
+			tracing_disabled = 1;
+			ret = -1;
+			break;
+		}
+		page = list_entry(p, struct page, lru);
+		ClearPageLRU(page);
+		list_del(&page->lru);
+		__free_page(page);
+
+		tracing_reset(data);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+		data = max_tr.data[i];
+		p = data->trace_pages.next;
+		if (p == &data->trace_pages) {
+			/* should never happen */
+			WARN_ON(1);
+			tracing_disabled = 1;
+			ret = -1;
+			break;
+		}
+		page = list_entry(p, struct page, lru);
+		ClearPageLRU(page);
+		list_del(&page->lru);
+		__free_page(page);
+
+		tracing_reset(data);
+#endif
+	}
+	global_trace.entries -= ENTRIES_PER_PAGE;
+
+	return ret;
+}
+
 __init static int tracer_alloc_buffers(void)
 {
 	struct trace_array_cpu *data;
@@ -2609,6 +2735,9 @@ __init static int tracer_alloc_buffers(void)
 		/* use the LRU flag to differentiate the two buffers */
 		ClearPageLRU(page);
 
+		data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+		max_tr.data[i]->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
 /* Only allocate if we are actually using the max trace */
 #ifdef CONFIG_TRACER_MAX_TRACE
 		array = (void *)__get_free_page(GFP_KERNEL);
-- 
cgit v1.2.3


From bb065afb8ebd07a03155502dba29ebf0f6fe67e8 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 12 May 2008 21:21:00 +0200
Subject: lockdep: update lockdep_recursion on graph_lock

With the introduction of ftrace, it is possible to recurse into
the lockdep functions via the mcount call. To prevent possible
lockups, updating the lockdep_recursion counter on grabbing the internal
lockdep_lock should prevent deadlocks.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/lockdep.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 90a440cbd6de..65548eff029e 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -82,6 +82,8 @@ static int graph_lock(void)
 		__raw_spin_unlock(&lockdep_lock);
 		return 0;
 	}
+	/* prevent any recursions within lockdep from causing deadlocks */
+	current->lockdep_recursion++;
 	return 1;
 }
 
@@ -90,6 +92,7 @@ static inline int graph_unlock(void)
 	if (debug_locks && !__raw_spin_is_locked(&lockdep_lock))
 		return DEBUG_LOCKS_WARN_ON(1);
 
+	current->lockdep_recursion--;
 	__raw_spin_unlock(&lockdep_lock);
 	return 0;
 }
-- 
cgit v1.2.3


From 93dcc6ea096c51cc30ad0f6647ed05fead2439bf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 12 May 2008 21:21:00 +0200
Subject: ftrace: simplify hexprint

simplify hex to ascii conversion.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 23 +++++------------------
 1 file changed, 5 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4723e012151d..627e39936ea7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -248,19 +248,18 @@ trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
 }
 
 #define HEX_CHARS 17
+static const char hex2asc[] = "0123456789abcdef";
 
 static int
 trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
 {
 	unsigned char hex[HEX_CHARS];
-	unsigned char *data;
+	unsigned char *data = mem;
 	unsigned char byte;
 	int i, j;
 
 	BUG_ON(len >= HEX_CHARS);
 
-	data = mem;
-
 #ifdef __BIG_ENDIAN
 	for (i = 0, j = 0; i < len; i++) {
 #else
@@ -268,22 +267,10 @@ trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
 #endif
 		byte = data[i];
 
-		hex[j]   = byte & 0x0f;
-		if (hex[j] >= 10)
-			hex[j] += 'a' - 10;
-		else
-			hex[j] += '0';
-		j++;
-
-		hex[j] = byte >> 4;
-		if (hex[j] >= 10)
-			hex[j] += 'a' - 10;
-		else
-			hex[j] += '0';
-		j++;
+		hex[j++] = hex2asc[byte & 0x0f];
+		hex[j++] = hex2asc[byte >> 4];
 	}
-	hex[j] = ' ';
-	j++;
+	hex[j++] = ' ';
 
 	return trace_seq_putmem(s, hex, j);
 }
-- 
cgit v1.2.3


From afc2abc0ae4265730a0fc48618012193a09a1d10 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:21:00 +0200
Subject: ftrace: cleanups

no code changed.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 627e39936ea7..d6b60576f991 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1155,12 +1155,12 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
 
 	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
 	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
-	if (hardirq && softirq)
+	if (hardirq && softirq) {
 		trace_seq_putc(s, 'H');
-	else {
-		if (hardirq)
+	} else {
+		if (hardirq) {
 			trace_seq_putc(s, 'h');
-		else {
+		} else {
 			if (softirq)
 				trace_seq_putc(s, 's');
 			else
@@ -2177,8 +2177,7 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
 		 * Always select as readable when in blocking mode
 		 */
 		return POLLIN | POLLRDNORM;
-	}
-	else {
+	} else {
 		if (!trace_empty(iter))
 			return POLLIN | POLLRDNORM;
 		poll_wait(filp, &trace_wait, poll_table);
-- 
cgit v1.2.3


From cffae437cdfbc8a5370d38cefbff1dfe34dad6ca Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 12 May 2008 21:21:00 +0200
Subject: ftrace: simple clean ups

Andrew Morton mentioned some clean ups that should be done to ftrace.
This patch does some of the simple clean ups.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d6b60576f991..0a367362ba2f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -36,8 +36,7 @@ unsigned long __read_mostly	tracing_max_latency = (cycle_t)ULONG_MAX;
 unsigned long __read_mostly	tracing_thresh;
 
 /* dummy trace to disable tracing */
-static struct tracer no_tracer __read_mostly =
-{
+static struct tracer no_tracer __read_mostly = {
 	.name		= "none",
 };
 
@@ -1906,8 +1905,8 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
 	int neg = 0;
 	int i;
 
-	if (cnt > 63)
-		cnt = 63;
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
 
 	if (copy_from_user(&buf, ubuf, cnt))
 		return -EFAULT;
@@ -1999,8 +1998,8 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
 	long val;
 	char buf[64];
 
-	if (cnt > 63)
-		cnt = 63;
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
 
 	if (copy_from_user(&buf, ubuf, cnt))
 		return -EFAULT;
@@ -2099,10 +2098,10 @@ tracing_max_lat_read(struct file *filp, char __user *ubuf,
 	char buf[64];
 	int r;
 
-	r = snprintf(buf, 64, "%ld\n",
+	r = snprintf(buf, sizeof(buf), "%ld\n",
 		     *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr));
-	if (r > 64)
-		r = 64;
+	if (r > sizeof(buf))
+		r = sizeof(buf);
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
@@ -2114,8 +2113,8 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
 	long val;
 	char buf[64];
 
-	if (cnt > 63)
-		cnt = 63;
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
 
 	if (copy_from_user(&buf, ubuf, cnt))
 		return -EFAULT;
@@ -2378,8 +2377,8 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
 	unsigned long val;
 	char buf[64];
 
-	if (cnt > 63)
-		cnt = 63;
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
 
 	if (copy_from_user(&buf, ubuf, cnt))
 		return -EFAULT;
-- 
cgit v1.2.3


From c6caeeb142cd3a03c46107aac45c8effc02bbadb Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 12 May 2008 21:21:00 +0200
Subject: ftrace: replace simple_strtoul with strict_strtoul

Andrew Morton suggested using strict_strtoul over simple_strtoul.
This patch replaces them in ftrace.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0a367362ba2f..290e9da7aa9a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -92,9 +92,16 @@ void trace_wake_up(void)
 
 static int __init set_nr_entries(char *str)
 {
+	unsigned long nr_entries;
+	int ret;
+
 	if (!str)
 		return 0;
-	trace_nr_entries = simple_strtoul(str, &str, 0);
+	ret = strict_strtoul(str, 0, &nr_entries);
+	/* nr_entries can not be zero */
+	if (ret < 0 || nr_entries == 0)
+		return 0;
+	trace_nr_entries = nr_entries;
 	return 1;
 }
 __setup("trace_entries=", set_nr_entries);
@@ -1995,8 +2002,9 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
 		   size_t cnt, loff_t *ppos)
 {
 	struct trace_array *tr = filp->private_data;
-	long val;
 	char buf[64];
+	long val;
+	int ret;
 
 	if (cnt >= sizeof(buf))
 		return -EINVAL;
@@ -2006,7 +2014,9 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
 
 	buf[cnt] = 0;
 
-	val = simple_strtoul(buf, NULL, 10);
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
 
 	val = !!val;
 
@@ -2110,8 +2120,9 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
 		      size_t cnt, loff_t *ppos)
 {
 	long *ptr = filp->private_data;
-	long val;
 	char buf[64];
+	long val;
+	int ret;
 
 	if (cnt >= sizeof(buf))
 		return -EINVAL;
@@ -2121,7 +2132,9 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
 
 	buf[cnt] = 0;
 
-	val = simple_strtoul(buf, NULL, 10);
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
 
 	*ptr = val * 1000;
 
@@ -2376,6 +2389,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
 {
 	unsigned long val;
 	char buf[64];
+	int ret;
 
 	if (cnt >= sizeof(buf))
 		return -EINVAL;
@@ -2385,7 +2399,9 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
 
 	buf[cnt] = 0;
 
-	val = simple_strtoul(buf, NULL, 10);
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
 
 	/* must have at least 1 entry */
 	if (!val)
-- 
cgit v1.2.3


From ab46428c6969d50ecf6f6e97b7a84abba6274368 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 12 May 2008 21:21:00 +0200
Subject: ftrace: modulize the number of CPU buffers

Currently ftrace allocates a trace buffer for every possible CPU.
Work is being done to change it to only online CPUs and add hooks
to hotplug CPUS.

This patch lays out the infrastructure for such a change.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 38 ++++++++++++++++++++++++--------------
 1 file changed, 24 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 290e9da7aa9a..5da391c5fb0d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -35,6 +35,12 @@
 unsigned long __read_mostly	tracing_max_latency = (cycle_t)ULONG_MAX;
 unsigned long __read_mostly	tracing_thresh;
 
+static unsigned long __read_mostly	tracing_nr_buffers;
+static cpumask_t __read_mostly		tracing_buffer_mask;
+
+#define for_each_tracing_cpu(cpu)	\
+	for_each_cpu_mask(cpu, tracing_buffer_mask)
+
 /* dummy trace to disable tracing */
 static struct tracer no_tracer __read_mostly = {
 	.name		= "none",
@@ -328,7 +334,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 	WARN_ON_ONCE(!irqs_disabled());
 	__raw_spin_lock(&ftrace_max_lock);
 	/* clear out all the previous traces */
-	for_each_possible_cpu(i) {
+	for_each_tracing_cpu(i) {
 		data = tr->data[i];
 		flip_trace(max_tr.data[i], data);
 		tracing_reset(data);
@@ -352,7 +358,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 
 	WARN_ON_ONCE(!irqs_disabled());
 	__raw_spin_lock(&ftrace_max_lock);
-	for_each_possible_cpu(i)
+	for_each_tracing_cpu(i)
 		tracing_reset(max_tr.data[i]);
 
 	flip_trace(max_tr.data[cpu], data);
@@ -398,7 +404,7 @@ int register_tracer(struct tracer *type)
 		 * internal tracing to verify that everything is in order.
 		 * If we fail, we do not register this tracer.
 		 */
-		for_each_possible_cpu(i) {
+		for_each_tracing_cpu(i) {
 			data = tr->data[i];
 			if (!head_page(data))
 				continue;
@@ -417,7 +423,7 @@ int register_tracer(struct tracer *type)
 			goto out;
 		}
 		/* Only reset on passing, to avoid touching corrupted buffers */
-		for_each_possible_cpu(i) {
+		for_each_tracing_cpu(i) {
 			data = tr->data[i];
 			if (!head_page(data))
 				continue;
@@ -847,7 +853,7 @@ find_next_entry(struct trace_iterator *iter, int *ent_cpu)
 	int next_cpu = -1;
 	int cpu;
 
-	for_each_possible_cpu(cpu) {
+	for_each_tracing_cpu(cpu) {
 		if (!head_page(tr->data[cpu]))
 			continue;
 		ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu);
@@ -972,7 +978,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 		iter->prev_ent = NULL;
 		iter->prev_cpu = -1;
 
-		for_each_possible_cpu(i) {
+		for_each_tracing_cpu(i) {
 			iter->next_idx[i] = 0;
 			iter->next_page[i] = NULL;
 		}
@@ -1089,7 +1095,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 	if (type)
 		name = type->name;
 
-	for_each_possible_cpu(cpu) {
+	for_each_tracing_cpu(cpu) {
 		if (head_page(tr->data[cpu])) {
 			total += tr->data[cpu]->trace_idx;
 			if (tr->data[cpu]->trace_idx > tr->entries)
@@ -1519,7 +1525,7 @@ static int trace_empty(struct trace_iterator *iter)
 	struct trace_array_cpu *data;
 	int cpu;
 
-	for_each_possible_cpu(cpu) {
+	for_each_tracing_cpu(cpu) {
 		data = iter->tr->data[cpu];
 
 		if (head_page(data) && data->trace_idx &&
@@ -1831,7 +1837,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
 
 	raw_local_irq_disable();
 	__raw_spin_lock(&ftrace_max_lock);
-	for_each_possible_cpu(cpu) {
+	for_each_tracing_cpu(cpu) {
 		/*
 		 * Increase/decrease the disabled counter if we are
 		 * about to flip a bit in the cpumask:
@@ -2308,7 +2314,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 	ftrace_enabled = 0;
 #endif
 	smp_wmb();
-	for_each_possible_cpu(cpu) {
+	for_each_tracing_cpu(cpu) {
 		data = iter->tr->data[cpu];
 
 		if (!head_page(data) || !data->trace_idx)
@@ -2605,7 +2611,7 @@ static int trace_alloc_page(void)
 	int i;
 
 	/* first allocate a page for each CPU */
-	for_each_possible_cpu(i) {
+	for_each_tracing_cpu(i) {
 		array = (void *)__get_free_page(GFP_KERNEL);
 		if (array == NULL) {
 			printk(KERN_ERR "tracer: failed to allocate page"
@@ -2630,7 +2636,7 @@ static int trace_alloc_page(void)
 	}
 
 	/* Now that we successfully allocate a page per CPU, add them */
-	for_each_possible_cpu(i) {
+	for_each_tracing_cpu(i) {
 		data = global_trace.data[i];
 		page = list_entry(pages.next, struct page, lru);
 		list_del_init(&page->lru);
@@ -2666,7 +2672,7 @@ static int trace_free_page(void)
 	int ret = 0;
 
 	/* free one page from each buffer */
-	for_each_possible_cpu(i) {
+	for_each_tracing_cpu(i) {
 		data = global_trace.data[i];
 		p = data->trace_pages.next;
 		if (p == &data->trace_pages) {
@@ -2717,8 +2723,12 @@ __init static int tracer_alloc_buffers(void)
 
 	global_trace.ctrl = tracer_enabled;
 
+	/* TODO: make the number of buffers hot pluggable with CPUS */
+	tracing_nr_buffers = num_possible_cpus();
+	tracing_buffer_mask = cpu_possible_map;
+
 	/* Allocate the first page for all buffers */
-	for_each_possible_cpu(i) {
+	for_each_tracing_cpu(i) {
 		data = global_trace.data[i] = &per_cpu(global_trace_cpu, i);
 		max_tr.data[i] = &per_cpu(max_data, i);
 
-- 
cgit v1.2.3


From 4fcdae83cebda24b519a89d3dd976081fff1ca80 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 12 May 2008 21:21:00 +0200
Subject: ftrace: comment code

This is first installment of adding documentation to the ftrace.
Expect many more patches of this kind in the near future.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/trace/trace.h |   7 +++
 2 files changed, 141 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5da391c5fb0d..a102b11eacf2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -64,26 +64,79 @@ cycle_t ftrace_now(int cpu)
 	return cpu_clock(cpu);
 }
 
+/*
+ * The global_trace is the descriptor that holds the tracing
+ * buffers for the live tracing. For each CPU, it contains
+ * a link list of pages that will store trace entries. The
+ * page descriptor of the pages in the memory is used to hold
+ * the link list by linking the lru item in the page descriptor
+ * to each of the pages in the buffer per CPU.
+ *
+ * For each active CPU there is a data field that holds the
+ * pages for the buffer for that CPU. Each CPU has the same number
+ * of pages allocated for its buffer.
+ */
 static struct trace_array	global_trace;
 
 static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
 
+/*
+ * The max_tr is used to snapshot the global_trace when a maximum
+ * latency is reached. Some tracers will use this to store a maximum
+ * trace while it continues examining live traces.
+ *
+ * The buffers for the max_tr are set up the same as the global_trace.
+ * When a snapshot is taken, the link list of the max_tr is swapped
+ * with the link list of the global_trace and the buffers are reset for
+ * the global_trace so the tracing can continue.
+ */
 static struct trace_array	max_tr;
 
 static DEFINE_PER_CPU(struct trace_array_cpu, max_data);
 
+/* tracer_enabled is used to toggle activation of a tracer */
 static int			tracer_enabled = 1;
+
+/*
+ * trace_nr_entries is the number of entries that is allocated
+ * for a buffer. Note, the number of entries is always rounded
+ * to ENTRIES_PER_PAGE.
+ */
 static unsigned long		trace_nr_entries = 65536UL;
 
+/* trace_types holds a link list of available tracers. */
 static struct tracer		*trace_types __read_mostly;
+
+/* current_trace points to the tracer that is currently active */
 static struct tracer		*current_trace __read_mostly;
+
+/*
+ * max_tracer_type_len is used to simplify the allocating of
+ * buffers to read userspace tracer names. We keep track of
+ * the longest tracer name registered.
+ */
 static int			max_tracer_type_len;
 
+/*
+ * trace_types_lock is used to protect the trace_types list.
+ * This lock is also used to keep user access serialized.
+ * Accesses from userspace will grab this lock while userspace
+ * activities happen inside the kernel.
+ */
 static DEFINE_MUTEX(trace_types_lock);
+
+/* trace_wait is a waitqueue for tasks blocked on trace_poll */
 static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 
+/* trace_flags holds iter_ctrl options */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT;
 
+/**
+ * trace_wake_up - wake up tasks waiting for trace input
+ *
+ * Simply wakes up any task that is blocked on the trace_wait
+ * queue. These is used with trace_poll for tasks polling the trace.
+ */
 void trace_wake_up(void)
 {
 	/*
@@ -117,6 +170,14 @@ unsigned long nsecs_to_usecs(unsigned long nsecs)
 	return nsecs / 1000;
 }
 
+/*
+ * trace_flag_type is an enumeration that holds different
+ * states when a trace occurs. These are:
+ *  IRQS_OFF	- interrupts were disabled
+ *  NEED_RESCED - reschedule is requested
+ *  HARDIRQ	- inside an interrupt handler
+ *  SOFTIRQ	- inside a softirq handler
+ */
 enum trace_flag_type {
 	TRACE_FLAG_IRQS_OFF		= 0x01,
 	TRACE_FLAG_NEED_RESCHED		= 0x02,
@@ -124,10 +185,14 @@ enum trace_flag_type {
 	TRACE_FLAG_SOFTIRQ		= 0x08,
 };
 
+/*
+ * TRACE_ITER_SYM_MASK masks the options in trace_flags that
+ * control the output of kernel symbols.
+ */
 #define TRACE_ITER_SYM_MASK \
 	(TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
 
-/* These must match the bit postions above */
+/* These must match the bit postions in trace_iterator_flags */
 static const char *trace_options[] = {
 	"print-parent",
 	"sym-offset",
@@ -142,6 +207,15 @@ static const char *trace_options[] = {
 	NULL
 };
 
+/*
+ * ftrace_max_lock is used to protect the swapping of buffers
+ * when taking a max snapshot. The buffers themselves are
+ * protected by per_cpu spinlocks. But the action of the swap
+ * needs its own lock.
+ *
+ * This is defined as a raw_spinlock_t in order to help
+ * with performance when lockdep debugging is enabled.
+ */
 static raw_spinlock_t ftrace_max_lock =
 	(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 
@@ -172,6 +246,13 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 	tracing_record_cmdline(current);
 }
 
+/**
+ * check_pages - integrity check of trace buffers
+ *
+ * As a safty measure we check to make sure the data pages have not
+ * been corrupted. TODO: configure to disable this because it adds
+ * a bit of overhead.
+ */
 void check_pages(struct trace_array_cpu *data)
 {
 	struct page *page, *tmp;
@@ -185,6 +266,13 @@ void check_pages(struct trace_array_cpu *data)
 	}
 }
 
+/**
+ * head_page - page address of the first page in per_cpu buffer.
+ *
+ * head_page returns the page address of the first page in
+ * a per_cpu buffer. This also preforms various consistency
+ * checks to make sure the buffer has not been corrupted.
+ */
 void *head_page(struct trace_array_cpu *data)
 {
 	struct page *page;
@@ -199,6 +287,17 @@ void *head_page(struct trace_array_cpu *data)
 	return page_address(page);
 }
 
+/**
+ * trace_seq_printf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ */
 int
 trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 {
@@ -222,6 +321,16 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 	return len;
 }
 
+/**
+ * trace_seq_puts - trace sequence printing of simple string
+ * @s: trace sequence descriptor
+ * @str: simple string to record
+ *
+ * The tracer may use either the sequence operations or its own
+ * copy to user routines. This function records a simple string
+ * into a special buffer (@s) for later retrieval by a sequencer
+ * or other mechanism.
+ */
 static int
 trace_seq_puts(struct trace_seq *s, const char *str)
 {
@@ -304,6 +413,13 @@ trace_print_seq(struct seq_file *m, struct trace_seq *s)
 	trace_seq_reset(s);
 }
 
+/*
+ * flip the trace buffers between two trace descriptors.
+ * This usually is the buffers between the global_trace and
+ * the max_tr to record a snapshot of a current trace.
+ *
+ * The ftrace_max_lock must be held.
+ */
 static void
 flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2)
 {
@@ -325,6 +441,15 @@ flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2)
 	check_pages(tr2);
 }
 
+/**
+ * update_max_tr - snapshot all trace buffers from global_trace to max_tr
+ * @tr: tracer
+ * @tsk: the task with the latency
+ * @cpu: The cpu that initiated the trace.
+ *
+ * Flip the buffers between the @tr and the max_tr and record information
+ * about which task was the cause of this latency.
+ */
 void
 update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 {
@@ -349,6 +474,8 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
  * @tr - tracer
  * @tsk - task with the latency
  * @cpu - the cpu of the buffer to copy.
+ *
+ * Flip the trace of a single CPU buffer between the @tr and the max_tr.
  */
 void
 update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
@@ -368,6 +495,12 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 	__raw_spin_unlock(&ftrace_max_lock);
 }
 
+/**
+ * register_tracer - register a tracer with the ftrace system.
+ * @type - the plugin for the tracer
+ *
+ * Register a new plugin tracer.
+ */
 int register_tracer(struct tracer *type)
 {
 	struct tracer *t;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b0ca7473671b..21c29ee13e53 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -294,6 +294,13 @@ extern long ns2usecs(cycle_t nsec);
 
 extern unsigned long trace_flags;
 
+/*
+ * trace_iterator_flags is an enumeration that defines bit
+ * positions into trace_flags that controls the output.
+ *
+ * NOTE: These bits must match the trace_options array in
+ *       trace.c.
+ */
 enum trace_iterator_flags {
 	TRACE_ITER_PRINT_PARENT		= 0x01,
 	TRACE_ITER_SYM_OFFSET		= 0x02,
-- 
cgit v1.2.3


From 25b0b44a1c732ccfc58095cdd8438955a0a19fff Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 12 May 2008 21:21:00 +0200
Subject: ftrace: fix comm on function trace output

In cleaning up of the sched_switch code, the function trace recording
of task comms was removed. This patch adds back the recording of comms
for function trace. The output of ftrace now has the task comm instead
of <...>.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c              | 7 ++++++-
 kernel/trace/trace.h              | 2 ++
 kernel/trace/trace_functions.c    | 2 ++
 kernel/trace/trace_sched_switch.c | 7 +++++--
 4 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a102b11eacf2..1281969103b8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -620,7 +620,12 @@ static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
 static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
 static int cmdline_idx;
 static DEFINE_SPINLOCK(trace_cmdline_lock);
-atomic_t trace_record_cmdline_disabled;
+
+/* trace in all context switches */
+atomic_t trace_record_cmdline_enabled __read_mostly;
+
+/* temporary disable recording */
+atomic_t trace_record_cmdline_disabled __read_mostly;
 
 static void trace_init_cmdlines(void)
 {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 21c29ee13e53..8991c5efcc74 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -216,6 +216,8 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs);
 extern unsigned long tracing_max_latency;
 extern unsigned long tracing_thresh;
 
+extern atomic_t trace_record_cmdline_enabled;
+
 void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
 void update_max_tr_single(struct trace_array *tr,
 			  struct task_struct *tsk, int cpu);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 4165d34bd28a..0a084656d7cf 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -29,12 +29,14 @@ static void function_reset(struct trace_array *tr)
 static void start_function_trace(struct trace_array *tr)
 {
 	function_reset(tr);
+	atomic_inc(&trace_record_cmdline_enabled);
 	tracing_start_function_trace();
 }
 
 static void stop_function_trace(struct trace_array *tr)
 {
 	tracing_stop_function_trace();
+	atomic_dec(&trace_record_cmdline_enabled);
 }
 
 static void function_trace_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 5671db0e1827..a3376478fc2c 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -29,8 +29,6 @@ ctx_switch_func(void *__rq, struct task_struct *prev, struct task_struct *next)
 	if (!tracer_enabled)
 		return;
 
-	tracing_record_cmdline(prev);
-
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
@@ -73,6 +71,9 @@ void
 ftrace_ctx_switch(void *__rq, struct task_struct *prev,
 		  struct task_struct *next)
 {
+	if (unlikely(atomic_read(&trace_record_cmdline_enabled)))
+		tracing_record_cmdline(prev);
+
 	/*
 	 * If tracer_switch_func only points to the local
 	 * switch func, it still needs the ptr passed to it.
@@ -134,11 +135,13 @@ static void sched_switch_reset(struct trace_array *tr)
 static void start_sched_trace(struct trace_array *tr)
 {
 	sched_switch_reset(tr);
+	atomic_inc(&trace_record_cmdline_enabled);
 	tracer_enabled = 1;
 }
 
 static void stop_sched_trace(struct trace_array *tr)
 {
+	atomic_dec(&trace_record_cmdline_enabled);
 	tracer_enabled = 0;
 }
 
-- 
cgit v1.2.3


From 53d0aa773053ab18287781e25d52c5faf9e0e09e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 12 May 2008 21:21:01 +0200
Subject: ftrace: add logic to record overruns

This patch sets up the infrastructure to record overruns of the tracing
buffer.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 16 +++++++++++-----
 kernel/trace/trace.h |  6 +++++-
 2 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1281969103b8..b9126ef46a9e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -609,6 +609,7 @@ void unregister_tracer(struct tracer *type)
 void tracing_reset(struct trace_array_cpu *data)
 {
 	data->trace_idx = 0;
+	data->overrun = 0;
 	data->trace_head = data->trace_tail = head_page(data);
 	data->trace_head_idx = 0;
 	data->trace_tail_idx = 0;
@@ -750,6 +751,7 @@ tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data)
 	if (data->trace_head == data->trace_tail &&
 	    idx_next == data->trace_tail_idx) {
 		/* overrun */
+		data->overrun++;
 		data->trace_tail_idx++;
 		if (data->trace_tail_idx >= ENTRIES_PER_PAGE) {
 			data->trace_tail =
@@ -2353,8 +2355,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 {
 	struct trace_iterator *iter = filp->private_data;
 	struct trace_array_cpu *data;
-	struct trace_array *tr = iter->tr;
-	struct tracer *tracer = iter->trace;
 	static cpumask_t mask;
 	static int start;
 	unsigned long flags;
@@ -2433,10 +2433,11 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 	if (cnt >= PAGE_SIZE)
 		cnt = PAGE_SIZE - 1;
 
-	memset(iter, 0, sizeof(*iter));
-	iter->tr = tr;
-	iter->trace = tracer;
+	/* reset all but tr, trace, and overruns */
 	iter->pos = -1;
+	memset(&iter->seq, 0,
+	       sizeof(struct trace_iterator) -
+	       offsetof(struct trace_iterator, seq));
 
 	/*
 	 * We need to stop all tracing on all CPUS to read the
@@ -2465,6 +2466,11 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 	for_each_cpu_mask(cpu, mask) {
 		data = iter->tr->data[cpu];
 		__raw_spin_lock(&data->lock);
+
+		if (data->overrun > iter->last_overrun[cpu])
+			iter->overrun[cpu] +=
+				data->overrun - iter->last_overrun[cpu];
+		iter->last_overrun[cpu] = data->overrun;
 	}
 
 	while (find_next_entry_inc(iter) != NULL) {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8991c5efcc74..c1ec134ac356 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -97,6 +97,7 @@ struct trace_array_cpu {
 	void			*trace_head; /* producer */
 	void			*trace_tail; /* consumer */
 	unsigned long		trace_idx;
+	unsigned long		overrun;
 	unsigned long		saved_latency;
 	unsigned long		critical_start;
 	unsigned long		critical_end;
@@ -157,10 +158,13 @@ struct trace_seq {
  * results to users and which routines might sleep, etc:
  */
 struct trace_iterator {
-	struct trace_seq	seq;
 	struct trace_array	*tr;
 	struct tracer		*trace;
+	long			last_overrun[NR_CPUS];
+	long			overrun[NR_CPUS];
 
+	/* The below is zeroed out in pipe_read */
+	struct trace_seq	seq;
 	struct trace_entry	*ent;
 	int			cpu;
 
-- 
cgit v1.2.3


From 107bad8bef5ab2c3a3bff7648c18c9dc3abdc13b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 12 May 2008 21:21:01 +0200
Subject: ftrace: add trace pipe header pluggin

This patch adds a method for open_pipe and open_read to the pluggins
so that they can add a header to the trace pipe call.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 38 +++++++++++++++++++++++++++++++-------
 kernel/trace/trace.h |  5 +++++
 2 files changed, 36 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b9126ef46a9e..32f9106d612c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2307,11 +2307,15 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
 	if (!iter)
 		return -ENOMEM;
 
+	mutex_lock(&trace_types_lock);
 	iter->tr = &global_trace;
 	iter->trace = current_trace;
-
 	filp->private_data = iter;
 
+	if (iter->trace->pipe_open)
+		iter->trace->pipe_open(iter);
+	mutex_unlock(&trace_types_lock);
+
 	return 0;
 }
 
@@ -2380,13 +2384,24 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 		return cnt;
 	}
 
+	mutex_lock(&trace_types_lock);
+	if (iter->trace->read) {
+		ret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
+		if (ret) {
+			read = ret;
+			goto out;
+		}
+	}
+
 	trace_seq_reset(&iter->seq);
 	start = 0;
 
 	while (trace_empty(iter)) {
 
-		if ((filp->f_flags & O_NONBLOCK))
-			return -EAGAIN;
+		if ((filp->f_flags & O_NONBLOCK)) {
+			read = -EAGAIN;
+			goto out;
+		}
 
 		/*
 		 * This is a make-shift waitqueue. The reason we don't use
@@ -2400,16 +2415,22 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 		set_current_state(TASK_INTERRUPTIBLE);
 		iter->tr->waiter = current;
 
+		mutex_unlock(&trace_types_lock);
+
 		/* sleep for one second, and try again. */
 		schedule_timeout(HZ);
 
+		mutex_lock(&trace_types_lock);
+
 		iter->tr->waiter = NULL;
 
-		if (signal_pending(current))
-			return -EINTR;
+		if (signal_pending(current)) {
+			read = -EINTR;
+			goto out;
+		}
 
 		if (iter->trace != current_trace)
-			return 0;
+			goto out;
 
 		/*
 		 * We block until we read something and tracing is disabled.
@@ -2428,7 +2449,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 
 	/* stop when tracing is finished */
 	if (trace_empty(iter))
-		return 0;
+		goto out;
 
 	if (cnt >= PAGE_SIZE)
 		cnt = PAGE_SIZE - 1;
@@ -2518,6 +2539,9 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 	if (ret)
 		read = -EFAULT;
 
+out:
+	mutex_unlock(&trace_types_lock);
+
 	return read;
 }
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c1ec134ac356..ee53d706066f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -135,9 +135,13 @@ struct tracer {
 	void			(*init)(struct trace_array *tr);
 	void			(*reset)(struct trace_array *tr);
 	void			(*open)(struct trace_iterator *iter);
+	void			(*pipe_open)(struct trace_iterator *iter);
 	void			(*close)(struct trace_iterator *iter);
 	void			(*start)(struct trace_iterator *iter);
 	void			(*stop)(struct trace_iterator *iter);
+	ssize_t			(*read)(struct trace_iterator *iter,
+					struct file *filp, char __user *ubuf,
+					size_t cnt, loff_t *ppos);
 	void			(*ctrl_update)(struct trace_array *tr);
 #ifdef CONFIG_FTRACE_STARTUP_TEST
 	int			(*selftest)(struct tracer *trace,
@@ -160,6 +164,7 @@ struct trace_seq {
 struct trace_iterator {
 	struct trace_array	*tr;
 	struct tracer		*trace;
+	void			*private;
 	long			last_overrun[NR_CPUS];
 	long			overrun[NR_CPUS];
 
-- 
cgit v1.2.3


From 4823ed7eadf35e4b57ce581327e21d39585f1f32 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 12 May 2008 21:21:01 +0200
Subject: ftrace: fix setting of pos in read_pipe

In resetting the iterator in read_pipe, the reset of pos was
postitioned in the wrong location with respect to the memset
operation. The current code sets pos, incorrectly, to zero.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 32f9106d612c..49e16630628a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2455,10 +2455,10 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 		cnt = PAGE_SIZE - 1;
 
 	/* reset all but tr, trace, and overruns */
-	iter->pos = -1;
 	memset(&iter->seq, 0,
 	       sizeof(struct trace_iterator) -
 	       offsetof(struct trace_iterator, seq));
+	iter->pos = -1;
 
 	/*
 	 * We need to stop all tracing on all CPUS to read the
-- 
cgit v1.2.3


From 9fe068e92f6290e89e19adc521441661a1229f00 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:21:02 +0200
Subject: ftrace: trace faster

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 49e16630628a..ca0d6ff74c11 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2417,8 +2417,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 
 		mutex_unlock(&trace_types_lock);
 
-		/* sleep for one second, and try again. */
-		schedule_timeout(HZ);
+		/* sleep for 100 msecs, and try again. */
+		schedule_timeout(HZ/10);
 
 		mutex_lock(&trace_types_lock);
 
-- 
cgit v1.2.3


From a4feb8348b62fe76a63cdb5569f5c920f5283c06 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:21:02 +0200
Subject: ftrace: special stacktrace

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 47 ++++++++++++++++++++++++-----------------------
 1 file changed, 24 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ca0d6ff74c11..c232d8248a09 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -808,29 +808,6 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
 		trace_function(tr, data, ip, parent_ip, flags);
 }
 
-void
-__trace_special(void *__tr, void *__data,
-		unsigned long arg1, unsigned long arg2, unsigned long arg3)
-{
-	struct trace_array_cpu *data = __data;
-	struct trace_array *tr = __tr;
-	struct trace_entry *entry;
-	unsigned long irq_flags;
-
-	raw_local_irq_save(irq_flags);
-	__raw_spin_lock(&data->lock);
-	entry			= tracing_get_trace_entry(tr, data);
-	tracing_generic_entry_update(entry, 0);
-	entry->type		= TRACE_SPECIAL;
-	entry->special.arg1	= arg1;
-	entry->special.arg2	= arg2;
-	entry->special.arg3	= arg3;
-	__raw_spin_unlock(&data->lock);
-	raw_local_irq_restore(irq_flags);
-
-	trace_wake_up();
-}
-
 void __trace_stack(struct trace_array *tr,
 		   struct trace_array_cpu *data,
 		   unsigned long flags,
@@ -856,6 +833,30 @@ void __trace_stack(struct trace_array *tr,
 	save_stack_trace(&trace);
 }
 
+void
+__trace_special(void *__tr, void *__data,
+		unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+	struct trace_array_cpu *data = __data;
+	struct trace_array *tr = __tr;
+	struct trace_entry *entry;
+	unsigned long irq_flags;
+
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&data->lock);
+	entry			= tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, 0);
+	entry->type		= TRACE_SPECIAL;
+	entry->special.arg1	= arg1;
+	entry->special.arg2	= arg2;
+	entry->special.arg3	= arg3;
+	__trace_stack(tr, data, irq_flags, 4);
+	__raw_spin_unlock(&data->lock);
+	raw_local_irq_restore(irq_flags);
+
+	trace_wake_up();
+}
+
 void
 tracing_sched_switch_trace(struct trace_array *tr,
 			   struct trace_array_cpu *data,
-- 
cgit v1.2.3


From 2bb6f8d6389cbfadd657e7dc069f6986abf35e4f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 12 May 2008 21:21:02 +0200
Subject: ftrace: use raw_smp_processor_id for mcount functions

Due to debug hooks in the kernel that can change the way smp_processor_id
works, use raw_smp_processor_id in mcount called functions (namely
ftrace_record_ip). Currently we annotate most debug functions from calling
mcount, but we should not rely on that to prevent kernel lockups.

This patch uses the raw_smp_processor_id to prevent a recusive crash
that can happen if a debug hook in smp_processor_id calls mcount.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/ftrace.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 40f64f7cd850..af5ad8949abb 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -267,6 +267,7 @@ ftrace_record_ip(unsigned long ip)
 	unsigned long key;
 	int resched;
 	int atomic;
+	int cpu;
 
 	if (!ftrace_enabled || ftrace_disabled)
 		return;
@@ -274,9 +275,15 @@ ftrace_record_ip(unsigned long ip)
 	resched = need_resched();
 	preempt_disable_notrace();
 
-	/* We simply need to protect against recursion */
-	__get_cpu_var(ftrace_shutdown_disable_cpu)++;
-	if (__get_cpu_var(ftrace_shutdown_disable_cpu) != 1)
+	/*
+	 * We simply need to protect against recursion.
+	 * Use the the raw version of smp_processor_id and not
+	 * __get_cpu_var which can call debug hooks that can
+	 * cause a recursive crash here.
+	 */
+	cpu = raw_smp_processor_id();
+	per_cpu(ftrace_shutdown_disable_cpu, cpu)++;
+	if (per_cpu(ftrace_shutdown_disable_cpu, cpu) != 1)
 		goto out;
 
 	if (unlikely(ftrace_record_suspend))
@@ -317,7 +324,7 @@ ftrace_record_ip(unsigned long ip)
  out_unlock:
 	spin_unlock_irqrestore(&ftrace_shutdown_lock, flags);
  out:
-	__get_cpu_var(ftrace_shutdown_disable_cpu)--;
+	per_cpu(ftrace_shutdown_disable_cpu, cpu)--;
 
 	/* prevent recursion with scheduler */
 	if (resched)
-- 
cgit v1.2.3


From 6c6c27969a4c6024e6c8838829546c02aaddca18 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Mon, 12 May 2008 21:21:02 +0200
Subject: ftrace: add readpos to struct trace_seq; add trace_seq_to_user()

Refactor code from tracing_read_pipe() and create trace_seq_to_user().
Moved trace_seq_reset() call before iter->trace->read() call so that
when all leftover data is returned, trace_seq is reset automatically.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 73 ++++++++++++++++++++++++++--------------------------
 kernel/trace/trace.h |  3 +++
 2 files changed, 39 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c232d8248a09..82ced406aacf 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -400,6 +400,26 @@ static void
 trace_seq_reset(struct trace_seq *s)
 {
 	s->len = 0;
+	s->readpos = 0;
+}
+
+ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
+{
+	int len;
+	int ret;
+
+	if (s->len <= s->readpos)
+		return -EBUSY;
+
+	len = s->len - s->readpos;
+	if (cnt > len)
+		cnt = len;
+	ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
+	if (ret)
+		return -EFAULT;
+
+	s->readpos += len;
+	return cnt;
 }
 
 static void
@@ -2361,46 +2381,32 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 	struct trace_iterator *iter = filp->private_data;
 	struct trace_array_cpu *data;
 	static cpumask_t mask;
-	static int start;
 	unsigned long flags;
 #ifdef CONFIG_FTRACE
 	int ftrace_save;
 #endif
-	int read = 0;
 	int cpu;
-	int len;
-	int ret;
+	ssize_t sret;
 
 	/* return any leftover data */
-	if (iter->seq.len > start) {
-		len = iter->seq.len - start;
-		if (cnt > len)
-			cnt = len;
-		ret = copy_to_user(ubuf, iter->seq.buffer + start, cnt);
-		if (ret)
-			cnt = -EFAULT;
-
-		start += len;
+	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+	if (sret != -EBUSY)
+		return sret;
+	sret = 0;
 
-		return cnt;
-	}
+	trace_seq_reset(&iter->seq);
 
 	mutex_lock(&trace_types_lock);
 	if (iter->trace->read) {
-		ret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
-		if (ret) {
-			read = ret;
+		sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
+		if (sret)
 			goto out;
-		}
 	}
 
-	trace_seq_reset(&iter->seq);
-	start = 0;
-
 	while (trace_empty(iter)) {
 
 		if ((filp->f_flags & O_NONBLOCK)) {
-			read = -EAGAIN;
+			sret = -EAGAIN;
 			goto out;
 		}
 
@@ -2426,7 +2432,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 		iter->tr->waiter = NULL;
 
 		if (signal_pending(current)) {
-			read = -EINTR;
+			sret = -EINTR;
 			goto out;
 		}
 
@@ -2496,6 +2502,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 	}
 
 	while (find_next_entry_inc(iter) != NULL) {
+		int ret;
 		int len = iter->seq.len;
 
 		ret = print_trace_line(iter);
@@ -2526,24 +2533,16 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 	local_irq_restore(flags);
 
 	/* Now copy what we have to the user */
-	read = iter->seq.len;
-	if (read > cnt)
-		read = cnt;
-
-	ret = copy_to_user(ubuf, iter->seq.buffer, read);
-
-	if (read < iter->seq.len)
-		start = read;
-	else
+	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+	if (iter->seq.readpos >= iter->seq.len)
 		trace_seq_reset(&iter->seq);
-
-	if (ret)
-		read = -EFAULT;
+	if (sret == -EBUSY)
+		sret = 0;
 
 out:
 	mutex_unlock(&trace_types_lock);
 
-	return read;
+	return sret;
 }
 
 static ssize_t
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index ee53d706066f..8845033ab49d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -155,6 +155,7 @@ struct tracer {
 struct trace_seq {
 	unsigned char		buffer[PAGE_SIZE];
 	unsigned int		len;
+	unsigned int		readpos;
 };
 
 /*
@@ -301,6 +302,8 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace,
 
 extern void *head_page(struct trace_array_cpu *data);
 extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
+extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
+				 size_t cnt);
 extern long ns2usecs(cycle_t nsec);
 
 extern unsigned long trace_flags;
-- 
cgit v1.2.3


From 3eefae994d9224fb7771a3ddb683868363c23510 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 12 May 2008 21:21:04 +0200
Subject: ftrace: limit trace entries

Currently there is no protection from the root user to use up all of
memory for trace buffers. If the root user allocates too many entries,
the OOM killer might start kill off all tasks.

This patch adds an algorith to check the following condition:

 pages_requested > (freeable_memory + current_trace_buffer_pages) / 4

If the above is met then the allocation fails. The above prevents more
than 1/4th of freeable memory from being used by trace buffers.

To determine the freeable_memory, I made determine_dirtyable_memory in
mm/page-writeback.c global.

Special thanks goes to Peter Zijlstra for suggesting the above calculation.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 82ced406aacf..2824cf48cdca 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -27,6 +27,7 @@
 #include <linux/poll.h>
 #include <linux/gfp.h>
 #include <linux/fs.h>
+#include <linux/writeback.h>
 
 #include <linux/stacktrace.h>
 
@@ -51,6 +52,8 @@ static int trace_free_page(void);
 
 static int tracing_disabled = 1;
 
+static unsigned long tracing_pages_allocated;
+
 long
 ns2usecs(cycle_t nsec)
 {
@@ -2591,12 +2594,41 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
 	}
 
 	if (val > global_trace.entries) {
+		long pages_requested;
+		unsigned long freeable_pages;
+
+		/* make sure we have enough memory before mapping */
+		pages_requested =
+			(val + (ENTRIES_PER_PAGE-1)) / ENTRIES_PER_PAGE;
+
+		/* account for each buffer (and max_tr) */
+		pages_requested *= tracing_nr_buffers * 2;
+
+		/* Check for overflow */
+		if (pages_requested < 0) {
+			cnt = -ENOMEM;
+			goto out;
+		}
+
+		freeable_pages = determine_dirtyable_memory();
+
+		/* we only allow to request 1/4 of useable memory */
+		if (pages_requested >
+		    ((freeable_pages + tracing_pages_allocated) / 4)) {
+			cnt = -ENOMEM;
+			goto out;
+		}
+
 		while (global_trace.entries < val) {
 			if (trace_alloc_page()) {
 				cnt = -ENOMEM;
 				goto out;
 			}
+			/* double check that we don't go over the known pages */
+			if (tracing_pages_allocated > pages_requested)
+				break;
 		}
+
 	} else {
 		/* include the number of entries in val (inc of page entries) */
 		while (global_trace.entries > val + (ENTRIES_PER_PAGE - 1))
@@ -2776,6 +2808,7 @@ static int trace_alloc_page(void)
 	struct page *page, *tmp;
 	LIST_HEAD(pages);
 	void *array;
+	unsigned pages_allocated = 0;
 	int i;
 
 	/* first allocate a page for each CPU */
@@ -2787,6 +2820,7 @@ static int trace_alloc_page(void)
 			goto free_pages;
 		}
 
+		pages_allocated++;
 		page = virt_to_page(array);
 		list_add(&page->lru, &pages);
 
@@ -2798,6 +2832,7 @@ static int trace_alloc_page(void)
 			       "for trace buffer!\n");
 			goto free_pages;
 		}
+		pages_allocated++;
 		page = virt_to_page(array);
 		list_add(&page->lru, &pages);
 #endif
@@ -2819,6 +2854,7 @@ static int trace_alloc_page(void)
 		SetPageLRU(page);
 #endif
 	}
+	tracing_pages_allocated += pages_allocated;
 	global_trace.entries += ENTRIES_PER_PAGE;
 
 	return 0;
@@ -2853,6 +2889,8 @@ static int trace_free_page(void)
 		page = list_entry(p, struct page, lru);
 		ClearPageLRU(page);
 		list_del(&page->lru);
+		tracing_pages_allocated--;
+		tracing_pages_allocated--;
 		__free_page(page);
 
 		tracing_reset(data);
-- 
cgit v1.2.3


From dc102a8fae2d0d6bf5223fc549247f2e23959ae6 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Mon, 12 May 2008 21:21:09 +0200
Subject: Markers - remove extra format argument

Denys Vlasenko <vda.linux@googlemail.com> :

> Not in this patch, but I noticed:
>
> #define __trace_mark(name, call_private, format, args...)               \
>         do {                                                            \
>                 static const char __mstrtab_##name[]                    \
>                 __attribute__((section("__markers_strings")))           \
>                 = #name "\0" format;                                    \
>                 static struct marker __mark_##name                      \
>                 __attribute__((section("__markers"), aligned(8))) =     \
>                 { __mstrtab_##name, &__mstrtab_##name[sizeof(#name)],   \
>                 0, 0, marker_probe_cb,                                  \
>                 { __mark_empty_function, NULL}, NULL };                 \
>                 __mark_check_format(format, ## args);                   \
>                 if (unlikely(__mark_##name.state)) {                    \
>                         (*__mark_##name.call)                           \
>                                 (&__mark_##name, call_private,          \
>                                 format, ## args);                       \
>                 }                                                       \
>         } while (0)
>
> In this call:
>
>                         (*__mark_##name.call)                           \
>                                 (&__mark_##name, call_private,          \
>                                 format, ## args);                       \
>
> you make gcc allocate duplicate format string. You can use
> &__mstrtab_##name[sizeof(#name)] instead since it holds the same string,
> or drop ", format," above and "const char *fmt" from here:
>
>         void (*call)(const struct marker *mdata,        /* Probe wrapper */
>                 void *call_private, const char *fmt, ...);
>
> since mdata->format is the same and all callees which need it can take it there.

Very good point. I actually thought about dropping it, since it would
remove an unnecessary argument from the stack. And actually, since I now
have the marker_probe_cb sitting between the marker site and the
callbacks, there is no API change required. Thanks :)

Mathieu

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
CC: Denys Vlasenko <vda.linux@googlemail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/marker.c | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/marker.c b/kernel/marker.c
index b5a9fe1d50d5..1abfb923b761 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -55,8 +55,8 @@ static DEFINE_MUTEX(markers_mutex);
 struct marker_entry {
 	struct hlist_node hlist;
 	char *format;
-	void (*call)(const struct marker *mdata,	/* Probe wrapper */
-		void *call_private, const char *fmt, ...);
+			/* Probe wrapper */
+	void (*call)(const struct marker *mdata, void *call_private, ...);
 	struct marker_probe_closure single;
 	struct marker_probe_closure *multi;
 	int refcount;	/* Number of times armed. 0 if disarmed. */
@@ -91,15 +91,13 @@ EXPORT_SYMBOL_GPL(__mark_empty_function);
  * marker_probe_cb Callback that prepares the variable argument list for probes.
  * @mdata: pointer of type struct marker
  * @call_private: caller site private data
- * @fmt: format string
  * @...:  Variable argument list.
  *
  * Since we do not use "typical" pointer based RCU in the 1 argument case, we
  * need to put a full smp_rmb() in this branch. This is why we do not use
  * rcu_dereference() for the pointer read.
  */
-void marker_probe_cb(const struct marker *mdata, void *call_private,
-	const char *fmt, ...)
+void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
 {
 	va_list args;
 	char ptype;
@@ -120,8 +118,9 @@ void marker_probe_cb(const struct marker *mdata, void *call_private,
 		/* Must read the ptr before private data. They are not data
 		 * dependant, so we put an explicit smp_rmb() here. */
 		smp_rmb();
-		va_start(args, fmt);
-		func(mdata->single.probe_private, call_private, fmt, &args);
+		va_start(args, call_private);
+		func(mdata->single.probe_private, call_private, mdata->format,
+			&args);
 		va_end(args);
 	} else {
 		struct marker_probe_closure *multi;
@@ -136,9 +135,9 @@ void marker_probe_cb(const struct marker *mdata, void *call_private,
 		smp_read_barrier_depends();
 		multi = mdata->multi;
 		for (i = 0; multi[i].func; i++) {
-			va_start(args, fmt);
-			multi[i].func(multi[i].probe_private, call_private, fmt,
-				&args);
+			va_start(args, call_private);
+			multi[i].func(multi[i].probe_private, call_private,
+				mdata->format, &args);
 			va_end(args);
 		}
 	}
@@ -150,13 +149,11 @@ EXPORT_SYMBOL_GPL(marker_probe_cb);
  * marker_probe_cb Callback that does not prepare the variable argument list.
  * @mdata: pointer of type struct marker
  * @call_private: caller site private data
- * @fmt: format string
  * @...:  Variable argument list.
  *
  * Should be connected to markers "MARK_NOARGS".
  */
-void marker_probe_cb_noarg(const struct marker *mdata,
-	void *call_private, const char *fmt, ...)
+void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
 {
 	va_list args;	/* not initialized */
 	char ptype;
@@ -172,7 +169,8 @@ void marker_probe_cb_noarg(const struct marker *mdata,
 		/* Must read the ptr before private data. They are not data
 		 * dependant, so we put an explicit smp_rmb() here. */
 		smp_rmb();
-		func(mdata->single.probe_private, call_private, fmt, &args);
+		func(mdata->single.probe_private, call_private, mdata->format,
+			&args);
 	} else {
 		struct marker_probe_closure *multi;
 		int i;
@@ -186,8 +184,8 @@ void marker_probe_cb_noarg(const struct marker *mdata,
 		smp_read_barrier_depends();
 		multi = mdata->multi;
 		for (i = 0; multi[i].func; i++)
-			multi[i].func(multi[i].probe_private, call_private, fmt,
-				&args);
+			multi[i].func(multi[i].probe_private, call_private,
+				mdata->format, &args);
 	}
 	preempt_enable();
 }
-- 
cgit v1.2.3


From 5b82a1b08a00b2adca3d9dd9777efff40b7aaaa1 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Mon, 12 May 2008 21:21:10 +0200
Subject: Port ftrace to markers

Porting ftrace to the marker infrastructure.

Don't need to chain to the wakeup tracer from the sched tracer, because markers
support multiple probes connected.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
CC: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c                    |  14 +++-
 kernel/trace/trace.h              |  20 +----
 kernel/trace/trace_sched_switch.c | 171 +++++++++++++++++++++++++++++++-------
 kernel/trace/trace_sched_wakeup.c | 106 +++++++++++++++++++++--
 4 files changed, 255 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ad95cca4e42e..e2e985eeee78 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2500,7 +2500,9 @@ out_activate:
 	success = 1;
 
 out_running:
-	ftrace_wake_up_task(rq, p, rq->curr);
+	trace_mark(kernel_sched_wakeup,
+		"pid %d state %ld ## rq %p task %p rq->curr %p",
+		p->pid, p->state, rq, p, rq->curr);
 	check_preempt_curr(rq, p);
 
 	p->state = TASK_RUNNING;
@@ -2631,7 +2633,9 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		p->sched_class->task_new(rq, p);
 		inc_nr_running(rq);
 	}
-	ftrace_wake_up_task(rq, p, rq->curr);
+	trace_mark(kernel_sched_wakeup_new,
+		"pid %d state %ld ## rq %p task %p rq->curr %p",
+		p->pid, p->state, rq, p, rq->curr);
 	check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
@@ -2804,7 +2808,11 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	struct mm_struct *mm, *oldmm;
 
 	prepare_task_switch(rq, prev, next);
-	ftrace_ctx_switch(rq, prev, next);
+	trace_mark(kernel_sched_schedule,
+		"prev_pid %d next_pid %d prev_state %ld "
+		"## rq %p prev %p next %p",
+		prev->pid, next->pid, prev->state,
+		rq, prev, next);
 	mm = next->mm;
 	oldmm = prev->active_mm;
 	/*
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8845033ab49d..f5de0601b408 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -234,25 +234,10 @@ void update_max_tr_single(struct trace_array *tr,
 
 extern cycle_t ftrace_now(int cpu);
 
-#ifdef CONFIG_SCHED_TRACER
-extern void
-wakeup_sched_switch(struct task_struct *prev, struct task_struct *next);
-extern void
-wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr);
-#else
-static inline void
-wakeup_sched_switch(struct task_struct *prev, struct task_struct *next)
-{
-}
-static inline void
-wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr)
-{
-}
-#endif
-
 #ifdef CONFIG_CONTEXT_SWITCH_TRACER
 typedef void
 (*tracer_switch_func_t)(void *private,
+			void *__rq,
 			struct task_struct *prev,
 			struct task_struct *next);
 
@@ -262,9 +247,6 @@ struct tracer_switch_ops {
 	struct tracer_switch_ops	*next;
 };
 
-extern int register_tracer_switch(struct tracer_switch_ops *ops);
-extern int unregister_tracer_switch(struct tracer_switch_ops *ops);
-
 #endif /* CONFIG_CONTEXT_SWITCH_TRACER */
 
 #ifdef CONFIG_DYNAMIC_FTRACE
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index a3376478fc2c..d25ffa5eaf2b 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -16,11 +16,14 @@
 
 static struct trace_array	*ctx_trace;
 static int __read_mostly	tracer_enabled;
+static atomic_t			sched_ref;
 
 static void
-ctx_switch_func(void *__rq, struct task_struct *prev, struct task_struct *next)
+sched_switch_func(void *private, void *__rq, struct task_struct *prev,
+			struct task_struct *next)
 {
-	struct trace_array *tr = ctx_trace;
+	struct trace_array **ptr = private;
+	struct trace_array *tr = *ptr;
 	struct trace_array_cpu *data;
 	unsigned long flags;
 	long disabled;
@@ -41,10 +44,40 @@ ctx_switch_func(void *__rq, struct task_struct *prev, struct task_struct *next)
 	local_irq_restore(flags);
 }
 
+static notrace void
+sched_switch_callback(void *probe_data, void *call_data,
+		      const char *format, va_list *args)
+{
+	struct task_struct *prev;
+	struct task_struct *next;
+	struct rq *__rq;
+
+	if (!atomic_read(&sched_ref))
+		return;
+
+	/* skip prev_pid %d next_pid %d prev_state %ld */
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, long);
+	__rq = va_arg(*args, typeof(__rq));
+	prev = va_arg(*args, typeof(prev));
+	next = va_arg(*args, typeof(next));
+
+	tracing_record_cmdline(prev);
+
+	/*
+	 * If tracer_switch_func only points to the local
+	 * switch func, it still needs the ptr passed to it.
+	 */
+	sched_switch_func(probe_data, __rq, prev, next);
+}
+
 static void
-wakeup_func(void *__rq, struct task_struct *wakee, struct task_struct *curr)
+wakeup_func(void *private, void *__rq, struct task_struct *wakee, struct
+			task_struct *curr)
 {
-	struct trace_array *tr = ctx_trace;
+	struct trace_array **ptr = private;
+	struct trace_array *tr = *ptr;
 	struct trace_array_cpu *data;
 	unsigned long flags;
 	long disabled;
@@ -67,35 +100,29 @@ wakeup_func(void *__rq, struct task_struct *wakee, struct task_struct *curr)
 	local_irq_restore(flags);
 }
 
-void
-ftrace_ctx_switch(void *__rq, struct task_struct *prev,
-		  struct task_struct *next)
+static notrace void
+wake_up_callback(void *probe_data, void *call_data,
+		 const char *format, va_list *args)
 {
-	if (unlikely(atomic_read(&trace_record_cmdline_enabled)))
-		tracing_record_cmdline(prev);
+	struct task_struct *curr;
+	struct task_struct *task;
+	struct rq *__rq;
 
-	/*
-	 * If tracer_switch_func only points to the local
-	 * switch func, it still needs the ptr passed to it.
-	 */
-	ctx_switch_func(__rq, prev, next);
+	if (likely(!tracer_enabled))
+		return;
 
-	/*
-	 * Chain to the wakeup tracer (this is a NOP if disabled):
-	 */
-	wakeup_sched_switch(prev, next);
-}
+	/* Skip pid %d state %ld */
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, long);
+	/* now get the meat: "rq %p task %p rq->curr %p" */
+	__rq = va_arg(*args, typeof(__rq));
+	task = va_arg(*args, typeof(task));
+	curr = va_arg(*args, typeof(curr));
 
-void
-ftrace_wake_up_task(void *__rq, struct task_struct *wakee,
-		    struct task_struct *curr)
-{
-	wakeup_func(__rq, wakee, curr);
+	tracing_record_cmdline(task);
+	tracing_record_cmdline(curr);
 
-	/*
-	 * Chain to the wakeup tracer (this is a NOP if disabled):
-	 */
-	wakeup_sched_wakeup(wakee, curr);
+	wakeup_func(probe_data, __rq, task, curr);
 }
 
 void
@@ -132,15 +159,95 @@ static void sched_switch_reset(struct trace_array *tr)
 		tracing_reset(tr->data[cpu]);
 }
 
+static int tracing_sched_register(void)
+{
+	int ret;
+
+	ret = marker_probe_register("kernel_sched_wakeup",
+			"pid %d state %ld ## rq %p task %p rq->curr %p",
+			wake_up_callback,
+			&ctx_trace);
+	if (ret) {
+		pr_info("wakeup trace: Couldn't add marker"
+			" probe to kernel_sched_wakeup\n");
+		return ret;
+	}
+
+	ret = marker_probe_register("kernel_sched_wakeup_new",
+			"pid %d state %ld ## rq %p task %p rq->curr %p",
+			wake_up_callback,
+			&ctx_trace);
+	if (ret) {
+		pr_info("wakeup trace: Couldn't add marker"
+			" probe to kernel_sched_wakeup_new\n");
+		goto fail_deprobe;
+	}
+
+	ret = marker_probe_register("kernel_sched_schedule",
+		"prev_pid %d next_pid %d prev_state %ld "
+		"## rq %p prev %p next %p",
+		sched_switch_callback,
+		&ctx_trace);
+	if (ret) {
+		pr_info("sched trace: Couldn't add marker"
+			" probe to kernel_sched_schedule\n");
+		goto fail_deprobe_wake_new;
+	}
+
+	return ret;
+fail_deprobe_wake_new:
+	marker_probe_unregister("kernel_sched_wakeup_new",
+				wake_up_callback,
+				&ctx_trace);
+fail_deprobe:
+	marker_probe_unregister("kernel_sched_wakeup",
+				wake_up_callback,
+				&ctx_trace);
+	return ret;
+}
+
+static void tracing_sched_unregister(void)
+{
+	marker_probe_unregister("kernel_sched_schedule",
+				sched_switch_callback,
+				&ctx_trace);
+	marker_probe_unregister("kernel_sched_wakeup_new",
+				wake_up_callback,
+				&ctx_trace);
+	marker_probe_unregister("kernel_sched_wakeup",
+				wake_up_callback,
+				&ctx_trace);
+}
+
+void tracing_start_sched_switch(void)
+{
+	long ref;
+
+	ref = atomic_inc_return(&sched_ref);
+	if (ref == 1)
+		tracing_sched_register();
+}
+
+void tracing_stop_sched_switch(void)
+{
+	long ref;
+
+	ref = atomic_dec_and_test(&sched_ref);
+	if (ref)
+		tracing_sched_unregister();
+}
+
 static void start_sched_trace(struct trace_array *tr)
 {
 	sched_switch_reset(tr);
 	atomic_inc(&trace_record_cmdline_enabled);
 	tracer_enabled = 1;
+	tracing_start_sched_switch();
 }
 
 static void stop_sched_trace(struct trace_array *tr)
 {
+	tracing_stop_sched_switch();
 	atomic_dec(&trace_record_cmdline_enabled);
 	tracer_enabled = 0;
 }
@@ -181,6 +288,14 @@ static struct tracer sched_switch_trace __read_mostly =
 
 __init static int init_sched_switch_trace(void)
 {
+	int ret = 0;
+
+	if (atomic_read(&sched_ref))
+		ret = tracing_sched_register();
+	if (ret) {
+		pr_info("error registering scheduler trace\n");
+		return ret;
+	}
 	return register_tracer(&sched_switch_trace);
 }
 device_initcall(init_sched_switch_trace);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 5948011006bc..5d2fb48e47f8 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -15,6 +15,7 @@
 #include <linux/kallsyms.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
+#include <linux/marker.h>
 
 #include "trace.h"
 
@@ -44,11 +45,13 @@ static int report_latency(cycle_t delta)
 	return 1;
 }
 
-void
-wakeup_sched_switch(struct task_struct *prev, struct task_struct *next)
+static void notrace
+wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
+	struct task_struct *next)
 {
 	unsigned long latency = 0, t0 = 0, t1 = 0;
-	struct trace_array *tr = wakeup_trace;
+	struct trace_array **ptr = private;
+	struct trace_array *tr = *ptr;
 	struct trace_array_cpu *data;
 	cycle_t T0, T1, delta;
 	unsigned long flags;
@@ -113,6 +116,31 @@ out:
 	atomic_dec(&tr->data[cpu]->disabled);
 }
 
+static notrace void
+sched_switch_callback(void *probe_data, void *call_data,
+		      const char *format, va_list *args)
+{
+	struct task_struct *prev;
+	struct task_struct *next;
+	struct rq *__rq;
+
+	/* skip prev_pid %d next_pid %d prev_state %ld */
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, long);
+	__rq = va_arg(*args, typeof(__rq));
+	prev = va_arg(*args, typeof(prev));
+	next = va_arg(*args, typeof(next));
+
+	tracing_record_cmdline(prev);
+
+	/*
+	 * If tracer_switch_func only points to the local
+	 * switch func, it still needs the ptr passed to it.
+	 */
+	wakeup_sched_switch(probe_data, __rq, prev, next);
+}
+
 static void __wakeup_reset(struct trace_array *tr)
 {
 	struct trace_array_cpu *data;
@@ -188,19 +216,68 @@ out:
 	atomic_dec(&tr->data[cpu]->disabled);
 }
 
-void wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr)
+static notrace void
+wake_up_callback(void *probe_data, void *call_data,
+		 const char *format, va_list *args)
 {
+	struct trace_array **ptr = probe_data;
+	struct trace_array *tr = *ptr;
+	struct task_struct *curr;
+	struct task_struct *task;
+	struct rq *__rq;
+
 	if (likely(!tracer_enabled))
 		return;
 
+	/* Skip pid %d state %ld */
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, long);
+	/* now get the meat: "rq %p task %p rq->curr %p" */
+	__rq = va_arg(*args, typeof(__rq));
+	task = va_arg(*args, typeof(task));
+	curr = va_arg(*args, typeof(curr));
+
+	tracing_record_cmdline(task);
 	tracing_record_cmdline(curr);
-	tracing_record_cmdline(wakee);
 
-	wakeup_check_start(wakeup_trace, wakee, curr);
+	wakeup_check_start(tr, task, curr);
 }
 
 static void start_wakeup_tracer(struct trace_array *tr)
 {
+	int ret;
+
+	ret = marker_probe_register("kernel_sched_wakeup",
+			"pid %d state %ld ## rq %p task %p rq->curr %p",
+			wake_up_callback,
+			&wakeup_trace);
+	if (ret) {
+		pr_info("wakeup trace: Couldn't add marker"
+			" probe to kernel_sched_wakeup\n");
+		return;
+	}
+
+	ret = marker_probe_register("kernel_sched_wakeup_new",
+			"pid %d state %ld ## rq %p task %p rq->curr %p",
+			wake_up_callback,
+			&wakeup_trace);
+	if (ret) {
+		pr_info("wakeup trace: Couldn't add marker"
+			" probe to kernel_sched_wakeup_new\n");
+		goto fail_deprobe;
+	}
+
+	ret = marker_probe_register("kernel_sched_schedule",
+		"prev_pid %d next_pid %d prev_state %ld "
+		"## rq %p prev %p next %p",
+		sched_switch_callback,
+		&wakeup_trace);
+	if (ret) {
+		pr_info("sched trace: Couldn't add marker"
+			" probe to kernel_sched_schedule\n");
+		goto fail_deprobe_wake_new;
+	}
+
 	wakeup_reset(tr);
 
 	/*
@@ -215,11 +292,28 @@ static void start_wakeup_tracer(struct trace_array *tr)
 	tracer_enabled = 1;
 
 	return;
+fail_deprobe_wake_new:
+	marker_probe_unregister("kernel_sched_wakeup_new",
+				wake_up_callback,
+				&wakeup_trace);
+fail_deprobe:
+	marker_probe_unregister("kernel_sched_wakeup",
+				wake_up_callback,
+				&wakeup_trace);
 }
 
 static void stop_wakeup_tracer(struct trace_array *tr)
 {
 	tracer_enabled = 0;
+	marker_probe_unregister("kernel_sched_schedule",
+				sched_switch_callback,
+				&wakeup_trace);
+	marker_probe_unregister("kernel_sched_wakeup_new",
+				wake_up_callback,
+				&wakeup_trace);
+	marker_probe_unregister("kernel_sched_wakeup",
+				wake_up_callback,
+				&wakeup_trace);
 }
 
 static void wakeup_tracer_init(struct trace_array *tr)
-- 
cgit v1.2.3


From 74f4e369fc5b52433ad824cef32d3bf1304549be Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:21:15 +0200
Subject: ftrace: stacktrace fix

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/semaphore.c   | 2 ++
 kernel/trace/trace.c | 4 ++--
 kernel/trace/trace.h | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 5c2942e768cd..1a064adab658 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -31,6 +31,7 @@
 #include <linux/sched.h>
 #include <linux/semaphore.h>
 #include <linux/spinlock.h>
+#include <linux/ftrace.h>
 
 static noinline void __down(struct semaphore *sem);
 static noinline int __down_interruptible(struct semaphore *sem);
@@ -53,6 +54,7 @@ void down(struct semaphore *sem)
 {
 	unsigned long flags;
 
+	ftrace_special(sem->count, 0, __LINE__);
 	spin_lock_irqsave(&sem->lock, flags);
 	if (likely(sem->count > 0))
 		sem->count--;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2824cf48cdca..3271916ff033 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -901,7 +901,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->ctx.next_pid	= next->pid;
 	entry->ctx.next_prio	= next->prio;
 	entry->ctx.next_state	= next->state;
-	__trace_stack(tr, data, flags, 4);
+	__trace_stack(tr, data, flags, 5);
 	__raw_spin_unlock(&data->lock);
 	raw_local_irq_restore(irq_flags);
 }
@@ -927,7 +927,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->ctx.next_pid	= wakee->pid;
 	entry->ctx.next_prio	= wakee->prio;
 	entry->ctx.next_state	= wakee->state;
-	__trace_stack(tr, data, flags, 5);
+	__trace_stack(tr, data, flags, 6);
 	__raw_spin_unlock(&data->lock);
 	raw_local_irq_restore(irq_flags);
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f5de0601b408..c460e85e94ed 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -51,7 +51,7 @@ struct special_entry {
  * Stack-trace entry:
  */
 
-#define FTRACE_STACK_ENTRIES	5
+#define FTRACE_STACK_ENTRIES	8
 
 struct stack_entry {
 	unsigned long		caller[FTRACE_STACK_ENTRIES];
-- 
cgit v1.2.3


From aa5e5ceaf52a882a29d9b86531a20733f5116066 Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Tue, 13 May 2008 22:06:56 -0700
Subject: ftrace: remove packed attribute on ftrace_page.

It causes unaligned access traps on platforms like sparc
(ftrace_page may be marked packed, but once we return
a dyn_ftrace sub-object from this array to another piece
of code, the "packed" part of the typing information doesn't
propagate).

But also, it didn't serve any purpose either.  Even if packed,
on 64-bit or 32-bit, it didn't give us any more dyn_ftrace
entries per-page.

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/ftrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index af5ad8949abb..07b2a14943f8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -177,9 +177,9 @@ static DEFINE_MUTEX(ftrace_filter_lock);
 
 struct ftrace_page {
 	struct ftrace_page	*next;
-	int			index;
+	unsigned long		index;
 	struct dyn_ftrace	records[];
-} __attribute__((packed));
+};
 
 #define ENTRIES_PER_PAGE \
   ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace))
-- 
cgit v1.2.3


From 37135677e653537ffc6e7def679443272a1c03c3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 14 May 2008 08:10:31 +0200
Subject: ftrace: fix mcount export bug

David S. Miller noticed the following bug: the -pg instrumentation
function callback is named differently on each platform. On x86 it
is mcount, on sparc it is _mcount. So the export does not make sense
in kernel/trace/ftrace.c - move it to x86.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/ftrace.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 07b2a14943f8..a3e47f43f8a0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -50,9 +50,6 @@ static struct ftrace_ops ftrace_list_end __read_mostly =
 static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
 ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
 
-/* mcount is defined per arch in assembly */
-EXPORT_SYMBOL(mcount);
-
 void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
 {
 	struct ftrace_ops *op = ftrace_list;
-- 
cgit v1.2.3


From 2d8b820b2e81954754277723379ae9ed5de316fa Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 23 Feb 2008 16:55:50 +0100
Subject: ftrace: cleanups

factor out code and clean it up.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/ftrace.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a3e47f43f8a0..89bd9a6f52ec 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -18,13 +18,13 @@
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
-#include <linux/kthread.h>
 #include <linux/hardirq.h>
-#include <linux/ftrace.h>
+#include <linux/kthread.h>
 #include <linux/uaccess.h>
+#include <linux/ftrace.h>
 #include <linux/sysctl.h>
-#include <linux/hash.h>
 #include <linux/ctype.h>
+#include <linux/hash.h>
 #include <linux/list.h>
 
 #include "trace.h"
-- 
cgit v1.2.3


From 4e491d14f2506b218d678935c25a7027b79178b1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 14 May 2008 23:49:44 -0400
Subject: ftrace: support for PowerPC

This patch adds full support for ftrace for PowerPC (both 64 and 32 bit).
This includes dynamic tracing and function filtering.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace_selftest.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index a5f6001c3332..3877dd9102f1 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -123,6 +123,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 	int ret;
 	int save_ftrace_enabled = ftrace_enabled;
 	int save_tracer_enabled = tracer_enabled;
+	char *func_name;
 
 	/* The ftrace test PASSED */
 	printk(KERN_CONT "PASSED\n");
@@ -142,9 +143,15 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 		return ret;
 	}
 
+	/*
+	 * Some archs *cough*PowerPC*cough* add charachters to the
+	 * start of the function names. We simply put a '*' to
+	 * accomodate them.
+	 */
+	func_name = "*" STR(DYN_FTRACE_TEST_NAME);
+
 	/* filter only on our function */
-	ftrace_set_filter(STR(DYN_FTRACE_TEST_NAME),
-			  sizeof(STR(DYN_FTRACE_TEST_NAME)), 1);
+	ftrace_set_filter(func_name, strlen(func_name), 1);
 
 	/* enable tracing */
 	tr->ctrl = 1;
-- 
cgit v1.2.3


From 6ec562328fda585be2d7f472cfac99d3b44d362a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 14 May 2008 21:30:30 -0400
Subject: ftrace: use the new kbuild CFLAGS_REMOVE for kernel directory

This patch removes the Makefile turd and uses the nice CFLAGS_REMOVE macro
in the kernel directory.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/Makefile | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index d2f80ea4cd9a..ca2433e84873 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,12 +11,16 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o
 
+CFLAGS_REMOVE_sched.o = -pg -mno-spe
+
 ifdef CONFIG_FTRACE
-# Do not profile debug utilities
-ORIG_CFLAGS := $(KBUILD_CFLAGS)
-KBUILD_CFLAGS = $(if $(filter-out lockdep% %debug,$(basename $(notdir $@))), \
-	$(ORIG_CFLAGS), \
-	$(subst -pg,,$(ORIG_CFLAGS)))
+# Do not trace debug files and internal ftrace files
+CFLAGS_REMOVE_lockdep.o = -pg
+CFLAGS_REMOVE_lockdep_proc.o = -pg
+CFLAGS_REMOVE_mutex-debug.o = -pg
+CFLAGS_REMOVE_rtmutex-debug.o = -pg
+CFLAGS_REMOVE_cgroup-debug.o = -pg
+CFLAGS_REMOVE_sched_clock.o = -pg
 endif
 
 obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
-- 
cgit v1.2.3


From 677aa9f77e8de3791b481a0cec6c8b84d1eec626 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Sat, 17 May 2008 00:01:36 -0400
Subject: ftrace: add have dynamic ftrace config for archs

Now that ftrace is being ported to other architectures, it has become
apparent that DYNAMIC_FTRACE is dependent on whether or not that
architecture implements dynamic ftrace. FTRACE itself may be ported to
an architecture without porting dynamic ftrace.

This patch adds HAVE_DYNAMIC_FTRACE to allow architectures to port ftrace
without having to also port the dynamic aspect as well.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/Kconfig | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index f3005717bcd0..5c2295b29f2c 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -4,6 +4,9 @@
 config HAVE_FTRACE
 	bool
 
+config HAVE_DYNAMIC_FTRACE
+	bool
+
 config TRACER_MAX_TRACE
 	bool
 
@@ -94,6 +97,7 @@ config CONTEXT_SWITCH_TRACER
 config DYNAMIC_FTRACE
 	bool "enable/disable ftrace tracepoints dynamically"
 	depends on FTRACE
+	depends on HAVE_DYNAMIC_FTRACE
 	default y
 	help
          This option will modify all the calls to ftrace dynamically
-- 
cgit v1.2.3


From f06c38103ea9dbca27c3f4d77f444ddefb5477cd Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:47 +0200
Subject: ftrace: add sysprof plugin

very first baby version.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/Kconfig         |  8 +++++
 kernel/trace/Makefile        |  1 +
 kernel/trace/trace_sysprof.c | 80 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 89 insertions(+)
 create mode 100644 kernel/trace/trace_sysprof.c

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 5c2295b29f2c..e101c9a85f0f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -75,6 +75,14 @@ config PREEMPT_TRACER
 	  enabled. This option and the irqs-off timing option can be
 	  used together or separately.)
 
+config SYSPROF_TRACER
+	bool "Sysprof Tracer"
+	depends on DEBUG_KERNEL
+	select TRACING
+	help
+	  This tracer provides the trace needed by the 'Sysprof' userspace
+	  tool.
+
 config SCHED_TRACER
 	bool "Scheduling Latency Tracer"
 	depends on HAVE_FTRACE
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index d9efbbfa2bdf..7aec123ec1d8 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_FTRACE) += libftrace.o
 
 obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
+obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
 obj-$(CONFIG_FTRACE) += trace_functions.o
 obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
new file mode 100644
index 000000000000..6c139bc1be7e
--- /dev/null
+++ b/kernel/trace/trace_sysprof.c
@@ -0,0 +1,80 @@
+/*
+ * trace stack traces
+ *
+ * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/marker.h>
+#include <linux/ftrace.h>
+
+#include "trace.h"
+
+static struct trace_array	*ctx_trace;
+static int __read_mostly	tracer_enabled;
+
+static notrace void stack_reset(struct trace_array *tr)
+{
+	int cpu;
+
+	tr->time_start = ftrace_now(tr->cpu);
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr->data[cpu]);
+}
+
+static notrace void start_stack_trace(struct trace_array *tr)
+{
+	stack_reset(tr);
+	tracer_enabled = 1;
+}
+
+static notrace void stop_stack_trace(struct trace_array *tr)
+{
+	tracer_enabled = 0;
+}
+
+static notrace void stack_trace_init(struct trace_array *tr)
+{
+	ctx_trace = tr;
+
+	if (tr->ctrl)
+		start_stack_trace(tr);
+}
+
+static notrace void stack_trace_reset(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		stop_stack_trace(tr);
+}
+
+static void stack_trace_ctrl_update(struct trace_array *tr)
+{
+	/* When starting a new trace, reset the buffers */
+	if (tr->ctrl)
+		start_stack_trace(tr);
+	else
+		stop_stack_trace(tr);
+}
+
+static struct tracer stack_trace __read_mostly =
+{
+	.name		= "sysprof",
+	.init		= stack_trace_init,
+	.reset		= stack_trace_reset,
+	.ctrl_update	= stack_trace_ctrl_update,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest    = trace_selftest_startup_stack,
+#endif
+};
+
+__init static int init_stack_trace(void)
+{
+	return register_tracer(&stack_trace);
+}
+device_initcall(init_stack_trace);
-- 
cgit v1.2.3


From 0075fa80305f3231a2d5df97b00d7f55a48ea27e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:47 +0200
Subject: ftrace: extend sysprof plugin

add per CPU hrtimers.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace_sysprof.c | 67 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 63 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 6c139bc1be7e..ba55b871b3d9 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -5,19 +5,76 @@
  * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
  *
  */
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/debugfs.h>
 #include <linux/kallsyms.h>
+#include <linux/debugfs.h>
+#include <linux/hrtimer.h>
 #include <linux/uaccess.h>
-#include <linux/marker.h>
 #include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/fs.h>
 
 #include "trace.h"
 
 static struct trace_array	*ctx_trace;
 static int __read_mostly	tracer_enabled;
 
+static const unsigned long sample_period = 1000000;
+
+/*
+ * Per CPU hrtimers that do the profiling:
+ */
+static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer);
+
+static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer)
+{
+	/* trace here */
+	panic_timeout++;
+
+	hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
+
+	return HRTIMER_RESTART;
+}
+
+static void start_stack_timer(int cpu)
+{
+	struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu);
+
+	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrtimer->function = stack_trace_timer_fn;
+	hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+
+	hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
+}
+
+static void start_stack_timers(void)
+{
+	cpumask_t saved_mask = current->cpus_allowed;
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
+		start_stack_timer(cpu);
+		printk("started timer on cpu%d\n", cpu);
+	}
+	set_cpus_allowed_ptr(current, &saved_mask);
+}
+
+static void stop_stack_timer(int cpu)
+{
+	struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu);
+
+	hrtimer_cancel(hrtimer);
+	printk("cancelled timer on cpu%d\n", cpu);
+}
+
+static void stop_stack_timers(void)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		stop_stack_timer(cpu);
+}
+
 static notrace void stack_reset(struct trace_array *tr)
 {
 	int cpu;
@@ -31,11 +88,13 @@ static notrace void stack_reset(struct trace_array *tr)
 static notrace void start_stack_trace(struct trace_array *tr)
 {
 	stack_reset(tr);
+	start_stack_timers();
 	tracer_enabled = 1;
 }
 
 static notrace void stop_stack_trace(struct trace_array *tr)
 {
+	stop_stack_timers();
 	tracer_enabled = 0;
 }
 
-- 
cgit v1.2.3


From 56a08bdcff20f0022bd9160c1093e56f763499aa Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:47 +0200
Subject: ftrace: extend sysprof plugin some more

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace_sysprof.c | 80 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 76 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index ba55b871b3d9..b1137c11ef8b 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
  * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
- *
+ * Copyright (C) 2004, 2005, Soeren Sandmann
  */
 #include <linux/kallsyms.h>
 #include <linux/debugfs.h>
@@ -11,13 +11,17 @@
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
 #include <linux/module.h>
+#include <linux/irq.h>
 #include <linux/fs.h>
 
 #include "trace.h"
 
-static struct trace_array	*ctx_trace;
+static struct trace_array	*sysprof_trace;
 static int __read_mostly	tracer_enabled;
 
+/*
+ * 10 msecs for now:
+ */
 static const unsigned long sample_period = 1000000;
 
 /*
@@ -25,10 +29,78 @@ static const unsigned long sample_period = 1000000;
  */
 static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer);
 
+struct stack_frame {
+	const void __user	*next_fp;
+	unsigned long		return_address;
+};
+
+static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+{
+	if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
+		return 0;
+
+	if (__copy_from_user_inatomic(frame, frame_pointer, sizeof(*frame)))
+		return 0;
+
+	return 1;
+}
+
+#define SYSPROF_MAX_ADDRESSES	512
+
+static void timer_notify(struct pt_regs *regs, int cpu)
+{
+	const void __user *frame_pointer;
+	struct trace_array_cpu *data;
+	struct stack_frame frame;
+	struct trace_array *tr;
+	int is_user;
+	int i;
+
+	if (!regs)
+		return;
+
+	tr = sysprof_trace;
+	data = tr->data[cpu];
+	is_user = user_mode(regs);
+
+	if (!current || current->pid == 0)
+		return;
+
+	if (is_user && current->state != TASK_RUNNING)
+		return;
+
+	if (!is_user) {
+		/* kernel */
+		ftrace(tr, data, current->pid, 1, 0);
+		return;
+
+	}
+
+	trace_special(tr, data, 0, current->pid, regs->ip);
+
+	frame_pointer = (void __user *)regs->bp;
+
+	for (i = 0; i < SYSPROF_MAX_ADDRESSES; i++) {
+		if (!copy_stack_frame(frame_pointer, &frame))
+			break;
+		if ((unsigned long)frame_pointer < regs->sp)
+			break;
+
+		trace_special(tr, data, 1, frame.return_address,
+			      (unsigned long)frame_pointer);
+		frame_pointer = frame.next_fp;
+	}
+
+	trace_special(tr, data, 2, current->pid, i);
+
+	if (i == SYSPROF_MAX_ADDRESSES)
+		trace_special(tr, data, -1, -1, -1);
+}
+
 static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer)
 {
 	/* trace here */
-	panic_timeout++;
+	timer_notify(get_irq_regs(), smp_processor_id());
 
 	hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
 
@@ -100,7 +172,7 @@ static notrace void stop_stack_trace(struct trace_array *tr)
 
 static notrace void stack_trace_init(struct trace_array *tr)
 {
-	ctx_trace = tr;
+	sysprof_trace = tr;
 
 	if (tr->ctrl)
 		start_stack_trace(tr);
-- 
cgit v1.2.3


From a6dd24f8d00cbccb560b19a723e6fb9bdfb20799 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:47 +0200
Subject: ftrace: sysprof-plugin, add self-tests

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.h          |  4 ++++
 kernel/trace/trace_selftest.c | 28 ++++++++++++++++++++++++++++
 kernel/trace/trace_sysprof.c  |  6 +++---
 3 files changed, 35 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c460e85e94ed..b2198bc830ae 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -280,6 +280,10 @@ extern int trace_selftest_startup_wakeup(struct tracer *trace,
 extern int trace_selftest_startup_sched_switch(struct tracer *trace,
 					       struct trace_array *tr);
 #endif
+#ifdef CONFIG_SYSPROF_TRACER
+extern int trace_selftest_startup_sysprof(struct tracer *trace,
+					       struct trace_array *tr);
+#endif
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 
 extern void *head_page(struct trace_array_cpu *data);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 3877dd9102f1..033a6fb2e5ff 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -537,3 +537,31 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
 	return ret;
 }
 #endif /* CONFIG_CONTEXT_SWITCH_TRACER */
+
+#ifdef CONFIG_SYSPROF_TRACER
+int
+trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
+{
+	unsigned long count;
+	int ret;
+
+	/* start the tracing */
+	tr->ctrl = 1;
+	trace->init(tr);
+	/* Sleep for a 1/10 of a second */
+	msleep(100);
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	/* check the trace buffer */
+	ret = trace_test_buffer(tr, &count);
+	trace->reset(tr);
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+	}
+
+	return ret;
+}
+#endif /* CONFIG_SYSPROF_TRACER */
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index b1137c11ef8b..b78f12f77fca 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -126,7 +126,7 @@ static void start_stack_timers(void)
 	for_each_online_cpu(cpu) {
 		set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
 		start_stack_timer(cpu);
-		printk("started timer on cpu%d\n", cpu);
+		printk(KERN_INFO "started sysprof timer on cpu%d\n", cpu);
 	}
 	set_cpus_allowed_ptr(current, &saved_mask);
 }
@@ -136,7 +136,7 @@ static void stop_stack_timer(int cpu)
 	struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu);
 
 	hrtimer_cancel(hrtimer);
-	printk("cancelled timer on cpu%d\n", cpu);
+	printk(KERN_INFO "cancelled sysprof timer on cpu%d\n", cpu);
 }
 
 static void stop_stack_timers(void)
@@ -200,7 +200,7 @@ static struct tracer stack_trace __read_mostly =
 	.reset		= stack_trace_reset,
 	.ctrl_update	= stack_trace_ctrl_update,
 #ifdef CONFIG_FTRACE_SELFTEST
-	.selftest    = trace_selftest_startup_stack,
+	.selftest    = trace_selftest_startup_sysprof,
 #endif
 };
 
-- 
cgit v1.2.3


From 842af315e8b0adad58fc642eaa5e6f53525e0534 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:47 +0200
Subject: ftrace: sysprof plugin improvement

add sample maximum depth.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace_sysprof.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index b78f12f77fca..7f6fcccffb88 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -23,6 +23,7 @@ static int __read_mostly	tracer_enabled;
  * 10 msecs for now:
  */
 static const unsigned long sample_period = 1000000;
+static const unsigned int sample_max_depth = 512;
 
 /*
  * Per CPU hrtimers that do the profiling:
@@ -45,8 +46,6 @@ static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
 	return 1;
 }
 
-#define SYSPROF_MAX_ADDRESSES	512
-
 static void timer_notify(struct pt_regs *regs, int cpu)
 {
 	const void __user *frame_pointer;
@@ -80,7 +79,7 @@ static void timer_notify(struct pt_regs *regs, int cpu)
 
 	frame_pointer = (void __user *)regs->bp;
 
-	for (i = 0; i < SYSPROF_MAX_ADDRESSES; i++) {
+	for (i = 0; i < sample_max_depth; i++) {
 		if (!copy_stack_frame(frame_pointer, &frame))
 			break;
 		if ((unsigned long)frame_pointer < regs->sp)
@@ -93,7 +92,7 @@ static void timer_notify(struct pt_regs *regs, int cpu)
 
 	trace_special(tr, data, 2, current->pid, i);
 
-	if (i == SYSPROF_MAX_ADDRESSES)
+	if (i == sample_max_depth)
 		trace_special(tr, data, -1, -1, -1);
 }
 
@@ -126,7 +125,6 @@ static void start_stack_timers(void)
 	for_each_online_cpu(cpu) {
 		set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
 		start_stack_timer(cpu);
-		printk(KERN_INFO "started sysprof timer on cpu%d\n", cpu);
 	}
 	set_cpus_allowed_ptr(current, &saved_mask);
 }
@@ -136,7 +134,6 @@ static void stop_stack_timer(int cpu)
 	struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu);
 
 	hrtimer_cancel(hrtimer);
-	printk(KERN_INFO "cancelled sysprof timer on cpu%d\n", cpu);
 }
 
 static void stop_stack_timers(void)
-- 
cgit v1.2.3


From ef4ab15ff34fd9c65e92bee70f58e7179da881c5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:48 +0200
Subject: ftrace: make sysprof dependent on x86 for now

that's the only tested platform for now. If there's interest we
can make it generic easily.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/Kconfig          | 2 +-
 kernel/trace/trace_selftest.c | 5 -----
 2 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e101c9a85f0f..9b49526ac0b5 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -77,7 +77,7 @@ config PREEMPT_TRACER
 
 config SYSPROF_TRACER
 	bool "Sysprof Tracer"
-	depends on DEBUG_KERNEL
+	depends on DEBUG_KERNEL && X86
 	select TRACING
 	help
 	  This tracer provides the trace needed by the 'Sysprof' userspace
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 033a6fb2e5ff..5588ecc40985 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -557,11 +557,6 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
 	ret = trace_test_buffer(tr, &count);
 	trace->reset(tr);
 
-	if (!ret && !count) {
-		printk(KERN_CONT ".. no entries found ..");
-		ret = -1;
-	}
-
 	return ret;
 }
 #endif /* CONFIG_SYSPROF_TRACER */
-- 
cgit v1.2.3


From 9f6b4e3f4a24f2590f1c96f117fc45fbea9b0fa4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:48 +0200
Subject: ftrace: sysprof fix

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace_sysprof.c | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 7f6fcccffb88..f9a09fe705b0 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -37,21 +37,26 @@ struct stack_frame {
 
 static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
 {
+	int ret;
+
 	if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
 		return 0;
 
-	if (__copy_from_user_inatomic(frame, frame_pointer, sizeof(*frame)))
-		return 0;
+	ret = 1;
+	pagefault_disable();
+	if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
+		ret = 0;
+	pagefault_enable();
 
-	return 1;
+	return ret;
 }
 
 static void timer_notify(struct pt_regs *regs, int cpu)
 {
-	const void __user *frame_pointer;
 	struct trace_array_cpu *data;
 	struct stack_frame frame;
 	struct trace_array *tr;
+	const void __user *fp;
 	int is_user;
 	int i;
 
@@ -77,21 +82,26 @@ static void timer_notify(struct pt_regs *regs, int cpu)
 
 	trace_special(tr, data, 0, current->pid, regs->ip);
 
-	frame_pointer = (void __user *)regs->bp;
+	fp = (void __user *)regs->bp;
 
 	for (i = 0; i < sample_max_depth; i++) {
-		if (!copy_stack_frame(frame_pointer, &frame))
+		frame.next_fp = 0;
+		frame.return_address = 0;
+		if (!copy_stack_frame(fp, &frame))
 			break;
-		if ((unsigned long)frame_pointer < regs->sp)
+		if ((unsigned long)fp < regs->sp)
 			break;
 
 		trace_special(tr, data, 1, frame.return_address,
-			      (unsigned long)frame_pointer);
-		frame_pointer = frame.next_fp;
+			      (unsigned long)fp);
+		fp = frame.next_fp;
 	}
 
 	trace_special(tr, data, 2, current->pid, i);
 
+	/*
+	 * Special trace entry if we overflow the max depth:
+	 */
 	if (i == sample_max_depth)
 		trace_special(tr, data, -1, -1, -1);
 }
-- 
cgit v1.2.3


From d618b3e6e50970a6248ac857653fdd49bcd3c045 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:49 +0200
Subject: ftrace: sysprof updates

make the sample period configurable.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c         |  3 ++
 kernel/trace/trace.h         |  2 ++
 kernel/trace/trace_sysprof.c | 70 ++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 73 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3271916ff033..95b7c48a9a1d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2800,6 +2800,9 @@ static __init void tracer_init_debugfs(void)
 		pr_warning("Could not create debugfs "
 			   "'dyn_ftrace_total_info' entry\n");
 #endif
+#ifdef CONFIG_SYSPROF_TRACER
+	init_tracer_sysprof_debugfs(d_tracer);
+#endif
 }
 
 static int trace_alloc_page(void)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b2198bc830ae..b7f85d9c80d7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -188,6 +188,8 @@ struct trace_iterator {
 void tracing_reset(struct trace_array_cpu *data);
 int tracing_open_generic(struct inode *inode, struct file *filp);
 struct dentry *tracing_init_dentry(void);
+void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
+
 void ftrace(struct trace_array *tr,
 			    struct trace_array_cpu *data,
 			    unsigned long ip,
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index f9a09fe705b0..19406236b67b 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -20,11 +20,12 @@ static struct trace_array	*sysprof_trace;
 static int __read_mostly	tracer_enabled;
 
 /*
- * 10 msecs for now:
+ * 1 msec sample interval by default:
  */
-static const unsigned long sample_period = 1000000;
+static unsigned long sample_period = 1000000;
 static const unsigned int sample_max_depth = 512;
 
+static DEFINE_MUTEX(sample_timer_lock);
 /*
  * Per CPU hrtimers that do the profiling:
  */
@@ -166,15 +167,19 @@ static notrace void stack_reset(struct trace_array *tr)
 
 static notrace void start_stack_trace(struct trace_array *tr)
 {
+	mutex_lock(&sample_timer_lock);
 	stack_reset(tr);
 	start_stack_timers();
 	tracer_enabled = 1;
+	mutex_unlock(&sample_timer_lock);
 }
 
 static notrace void stop_stack_trace(struct trace_array *tr)
 {
+	mutex_lock(&sample_timer_lock);
 	stop_stack_timers();
 	tracer_enabled = 0;
+	mutex_unlock(&sample_timer_lock);
 }
 
 static notrace void stack_trace_init(struct trace_array *tr)
@@ -216,3 +221,64 @@ __init static int init_stack_trace(void)
 	return register_tracer(&stack_trace);
 }
 device_initcall(init_stack_trace);
+
+#define MAX_LONG_DIGITS 22
+
+static ssize_t
+sysprof_sample_read(struct file *filp, char __user *ubuf,
+		    size_t cnt, loff_t *ppos)
+{
+	char buf[MAX_LONG_DIGITS];
+	int r;
+
+	r = sprintf(buf, "%ld\n", nsecs_to_usecs(sample_period));
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+sysprof_sample_write(struct file *filp, const char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	char buf[MAX_LONG_DIGITS];
+	unsigned long val;
+
+	if (cnt > MAX_LONG_DIGITS-1)
+		cnt = MAX_LONG_DIGITS-1;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	val = simple_strtoul(buf, NULL, 10);
+	/*
+	 * Enforce a minimum sample period of 100 usecs:
+	 */
+	if (val < 100)
+		val = 100;
+
+	mutex_lock(&sample_timer_lock);
+	stop_stack_timers();
+	sample_period = val * 1000;
+	start_stack_timers();
+	mutex_unlock(&sample_timer_lock);
+
+	return cnt;
+}
+
+static struct file_operations sysprof_sample_fops = {
+	.read		= sysprof_sample_read,
+	.write		= sysprof_sample_write,
+};
+
+void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
+{
+	struct dentry *entry;
+
+	entry = debugfs_create_file("sysprof_sample_period", 0644,
+			d_tracer, NULL, &sysprof_sample_fops);
+	if (entry)
+		return;
+	pr_warning("Could not create debugfs 'dyn_ftrace_total_info' entry\n");
+}
-- 
cgit v1.2.3


From ada6b835067dc022f11cdae1c313a3710d3d977c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 23 May 2008 23:50:41 +0200
Subject: ftrace: remove notrace

Remove the notrace annotations. The build logic takes care of that.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace_sysprof.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 19406236b67b..3b1e4ba9180d 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -155,7 +155,7 @@ static void stop_stack_timers(void)
 		stop_stack_timer(cpu);
 }
 
-static notrace void stack_reset(struct trace_array *tr)
+static void stack_reset(struct trace_array *tr)
 {
 	int cpu;
 
@@ -165,7 +165,7 @@ static notrace void stack_reset(struct trace_array *tr)
 		tracing_reset(tr->data[cpu]);
 }
 
-static notrace void start_stack_trace(struct trace_array *tr)
+static void start_stack_trace(struct trace_array *tr)
 {
 	mutex_lock(&sample_timer_lock);
 	stack_reset(tr);
@@ -174,7 +174,7 @@ static notrace void start_stack_trace(struct trace_array *tr)
 	mutex_unlock(&sample_timer_lock);
 }
 
-static notrace void stop_stack_trace(struct trace_array *tr)
+static void stop_stack_trace(struct trace_array *tr)
 {
 	mutex_lock(&sample_timer_lock);
 	stop_stack_timers();
@@ -182,7 +182,7 @@ static notrace void stop_stack_trace(struct trace_array *tr)
 	mutex_unlock(&sample_timer_lock);
 }
 
-static notrace void stack_trace_init(struct trace_array *tr)
+static void stack_trace_init(struct trace_array *tr)
 {
 	sysprof_trace = tr;
 
@@ -190,7 +190,7 @@ static notrace void stack_trace_init(struct trace_array *tr)
 		start_stack_trace(tr);
 }
 
-static notrace void stack_trace_reset(struct trace_array *tr)
+static void stack_trace_reset(struct trace_array *tr)
 {
 	if (tr->ctrl)
 		stop_stack_trace(tr);
-- 
cgit v1.2.3


From 9caee613d3b860ae81b79370eeae9ac967c07536 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 23 May 2008 23:55:54 +0200
Subject: ftrace: fix __trace_special()

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace_sysprof.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 3b1e4ba9180d..76dd953eeccd 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -81,7 +81,7 @@ static void timer_notify(struct pt_regs *regs, int cpu)
 
 	}
 
-	trace_special(tr, data, 0, current->pid, regs->ip);
+	__trace_special(tr, data, 0, current->pid, regs->ip);
 
 	fp = (void __user *)regs->bp;
 
@@ -93,18 +93,18 @@ static void timer_notify(struct pt_regs *regs, int cpu)
 		if ((unsigned long)fp < regs->sp)
 			break;
 
-		trace_special(tr, data, 1, frame.return_address,
+		__trace_special(tr, data, 1, frame.return_address,
 			      (unsigned long)fp);
 		fp = frame.next_fp;
 	}
 
-	trace_special(tr, data, 2, current->pid, i);
+	__trace_special(tr, data, 2, current->pid, i);
 
 	/*
 	 * Special trace entry if we overflow the max depth:
 	 */
 	if (i == sample_max_depth)
-		trace_special(tr, data, -1, -1, -1);
+		__trace_special(tr, data, -1, -1, -1);
 }
 
 static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer)
-- 
cgit v1.2.3


From 5fc4511c756860149b81aead6eca5bdf5c438ea7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 23 May 2008 23:58:21 +0200
Subject: ftrace: make it more available in the Kconfig

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 9b49526ac0b5..e101c9a85f0f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -77,7 +77,7 @@ config PREEMPT_TRACER
 
 config SYSPROF_TRACER
 	bool "Sysprof Tracer"
-	depends on DEBUG_KERNEL && X86
+	depends on DEBUG_KERNEL
 	select TRACING
 	help
 	  This tracer provides the trace needed by the 'Sysprof' userspace
-- 
cgit v1.2.3


From cd2134b1dda92fd450e6a1e12b1c7960dd6a2178 Mon Sep 17 00:00:00 2001
From: Soeren Sandmann Pedersen <sandmann@redhat.com>
Date: Mon, 12 May 2008 21:20:54 +0200
Subject: sysprof: kernel trace

add kernel backtracing to the sysprof tracer.

change the format of the data, so that type=0 means
beginning of stack trace, 1 means kernel address, 2 means user
address, and 3 means end of trace.

EIP addresses are no longer distinguished from return addresses,
mostly because sysprof userspace doesn't make use of it. It may be
worthwhile adding this back in though, just in case it becomes
interesting.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace_sysprof.c | 89 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 80 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 76dd953eeccd..ebcb66d054cc 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -14,6 +14,8 @@
 #include <linux/irq.h>
 #include <linux/fs.h>
 
+#include <asm/stacktrace.h>
+
 #include "trace.h"
 
 static struct trace_array	*sysprof_trace;
@@ -52,6 +54,77 @@ static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
 	return ret;
 }
 
+struct backtrace_info {
+	struct trace_array_cpu	*data;
+	struct trace_array	*tr;
+	int			pos;
+};
+
+static void
+backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+	/* Ignore warnings */
+}
+
+static void backtrace_warning(void *data, char *msg)
+{
+	/* Ignore warnings */
+}
+
+static int backtrace_stack(void *data, char *name)
+{
+	/* Don't bother with IRQ stacks for now */
+	return -1;
+}
+
+static void backtrace_address(void *data, unsigned long addr, int reliable)
+{
+	struct backtrace_info *info = data;
+
+	if (info->pos < sample_max_depth && reliable) {
+		__trace_special(info->tr, info->data, 1, addr, 0);
+
+		info->pos++;
+	}
+}
+
+const static struct stacktrace_ops backtrace_ops = {
+	.warning		= backtrace_warning,
+	.warning_symbol		= backtrace_warning_symbol,
+	.stack			= backtrace_stack,
+	.address		= backtrace_address,
+};
+
+static struct pt_regs *
+trace_kernel(struct pt_regs *regs, struct trace_array *tr,
+	     struct trace_array_cpu *data)
+{
+	struct backtrace_info info;
+	unsigned long bp;
+	char *user_stack;
+	char *stack;
+
+	info.tr = tr;
+	info.data = data;
+	info.pos = 1;
+
+	__trace_special(info.tr, info.data, 1, regs->ip, 0);
+
+	stack = ((char *)regs + sizeof(struct pt_regs));
+#ifdef CONFIG_FRAME_POINTER
+	bp = regs->bp;
+#else
+	bp = 0;
+#endif
+
+	dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info);
+
+	/* Now trace the user stack */
+	user_stack = ((char *)current->thread.sp0 - sizeof(struct pt_regs));
+
+	return (struct pt_regs *)user_stack;
+}
+
 static void timer_notify(struct pt_regs *regs, int cpu)
 {
 	struct trace_array_cpu *data;
@@ -74,17 +147,15 @@ static void timer_notify(struct pt_regs *regs, int cpu)
 	if (is_user && current->state != TASK_RUNNING)
 		return;
 
-	if (!is_user) {
-		/* kernel */
-		ftrace(tr, data, current->pid, 1, 0);
-		return;
+	__trace_special(tr, data, 0, 0, current->pid);
 
-	}
-
-	__trace_special(tr, data, 0, current->pid, regs->ip);
+	if (!is_user)
+		regs = trace_kernel(regs, tr, data);
 
 	fp = (void __user *)regs->bp;
 
+	__trace_special(tr, data, 2, regs->ip, 0);
+
 	for (i = 0; i < sample_max_depth; i++) {
 		frame.next_fp = 0;
 		frame.return_address = 0;
@@ -93,12 +164,12 @@ static void timer_notify(struct pt_regs *regs, int cpu)
 		if ((unsigned long)fp < regs->sp)
 			break;
 
-		__trace_special(tr, data, 1, frame.return_address,
+		__trace_special(tr, data, 2, frame.return_address,
 			      (unsigned long)fp);
 		fp = frame.next_fp;
 	}
 
-	__trace_special(tr, data, 2, current->pid, i);
+	__trace_special(tr, data, 3, current->pid, i);
 
 	/*
 	 * Special trace entry if we overflow the max depth:
-- 
cgit v1.2.3


From 8a9e94c1fbfdac45a3b6811b880777c4116aa309 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:54 +0200
Subject: sysprof: update copyrights

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace_sysprof.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index ebcb66d054cc..fe23d6dba7f1 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -1,9 +1,9 @@
 /*
  * trace stack traces
  *
+ * Copyright (C) 2004-2008, Soeren Sandmann
  * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
  * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
- * Copyright (C) 2004, 2005, Soeren Sandmann
  */
 #include <linux/kallsyms.h>
 #include <linux/debugfs.h>
-- 
cgit v1.2.3


From cf3271a73b612a03da00681ecd9bfefab37c74c9 Mon Sep 17 00:00:00 2001
From: Soeren Sandmann <sandmann@daimi.au.dk>
Date: Mon, 12 May 2008 05:28:50 +0200
Subject: ftrace/sysprof: don't trace the user stack if we are a kernel thread.

Check that current->mm is non-NULL before attempting to trace the user
stack.

Also take depth of the kernel stack into account when comparing
against sample_max_depth.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace_sysprof.c | 50 +++++++++++++++++++++++++-------------------
 1 file changed, 29 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index fe23d6dba7f1..2301e1e7c606 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -95,13 +95,12 @@ const static struct stacktrace_ops backtrace_ops = {
 	.address		= backtrace_address,
 };
 
-static struct pt_regs *
+static int
 trace_kernel(struct pt_regs *regs, struct trace_array *tr,
 	     struct trace_array_cpu *data)
 {
 	struct backtrace_info info;
 	unsigned long bp;
-	char *user_stack;
 	char *stack;
 
 	info.tr = tr;
@@ -119,10 +118,7 @@ trace_kernel(struct pt_regs *regs, struct trace_array *tr,
 
 	dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info);
 
-	/* Now trace the user stack */
-	user_stack = ((char *)current->thread.sp0 - sizeof(struct pt_regs));
-
-	return (struct pt_regs *)user_stack;
+	return info.pos;
 }
 
 static void timer_notify(struct pt_regs *regs, int cpu)
@@ -150,32 +146,44 @@ static void timer_notify(struct pt_regs *regs, int cpu)
 	__trace_special(tr, data, 0, 0, current->pid);
 
 	if (!is_user)
-		regs = trace_kernel(regs, tr, data);
+		i = trace_kernel(regs, tr, data);
+	else
+		i = 0;
 
-	fp = (void __user *)regs->bp;
+	/*
+	 * Trace user stack if we are not a kernel thread
+	 */
+	if (current->mm && i < sample_max_depth) {
+		regs = (struct pt_regs *)current->thread.sp0 - 1;
 
-	__trace_special(tr, data, 2, regs->ip, 0);
+		fp = (void __user *)regs->bp;
 
-	for (i = 0; i < sample_max_depth; i++) {
-		frame.next_fp = 0;
-		frame.return_address = 0;
-		if (!copy_stack_frame(fp, &frame))
-			break;
-		if ((unsigned long)fp < regs->sp)
-			break;
+		__trace_special(tr, data, 2, regs->ip, 0);
 
-		__trace_special(tr, data, 2, frame.return_address,
-			      (unsigned long)fp);
-		fp = frame.next_fp;
-	}
+		while (i < sample_max_depth) {
+			frame.next_fp = 0;
+			frame.return_address = 0;
+			if (!copy_stack_frame(fp, &frame))
+				break;
+			if ((unsigned long)fp < regs->sp)
+				break;
 
-	__trace_special(tr, data, 3, current->pid, i);
+			__trace_special(tr, data, 2, frame.return_address,
+					(unsigned long)fp);
+			fp = frame.next_fp;
+
+			i++;
+		}
+
+	}
 
 	/*
 	 * Special trace entry if we overflow the max depth:
 	 */
 	if (i == sample_max_depth)
 		__trace_special(tr, data, -1, -1, -1);
+
+	__trace_special(tr, data, 3, current->pid, i);
 }
 
 static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer)
-- 
cgit v1.2.3


From bfeeeeb991cf75081e6c2f74d44ae5da05b50a94 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Mon, 12 May 2008 21:21:14 +0200
Subject: stacktrace: don't crash on invalid stack trace structs

This patch makes the stacktrace printout code \warn when the entries
pointer is unset rather than crashing when trying to access it in an
attempt to make it a bit more robust.

I was saving a stacktrace into an skb and forgot to copy it across skb
copies... I have since fixed the code, but it would have been easier
had the kernel not crashed in an interrupt.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/stacktrace.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index b71816e47a30..0914d0cbc83c 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -13,6 +13,9 @@ void print_stack_trace(struct stack_trace *trace, int spaces)
 {
 	int i, j;
 
+	if (WARN_ON(!trace->entries))
+		return;
+
 	for (i = 0; i < trace->nr_entries; i++) {
 		unsigned long ip = trace->entries[i];
 
-- 
cgit v1.2.3


From f984b51e0779a6dd30feedc41404013ca54e5d05 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Mon, 12 May 2008 21:20:57 +0200
Subject: ftrace: add mmiotrace plugin

On Sat, 22 Mar 2008 13:07:47 +0100
Ingo Molnar <mingo@elte.hu> wrote:

> > > i'd suggest the following: pull x86.git and sched-devel.git into a
> > > single tree [the two will combine without rejects]. Then try to add a
> > > kernel/tracing/trace_mmiotrace.c ftrace plugin. The trace_sysprof.c
> > > plugin might be a good example.
> >
> > I did this and now I have mmiotrace enabled/disabled via the tracing
> > framework (what do we call this, since ftrace is one of the tracers?).
>
> cool! could you send the patches for that? (even if they are not fully
> functional yet)

Patch attached in the end. Nice to see how much code disappeared. I tried
to mark all the features I had to break with XXX-comments.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/Makefile          |  1 +
 kernel/trace/trace_mmiotrace.c | 84 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 85 insertions(+)
 create mode 100644 kernel/trace/trace_mmiotrace.c

(limited to 'kernel')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index d9efbbfa2bdf..c44a7dce9086 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -18,5 +18,6 @@ obj-$(CONFIG_FTRACE) += trace_functions.o
 obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
+obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
new file mode 100644
index 000000000000..e4dd03cc5aa6
--- /dev/null
+++ b/kernel/trace/trace_mmiotrace.c
@@ -0,0 +1,84 @@
+/*
+ * Memory mapped I/O tracing
+ *
+ * Copyright (C) 2008 Pekka Paalanen <pq@iki.fi>
+ */
+
+#define DEBUG 1
+
+#include <linux/kernel.h>
+#include <linux/mmiotrace.h>
+
+#include "trace.h"
+
+extern void
+__trace_special(void *__tr, void *__data,
+		unsigned long arg1, unsigned long arg2, unsigned long arg3);
+
+static struct trace_array *mmio_trace_array;
+
+
+static void mmio_trace_init(struct trace_array *tr)
+{
+	pr_debug("in %s\n", __func__);
+	mmio_trace_array = tr;
+	if (tr->ctrl)
+		enable_mmiotrace();
+}
+
+static void mmio_trace_reset(struct trace_array *tr)
+{
+	pr_debug("in %s\n", __func__);
+	if (tr->ctrl)
+		disable_mmiotrace();
+}
+
+static void mmio_trace_ctrl_update(struct trace_array *tr)
+{
+	pr_debug("in %s\n", __func__);
+	if (tr->ctrl)
+		enable_mmiotrace();
+	else
+		disable_mmiotrace();
+}
+
+static struct tracer mmio_tracer __read_mostly =
+{
+	.name		= "mmiotrace",
+	.init		= mmio_trace_init,
+	.reset		= mmio_trace_reset,
+	.ctrl_update	= mmio_trace_ctrl_update,
+};
+
+__init static int init_mmio_trace(void)
+{
+	int ret = init_mmiotrace();
+	if (ret)
+		return ret;
+	return register_tracer(&mmio_tracer);
+}
+device_initcall(init_mmio_trace);
+
+void mmio_trace_record(u32 type, unsigned long addr, unsigned long arg)
+{
+	struct trace_array *tr = mmio_trace_array;
+	struct trace_array_cpu *data = tr->data[smp_processor_id()];
+
+	if (!current || current->pid == 0) {
+		/*
+		 * XXX: This is a problem. We need to able to record, no
+		 * matter what. tracing_generic_entry_update() would crash.
+		 */
+		static unsigned limit;
+		if (limit++ < 12)
+			pr_err("Error in %s: no current.\n", __func__);
+		return;
+	}
+	if (!tr || !data) {
+		static unsigned limit;
+		if (limit++ < 12)
+			pr_err("%s: no tr or data\n", __func__);
+		return;
+	}
+	__trace_special(tr, data, type, addr, arg);
+}
-- 
cgit v1.2.3


From bd8ac686c73c7e925fcfe0b02dc4e7b947127864 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Mon, 12 May 2008 21:20:57 +0200
Subject: ftrace: mmiotrace, updates

here is a patch that makes mmiotrace work almost well within the tracing
framework. The patch applies on top of my previous patch. I have my own
output formatting in place now.

Summary of changes:
- fix the NULL dereference that was due to not calling tracing_reset()
- add print_line() callback into struct tracer
- implement print_line() for mmiotrace, producing up-to-spec text
- add my output header, but that is not really called in the right place
- rewrote the main structs in mmiotrace
- added two new trace entry types: TRACE_MMIO_RW and TRACE_MMIO_MAP
- made some functions in trace.c non-static
- check current==NULL in tracing_generic_entry_update()
- fix(?) comparison in trace_seq_printf()

Things seem to work fine except a few issues. Markers (text lines injected
into mmiotrace log) are missing, I did not feel hacking them in before we
have variable length entries. My output header is printed only for 'trace'
file, but not 'trace_pipe'. For some reason, despite my quick fix,
iter->trace is NULL in print_trace_line() when called from 'trace_pipe'
file, which means I don't get proper output formatting.

I only tried by loading nouveau.ko, which just detects the card, and that
is traced fine. I didn't try further. Map, two reads and unmap. Works
perfectly.

I am missing the information about overflows, I'd prefer to have a
counter for lost events. I didn't try, but I guess currently there is no
way of knowning when it overflows?

So, not too far from being fully operational, it seems :-)
And looking at the diffstat, there also is some 700-900 lines of user space
code that just became obsolete.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c           |  34 ++++++++++
 kernel/trace/trace.h           |  14 ++++
 kernel/trace/trace_mmiotrace.c | 151 +++++++++++++++++++++++++++++++++--------
 3 files changed, 171 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3271916ff033..d14fe49e9638 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -831,6 +831,40 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
 		trace_function(tr, data, ip, parent_ip, flags);
 }
 
+#ifdef CONFIG_MMIOTRACE
+void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data,
+						struct mmiotrace_rw *rw)
+{
+	struct trace_entry *entry;
+	unsigned long irq_flags;
+
+	spin_lock_irqsave(&data->lock, irq_flags);
+	entry			= tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, 0);
+	entry->type		= TRACE_MMIO_RW;
+	entry->mmiorw		= *rw;
+	spin_unlock_irqrestore(&data->lock, irq_flags);
+
+	trace_wake_up();
+}
+
+void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data,
+						struct mmiotrace_map *map)
+{
+	struct trace_entry *entry;
+	unsigned long irq_flags;
+
+	spin_lock_irqsave(&data->lock, irq_flags);
+	entry			= tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, 0);
+	entry->type		= TRACE_MMIO_MAP;
+	entry->mmiomap		= *map;
+	spin_unlock_irqrestore(&data->lock, irq_flags);
+
+	trace_wake_up();
+}
+#endif
+
 void __trace_stack(struct trace_array *tr,
 		   struct trace_array_cpu *data,
 		   unsigned long flags,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c460e85e94ed..0ef9ef74c806 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -5,6 +5,7 @@
 #include <asm/atomic.h>
 #include <linux/sched.h>
 #include <linux/clocksource.h>
+#include <linux/mmiotrace.h>
 
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
@@ -14,6 +15,8 @@ enum trace_type {
 	TRACE_WAKE,
 	TRACE_STACK,
 	TRACE_SPECIAL,
+	TRACE_MMIO_RW,
+	TRACE_MMIO_MAP,
 
 	__TRACE_LAST_TYPE
 };
@@ -75,6 +78,8 @@ struct trace_entry {
 		struct ctx_switch_entry		ctx;
 		struct special_entry		special;
 		struct stack_entry		stack;
+		struct mmiotrace_rw		mmiorw;
+		struct mmiotrace_map		mmiomap;
 	};
 };
 
@@ -255,6 +260,15 @@ extern unsigned long ftrace_update_tot_cnt;
 extern int DYN_FTRACE_TEST_NAME(void);
 #endif
 
+#ifdef CONFIG_MMIOTRACE
+extern void __trace_mmiotrace_rw(struct trace_array *tr,
+				struct trace_array_cpu *data,
+				struct mmiotrace_rw *rw);
+extern void __trace_mmiotrace_map(struct trace_array *tr,
+				struct trace_array_cpu *data,
+				struct mmiotrace_map *map);
+#endif
+
 #ifdef CONFIG_FTRACE_STARTUP_TEST
 #ifdef CONFIG_FTRACE
 extern int trace_selftest_startup_function(struct tracer *trace,
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index e4dd03cc5aa6..3a12b1ad0c63 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -11,19 +11,26 @@
 
 #include "trace.h"
 
-extern void
-__trace_special(void *__tr, void *__data,
-		unsigned long arg1, unsigned long arg2, unsigned long arg3);
-
 static struct trace_array *mmio_trace_array;
 
+static void mmio_reset_data(struct trace_array *tr)
+{
+	int cpu;
+
+	tr->time_start = ftrace_now(tr->cpu);
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr->data[cpu]);
+}
 
 static void mmio_trace_init(struct trace_array *tr)
 {
 	pr_debug("in %s\n", __func__);
 	mmio_trace_array = tr;
-	if (tr->ctrl)
+	if (tr->ctrl) {
+		mmio_reset_data(tr);
 		enable_mmiotrace();
+	}
 }
 
 static void mmio_trace_reset(struct trace_array *tr)
@@ -31,15 +38,110 @@ static void mmio_trace_reset(struct trace_array *tr)
 	pr_debug("in %s\n", __func__);
 	if (tr->ctrl)
 		disable_mmiotrace();
+	mmio_reset_data(tr);
+	mmio_trace_array = NULL;
 }
 
 static void mmio_trace_ctrl_update(struct trace_array *tr)
 {
 	pr_debug("in %s\n", __func__);
-	if (tr->ctrl)
+	if (tr->ctrl) {
+		mmio_reset_data(tr);
 		enable_mmiotrace();
-	else
+	} else {
 		disable_mmiotrace();
+	}
+}
+
+/* XXX: This is not called for trace_pipe file! */
+void mmio_print_header(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	trace_seq_printf(s, "VERSION broken 20070824\n");
+	/* TODO: print /proc/bus/pci/devices contents as PCIDEV lines */
+}
+
+static int mmio_print_rw(struct trace_iterator *iter)
+{
+	struct trace_entry *entry = iter->ent;
+	struct mmiotrace_rw *rw	= &entry->mmiorw;
+	struct trace_seq *s	= &iter->seq;
+	unsigned long long t	= ns2usecs(entry->t);
+	unsigned long usec_rem	= do_div(t, 1000000ULL);
+	unsigned secs		= (unsigned long)t;
+	int ret = 1;
+
+	switch (entry->mmiorw.opcode) {
+	case MMIO_READ:
+		ret = trace_seq_printf(s,
+			"R %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n",
+			rw->width, secs, usec_rem, rw->map_id, rw->phys,
+			rw->value, rw->pc, entry->pid);
+		break;
+	case MMIO_WRITE:
+		ret = trace_seq_printf(s,
+			"W %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n",
+			rw->width, secs, usec_rem, rw->map_id, rw->phys,
+			rw->value, rw->pc, entry->pid);
+		break;
+	case MMIO_UNKNOWN_OP:
+		ret = trace_seq_printf(s,
+			"UNKNOWN %lu.%06lu %d 0x%lx %02x,%02x,%02x 0x%lx %d\n",
+			secs, usec_rem, rw->map_id, rw->phys,
+			(rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff,
+			(rw->value >> 0) & 0xff, rw->pc, entry->pid);
+		break;
+	default:
+		ret = trace_seq_printf(s, "rw what?\n");
+		break;
+	}
+	if (ret)
+		return 1;
+	return 0;
+}
+
+static int mmio_print_map(struct trace_iterator *iter)
+{
+	struct trace_entry *entry = iter->ent;
+	struct mmiotrace_map *m	= &entry->mmiomap;
+	struct trace_seq *s	= &iter->seq;
+	unsigned long long t	= ns2usecs(entry->t);
+	unsigned long usec_rem	= do_div(t, 1000000ULL);
+	unsigned secs		= (unsigned long)t;
+	int ret = 1;
+
+	switch (entry->mmiorw.opcode) {
+	case MMIO_PROBE:
+		ret = trace_seq_printf(s,
+			"MAP %lu.%06lu %d 0x%lx 0x%lx 0x%lx 0x%lx %d\n",
+			secs, usec_rem, m->map_id, m->phys, m->virt, m->len,
+			0UL, entry->pid);
+		break;
+	case MMIO_UNPROBE:
+		ret = trace_seq_printf(s,
+			"UNMAP %lu.%06lu %d 0x%lx %d\n",
+			secs, usec_rem, m->map_id, 0UL, entry->pid);
+		break;
+	default:
+		ret = trace_seq_printf(s, "map what?\n");
+		break;
+	}
+	if (ret)
+		return 1;
+	return 0;
+}
+
+/* return 0 to abort printing without consuming current entry in pipe mode */
+static int mmio_print_line(struct trace_iterator *iter)
+{
+	switch (iter->ent->type) {
+	case TRACE_MMIO_RW:
+		return mmio_print_rw(iter);
+	case TRACE_MMIO_MAP:
+		return mmio_print_map(iter);
+	default:
+		return 1; /* ignore unknown entries */
+	}
 }
 
 static struct tracer mmio_tracer __read_mostly =
@@ -47,38 +149,31 @@ static struct tracer mmio_tracer __read_mostly =
 	.name		= "mmiotrace",
 	.init		= mmio_trace_init,
 	.reset		= mmio_trace_reset,
+	.open		= mmio_print_header,
 	.ctrl_update	= mmio_trace_ctrl_update,
+	.print_line	= mmio_print_line,
 };
 
 __init static int init_mmio_trace(void)
 {
-	int ret = init_mmiotrace();
-	if (ret)
-		return ret;
 	return register_tracer(&mmio_tracer);
 }
 device_initcall(init_mmio_trace);
 
-void mmio_trace_record(u32 type, unsigned long addr, unsigned long arg)
+void mmio_trace_rw(struct mmiotrace_rw *rw)
 {
 	struct trace_array *tr = mmio_trace_array;
 	struct trace_array_cpu *data = tr->data[smp_processor_id()];
+	__trace_mmiotrace_rw(tr, data, rw);
+}
 
-	if (!current || current->pid == 0) {
-		/*
-		 * XXX: This is a problem. We need to able to record, no
-		 * matter what. tracing_generic_entry_update() would crash.
-		 */
-		static unsigned limit;
-		if (limit++ < 12)
-			pr_err("Error in %s: no current.\n", __func__);
-		return;
-	}
-	if (!tr || !data) {
-		static unsigned limit;
-		if (limit++ < 12)
-			pr_err("%s: no tr or data\n", __func__);
-		return;
-	}
-	__trace_special(tr, data, type, addr, arg);
+void mmio_trace_mapping(struct mmiotrace_map *map)
+{
+	struct trace_array *tr = mmio_trace_array;
+	struct trace_array_cpu *data;
+
+	preempt_disable();
+	data = tr->data[smp_processor_id()];
+	__trace_mmiotrace_map(tr, data, map);
+	preempt_enable();
 }
-- 
cgit v1.2.3


From 138295373ccf7625fcb0218dfea114837983bc39 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Mon, 12 May 2008 21:20:58 +0200
Subject: ftrace: mmiotrace update, #2

another weekend, another patch. This should apply on top of my previous patch
from March 23rd.

Summary of changes:
- Print PCI device list in output header
- work around recursive probe hits on SMP
- refactor dis/arm_kmmio_fault_page() and add check for page levels
- remove un/reference_kmmio(), the die notifier hook is registered
permanently into the list
- explicitly check for single stepping in die notifier callback

I have tested this version on my UP Athlon64 desktop with Nouveau, and
SMP Core 2 Duo laptop with the proprietary nvidia driver. Both systems
are 64-bit. One previously unknown bug crept into daylight: the ftrace
framework's output routines print the first entry last after buffer has
wrapped around.

The most important regressions compared to non-ftrace mmiotrace at this
time are:
- failure of trace_pipe file
- illegal lines in output file
- unaware of losing data due to buffer full

Personally I'd like to see these three solved before submitting to
mainline. Other issues may come up once we know when we lose events.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace_mmiotrace.c | 47 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 44 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 3a12b1ad0c63..361472b5788c 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -8,6 +8,7 @@
 
 #include <linux/kernel.h>
 #include <linux/mmiotrace.h>
+#include <linux/pci.h>
 
 #include "trace.h"
 
@@ -53,12 +54,52 @@ static void mmio_trace_ctrl_update(struct trace_array *tr)
 	}
 }
 
+static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
+{
+	int ret = 0;
+	int i;
+	resource_size_t start, end;
+	const struct pci_driver *drv = pci_dev_driver(dev);
+
+	/* XXX: incomplete checks for trace_seq_printf() return value */
+	ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
+				dev->bus->number, dev->devfn,
+				dev->vendor, dev->device, dev->irq);
+	/*
+	 * XXX: is pci_resource_to_user() appropriate, since we are
+	 * supposed to interpret the __ioremap() phys_addr argument based on
+	 * these printed values?
+	 */
+	for (i = 0; i < 7; i++) {
+		pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+		ret += trace_seq_printf(s, " %llx",
+			(unsigned long long)(start |
+			(dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
+	}
+	for (i = 0; i < 7; i++) {
+		pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+		ret += trace_seq_printf(s, " %llx",
+			dev->resource[i].start < dev->resource[i].end ?
+			(unsigned long long)(end - start) + 1 : 0);
+	}
+	if (drv)
+		ret += trace_seq_printf(s, " %s\n", drv->name);
+	else
+		ret += trace_seq_printf(s, " \n");
+	return ret;
+}
+
 /* XXX: This is not called for trace_pipe file! */
-void mmio_print_header(struct trace_iterator *iter)
+static void mmio_print_header(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
-	trace_seq_printf(s, "VERSION broken 20070824\n");
-	/* TODO: print /proc/bus/pci/devices contents as PCIDEV lines */
+	struct pci_dev *dev = NULL;
+
+	trace_seq_printf(s, "VERSION 20070824\n");
+
+	for_each_pci_dev(dev)
+		mmio_print_pcidev(s, dev);
+	/* XXX: return value? What if header is very long? */
 }
 
 static int mmio_print_rw(struct trace_iterator *iter)
-- 
cgit v1.2.3


From 801a175bf601f9a9d5e86e92dee9adeeb6625da8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:58 +0200
Subject: mmiotrace: ftrace fix

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d14fe49e9638..4dcc4e85c5d6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -838,12 +838,16 @@ void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data,
 	struct trace_entry *entry;
 	unsigned long irq_flags;
 
-	spin_lock_irqsave(&data->lock, irq_flags);
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&data->lock);
+
 	entry			= tracing_get_trace_entry(tr, data);
 	tracing_generic_entry_update(entry, 0);
 	entry->type		= TRACE_MMIO_RW;
 	entry->mmiorw		= *rw;
-	spin_unlock_irqrestore(&data->lock, irq_flags);
+
+	__raw_spin_unlock(&data->lock);
+	raw_local_irq_restore(irq_flags);
 
 	trace_wake_up();
 }
@@ -854,12 +858,16 @@ void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data,
 	struct trace_entry *entry;
 	unsigned long irq_flags;
 
-	spin_lock_irqsave(&data->lock, irq_flags);
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&data->lock);
+
 	entry			= tracing_get_trace_entry(tr, data);
 	tracing_generic_entry_update(entry, 0);
 	entry->type		= TRACE_MMIO_MAP;
 	entry->mmiomap		= *map;
-	spin_unlock_irqrestore(&data->lock, irq_flags);
+
+	__raw_spin_unlock(&data->lock);
+	raw_local_irq_restore(irq_flags);
 
 	trace_wake_up();
 }
-- 
cgit v1.2.3


From 736ca61fa81874b3fee205a593251b1869d0bcf1 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Mon, 12 May 2008 21:20:59 +0200
Subject: x86 mmiotrace: Do not print bogus pid

Non-zero pid indicates the MMIO access originated in user space.
We do not catch that kind of accesses yet, so always print zero for now.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace_mmiotrace.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 361472b5788c..79be4a18ec1e 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -117,20 +117,20 @@ static int mmio_print_rw(struct trace_iterator *iter)
 		ret = trace_seq_printf(s,
 			"R %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n",
 			rw->width, secs, usec_rem, rw->map_id, rw->phys,
-			rw->value, rw->pc, entry->pid);
+			rw->value, rw->pc, 0);
 		break;
 	case MMIO_WRITE:
 		ret = trace_seq_printf(s,
 			"W %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n",
 			rw->width, secs, usec_rem, rw->map_id, rw->phys,
-			rw->value, rw->pc, entry->pid);
+			rw->value, rw->pc, 0);
 		break;
 	case MMIO_UNKNOWN_OP:
 		ret = trace_seq_printf(s,
 			"UNKNOWN %lu.%06lu %d 0x%lx %02x,%02x,%02x 0x%lx %d\n",
 			secs, usec_rem, rw->map_id, rw->phys,
 			(rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff,
-			(rw->value >> 0) & 0xff, rw->pc, entry->pid);
+			(rw->value >> 0) & 0xff, rw->pc, 0);
 		break;
 	default:
 		ret = trace_seq_printf(s, "rw what?\n");
-- 
cgit v1.2.3


From d0a7e8ca5b996d36219e6fc002907291c8ee677b Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Mon, 12 May 2008 21:21:02 +0200
Subject: mmiotrace: print header using the read hook.

Now the header is printed only for `trace_pipe' file.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_mmiotrace.c | 60 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 53 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 79be4a18ec1e..6d2edbdde939 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -12,6 +12,10 @@
 
 #include "trace.h"
 
+struct header_iter {
+	struct pci_dev *dev;
+};
+
 static struct trace_array *mmio_trace_array;
 
 static void mmio_reset_data(struct trace_array *tr)
@@ -89,17 +93,57 @@ static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
 	return ret;
 }
 
-/* XXX: This is not called for trace_pipe file! */
-static void mmio_print_header(struct trace_iterator *iter)
+static void destroy_header_iter(struct header_iter *hiter)
+{
+	if (!hiter)
+		return;
+	pci_dev_put(hiter->dev);
+	kfree(hiter);
+}
+
+static void mmio_pipe_open(struct trace_iterator *iter)
 {
+	struct header_iter *hiter;
 	struct trace_seq *s = &iter->seq;
-	struct pci_dev *dev = NULL;
 
 	trace_seq_printf(s, "VERSION 20070824\n");
 
-	for_each_pci_dev(dev)
-		mmio_print_pcidev(s, dev);
-	/* XXX: return value? What if header is very long? */
+	hiter = kzalloc(sizeof(*hiter), GFP_KERNEL);
+	if (!hiter)
+		return;
+
+	hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, NULL);
+	iter->private = hiter;
+}
+
+/* XXX: This is not called when the pipe is closed! */
+static void mmio_close(struct trace_iterator *iter)
+{
+	struct header_iter *hiter = iter->private;
+	destroy_header_iter(hiter);
+	iter->private = NULL;
+}
+
+static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp,
+				char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	ssize_t ret;
+	struct header_iter *hiter = iter->private;
+	struct trace_seq *s = &iter->seq;
+
+	if (!hiter)
+		return 0;
+
+	mmio_print_pcidev(s, hiter->dev);
+	hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, hiter->dev);
+
+	if (!hiter->dev) {
+		destroy_header_iter(hiter);
+		iter->private = NULL;
+	}
+
+	ret = trace_seq_to_user(s, ubuf, cnt);
+	return (ret == -EBUSY) ? 0 : ret;
 }
 
 static int mmio_print_rw(struct trace_iterator *iter)
@@ -190,7 +234,9 @@ static struct tracer mmio_tracer __read_mostly =
 	.name		= "mmiotrace",
 	.init		= mmio_trace_init,
 	.reset		= mmio_trace_reset,
-	.open		= mmio_print_header,
+	.pipe_open	= mmio_pipe_open,
+	.close		= mmio_close,
+	.read		= mmio_read,
 	.ctrl_update	= mmio_trace_ctrl_update,
 	.print_line	= mmio_print_line,
 };
-- 
cgit v1.2.3


From 2039238b79b51a50f8477f53f33750e1c3fc146a Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Mon, 12 May 2008 21:21:02 +0200
Subject: mmiotrace: print overrun counts

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_mmiotrace.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 6d2edbdde939..d0f649a2203d 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -17,11 +17,13 @@ struct header_iter {
 };
 
 static struct trace_array *mmio_trace_array;
+static bool overrun_detected;
 
 static void mmio_reset_data(struct trace_array *tr)
 {
 	int cpu;
 
+	overrun_detected = false;
 	tr->time_start = ftrace_now(tr->cpu);
 
 	for_each_online_cpu(cpu)
@@ -124,12 +126,34 @@ static void mmio_close(struct trace_iterator *iter)
 	iter->private = NULL;
 }
 
+static unsigned long count_overruns(struct trace_iterator *iter)
+{
+	int cpu;
+	unsigned long cnt = 0;
+	for_each_online_cpu(cpu) {
+		cnt += iter->overrun[cpu];
+		iter->overrun[cpu] = 0;
+	}
+	return cnt;
+}
+
 static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp,
 				char __user *ubuf, size_t cnt, loff_t *ppos)
 {
 	ssize_t ret;
 	struct header_iter *hiter = iter->private;
 	struct trace_seq *s = &iter->seq;
+	unsigned long n;
+
+	n = count_overruns(iter);
+	if (n) {
+		/* XXX: This is later than where events were lost. */
+		trace_seq_printf(s, "MARK 0.000000 Lost %lu events.\n", n);
+		if (!overrun_detected)
+			pr_warning("mmiotrace has lost events.\n");
+		overrun_detected = true;
+		goto print_out;
+	}
 
 	if (!hiter)
 		return 0;
@@ -142,6 +166,7 @@ static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp,
 		iter->private = NULL;
 	}
 
+print_out:
 	ret = trace_seq_to_user(s, ubuf, cnt);
 	return (ret == -EBUSY) ? 0 : ret;
 }
-- 
cgit v1.2.3


From e0fd5c2fa188311667267c02a702ae699a9fc2bd Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Mon, 12 May 2008 21:21:02 +0200
Subject: mmiotrace: do not print bogus pid for maps either

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_mmiotrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index d0f649a2203d..3c1dacdc2d85 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -225,12 +225,12 @@ static int mmio_print_map(struct trace_iterator *iter)
 		ret = trace_seq_printf(s,
 			"MAP %lu.%06lu %d 0x%lx 0x%lx 0x%lx 0x%lx %d\n",
 			secs, usec_rem, m->map_id, m->phys, m->virt, m->len,
-			0UL, entry->pid);
+			0UL, 0);
 		break;
 	case MMIO_UNPROBE:
 		ret = trace_seq_printf(s,
 			"UNMAP %lu.%06lu %d 0x%lx %d\n",
-			secs, usec_rem, m->map_id, 0UL, entry->pid);
+			secs, usec_rem, m->map_id, 0UL, 0);
 		break;
 	default:
 		ret = trace_seq_printf(s, "map what?\n");
-- 
cgit v1.2.3


From dee310d0adf41019aca476052ac3085ff286d9be Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Mon, 12 May 2008 21:21:03 +0200
Subject: x86 mmiotrace: use resource_size_t for phys addresses

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_mmiotrace.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 3c1dacdc2d85..b13dc19dcbb4 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -184,20 +184,23 @@ static int mmio_print_rw(struct trace_iterator *iter)
 	switch (entry->mmiorw.opcode) {
 	case MMIO_READ:
 		ret = trace_seq_printf(s,
-			"R %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n",
-			rw->width, secs, usec_rem, rw->map_id, rw->phys,
+			"R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
+			rw->width, secs, usec_rem, rw->map_id,
+			(unsigned long long)rw->phys,
 			rw->value, rw->pc, 0);
 		break;
 	case MMIO_WRITE:
 		ret = trace_seq_printf(s,
-			"W %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n",
-			rw->width, secs, usec_rem, rw->map_id, rw->phys,
+			"W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
+			rw->width, secs, usec_rem, rw->map_id,
+			(unsigned long long)rw->phys,
 			rw->value, rw->pc, 0);
 		break;
 	case MMIO_UNKNOWN_OP:
 		ret = trace_seq_printf(s,
-			"UNKNOWN %lu.%06lu %d 0x%lx %02x,%02x,%02x 0x%lx %d\n",
-			secs, usec_rem, rw->map_id, rw->phys,
+			"UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n",
+			secs, usec_rem, rw->map_id,
+			(unsigned long long)rw->phys,
 			(rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff,
 			(rw->value >> 0) & 0xff, rw->pc, 0);
 		break;
@@ -223,8 +226,9 @@ static int mmio_print_map(struct trace_iterator *iter)
 	switch (entry->mmiorw.opcode) {
 	case MMIO_PROBE:
 		ret = trace_seq_printf(s,
-			"MAP %lu.%06lu %d 0x%lx 0x%lx 0x%lx 0x%lx %d\n",
-			secs, usec_rem, m->map_id, m->phys, m->virt, m->len,
+			"MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
+			secs, usec_rem, m->map_id,
+			(unsigned long long)m->phys, m->virt, m->len,
 			0UL, 0);
 		break;
 	case MMIO_UNPROBE:
-- 
cgit v1.2.3


From 4d2df795f0c3eb91f97a666f47716121a2f166ed Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 24 May 2008 15:00:46 +0200
Subject: sysprof: make it depend on X86

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e101c9a85f0f..263e9e6bbd60 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -77,7 +77,7 @@ config PREEMPT_TRACER
 
 config SYSPROF_TRACER
 	bool "Sysprof Tracer"
-	depends on DEBUG_KERNEL
+	depends on X86
 	select TRACING
 	help
 	  This tracer provides the trace needed by the 'Sysprof' userspace
-- 
cgit v1.2.3


From 81d50bb254ed53a0da45a65988e4e1fa08e8a541 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Thu, 15 May 2008 19:42:49 -0700
Subject: posix-timers: print RT watchdog message

It's useful to detect which process is killed by RT watchdog.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/posix-cpu-timers.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index f1525ad06cb3..c42a03aef36f 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1037,6 +1037,9 @@ static void check_thread_timers(struct task_struct *tsk,
 				sig->rlim[RLIMIT_RTTIME].rlim_cur +=
 								USEC_PER_SEC;
 			}
+			printk(KERN_INFO
+				"RT Watchdog Timeout: %s[%d]\n",
+				tsk->comm, task_pid_nr(tsk));
 			__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
 		}
 	}
-- 
cgit v1.2.3


From 7f6f3a39d258adf51f0fb1fe0dab52272a1c61a4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 24 May 2008 23:12:18 +0200
Subject: namespacecheck: fix kernel printk.c

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/printk.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 8fb01c32aa3b..b620e3d96136 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -231,7 +231,7 @@ static inline void boot_delay_msec(void)
 /*
  * Return the number of unread characters in the log buffer.
  */
-int log_buf_get_len(void)
+static int log_buf_get_len(void)
 {
 	return logged_chars;
 }
@@ -270,7 +270,7 @@ int log_buf_copy(char *dest, int idx, int len)
 /*
  * Extract a single character from the log buffer.
  */
-int log_buf_read(int idx)
+static int log_buf_read(int idx)
 {
 	char ret;
 
-- 
cgit v1.2.3


From 42fdfa238a23643226910acf922ea930b3286032 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 24 May 2008 23:14:51 +0200
Subject: namespacecheck: more kernel/printk.c fixes

[ Stephen Rothwell <sfr@canb.auug.org.au>: build fix ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/printk.c | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index b620e3d96136..55d16e57499a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -267,19 +267,6 @@ int log_buf_copy(char *dest, int idx, int len)
 	return ret;
 }
 
-/*
- * Extract a single character from the log buffer.
- */
-static int log_buf_read(int idx)
-{
-	char ret;
-
-	if (log_buf_copy(&ret, idx, 1) == 1)
-		return ret;
-	else
-		return -1;
-}
-
 /*
  * Commands to do_syslog:
  *
-- 
cgit v1.2.3


From 3b8945e8d40645eecdb7d2357ca531f9b4dd9f71 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 12 May 2008 21:21:04 +0200
Subject: printk: clean up recursion check related static variables

Make printk_recursion_bug_msg static and drop printk prefix from recursion
variables.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/printk.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 55d16e57499a..8b42f87e311d 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -652,16 +652,14 @@ static int acquire_console_semaphore_for_printk(unsigned int cpu)
 	spin_unlock(&logbuf_lock);
 	return retval;
 }
-
-const char printk_recursion_bug_msg [] =
-			KERN_CRIT "BUG: recent printk recursion!\n";
-static int printk_recursion_bug;
+static const char recursion_bug_msg [] =
+		KERN_CRIT "BUG: recent printk recursion!\n";
+static int recursion_bug;
+static int log_level_unknown = 1;
+static char printk_buf[1024];
 
 asmlinkage int vprintk(const char *fmt, va_list args)
 {
-	static int log_level_unknown = 1;
-	static char printk_buf[1024];
-
 	unsigned long flags;
 	int printed_len = 0;
 	int this_cpu;
@@ -686,7 +684,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 		 * it can be printed at the next appropriate moment:
 		 */
 		if (!oops_in_progress) {
-			printk_recursion_bug = 1;
+			recursion_bug = 1;
 			goto out_restore_irqs;
 		}
 		zap_locks();
@@ -696,10 +694,10 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 	spin_lock(&logbuf_lock);
 	printk_cpu = this_cpu;
 
-	if (printk_recursion_bug) {
-		printk_recursion_bug = 0;
-		strcpy(printk_buf, printk_recursion_bug_msg);
-		printed_len = sizeof(printk_recursion_bug_msg);
+	if (recursion_bug) {
+		recursion_bug = 0;
+		strcpy(printk_buf, recursion_bug_msg);
+		printed_len = sizeof(recursion_bug_msg);
 	}
 	/* Emit the output into the temporary buffer */
 	printed_len += vscnprintf(printk_buf + printed_len,
-- 
cgit v1.2.3


From cd3a1b8562d28490b334a61d5eb05df3d722d91e Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@web.de>
Date: Mon, 12 May 2008 21:21:04 +0200
Subject: printk: don't prefer unsuited consoles on registration

console election: If some console happens to be registered first which does
not provide a tty binding (!console->device), it prevents that more suited
consoles which are registered later on can enter the candidate pool for
console_device().  This is observable with KGDB's console which may already
be registered (and exploited!) during early debugger connections, that is
before any regular console registration.

This patch fixes the issue by postponing the final, automated
preferred_console selection until someone with a non-NULL device handler
comes around.

Signed-off-by: Jan Kiszka <jan.kiszka@web.de>
Cc: Jason Wessel <jason.wessel@windriver.com>
Cc: Gerd Hoffmann <kraxel@suse.de>
Cc: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/printk.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 8b42f87e311d..7d5556152234 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1157,8 +1157,11 @@ void register_console(struct console *console)
 			console->index = 0;
 		if (console->setup == NULL ||
 		    console->setup(console, NULL) == 0) {
-			console->flags |= CON_ENABLED | CON_CONSDEV;
-			preferred_console = 0;
+			console->flags |= CON_ENABLED;
+			if (console->device) {
+				console->flags |= CON_CONSDEV;
+				preferred_console = 0;
+			}
 		}
 	}
 
-- 
cgit v1.2.3


From ac60ad7413ca8208094609a3b88ed9b1ed012fbc Mon Sep 17 00:00:00 2001
From: Nick Andrew <nick@nick-andrew.net>
Date: Mon, 12 May 2008 21:21:04 +0200
Subject: printk: refactor processing of line severity tokens

Restructure the logic of vprintk() so the processing of the leading
3 characters of each input line is in one place, regardless whether
printk_time is enabled. This makes the code smaller and easier to
understand.

size reduction in kernel/printk.o:

   text	   data	    bss	    dec	    hex	filename
   6157	    397	1049804	1056358	 101e66	printk.o.before
   6117	    397	1049804	1056318	 101e3e	printk.o.after

and some style uncleanlinesses removed as well as a side-effect:

 Before:
    total: 19 errors, 22 warnings, 1340 lines checked
 After:
    total: 17 errors, 22 warnings, 1333 lines checked

Signed-off-by: Nick Andrew <nick@nick-andrew.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/printk.c | 61 +++++++++++++++++++++++++--------------------------------
 1 file changed, 27 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 7d5556152234..98ca1b76277f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -655,13 +655,13 @@ static int acquire_console_semaphore_for_printk(unsigned int cpu)
 static const char recursion_bug_msg [] =
 		KERN_CRIT "BUG: recent printk recursion!\n";
 static int recursion_bug;
-static int log_level_unknown = 1;
+	static int new_text_line = 1;
 static char printk_buf[1024];
 
 asmlinkage int vprintk(const char *fmt, va_list args)
 {
-	unsigned long flags;
 	int printed_len = 0;
+	unsigned long flags;
 	int this_cpu;
 	char *p;
 
@@ -703,61 +703,54 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 	printed_len += vscnprintf(printk_buf + printed_len,
 				  sizeof(printk_buf) - printed_len, fmt, args);
 
+
 	/*
 	 * Copy the output into log_buf.  If the caller didn't provide
 	 * appropriate log level tags, we insert them here
 	 */
 	for (p = printk_buf; *p; p++) {
-		if (log_level_unknown) {
-                        /* log_level_unknown signals the start of a new line */
+		if (new_text_line) {
+			int current_log_level = default_message_loglevel;
+			/* If a token, set current_log_level and skip over */
+			if (p[0] == '<' && p[1] >= '0' && p[1] <= '7' &&
+			    p[2] == '>') {
+				current_log_level = p[1] - '0';
+				p += 3;
+				printed_len -= 3;
+			}
+
+			/* Always output the token */
+			emit_log_char('<');
+			emit_log_char(current_log_level + '0');
+			emit_log_char('>');
+			printed_len += 3;
+			new_text_line = 0;
+
 			if (printk_time) {
-				int loglev_char;
+				/* Follow the token with the time */
 				char tbuf[50], *tp;
 				unsigned tlen;
 				unsigned long long t;
 				unsigned long nanosec_rem;
 
-				/*
-				 * force the log level token to be
-				 * before the time output.
-				 */
-				if (p[0] == '<' && p[1] >='0' &&
-				   p[1] <= '7' && p[2] == '>') {
-					loglev_char = p[1];
-					p += 3;
-					printed_len -= 3;
-				} else {
-					loglev_char = default_message_loglevel
-						+ '0';
-				}
 				t = cpu_clock(printk_cpu);
 				nanosec_rem = do_div(t, 1000000000);
-				tlen = sprintf(tbuf,
-						"<%c>[%5lu.%06lu] ",
-						loglev_char,
-						(unsigned long)t,
-						nanosec_rem/1000);
+				tlen = sprintf(tbuf, "[%5lu.%06lu] ",
+						(unsigned long) t,
+						nanosec_rem / 1000);
 
 				for (tp = tbuf; tp < tbuf + tlen; tp++)
 					emit_log_char(*tp);
 				printed_len += tlen;
-			} else {
-				if (p[0] != '<' || p[1] < '0' ||
-				   p[1] > '7' || p[2] != '>') {
-					emit_log_char('<');
-					emit_log_char(default_message_loglevel
-						+ '0');
-					emit_log_char('>');
-					printed_len += 3;
-				}
 			}
-			log_level_unknown = 0;
+
 			if (!*p)
 				break;
 		}
+
 		emit_log_char(*p);
 		if (*p == '\n')
-			log_level_unknown = 1;
+			new_text_line = 1;
 	}
 
 	/*
-- 
cgit v1.2.3


From 091593080533a752ce66d12858d8c4105c8e1793 Mon Sep 17 00:00:00 2001
From: Nick Andrew <nick@nick-andrew.net>
Date: Mon, 12 May 2008 21:21:04 +0200
Subject: printk: remember the message level for multi-line output

printk(KERN_ALERT "Danger Will Robinson!\nAlien Approaching!\n");

At present this will result in one message at ALERT level and one
at the current default message loglevel (e.g. WARNING). This is
non-intuitive.

Modify vprintk() to remember the message loglevel each time it
is specified and use it for subsequent lines of output which do
not specify one, within the same call to printk.

Signed-off-by: Nick Andrew <nick@nick-andrew.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 98ca1b76277f..475fc22a2857 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -661,6 +661,7 @@ static char printk_buf[1024];
 asmlinkage int vprintk(const char *fmt, va_list args)
 {
 	int printed_len = 0;
+	int current_log_level = default_message_loglevel;
 	unsigned long flags;
 	int this_cpu;
 	char *p;
@@ -710,7 +711,6 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 	 */
 	for (p = printk_buf; *p; p++) {
 		if (new_text_line) {
-			int current_log_level = default_message_loglevel;
 			/* If a token, set current_log_level and skip over */
 			if (p[0] == '<' && p[1] >= '0' && p[1] <= '7' &&
 			    p[2] == '>') {
-- 
cgit v1.2.3


From 9c44bc03fff44ff04237a7d92e35304a0e50c331 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:21:04 +0200
Subject: softlockup: allow panic on lockup

allow users to configure the softlockup detector to generate a panic
instead of a warning message.

high-availability systems might opt for this strict method (combined
with panic_timeout= boot option/sysctl), instead of generating
softlockup warnings ad infinitum.

also, automated tests work better if the system reboots reliably (into
a safe kernel) in case of a lockup.

The full spectrum of configurability is supported: boot option, sysctl
option and Kconfig option.

it's default-disabled.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/softlockup.c | 21 +++++++++++++++++++++
 kernel/sysctl.c     | 11 +++++++++++
 2 files changed, 32 insertions(+)

(limited to 'kernel')

diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 01b6522fd92b..78e0ad21cb0c 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -27,6 +27,21 @@ static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
 static int __read_mostly did_panic;
 unsigned long __read_mostly softlockup_thresh = 60;
 
+/*
+ * Should we panic (and reboot, if panic_timeout= is set) when a
+ * soft-lockup occurs:
+ */
+unsigned int __read_mostly softlockup_panic =
+				CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
+
+static int __init softlockup_panic_setup(char *str)
+{
+	softlockup_panic = simple_strtoul(str, NULL, 0);
+
+	return 1;
+}
+__setup("softlockup_panic=", softlockup_panic_setup);
+
 static int
 softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
 {
@@ -120,6 +135,9 @@ void softlockup_tick(void)
 	else
 		dump_stack();
 	spin_unlock(&print_lock);
+
+	if (softlockup_panic)
+		panic("softlockup: hung tasks");
 }
 
 /*
@@ -172,6 +190,9 @@ static void check_hung_task(struct task_struct *t, unsigned long now)
 
 	t->last_switch_timestamp = now;
 	touch_nmi_watchdog();
+
+	if (softlockup_panic)
+		panic("softlockup: blocked tasks");
 }
 
 /*
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 29116652dca8..2d3b388c402d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -727,6 +727,17 @@ static struct ctl_table kern_table[] = {
 	},
 #endif
 #ifdef CONFIG_DETECT_SOFTLOCKUP
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "softlockup_panic",
+		.data		= &softlockup_panic,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_doulongvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "softlockup_thresh",
-- 
cgit v1.2.3


From 9383d9679056e6cc4e7ff70f31da945a268238f4 Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Mon, 12 May 2008 21:21:14 +0200
Subject: softlockup: fix softlockup_thresh unaligned access and disable
 detection at runtime

Fix unaligned access errors when setting softlockup_thresh on
64 bit platforms.

Allow softlockup detection to be disabled by setting
softlockup_thresh <= 0.

Detect that boot time softlockup detection has been disabled
earlier in softlockup_tick.

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/softlockup.c | 12 ++++++++++--
 kernel/sysctl.c     |  9 +++++----
 2 files changed, 15 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 78e0ad21cb0c..a3a0b239b7f7 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -25,7 +25,7 @@ static DEFINE_PER_CPU(unsigned long, print_timestamp);
 static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
 
 static int __read_mostly did_panic;
-unsigned long __read_mostly softlockup_thresh = 60;
+int __read_mostly softlockup_thresh = 60;
 
 /*
  * Should we panic (and reboot, if panic_timeout= is set) when a
@@ -94,6 +94,14 @@ void softlockup_tick(void)
 	struct pt_regs *regs = get_irq_regs();
 	unsigned long now;
 
+	/* Is detection switched off? */
+	if (!per_cpu(watchdog_task, this_cpu) || softlockup_thresh <= 0) {
+		/* Be sure we don't false trigger if switched back on */
+		if (touch_timestamp)
+			per_cpu(touch_timestamp, this_cpu) = 0;
+		return;
+	}
+
 	if (touch_timestamp == 0) {
 		touch_softlockup_watchdog();
 		return;
@@ -104,7 +112,7 @@ void softlockup_tick(void)
 	/* report at most once a second */
 	if ((print_timestamp >= touch_timestamp &&
 			print_timestamp < (touch_timestamp + 1)) ||
-			did_panic || !per_cpu(watchdog_task, this_cpu)) {
+			did_panic) {
 		return;
 	}
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2d3b388c402d..31c19a79738d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -84,12 +84,13 @@ extern int latencytop_enabled;
 extern int sysctl_nr_open_min, sysctl_nr_open_max;
 
 /* Constants used for minimum and  maximum */
-#if defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM)
+#ifdef CONFIG_HIGHMEM
 static int one = 1;
 #endif
 
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 static int sixty = 60;
+static int neg_one = -1;
 #endif
 
 #ifdef CONFIG_MMU
@@ -742,11 +743,11 @@ static struct ctl_table kern_table[] = {
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "softlockup_thresh",
 		.data		= &softlockup_thresh,
-		.maxlen		= sizeof(unsigned long),
+		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_doulongvec_minmax,
+		.proc_handler	= &proc_dointvec_minmax,
 		.strategy	= &sysctl_intvec,
-		.extra1		= &one,
+		.extra1		= &neg_one,
 		.extra2		= &sixty,
 	},
 	{
-- 
cgit v1.2.3


From 1c4cd6dd1d0fd3057bb6b8c87460049497889d1b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:21:14 +0200
Subject: softlockup: fix softlockup_thresh fix

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sysctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 31c19a79738d..a829dc8d7a9e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -84,7 +84,7 @@ extern int latencytop_enabled;
 extern int sysctl_nr_open_min, sysctl_nr_open_max;
 
 /* Constants used for minimum and  maximum */
-#ifdef CONFIG_HIGHMEM
+#if defined(CONFIG_HIGHMEM) || defined(CONFIG_DETECT_SOFTLOCKUP)
 static int one = 1;
 #endif
 
-- 
cgit v1.2.3


From 962cf36c5bf6d2840b8d66ee9a606fae2f540bbd Mon Sep 17 00:00:00 2001
From: "Carlos R. Mafra" <crmafra2@gmail.com>
Date: Thu, 15 May 2008 11:15:37 -0300
Subject: Remove argument from open_softirq which is always NULL

As git-grep shows, open_softirq() is always called with the last argument
being NULL

block/blk-core.c:       open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
kernel/hrtimer.c:       open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq, NULL);
kernel/rcuclassic.c:    open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
kernel/rcupreempt.c:    open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
kernel/sched.c: open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
kernel/softirq.c:       open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL);
kernel/softirq.c:       open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL);
kernel/timer.c: open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
net/core/dev.c: open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
net/core/dev.c: open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);

This observation has already been made by Matthew Wilcox in June 2002
(http://www.cs.helsinki.fi/linux/linux-kernel/2002-25/0687.html)

"I notice that none of the current softirq routines use the data element
passed to them."

and the situation hasn't changed since them. So it appears we can safely
remove that extra argument to save 128 (54) bytes of kernel data (text).

Signed-off-by: Carlos R. Mafra <crmafra@ift.unesp.br>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/hrtimer.c    | 2 +-
 kernel/rcuclassic.c | 2 +-
 kernel/rcupreempt.c | 2 +-
 kernel/sched.c      | 2 +-
 kernel/softirq.c    | 7 +++----
 kernel/timer.c      | 2 +-
 6 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 421be5fe5cc7..861b4088092a 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1669,7 +1669,7 @@ void __init hrtimers_init(void)
 			  (void *)(long)smp_processor_id());
 	register_cpu_notifier(&hrtimers_nb);
 #ifdef CONFIG_HIGH_RES_TIMERS
-	open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq, NULL);
+	open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
 #endif
 }
 
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index f4ffbd0f306f..f6e01f3ae9c6 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -529,7 +529,7 @@ static void __cpuinit rcu_online_cpu(int cpu)
 
 	rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
 	rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
-	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
+	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 }
 
 static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index e1cdf196a515..9dd827db359f 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -1125,7 +1125,7 @@ void __init __rcu_init(void)
 	for_each_online_cpu(cpu)
 		rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,	(void *)(long) cpu);
 
-	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
+	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 }
 
 /*
diff --git a/kernel/sched.c b/kernel/sched.c
index cfa222a91539..56ea3a203a5a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8154,7 +8154,7 @@ void __init sched_init(void)
 #endif
 
 #ifdef CONFIG_SMP
-	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
+	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
 #endif
 
 #ifdef CONFIG_RT_MUTEXES
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 36e061740047..059256874e9b 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -347,9 +347,8 @@ void raise_softirq(unsigned int nr)
 	local_irq_restore(flags);
 }
 
-void open_softirq(int nr, void (*action)(struct softirq_action*), void *data)
+void open_softirq(int nr, void (*action)(struct softirq_action *))
 {
-	softirq_vec[nr].data = data;
 	softirq_vec[nr].action = action;
 }
 
@@ -503,8 +502,8 @@ void __init softirq_init(void)
 			&per_cpu(tasklet_hi_vec, cpu).head;
 	}
 
-	open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL);
-	open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL);
+	open_softirq(TASKLET_SOFTIRQ, tasklet_action);
+	open_softirq(HI_SOFTIRQ, tasklet_hi_action);
 }
 
 static int ksoftirqd(void * __bind_cpu)
diff --git a/kernel/timer.c b/kernel/timer.c
index ceacc6626572..b4da888497fa 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1502,7 +1502,7 @@ void __init init_timers(void)
 
 	BUG_ON(err == NOTIFY_BAD);
 	register_cpu_notifier(&timers_nb);
-	open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
+	open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
 }
 
 /**
-- 
cgit v1.2.3


From c6531cce6e6e4b99bcda46b6268d6f2d9e30aea4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:21:14 +0200
Subject: sched: do not trace sched_clock

The tracer uses sched_clock, so do not trace it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e2e985eeee78..6590a828138f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -884,12 +884,12 @@ static unsigned long long __cpu_clock(int cpu)
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
  * clock constructed from sched_clock():
  */
-unsigned long long cpu_clock(int cpu)
+unsigned long long notrace cpu_clock(int cpu)
 {
 	unsigned long long prev_cpu_time, time, delta_time;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	prev_cpu_time = per_cpu(prev_cpu_time, cpu);
 	time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
 	delta_time = time-prev_cpu_time;
@@ -898,7 +898,7 @@ unsigned long long cpu_clock(int cpu)
 		time = __sync_cpu_clock(time, cpu);
 		per_cpu(prev_cpu_time, cpu) = time;
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return time;
 }
-- 
cgit v1.2.3


From 19384c0314342222b18d4c7f09cdce1ca74dfd2a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 22 May 2008 00:22:16 -0400
Subject: ftrace: limit use of check pages

The check_pages function is called often enough that it can cause problems
with trace outputs or even bringing the system to a halt.

This patch limits the check_pages to the places that are most likely to
have problems. The check is made at the flip between the global array and
the max save array, as well as when the size of the buffers changes and
the self tests.

This patch also removes the BUG_ON from check_pages and replaces it with
a WARN_ON and disabling of the tracer.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: pq@iki.fi
Cc: proski@gnu.org
Cc: sandmann@redhat.com
Cc: a.p.zijlstra@chello.nl
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c          | 32 +++++++++++++++++++++++---------
 kernel/trace/trace_selftest.c |  1 +
 2 files changed, 24 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3271916ff033..0567f51bbea4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -249,24 +249,32 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 	tracing_record_cmdline(current);
 }
 
+#define CHECK_COND(cond)			\
+	if (unlikely(cond)) {			\
+		tracing_disabled = 1;		\
+		WARN_ON(1);			\
+		return -1;			\
+	}
+
 /**
  * check_pages - integrity check of trace buffers
  *
  * As a safty measure we check to make sure the data pages have not
- * been corrupted. TODO: configure to disable this because it adds
- * a bit of overhead.
+ * been corrupted.
  */
-void check_pages(struct trace_array_cpu *data)
+int check_pages(struct trace_array_cpu *data)
 {
 	struct page *page, *tmp;
 
-	BUG_ON(data->trace_pages.next->prev != &data->trace_pages);
-	BUG_ON(data->trace_pages.prev->next != &data->trace_pages);
+	CHECK_COND(data->trace_pages.next->prev != &data->trace_pages);
+	CHECK_COND(data->trace_pages.prev->next != &data->trace_pages);
 
 	list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) {
-		BUG_ON(page->lru.next->prev != &page->lru);
-		BUG_ON(page->lru.prev->next != &page->lru);
+		CHECK_COND(page->lru.next->prev != &page->lru);
+		CHECK_COND(page->lru.prev->next != &page->lru);
 	}
+
+	return 0;
 }
 
 /**
@@ -280,7 +288,6 @@ void *head_page(struct trace_array_cpu *data)
 {
 	struct page *page;
 
-	check_pages(data);
 	if (list_empty(&data->trace_pages))
 		return NULL;
 
@@ -2566,7 +2573,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
 {
 	unsigned long val;
 	char buf[64];
-	int ret;
+	int i, ret;
 
 	if (cnt >= sizeof(buf))
 		return -EINVAL;
@@ -2635,8 +2642,15 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
 			trace_free_page();
 	}
 
+	/* check integrity */
+	for_each_tracing_cpu(i)
+		check_pages(global_trace.data[i]);
+
 	filp->f_pos += cnt;
 
+	/* If check pages failed, return ENOMEM */
+	if (tracing_disabled)
+		cnt = -ENOMEM;
  out:
 	max_tr.entries = global_trace.entries;
 	mutex_unlock(&trace_types_lock);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 3877dd9102f1..18c5423bc977 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -28,6 +28,7 @@ trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data)
 	page = list_entry(data->trace_pages.next, struct page, lru);
 	entries = page_address(page);
 
+	check_pages(data);
 	if (head_page(data) != entries)
 		goto failed;
 
-- 
cgit v1.2.3


From 4902f8849da6d2805bd291551a6dfd48f1b4f604 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 22 May 2008 00:22:18 -0400
Subject: ftrace: move ftrace_special to trace.c

Move the ftrace_special out of sched_switch to trace.c.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: pq@iki.fi
Cc: proski@gnu.org
Cc: sandmann@redhat.com
Cc: a.p.zijlstra@chello.nl
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c              | 27 +++++++++++++++++++++++++--
 kernel/trace/trace_sched_switch.c | 24 ------------------------
 2 files changed, 25 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0567f51bbea4..583fe24903d3 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -941,6 +941,30 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	trace_wake_up();
 }
 
+void
+ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+	struct trace_array *tr = &global_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+
+	if (tracing_disabled || current_trace == &no_tracer || !tr->ctrl)
+		return;
+
+	local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1))
+		__trace_special(tr, data, arg1, arg2, arg3);
+
+	atomic_dec(&data->disabled);
+	local_irq_restore(flags);
+}
+
 #ifdef CONFIG_FTRACE
 static void
 function_trace_call(unsigned long ip, unsigned long parent_ip)
@@ -2941,8 +2965,6 @@ __init static int tracer_alloc_buffers(void)
 	int ret = -ENOMEM;
 	int i;
 
-	global_trace.ctrl = tracer_enabled;
-
 	/* TODO: make the number of buffers hot pluggable with CPUS */
 	tracing_nr_buffers = num_possible_cpus();
 	tracing_buffer_mask = cpu_possible_map;
@@ -3012,6 +3034,7 @@ __init static int tracer_alloc_buffers(void)
 	current_trace = &no_tracer;
 
 	/* All seems OK, enable tracing */
+	global_trace.ctrl = tracer_enabled;
 	tracing_disabled = 0;
 
 	return 0;
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index d25ffa5eaf2b..798ec0dc863c 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -125,30 +125,6 @@ wake_up_callback(void *probe_data, void *call_data,
 	wakeup_func(probe_data, __rq, task, curr);
 }
 
-void
-ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
-{
-	struct trace_array *tr = ctx_trace;
-	struct trace_array_cpu *data;
-	unsigned long flags;
-	long disabled;
-	int cpu;
-
-	if (!tracer_enabled)
-		return;
-
-	local_irq_save(flags);
-	cpu = raw_smp_processor_id();
-	data = tr->data[cpu];
-	disabled = atomic_inc_return(&data->disabled);
-
-	if (likely(disabled == 1))
-		__trace_special(tr, data, arg1, arg2, arg3);
-
-	atomic_dec(&data->disabled);
-	local_irq_restore(flags);
-}
-
 static void sched_switch_reset(struct trace_array *tr)
 {
 	int cpu;
-- 
cgit v1.2.3


From 7e18d8e701b6798a5df11e0a16881a60ab1018b6 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 22 May 2008 00:22:19 -0400
Subject: ftrace: add function tracing to wake up tracing

This patch adds function tracing to the functions that are called
on the CPU of the task being traced.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: pq@iki.fi
Cc: proski@gnu.org
Cc: sandmann@redhat.com
Cc: a.p.zijlstra@chello.nl
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace_sched_wakeup.c | 67 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 66 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 5d2fb48e47f8..bf7e91caef57 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -30,6 +30,69 @@ static DEFINE_SPINLOCK(wakeup_lock);
 
 static void __wakeup_reset(struct trace_array *tr);
 
+#ifdef CONFIG_FTRACE
+/*
+ * irqsoff uses its own tracer function to keep the overhead down:
+ */
+static void
+wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct trace_array *tr = wakeup_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int resched;
+	int cpu;
+
+	if (likely(!wakeup_task))
+		return;
+
+	resched = need_resched();
+	preempt_disable_notrace();
+
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+	if (unlikely(disabled != 1))
+		goto out;
+
+	spin_lock_irqsave(&wakeup_lock, flags);
+
+	if (unlikely(!wakeup_task))
+		goto unlock;
+
+	/*
+	 * The task can't disappear because it needs to
+	 * wake up first, and we have the wakeup_lock.
+	 */
+	if (task_cpu(wakeup_task) != cpu)
+		goto unlock;
+
+	trace_function(tr, data, ip, parent_ip, flags);
+
+ unlock:
+	spin_unlock_irqrestore(&wakeup_lock, flags);
+
+ out:
+	atomic_dec(&data->disabled);
+
+	/*
+	 * To prevent recursion from the scheduler, if the
+	 * resched flag was set before we entered, then
+	 * don't reschedule.
+	 */
+	if (resched)
+		preempt_enable_no_resched_notrace();
+	else
+		preempt_enable_notrace();
+}
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+	.func = wakeup_tracer_call,
+};
+#endif /* CONFIG_FTRACE */
+
 /*
  * Should this new latency be reported/recorded?
  */
@@ -73,7 +136,7 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
 	if (next != wakeup_task)
 		return;
 
-	/* The task we are waitng for is waking up */
+	/* The task we are waiting for is waking up */
 	data = tr->data[wakeup_cpu];
 
 	/* disable local data, not wakeup_cpu data */
@@ -290,6 +353,7 @@ static void start_wakeup_tracer(struct trace_array *tr)
 	smp_wmb();
 
 	tracer_enabled = 1;
+	register_ftrace_function(&trace_ops);
 
 	return;
 fail_deprobe_wake_new:
@@ -305,6 +369,7 @@ fail_deprobe:
 static void stop_wakeup_tracer(struct trace_array *tr)
 {
 	tracer_enabled = 0;
+	unregister_ftrace_function(&trace_ops);
 	marker_probe_unregister("kernel_sched_schedule",
 				sched_switch_callback,
 				&wakeup_trace);
-- 
cgit v1.2.3


From da89a7a2536c46e76a1a4351a70a8b8417e5fed1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 22 May 2008 00:22:20 -0400
Subject: ftrace: remove printks from irqsoff trace

Printing out new max latencies was fine for the old RT tracer. But for
mainline it is a bit messy. We also need to test if the run queue
is locked before we can do the print. This means that we may not be
printing out latencies if the run queue is locked on another CPU.
This produces inconsistencies in the output.

This patch simply removes the print altogether.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: pq@iki.fi
Cc: proski@gnu.org
Cc: sandmann@redhat.com
Cc: a.p.zijlstra@chello.nl
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace_irqsoff.c | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 761f3ec66c50..421d6fe3650e 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -165,22 +165,6 @@ check_critical_timing(struct trace_array *tr,
 
 	update_max_tr_single(tr, current, cpu);
 
-	if (!runqueue_is_locked()) {
-		if (tracing_thresh) {
-			printk(KERN_INFO "(%16s-%-5d|#%d): %lu us critical"
-			       " section violates %lu us threshold.\n",
-			       current->comm, current->pid,
-			       raw_smp_processor_id(),
-			       latency, nsecs_to_usecs(tracing_thresh));
-		} else {
-			printk(KERN_INFO "(%16s-%-5d|#%d): new %lu us"
-			       " maximum-latency critical section.\n",
-			       current->comm, current->pid,
-			       raw_smp_processor_id(),
-			       latency);
-		}
-	}
-
 	max_sequence++;
 
 out_unlock:
-- 
cgit v1.2.3


From 41c52c0db9607e59f90da7da5309489fa06e887f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 22 May 2008 11:46:33 -0400
Subject: ftrace: set_ftrace_notrace feature

While debugging latencies in the RT kernel, I found that it would be nice
to be able to filter away functions from the trace than just to filter
on functions.

I added a new interface to the debugfs tracing directory called

  set_ftrace_notrace

When dynamic frace is enabled, this lets you filter away functions that will
not be recorded in the trace. It is similar to adding 'notrace' to those
functions but by doing it without recompiling the kernel.

Here's how set_ftrace_filter and set_ftrace_notrace interact. Remember, if
set_ftrace_filter is set, it removes all functions from the trace execpt for
those listed in the set_ftrace_filter. set_ftrace_notrace will prevent those
functions from being traced.

If you were to set one function in both set_ftrace_filter and
set_ftrace_notrace and that function was the same, then you would end up
with an empty trace.

the set of functions to trace is:

  set_ftrace_filter == empty then

     all functions not in set_ftrace_notrace

  else

     set of the set_ftrace_filter and not in set of set_ftrace_notrace.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/ftrace.c | 170 ++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 130 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 89bd9a6f52ec..2552454609cf 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -170,7 +170,7 @@ static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu);
 
 static DEFINE_SPINLOCK(ftrace_shutdown_lock);
 static DEFINE_MUTEX(ftraced_lock);
-static DEFINE_MUTEX(ftrace_filter_lock);
+static DEFINE_MUTEX(ftrace_regex_lock);
 
 struct ftrace_page {
 	struct ftrace_page	*next;
@@ -337,13 +337,12 @@ static void
 __ftrace_replace_code(struct dyn_ftrace *rec,
 		      unsigned char *old, unsigned char *new, int enable)
 {
-	unsigned long ip;
+	unsigned long ip, fl;
 	int failed;
 
 	ip = rec->ip;
 
 	if (ftrace_filtered && enable) {
-		unsigned long fl;
 		/*
 		 * If filtering is on:
 		 *
@@ -356,13 +355,16 @@ __ftrace_replace_code(struct dyn_ftrace *rec,
 		 * If this record is not set to be filtered
 		 * and it is not enabled do nothing.
 		 *
+		 * If this record is set not to trace then
+		 * do nothing.
+		 *
 		 * If this record is not set to be filtered and
 		 * it is enabled, disable it.
 		 */
 		fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED);
 
 		if ((fl ==  (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) ||
-		    (fl == 0))
+		    (fl == 0) || (rec->flags & FTRACE_FL_NOTRACE))
 			return;
 
 		/*
@@ -380,9 +382,17 @@ __ftrace_replace_code(struct dyn_ftrace *rec,
 		}
 	} else {
 
-		if (enable)
+		if (enable) {
+			/*
+			 * If this record is set not to trace and is
+			 * not enabled, do nothing.
+			 */
+			fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED);
+			if (fl == FTRACE_FL_NOTRACE)
+				return;
+
 			new = ftrace_call_replace(ip, FTRACE_ADDR);
-		else
+		} else
 			old = ftrace_call_replace(ip, FTRACE_ADDR);
 
 		if (enable) {
@@ -721,6 +731,7 @@ static int __init ftrace_dyn_table_alloc(void)
 enum {
 	FTRACE_ITER_FILTER	= (1 << 0),
 	FTRACE_ITER_CONT	= (1 << 1),
+	FTRACE_ITER_NOTRACE	= (1 << 2),
 };
 
 #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
@@ -754,7 +765,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 		rec = &iter->pg->records[iter->idx++];
 		if ((rec->flags & FTRACE_FL_FAILED) ||
 		    ((iter->flags & FTRACE_ITER_FILTER) &&
-		     !(rec->flags & FTRACE_FL_FILTER))) {
+		     !(rec->flags & FTRACE_FL_FILTER)) ||
+		    ((iter->flags & FTRACE_ITER_NOTRACE) &&
+		     !(rec->flags & FTRACE_FL_NOTRACE))) {
 			rec = NULL;
 			goto retry;
 		}
@@ -847,22 +860,24 @@ int ftrace_avail_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static void ftrace_filter_reset(void)
+static void ftrace_filter_reset(int enable)
 {
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
+	unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
 	unsigned i;
 
 	/* keep kstop machine from running */
 	preempt_disable();
-	ftrace_filtered = 0;
+	if (enable)
+		ftrace_filtered = 0;
 	pg = ftrace_pages_start;
 	while (pg) {
 		for (i = 0; i < pg->index; i++) {
 			rec = &pg->records[i];
 			if (rec->flags & FTRACE_FL_FAILED)
 				continue;
-			rec->flags &= ~FTRACE_FL_FILTER;
+			rec->flags &= ~type;
 		}
 		pg = pg->next;
 	}
@@ -870,7 +885,7 @@ static void ftrace_filter_reset(void)
 }
 
 static int
-ftrace_filter_open(struct inode *inode, struct file *file)
+ftrace_regex_open(struct inode *inode, struct file *file, int enable)
 {
 	struct ftrace_iterator *iter;
 	int ret = 0;
@@ -882,15 +897,16 @@ ftrace_filter_open(struct inode *inode, struct file *file)
 	if (!iter)
 		return -ENOMEM;
 
-	mutex_lock(&ftrace_filter_lock);
+	mutex_lock(&ftrace_regex_lock);
 	if ((file->f_mode & FMODE_WRITE) &&
 	    !(file->f_flags & O_APPEND))
-		ftrace_filter_reset();
+		ftrace_filter_reset(enable);
 
 	if (file->f_mode & FMODE_READ) {
 		iter->pg = ftrace_pages_start;
 		iter->pos = -1;
-		iter->flags = FTRACE_ITER_FILTER;
+		iter->flags = enable ? FTRACE_ITER_FILTER :
+			FTRACE_ITER_NOTRACE;
 
 		ret = seq_open(file, &show_ftrace_seq_ops);
 		if (!ret) {
@@ -900,13 +916,25 @@ ftrace_filter_open(struct inode *inode, struct file *file)
 			kfree(iter);
 	} else
 		file->private_data = iter;
-	mutex_unlock(&ftrace_filter_lock);
+	mutex_unlock(&ftrace_regex_lock);
 
 	return ret;
 }
 
+static int
+ftrace_filter_open(struct inode *inode, struct file *file)
+{
+	return ftrace_regex_open(inode, file, 1);
+}
+
+static int
+ftrace_notrace_open(struct inode *inode, struct file *file)
+{
+	return ftrace_regex_open(inode, file, 0);
+}
+
 static ssize_t
-ftrace_filter_read(struct file *file, char __user *ubuf,
+ftrace_regex_read(struct file *file, char __user *ubuf,
 		       size_t cnt, loff_t *ppos)
 {
 	if (file->f_mode & FMODE_READ)
@@ -916,7 +944,7 @@ ftrace_filter_read(struct file *file, char __user *ubuf,
 }
 
 static loff_t
-ftrace_filter_lseek(struct file *file, loff_t offset, int origin)
+ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
 {
 	loff_t ret;
 
@@ -936,13 +964,14 @@ enum {
 };
 
 static void
-ftrace_match(unsigned char *buff, int len)
+ftrace_match(unsigned char *buff, int len, int enable)
 {
 	char str[KSYM_SYMBOL_LEN];
 	char *search = NULL;
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
 	int type = MATCH_FULL;
+	unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
 	unsigned i, match = 0, search_len = 0;
 
 	for (i = 0; i < len; i++) {
@@ -966,7 +995,8 @@ ftrace_match(unsigned char *buff, int len)
 
 	/* keep kstop machine from running */
 	preempt_disable();
-	ftrace_filtered = 1;
+	if (enable)
+		ftrace_filtered = 1;
 	pg = ftrace_pages_start;
 	while (pg) {
 		for (i = 0; i < pg->index; i++) {
@@ -997,7 +1027,7 @@ ftrace_match(unsigned char *buff, int len)
 				break;
 			}
 			if (matched)
-				rec->flags |= FTRACE_FL_FILTER;
+				rec->flags |= flag;
 		}
 		pg = pg->next;
 	}
@@ -1005,8 +1035,8 @@ ftrace_match(unsigned char *buff, int len)
 }
 
 static ssize_t
-ftrace_filter_write(struct file *file, const char __user *ubuf,
-		    size_t cnt, loff_t *ppos)
+ftrace_regex_write(struct file *file, const char __user *ubuf,
+		   size_t cnt, loff_t *ppos, int enable)
 {
 	struct ftrace_iterator *iter;
 	char ch;
@@ -1016,7 +1046,7 @@ ftrace_filter_write(struct file *file, const char __user *ubuf,
 	if (!cnt || cnt < 0)
 		return 0;
 
-	mutex_lock(&ftrace_filter_lock);
+	mutex_lock(&ftrace_regex_lock);
 
 	if (file->f_mode & FMODE_READ) {
 		struct seq_file *m = file->private_data;
@@ -1045,7 +1075,6 @@ ftrace_filter_write(struct file *file, const char __user *ubuf,
 			cnt--;
 		}
 
-
 		if (isspace(ch)) {
 			file->f_pos += read;
 			ret = read;
@@ -1072,7 +1101,7 @@ ftrace_filter_write(struct file *file, const char __user *ubuf,
 	if (isspace(ch)) {
 		iter->filtered++;
 		iter->buffer[iter->buffer_idx] = 0;
-		ftrace_match(iter->buffer, iter->buffer_idx);
+		ftrace_match(iter->buffer, iter->buffer_idx, enable);
 		iter->buffer_idx = 0;
 	} else
 		iter->flags |= FTRACE_ITER_CONT;
@@ -1082,11 +1111,39 @@ ftrace_filter_write(struct file *file, const char __user *ubuf,
 
 	ret = read;
  out:
-	mutex_unlock(&ftrace_filter_lock);
+	mutex_unlock(&ftrace_regex_lock);
 
 	return ret;
 }
 
+static ssize_t
+ftrace_filter_write(struct file *file, const char __user *ubuf,
+		    size_t cnt, loff_t *ppos)
+{
+	return ftrace_regex_write(file, ubuf, cnt, ppos, 1);
+}
+
+static ssize_t
+ftrace_notrace_write(struct file *file, const char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	return ftrace_regex_write(file, ubuf, cnt, ppos, 0);
+}
+
+static void
+ftrace_set_regex(unsigned char *buf, int len, int reset, int enable)
+{
+	if (unlikely(ftrace_disabled))
+		return;
+
+	mutex_lock(&ftrace_regex_lock);
+	if (reset)
+		ftrace_filter_reset(enable);
+	if (buf)
+		ftrace_match(buf, len, enable);
+	mutex_unlock(&ftrace_regex_lock);
+}
+
 /**
  * ftrace_set_filter - set a function to filter on in ftrace
  * @buf - the string that holds the function filter text.
@@ -1098,24 +1155,31 @@ ftrace_filter_write(struct file *file, const char __user *ubuf,
  */
 void ftrace_set_filter(unsigned char *buf, int len, int reset)
 {
-	if (unlikely(ftrace_disabled))
-		return;
+	ftrace_set_regex(buf, len, reset, 1);
+}
 
-	mutex_lock(&ftrace_filter_lock);
-	if (reset)
-		ftrace_filter_reset();
-	if (buf)
-		ftrace_match(buf, len);
-	mutex_unlock(&ftrace_filter_lock);
+/**
+ * ftrace_set_notrace - set a function to not trace in ftrace
+ * @buf - the string that holds the function notrace text.
+ * @len - the length of the string.
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Notrace Filters denote which functions should not be enabled when tracing
+ * is enabled. If @buf is NULL and reset is set, all functions will be enabled
+ * for tracing.
+ */
+void ftrace_set_notrace(unsigned char *buf, int len, int reset)
+{
+	ftrace_set_regex(buf, len, reset, 0);
 }
 
 static int
-ftrace_filter_release(struct inode *inode, struct file *file)
+ftrace_regex_release(struct inode *inode, struct file *file, int enable)
 {
 	struct seq_file *m = (struct seq_file *)file->private_data;
 	struct ftrace_iterator *iter;
 
-	mutex_lock(&ftrace_filter_lock);
+	mutex_lock(&ftrace_regex_lock);
 	if (file->f_mode & FMODE_READ) {
 		iter = m->private;
 
@@ -1126,7 +1190,7 @@ ftrace_filter_release(struct inode *inode, struct file *file)
 	if (iter->buffer_idx) {
 		iter->filtered++;
 		iter->buffer[iter->buffer_idx] = 0;
-		ftrace_match(iter->buffer, iter->buffer_idx);
+		ftrace_match(iter->buffer, iter->buffer_idx, enable);
 	}
 
 	mutex_lock(&ftrace_sysctl_lock);
@@ -1137,10 +1201,22 @@ ftrace_filter_release(struct inode *inode, struct file *file)
 	mutex_unlock(&ftrace_sysctl_lock);
 
 	kfree(iter);
-	mutex_unlock(&ftrace_filter_lock);
+	mutex_unlock(&ftrace_regex_lock);
 	return 0;
 }
 
+static int
+ftrace_filter_release(struct inode *inode, struct file *file)
+{
+	return ftrace_regex_release(inode, file, 1);
+}
+
+static int
+ftrace_notrace_release(struct inode *inode, struct file *file)
+{
+	return ftrace_regex_release(inode, file, 0);
+}
+
 static struct file_operations ftrace_avail_fops = {
 	.open = ftrace_avail_open,
 	.read = seq_read,
@@ -1150,12 +1226,20 @@ static struct file_operations ftrace_avail_fops = {
 
 static struct file_operations ftrace_filter_fops = {
 	.open = ftrace_filter_open,
-	.read = ftrace_filter_read,
+	.read = ftrace_regex_read,
 	.write = ftrace_filter_write,
-	.llseek = ftrace_filter_lseek,
+	.llseek = ftrace_regex_lseek,
 	.release = ftrace_filter_release,
 };
 
+static struct file_operations ftrace_notrace_fops = {
+	.open = ftrace_notrace_open,
+	.read = ftrace_regex_read,
+	.write = ftrace_notrace_write,
+	.llseek = ftrace_regex_lseek,
+	.release = ftrace_notrace_release,
+};
+
 /**
  * ftrace_force_update - force an update to all recording ftrace functions
  *
@@ -1239,6 +1323,12 @@ static __init int ftrace_init_debugfs(void)
 	if (!entry)
 		pr_warning("Could not create debugfs "
 			   "'set_ftrace_filter' entry\n");
+
+	entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer,
+				    NULL, &ftrace_notrace_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'set_ftrace_notrace' entry\n");
 	return 0;
 }
 
-- 
cgit v1.2.3


From 41bc8144d02028133bcd1d545023c6f49e8b2411 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 22 May 2008 11:49:22 -0400
Subject: ftrace: fix up cmdline recording

The new work with converting the trace hooks over to markers broke the
command line recording of ftrace. This patch fixes it again.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c              |  3 ---
 kernel/trace/trace.h              |  4 ++--
 kernel/trace/trace_functions.c    |  4 ++--
 kernel/trace/trace_sched_switch.c | 21 +++++++++++++++------
 4 files changed, 19 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 583fe24903d3..0feae23d9893 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -652,9 +652,6 @@ static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
 static int cmdline_idx;
 static DEFINE_SPINLOCK(trace_cmdline_lock);
 
-/* trace in all context switches */
-atomic_t trace_record_cmdline_enabled __read_mostly;
-
 /* temporary disable recording */
 atomic_t trace_record_cmdline_disabled __read_mostly;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c460e85e94ed..6b8bd8800d04 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -218,6 +218,8 @@ void trace_function(struct trace_array *tr,
 
 void tracing_start_function_trace(void);
 void tracing_stop_function_trace(void);
+void tracing_start_cmdline_record(void);
+void tracing_stop_cmdline_record(void);
 int register_tracer(struct tracer *type);
 void unregister_tracer(struct tracer *type);
 
@@ -226,8 +228,6 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs);
 extern unsigned long tracing_max_latency;
 extern unsigned long tracing_thresh;
 
-extern atomic_t trace_record_cmdline_enabled;
-
 void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
 void update_max_tr_single(struct trace_array *tr,
 			  struct task_struct *tsk, int cpu);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 0a084656d7cf..7ee7dcd76b7d 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -29,14 +29,14 @@ static void function_reset(struct trace_array *tr)
 static void start_function_trace(struct trace_array *tr)
 {
 	function_reset(tr);
-	atomic_inc(&trace_record_cmdline_enabled);
+	tracing_start_cmdline_record();
 	tracing_start_function_trace();
 }
 
 static void stop_function_trace(struct trace_array *tr)
 {
 	tracing_stop_function_trace();
-	atomic_dec(&trace_record_cmdline_enabled);
+	tracing_stop_cmdline_record();
 }
 
 static void function_trace_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 798ec0dc863c..c16935d3bc5c 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -29,6 +29,9 @@ sched_switch_func(void *private, void *__rq, struct task_struct *prev,
 	long disabled;
 	int cpu;
 
+	tracing_record_cmdline(prev);
+	tracing_record_cmdline(next);
+
 	if (!tracer_enabled)
 		return;
 
@@ -63,8 +66,6 @@ sched_switch_callback(void *probe_data, void *call_data,
 	prev = va_arg(*args, typeof(prev));
 	next = va_arg(*args, typeof(next));
 
-	tracing_record_cmdline(prev);
-
 	/*
 	 * If tracer_switch_func only points to the local
 	 * switch func, it still needs the ptr passed to it.
@@ -213,18 +214,26 @@ void tracing_stop_sched_switch(void)
 		tracing_sched_unregister();
 }
 
+void tracing_start_cmdline_record(void)
+{
+	tracing_start_sched_switch();
+}
+
+void tracing_stop_cmdline_record(void)
+{
+	tracing_stop_sched_switch();
+}
+
 static void start_sched_trace(struct trace_array *tr)
 {
 	sched_switch_reset(tr);
-	atomic_inc(&trace_record_cmdline_enabled);
 	tracer_enabled = 1;
-	tracing_start_sched_switch();
+	tracing_start_cmdline_record();
 }
 
 static void stop_sched_trace(struct trace_array *tr)
 {
-	tracing_stop_sched_switch();
-	atomic_dec(&trace_record_cmdline_enabled);
+	tracing_stop_cmdline_record();
 	tracer_enabled = 0;
 }
 
-- 
cgit v1.2.3


From ffdaa3582b6b39d625d585d07e329ffdc925e971 Mon Sep 17 00:00:00 2001
From: Abhishek Sagar <sagar.abhishek@gmail.com>
Date: Sat, 24 May 2008 23:45:02 +0530
Subject: ftrace: safe traversal of ftrace_hash hlist

Hi Steven,

I noticed that concurrent instances of ftrace_record_ip()
have a race between ftrace_hash list traversal during
ftrace_ip_in_hash() (before acquiring ftrace_shutdown_lock)
and ftrace_add_hash(). If it's so then this should fix it.

Signed-off-by: Abhishek Sagar <sagar.abhishek@gmail.com>
Cc: rostedt@goodmis.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/ftrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2552454609cf..9b7c54f8a62f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -201,7 +201,7 @@ ftrace_ip_in_hash(unsigned long ip, unsigned long key)
 	struct hlist_node *t;
 	int found = 0;
 
-	hlist_for_each_entry(p, t, &ftrace_hash[key], node) {
+	hlist_for_each_entry_rcu(p, t, &ftrace_hash[key], node) {
 		if (p->ip == ip) {
 			found = 1;
 			break;
@@ -214,7 +214,7 @@ ftrace_ip_in_hash(unsigned long ip, unsigned long key)
 static inline void
 ftrace_add_hash(struct dyn_ftrace *node, unsigned long key)
 {
-	hlist_add_head(&node->node, &ftrace_hash[key]);
+	hlist_add_head_rcu(&node->node, &ftrace_hash[key]);
 }
 
 static void ftrace_free_rec(struct dyn_ftrace *rec)
-- 
cgit v1.2.3


From 492a7ea5bcf263ee02a9eb6a3ab0222a1946fade Mon Sep 17 00:00:00 2001
From: Abhishek Sagar <sagar.abhishek@gmail.com>
Date: Sun, 25 May 2008 00:10:04 +0530
Subject: ftrace: fix updating of ftrace_update_cnt

Hi Ingo/Steven,

Ftrace currently maintains an update count which includes false updates,
i.e, updates which failed. If anything, such failures should be tracked
by some separate variable, but this patch provides a minimal fix.

Signed-off-by: Abhishek Sagar <sagar.abhishek@gmail.com>
Cc: rostedt@goodmis.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/ftrace.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9b7c54f8a62f..1843edc098a6 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -453,7 +453,7 @@ static void ftrace_shutdown_replenish(void)
 	ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
 }
 
-static void
+static int
 ftrace_code_disable(struct dyn_ftrace *rec)
 {
 	unsigned long ip;
@@ -469,7 +469,9 @@ ftrace_code_disable(struct dyn_ftrace *rec)
 	if (failed) {
 		rec->flags |= FTRACE_FL_FAILED;
 		ftrace_free_rec(rec);
+		return 0;
 	}
+	return 1;
 }
 
 static int __ftrace_modify_code(void *data)
@@ -617,8 +619,8 @@ static int __ftrace_update_code(void *ignore)
 
 		/* all CPUS are stopped, we are safe to modify code */
 		hlist_for_each_entry(p, t, &head, node) {
-			ftrace_code_disable(p);
-			ftrace_update_cnt++;
+			if (ftrace_code_disable(p))
+				ftrace_update_cnt++;
 		}
 
 	}
-- 
cgit v1.2.3


From d031476408ae0f5196e3c579f519dfdefb099b67 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Fri, 23 May 2008 14:41:19 +0100
Subject: hrtimer: remove warning in hres_timers_resume

hres_timers_resume() warns if there appears to be more than one cpu
online.  This warning makes sense when the suspend/resume mechanism
offlines all cpus but one during the suspend/resume process.

However, Xen suspend does not need to offline the other cpus; it
merely keeps them tied up in stop_machine() while the virtual machine
is suspended.  The warning hres_timers_resume issues is therefore
spurious.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: xen-devel <xen-devel@lists.xensource.com>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/hrtimer.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 421be5fe5cc7..493c4b8b8cba 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -632,8 +632,6 @@ void clock_was_set(void)
  */
 void hres_timers_resume(void)
 {
-	WARN_ON_ONCE(num_online_cpus() > 1);
-
 	/* Retrigger the CPU local events: */
 	retrigger_next_event(NULL);
 }
-- 
cgit v1.2.3


From 900cfa46191a7d87cf1891924cb90499287fd235 Mon Sep 17 00:00:00 2001
From: "Carlos R. Mafra" <crmafra2@gmail.com>
Date: Thu, 22 May 2008 19:25:11 -0300
Subject: hrtimer: Remove unused variables in ktime_divns()

The variables dns and inc are not used, remove them.

Signed-off-by: Carlos R. Mafra <crmafra@gmail.com>
Cc: tglx@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/hrtimer.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 493c4b8b8cba..635739d219da 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -300,11 +300,10 @@ EXPORT_SYMBOL_GPL(ktime_sub_ns);
  */
 u64 ktime_divns(const ktime_t kt, s64 div)
 {
-	u64 dclc, inc, dns;
+	u64 dclc;
 	int sft = 0;
 
-	dclc = dns = ktime_to_ns(kt);
-	inc = div;
+	dclc = ktime_to_ns(kt);
 	/* Make sure the divisor is less than 2^32: */
 	while (div >> 32) {
 		sft++;
-- 
cgit v1.2.3


From 9e124fe16ff24746d6de5a2ad685266d7bce0e08 Mon Sep 17 00:00:00 2001
From: Markus Armbruster <armbru@redhat.com>
Date: Mon, 26 May 2008 23:31:07 +0100
Subject: xen: Enable console tty by default in domU if it's not a dummy

Without console= arguments on the kernel command line, the first
console to register becomes enabled and the preferred console (the one
behind /dev/console).  This is normally tty (assuming
CONFIG_VT_CONSOLE is enabled, which it commonly is).

This is okay as long tty is a useful console.  But unless we have the
PV framebuffer, and it is enabled for this domain, tty0 in domU is
merely a dummy.  In that case, we want the preferred console to be the
Xen console hvc0, and we want it without having to fiddle with the
kernel command line.  Commit b8c2d3dfbc117dff26058fbac316b8acfc2cb5f7
did that for us.

Since we now have the PV framebuffer, we want to enable and prefer tty
again, but only when PVFB is enabled.  But even then we still want to
enable the Xen console as well.

Problem: when tty registers, we can't yet know whether the PVFB is
enabled.  By the time we can know (xenstore is up), the console setup
game is over.

Solution: enable console tty by default, but keep hvc as the preferred
console.  Change the preferred console to tty when PVFB probes
successfully, unless we've been given console kernel parameters.

Signed-off-by: Markus Armbruster <armbru@redhat.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/printk.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 8fb01c32aa3b..028ed75d4864 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -121,6 +121,8 @@ struct console_cmdline
 static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
 static int selected_console = -1;
 static int preferred_console = -1;
+int console_set_on_cmdline;
+EXPORT_SYMBOL(console_set_on_cmdline);
 
 /* Flag: console code may call schedule() */
 static int console_may_schedule;
@@ -890,6 +892,7 @@ static int __init console_setup(char *str)
 	*s = 0;
 
 	__add_preferred_console(buf, idx, options, brl_options);
+	console_set_on_cmdline = 1;
 	return 1;
 }
 __setup("console=", console_setup);
-- 
cgit v1.2.3


From 02ff375590ac4140d88afc76505df1ad45c6af59 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 15:43:53 +0200
Subject: softlockup: fix false positives on nohz if CPU is 100% idle for more
 than 60 seconds

Fix (probably theoretical only) rq->clock update bug:
in tick_nohz_update_jiffies() [which is called on all irq
entry on all cpus where the irq entry hits an idle cpu] we
call touch_softlockup_watchdog() before we update jiffies.
That works fine most of the time when idle timeouts are within
60 seconds. But when an idle timeout is beyond 60 seconds,
jiffies is updated with a jump of more than 60 seconds,
which causes a jump in cpu-clock of more than 60 seconds,
triggering a false positive.

Reported-by: David Miller <davem@davemloft.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/tick-sched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b854a895591e..28abad66fc8e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -133,8 +133,6 @@ void tick_nohz_update_jiffies(void)
 	if (!ts->tick_stopped)
 		return;
 
-	touch_softlockup_watchdog();
-
 	cpu_clear(cpu, nohz_cpu_mask);
 	now = ktime_get();
 	ts->idle_waketime = now;
@@ -142,6 +140,8 @@ void tick_nohz_update_jiffies(void)
 	local_irq_save(flags);
 	tick_do_update_jiffies64(now);
 	local_irq_restore(flags);
+
+	touch_softlockup_watchdog();
 }
 
 void tick_nohz_stop_idle(int cpu)
-- 
cgit v1.2.3


From 7a14ce1d8c1d3a6118d406e64eaf9aa70375e085 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 15:43:53 +0200
Subject: nohz: reduce jiffies polling overhead

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/tick-sched.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b854a895591e..cb75394ed00e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -48,6 +48,13 @@ static void tick_do_update_jiffies64(ktime_t now)
 	unsigned long ticks = 0;
 	ktime_t delta;
 
+	/*
+	 * Do a quick check without holding xtime_lock:
+	 */
+	delta = ktime_sub(now, last_jiffies_update);
+	if (delta.tv64 < tick_period.tv64)
+		return;
+
 	/* Reevalute with xtime_lock held */
 	write_seqlock(&xtime_lock);
 
-- 
cgit v1.2.3


From 8c2238eaaf0f774ca0f8d9daad7a616429bbb7f1 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Tue, 27 May 2008 12:23:29 -0500
Subject: softlockup: fix NMI hangs due to lock race - 2.6.26-rc regression

The touch_nmi_watchdog() routine on x86 ultimately calls
touch_softlockup_watchdog().  The problem is that to touch the
softlockup watchdog, the cpu_clock code has to be called which could
involve multiple cpu locks and can lead to a hard hang if one of the
locks is held by a processor that is not going to return anytime soon
(such as could be the case with kgdb or perhaps even with some other
kind of exception).

This patch causes the public version of the
touch_softlockup_watchdog() to defer the cpu clock access to a later
point.

The test case for this problem is to use the following kernel config
options:

CONFIG_KGDB_TESTS=y
CONFIG_KGDB_TESTS_ON_BOOT=y
CONFIG_KGDB_TESTS_BOOT_STRING="V1F100I100000"

It should be noted that kgdb test suite and these options were not
available until 2.6.26-rc2, so it was necessary to patch the kgdb
test suite during the bisection.

I would consider this patch a regression fix because the problem first
appeared in commit 27ec4407790d075c325e1f4da0a19c56953cce23 when some
logic was added to try to periodically sync the clocks.  It was
possible to work around this particular problem by simply not
performing the sync anytime the system was in a critical context.
This was ok until commit 3e51f33fcc7f55e6df25d15b55ed10c8b4da84cd,
which added config option CONFIG_HAVE_UNSTABLE_SCHED_CLOCK and some
multi-cpu locks to sync the clocks.  It became clear that accessing
this code from an nmi was the source of the lockups.  Avoiding the
access to the low level clock code from an code inside the NMI
processing also fixed the problem with the 27ec44... commit.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/softlockup.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index a3a0b239b7f7..6b682d86bddf 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -64,12 +64,17 @@ static unsigned long get_timestamp(int this_cpu)
 	return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
 }
 
-void touch_softlockup_watchdog(void)
+static void __touch_softlockup_watchdog(void)
 {
 	int this_cpu = raw_smp_processor_id();
 
 	__raw_get_cpu_var(touch_timestamp) = get_timestamp(this_cpu);
 }
+
+void touch_softlockup_watchdog(void)
+{
+	__raw_get_cpu_var(touch_timestamp) = 0;
+}
 EXPORT_SYMBOL(touch_softlockup_watchdog);
 
 void touch_all_softlockup_watchdogs(void)
@@ -103,7 +108,7 @@ void softlockup_tick(void)
 	}
 
 	if (touch_timestamp == 0) {
-		touch_softlockup_watchdog();
+		__touch_softlockup_watchdog();
 		return;
 	}
 
@@ -118,7 +123,7 @@ void softlockup_tick(void)
 
 	/* do not print during early bootup: */
 	if (unlikely(system_state != SYSTEM_RUNNING)) {
-		touch_softlockup_watchdog();
+		__touch_softlockup_watchdog();
 		return;
 	}
 
@@ -243,7 +248,7 @@ static int watchdog(void *__bind_cpu)
 	sched_setscheduler(current, SCHED_FIFO, &param);
 
 	/* initialize timestamp */
-	touch_softlockup_watchdog();
+	__touch_softlockup_watchdog();
 
 	set_current_state(TASK_INTERRUPTIBLE);
 	/*
@@ -252,7 +257,7 @@ static int watchdog(void *__bind_cpu)
 	 * debug-printout triggers in softlockup_tick().
 	 */
 	while (!kthread_should_stop()) {
-		touch_softlockup_watchdog();
+		__touch_softlockup_watchdog();
 		schedule();
 
 		if (kthread_should_stop())
-- 
cgit v1.2.3


From 76094a2cf46e4ab776055d4086615b884408568c Mon Sep 17 00:00:00 2001
From: Abhishek Sagar <sagar.abhishek@gmail.com>
Date: Wed, 28 May 2008 00:03:18 +0530
Subject: ftrace: distinguish kretprobe'd functions in trace logs

Tracing functions via ftrace which have a kretprobe installed on them, can produce misleading output in their trace logs. E.g, consider the correct trace of the following sequence:

do_IRQ()
{
~
  irq_enter();
~
}

Trace log (sample):
<idle>-0     [00] 4154504455.781616: irq_enter <- do_IRQ

But if irq_enter() has a kretprobe installed on it, the return value stored on the stack at each invocation is modified to divert the return to a kprobe trampoline function called kretprobe_trampoline(). So with this the trace would (currently) look like:

<idle>-0     [00] 4154504455.781616: irq_enter <- kretprobe_trampoline

Now this is quite misleading to the end user, as it suggests something that didn't actually happen. So just to avoid such misinterpretations, the inlined patch aims to output such a log as:

<idle>-0     [00] 4154504455.781616: irq_enter <- [unknown/kretprobe'd]

Signed-off-by: Abhishek Sagar <sagar.abhishek@gmail.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0feae23d9893..12f5e817380e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -27,6 +27,7 @@
 #include <linux/poll.h>
 #include <linux/gfp.h>
 #include <linux/fs.h>
+#include <linux/kprobes.h>
 #include <linux/writeback.h>
 
 #include <linux/stacktrace.h>
@@ -1199,6 +1200,20 @@ static void s_stop(struct seq_file *m, void *p)
 	mutex_unlock(&trace_types_lock);
 }
 
+#define KRETPROBE_MSG "[unknown/kretprobe'd]"
+
+#ifdef CONFIG_KRETPROBES
+static inline int kretprobed(unsigned long addr)
+{
+	return addr == (unsigned long)kretprobe_trampoline;
+}
+#else
+static inline int kretprobed(unsigned long addr)
+{
+	return 0;
+}
+#endif /* CONFIG_KRETPROBES */
+
 static int
 seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
 {
@@ -1434,7 +1449,10 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 	case TRACE_FN:
 		seq_print_ip_sym(s, entry->fn.ip, sym_flags);
 		trace_seq_puts(s, " (");
-		seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags);
+		if (kretprobed(entry->fn.parent_ip))
+			trace_seq_puts(s, KRETPROBE_MSG);
+		else
+			seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags);
 		trace_seq_puts(s, ")\n");
 		break;
 	case TRACE_CTX:
@@ -1514,8 +1532,11 @@ static int print_trace_fmt(struct trace_iterator *iter)
 			ret = trace_seq_printf(s, " <-");
 			if (!ret)
 				return 0;
-			ret = seq_print_ip_sym(s, entry->fn.parent_ip,
-					       sym_flags);
+			if (kretprobed(entry->fn.parent_ip))
+				ret = trace_seq_puts(s, KRETPROBE_MSG);
+			else
+				ret = seq_print_ip_sym(s, entry->fn.parent_ip,
+						       sym_flags);
 			if (!ret)
 				return 0;
 		}
-- 
cgit v1.2.3


From ad90c0e3ce8d20d6873b57e36181ef6d7a0097fe Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 27 May 2008 20:48:37 -0400
Subject: ftrace: user update and disable dynamic ftrace daemon

In dynamic ftrace, the mcount function starts off pointing to a stub
function that just returns.

On start up, the call to the stub is modified to point to a "record_ip"
function. The job of the record_ip function is to add the function to
a pre-allocated hash list. If the function is already there, it simply is
ignored, otherwise it is added to the list.

Later, a ftraced daemon wakes up and calls kstop_machine if any functions
have been recorded, and changes the calls to the recorded functions to
a simple nop.  If no functions were recorded, the daemon goes back to sleep.

The daemon wakes up once a second to see if it needs to update any newly
recorded functions into nops.  Usually it does not, but if a lot of code
has been executed for the first time in the kernel, the ftraced daemon
will call kstop_machine to update those into nops.

The problem currently is that there's no way to stop the daemon from doing
this, and it can cause unneeded latencies (800us which for some is bothersome).

This patch adds a new file /debugfs/tracing/ftraced_enabled. If the daemon
is active, reading this will return "enabled\n" and "disabled\n" when the
daemon is not running. To disable the daemon, the user can echo "0" or
"disable" into this file, and "1" or "enable" to re-enable the daemon.

Since the daemon is used to convert the functions into nops to increase
the performance of the system, I also added that anytime something is
written into the ftraced_enabled file, kstop_machine will run if there
are new functions that have been detected that need to be converted.

This way the user can disable the daemon but still be able to control the
conversion of the mcount calls to nops by simply,

  "echo 0 > /debugfs/tracing/ftraced_enabled"

when they need to do more conversions.

To see the number of converted functions:

  "cat /debugfs/tracing/dyn_ftrace_total_info"

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 157 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 110 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1843edc098a6..f762f5a2d331 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -151,8 +151,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 static struct task_struct *ftraced_task;
-static DECLARE_WAIT_QUEUE_HEAD(ftraced_waiters);
-static unsigned long ftraced_iteration_counter;
 
 enum {
 	FTRACE_ENABLE_CALLS		= (1 << 0),
@@ -189,6 +187,7 @@ static struct ftrace_page	*ftrace_pages;
 
 static int ftraced_trigger;
 static int ftraced_suspend;
+static int ftraced_stop;
 
 static int ftrace_record_suspend;
 
@@ -474,14 +473,21 @@ ftrace_code_disable(struct dyn_ftrace *rec)
 	return 1;
 }
 
+static int __ftrace_update_code(void *ignore);
+
 static int __ftrace_modify_code(void *data)
 {
 	unsigned long addr;
 	int *command = data;
 
-	if (*command & FTRACE_ENABLE_CALLS)
+	if (*command & FTRACE_ENABLE_CALLS) {
+		/*
+		 * Update any recorded ips now that we have the
+		 * machine stopped
+		 */
+		__ftrace_update_code(NULL);
 		ftrace_replace_code(1);
-	else if (*command & FTRACE_DISABLE_CALLS)
+	} else if (*command & FTRACE_DISABLE_CALLS)
 		ftrace_replace_code(0);
 
 	if (*command & FTRACE_UPDATE_TRACE_FUNC)
@@ -503,6 +509,25 @@ static void ftrace_run_update_code(int command)
 	stop_machine_run(__ftrace_modify_code, &command, NR_CPUS);
 }
 
+void ftrace_disable_daemon(void)
+{
+	/* Stop the daemon from calling kstop_machine */
+	mutex_lock(&ftraced_lock);
+	ftraced_stop = 1;
+	mutex_unlock(&ftraced_lock);
+
+	ftrace_force_update();
+}
+
+void ftrace_enable_daemon(void)
+{
+	mutex_lock(&ftraced_lock);
+	ftraced_stop = 0;
+	mutex_unlock(&ftraced_lock);
+
+	ftrace_force_update();
+}
+
 static ftrace_func_t saved_ftrace_func;
 
 static void ftrace_startup(void)
@@ -603,6 +628,7 @@ static int __ftrace_update_code(void *ignore)
 	int i;
 
 	/* Don't be recording funcs now */
+	ftrace_record_suspend++;
 	save_ftrace_enabled = ftrace_enabled;
 	ftrace_enabled = 0;
 
@@ -628,18 +654,23 @@ static int __ftrace_update_code(void *ignore)
 	stop = ftrace_now(raw_smp_processor_id());
 	ftrace_update_time = stop - start;
 	ftrace_update_tot_cnt += ftrace_update_cnt;
+	ftraced_trigger = 0;
 
 	ftrace_enabled = save_ftrace_enabled;
+	ftrace_record_suspend--;
 
 	return 0;
 }
 
-static void ftrace_update_code(void)
+static int ftrace_update_code(void)
 {
-	if (unlikely(ftrace_disabled))
-		return;
+	if (unlikely(ftrace_disabled) ||
+	    !ftrace_enabled || !ftraced_trigger)
+		return 0;
 
 	stop_machine_run(__ftrace_update_code, NULL, NR_CPUS);
+
+	return 1;
 }
 
 static int ftraced(void *ignore)
@@ -658,14 +689,13 @@ static int ftraced(void *ignore)
 
 		mutex_lock(&ftrace_sysctl_lock);
 		mutex_lock(&ftraced_lock);
-		if (ftrace_enabled && ftraced_trigger && !ftraced_suspend) {
-			ftrace_record_suspend++;
-			ftrace_update_code();
+		if (!ftraced_suspend && !ftraced_stop &&
+		    ftrace_update_code()) {
 			usecs = nsecs_to_usecs(ftrace_update_time);
 			if (ftrace_update_tot_cnt > 100000) {
 				ftrace_update_tot_cnt = 0;
 				pr_info("hm, dftrace overflow: %lu change%s"
-					 " (%lu total) in %lu usec%s\n",
+					" (%lu total) in %lu usec%s\n",
 					ftrace_update_cnt,
 					ftrace_update_cnt != 1 ? "s" : "",
 					ftrace_update_tot_cnt,
@@ -673,15 +703,10 @@ static int ftraced(void *ignore)
 				ftrace_disabled = 1;
 				WARN_ON_ONCE(1);
 			}
-			ftraced_trigger = 0;
-			ftrace_record_suspend--;
 		}
-		ftraced_iteration_counter++;
 		mutex_unlock(&ftraced_lock);
 		mutex_unlock(&ftrace_sysctl_lock);
 
-		wake_up_interruptible(&ftraced_waiters);
-
 		ftrace_shutdown_replenish();
 	}
 	__set_current_state(TASK_RUNNING);
@@ -1219,6 +1244,55 @@ ftrace_notrace_release(struct inode *inode, struct file *file)
 	return ftrace_regex_release(inode, file, 0);
 }
 
+static ssize_t
+ftraced_read(struct file *filp, char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	/* don't worry about races */
+	char *buf = ftraced_stop ? "disabled\n" : "enabled\n";
+	int r = strlen(buf);
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+ftraced_write(struct file *filp, const char __user *ubuf,
+		      size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	long val;
+	int ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	if (strncmp(buf, "enable", 6) == 0)
+		val = 1;
+	else if (strncmp(buf, "disable", 7) == 0)
+		val = 0;
+	else {
+		buf[cnt] = 0;
+
+		ret = strict_strtoul(buf, 10, &val);
+		if (ret < 0)
+			return ret;
+
+		val = !!val;
+	}
+
+	if (val)
+		ftrace_enable_daemon();
+	else
+		ftrace_disable_daemon();
+
+	filp->f_pos += cnt;
+
+	return cnt;
+}
+
 static struct file_operations ftrace_avail_fops = {
 	.open = ftrace_avail_open,
 	.read = seq_read,
@@ -1242,51 +1316,34 @@ static struct file_operations ftrace_notrace_fops = {
 	.release = ftrace_notrace_release,
 };
 
+static struct file_operations ftraced_fops = {
+	.open = tracing_open_generic,
+	.read = ftraced_read,
+	.write = ftraced_write,
+};
+
 /**
  * ftrace_force_update - force an update to all recording ftrace functions
- *
- * The ftrace dynamic update daemon only wakes up once a second.
- * There may be cases where an update needs to be done immediately
- * for tests or internal kernel tracing to begin. This function
- * wakes the daemon to do an update and will not return until the
- * update is complete.
  */
 int ftrace_force_update(void)
 {
-	unsigned long last_counter;
-	DECLARE_WAITQUEUE(wait, current);
 	int ret = 0;
 
 	if (unlikely(ftrace_disabled))
 		return -ENODEV;
 
+	mutex_lock(&ftrace_sysctl_lock);
 	mutex_lock(&ftraced_lock);
-	last_counter = ftraced_iteration_counter;
-
-	set_current_state(TASK_INTERRUPTIBLE);
-	add_wait_queue(&ftraced_waiters, &wait);
 
-	if (unlikely(!ftraced_task)) {
-		ret = -ENODEV;
-		goto out;
-	}
-
-	do {
-		mutex_unlock(&ftraced_lock);
-		wake_up_process(ftraced_task);
-		schedule();
-		mutex_lock(&ftraced_lock);
-		if (signal_pending(current)) {
-			ret = -EINTR;
-			break;
-		}
-		set_current_state(TASK_INTERRUPTIBLE);
-	} while (last_counter == ftraced_iteration_counter);
+	/*
+	 * If ftraced_trigger is not set, then there is nothing
+	 * to update.
+	 */
+	if (ftraced_trigger && !ftrace_update_code())
+		ret = -EBUSY;
 
- out:
 	mutex_unlock(&ftraced_lock);
-	remove_wait_queue(&ftraced_waiters, &wait);
-	set_current_state(TASK_RUNNING);
+	mutex_unlock(&ftrace_sysctl_lock);
 
 	return ret;
 }
@@ -1331,6 +1388,12 @@ static __init int ftrace_init_debugfs(void)
 	if (!entry)
 		pr_warning("Could not create debugfs "
 			   "'set_ftrace_notrace' entry\n");
+
+	entry = debugfs_create_file("ftraced_enabled", 0644, d_tracer,
+				    NULL, &ftraced_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'ftraced_enabled' entry\n");
 	return 0;
 }
 
-- 
cgit v1.2.3


From 18404756765c713a0be4eb1082920c04822ce588 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qualcomm.com>
Date: Thu, 29 May 2008 11:02:52 -0700
Subject: genirq: Expose default irq affinity mask (take 3)

Current IRQ affinity interface does not provide a way to set affinity
for the IRQs that will be allocated/activated in the future.
This patch creates /proc/irq/default_smp_affinity that lets users set
default affinity mask for the newly allocated IRQs. Changing the default
does not affect affinity masks for the currently active IRQs, they
have to be changed explicitly.

Updated based on Paul J's comments and added some more documentation.

Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Cc: pj@sgi.com
Cc: a.p.zijlstra@chello.nl
Cc: tglx@linutronix.de
Cc: rdunlap@xenotime.net
Cc: mingo@elte.hu
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/manage.c | 28 +++++++++++++++++++++++--
 kernel/irq/proc.c   | 59 +++++++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 81 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 46d6611a33bb..469814e9b9ee 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -17,6 +17,8 @@
 
 #ifdef CONFIG_SMP
 
+cpumask_t irq_default_affinity = CPU_MASK_ALL;
+
 /**
  *	synchronize_irq - wait for pending IRQ handlers (on other CPUs)
  *	@irq: interrupt number to wait for
@@ -95,6 +97,27 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
 	return 0;
 }
 
+#ifndef CONFIG_AUTO_IRQ_AFFINITY
+/*
+ * Generic version of the affinity autoselector.
+ */
+int irq_select_affinity(unsigned int irq)
+{
+	cpumask_t mask;
+
+	if (!irq_can_set_affinity(irq))
+		return 0;
+
+	cpus_and(mask, cpu_online_map, irq_default_affinity);
+
+	irq_desc[irq].affinity = mask;
+	irq_desc[irq].chip->set_affinity(irq, mask);
+
+	set_balance_irq_affinity(irq, mask);
+	return 0;
+}
+#endif
+
 #endif
 
 /**
@@ -382,6 +405,9 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 		} else
 			/* Undo nested disables: */
 			desc->depth = 1;
+
+		/* Set default affinity mask once everything is setup */
+		irq_select_affinity(irq);
 	}
 	/* Reset broken irq detection when installing new handler */
 	desc->irq_count = 0;
@@ -571,8 +597,6 @@ int request_irq(unsigned int irq, irq_handler_t handler,
 	action->next = NULL;
 	action->dev_id = dev_id;
 
-	select_smp_affinity(irq);
-
 #ifdef CONFIG_DEBUG_SHIRQ
 	if (irqflags & IRQF_SHARED) {
 		/*
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index c2f2ccb0549a..6c6d35d68ee9 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -44,7 +44,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
 				   unsigned long count, void *data)
 {
 	unsigned int irq = (int)(long)data, full_count = count, err;
-	cpumask_t new_value, tmp;
+	cpumask_t new_value;
 
 	if (!irq_desc[irq].chip->set_affinity || no_irq_affinity ||
 	    irq_balancing_disabled(irq))
@@ -62,17 +62,51 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
 	 * way to make the system unusable accidentally :-) At least
 	 * one online CPU still has to be targeted.
 	 */
-	cpus_and(tmp, new_value, cpu_online_map);
-	if (cpus_empty(tmp))
+	if (!cpus_intersects(new_value, cpu_online_map))
 		/* Special case for empty set - allow the architecture
 		   code to set default SMP affinity. */
-		return select_smp_affinity(irq) ? -EINVAL : full_count;
+		return irq_select_affinity(irq) ? -EINVAL : full_count;
 
 	irq_set_affinity(irq, new_value);
 
 	return full_count;
 }
 
+static int default_affinity_read(char *page, char **start, off_t off,
+				  int count, int *eof, void *data)
+{
+	int len = cpumask_scnprintf(page, count, irq_default_affinity);
+	if (count - len < 2)
+		return -EINVAL;
+	len += sprintf(page + len, "\n");
+	return len;
+}
+
+static int default_affinity_write(struct file *file, const char __user *buffer,
+				   unsigned long count, void *data)
+{
+	unsigned int full_count = count, err;
+	cpumask_t new_value;
+
+	err = cpumask_parse_user(buffer, count, new_value);
+	if (err)
+		return err;
+
+	if (!is_affinity_mask_valid(new_value))
+		return -EINVAL;
+
+	/*
+	 * Do not allow disabling IRQs completely - it's a too easy
+	 * way to make the system unusable accidentally :-) At least
+	 * one online CPU still has to be targeted.
+	 */
+	if (!cpus_intersects(new_value, cpu_online_map))
+		return -EINVAL;
+
+	irq_default_affinity = new_value;
+
+	return full_count;
+}
 #endif
 
 static int irq_spurious_read(char *page, char **start, off_t off,
@@ -171,6 +205,21 @@ void unregister_handler_proc(unsigned int irq, struct irqaction *action)
 		remove_proc_entry(action->dir->name, irq_desc[irq].dir);
 }
 
+void register_default_affinity_proc(void)
+{
+#ifdef CONFIG_SMP
+	struct proc_dir_entry *entry;
+
+	/* create /proc/irq/default_smp_affinity */
+	entry = create_proc_entry("default_smp_affinity", 0600, root_irq_dir);
+	if (entry) {
+		entry->data = NULL;
+		entry->read_proc  = default_affinity_read;
+		entry->write_proc = default_affinity_write;
+	}
+#endif
+}
+
 void init_irq_proc(void)
 {
 	int i;
@@ -180,6 +229,8 @@ void init_irq_proc(void)
 	if (!root_irq_dir)
 		return;
 
+	register_default_affinity_proc();
+
 	/*
 	 * Create entries for all existing IRQs.
 	 */
-- 
cgit v1.2.3


From 45c01e824991b2dd0a332e19efc4901acb31209f Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Mon, 12 May 2008 21:20:41 +0200
Subject: sched: prioritize non-migratable tasks over migratable ones

Dmitry Adamushko pointed out a known flaw in the rt-balancing algorithm
that could allow suboptimal balancing if a non-migratable task gets
queued behind a running migratable one.  It is discussed in this thread:

http://lkml.org/lkml/2008/4/22/296

This issue has been further exacerbated by a recent checkin to
sched-devel (git-id 5eee63a5ebc19a870ac40055c0be49457f3a89a3).

>From a pure priority standpoint, the run-queue is doing the "right"
thing. Using Dmitry's nomenclature, if T0 is on cpu1 first, and T1
wakes up at equal or lower priority (affined only to cpu1) later, it
*should* wait for T0 to finish.  However, in reality that is likely
suboptimal from a system perspective if there are other cores that
could allow T0 and T1 to run concurrently.  Since T1 can not migrate,
the only choice for higher concurrency is to try to move T0.  This is
not something we addessed in the recent rt-balancing re-work.

This patch tries to enhance the balancing algorithm by accomodating this
scenario.  It accomplishes this by incorporating the migratability of a
task into its priority calculation.  Within a numerical tsk->prio, a
non-migratable task is logically higher than a migratable one.  We
maintain this by introducing a new per-priority queue (xqueue, or
exclusive-queue) for holding non-migratable tasks.  The scheduler will
draw from the xqueue over the standard shared-queue (squeue) when
available.

There are several details for utilizing this properly.

1) During task-wake-up, we not only need to check if the priority
   preempts the current task, but we also need to check for this
   non-migratable condition.  Therefore, if a non-migratable task wakes
   up and sees an equal priority migratable task already running, it
   will attempt to preempt it *if* there is a likelyhood that the
   current task will find an immediate home.

2) Tasks only get this non-migratable "priority boost" on wake-up.  Any
   requeuing will result in the non-migratable task being queued to the
   end of the shared queue.  This is an attempt to prevent the system
   from being completely unfair to migratable tasks during things like
   SCHED_RR timeslicing.

I am sure this patch introduces potentially "odd" behavior if you
concoct a scenario where a bunch of non-migratable threads could starve
migratable ones given the right pattern.  I am not yet convinced that
this is a problem since we are talking about tasks of equal RT priority
anyway, and there never is much in the way of guarantees against
starvation under that scenario anyway. (e.g. you could come up with a
similar scenario with a specific timing environment verses an affinity
environment).  I can be convinced otherwise, but for now I think this is
"ok".

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
CC: Dmitry Adamushko <dmitry.adamushko@gmail.com>
CC: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c    |  6 +++--
 kernel/sched_rt.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 72 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index bfb8ad8ed171..7178b8c2351c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -151,7 +151,8 @@ static inline int task_has_rt_policy(struct task_struct *p)
  */
 struct rt_prio_array {
 	DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
-	struct list_head queue[MAX_RT_PRIO];
+	struct list_head xqueue[MAX_RT_PRIO]; /* exclusive queue */
+	struct list_head squeue[MAX_RT_PRIO];  /* shared queue */
 };
 
 struct rt_bandwidth {
@@ -7542,7 +7543,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 
 	array = &rt_rq->active;
 	for (i = 0; i < MAX_RT_PRIO; i++) {
-		INIT_LIST_HEAD(array->queue + i);
+		INIT_LIST_HEAD(array->xqueue + i);
+		INIT_LIST_HEAD(array->squeue + i);
 		__clear_bit(i, array->bitmap);
 	}
 	/* delimiter for bitsearch: */
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 3432d573205d..fefed39fafd8 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -458,7 +458,13 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
 	if (group_rq && rt_rq_throttled(group_rq))
 		return;
 
-	list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
+	if (rt_se->nr_cpus_allowed == 1)
+		list_add_tail(&rt_se->run_list,
+			      array->xqueue + rt_se_prio(rt_se));
+	else
+		list_add_tail(&rt_se->run_list,
+			      array->squeue + rt_se_prio(rt_se));
+
 	__set_bit(rt_se_prio(rt_se), array->bitmap);
 
 	inc_rt_tasks(rt_se, rt_rq);
@@ -470,7 +476,8 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
 	struct rt_prio_array *array = &rt_rq->active;
 
 	list_del_init(&rt_se->run_list);
-	if (list_empty(array->queue + rt_se_prio(rt_se)))
+	if (list_empty(array->squeue + rt_se_prio(rt_se))
+	    && list_empty(array->xqueue + rt_se_prio(rt_se)))
 		__clear_bit(rt_se_prio(rt_se), array->bitmap);
 
 	dec_rt_tasks(rt_se, rt_rq);
@@ -537,13 +544,19 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 /*
  * Put task to the end of the run list without the overhead of dequeue
  * followed by enqueue.
+ *
+ * Note: We always enqueue the task to the shared-queue, regardless of its
+ * previous position w.r.t. exclusive vs shared.  This is so that exclusive RR
+ * tasks fairly round-robin with all tasks on the runqueue, not just other
+ * exclusive tasks.
  */
 static
 void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
 {
 	struct rt_prio_array *array = &rt_rq->active;
 
-	list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
+	list_del_init(&rt_se->run_list);
+	list_add_tail(&rt_se->run_list, array->squeue + rt_se_prio(rt_se));
 }
 
 static void requeue_task_rt(struct rq *rq, struct task_struct *p)
@@ -601,13 +614,46 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
 }
 #endif /* CONFIG_SMP */
 
+static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
+						   struct rt_rq *rt_rq);
+
 /*
  * Preempt the current task with a newly woken task if needed:
  */
 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
 {
-	if (p->prio < rq->curr->prio)
+	if (p->prio < rq->curr->prio) {
 		resched_task(rq->curr);
+		return;
+	}
+
+#ifdef CONFIG_SMP
+	/*
+	 * If:
+	 *
+	 * - the newly woken task is of equal priority to the current task
+	 * - the newly woken task is non-migratable while current is migratable
+	 * - current will be preempted on the next reschedule
+	 *
+	 * we should check to see if current can readily move to a different
+	 * cpu.  If so, we will reschedule to allow the push logic to try
+	 * to move current somewhere else, making room for our non-migratable
+	 * task.
+	 */
+	if((p->prio == rq->curr->prio)
+	   && p->rt.nr_cpus_allowed == 1
+	   && rq->curr->rt.nr_cpus_allowed != 1
+	   && pick_next_rt_entity(rq, &rq->rt) != &rq->curr->rt) {
+		cpumask_t mask;
+
+		if (cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
+			/*
+			 * There appears to be other cpus that can accept
+			 * current, so lets reschedule to try and push it away
+			 */
+			resched_task(rq->curr);
+	}
+#endif
 }
 
 static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
@@ -621,8 +667,15 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
 	idx = sched_find_first_bit(array->bitmap);
 	BUG_ON(idx >= MAX_RT_PRIO);
 
-	queue = array->queue + idx;
-	next = list_entry(queue->next, struct sched_rt_entity, run_list);
+	queue = array->xqueue + idx;
+	if (!list_empty(queue))
+		next = list_entry(queue->next, struct sched_rt_entity,
+				  run_list);
+	else {
+		queue = array->squeue + idx;
+		next = list_entry(queue->next, struct sched_rt_entity,
+				  run_list);
+	}
 
 	return next;
 }
@@ -692,7 +745,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 			continue;
 		if (next && next->prio < idx)
 			continue;
-		list_for_each_entry(rt_se, array->queue + idx, run_list) {
+		list_for_each_entry(rt_se, array->squeue + idx, run_list) {
 			struct task_struct *p = rt_task_of(rt_se);
 			if (pick_rt_task(rq, p, cpu)) {
 				next = p;
@@ -1146,6 +1199,14 @@ static void set_cpus_allowed_rt(struct task_struct *p,
 		}
 
 		update_rt_migration(rq);
+
+		if (unlikely(weight == 1 || p->rt.nr_cpus_allowed == 1))
+			/*
+			 * If either the new or old weight is a "1", we need
+			 * to requeue to properly move between shared and
+			 * exclusive queues.
+			 */
+			requeue_task_rt(rq, p);
 	}
 
 	p->cpus_allowed    = *new_mask;
-- 
cgit v1.2.3


From f333fdc9098b71e2687e4e9b6349fcb352960d66 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Mon, 12 May 2008 21:20:55 +0200
Subject: sched: make !hrtick faster

it is safe to ignore timers and flags when the feature is disabled.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 7178b8c2351c..aa960b84b881 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4134,7 +4134,7 @@ asmlinkage void __sched schedule(void)
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
 	struct rq *rq;
-	int cpu;
+	int cpu, hrtick = sched_feat(HRTICK);
 
 need_resched:
 	preempt_disable();
@@ -4149,7 +4149,8 @@ need_resched_nonpreemptible:
 
 	schedule_debug(prev);
 
-	hrtick_clear(rq);
+	if (hrtick)
+		hrtick_clear(rq);
 
 	/*
 	 * Do the rq-clock update outside the rq lock:
@@ -4197,7 +4198,8 @@ need_resched_nonpreemptible:
 	} else
 		spin_unlock_irq(&rq->lock);
 
-	hrtick_set(rq);
+	if (hrtick)
+		hrtick_set(rq);
 
 	if (unlikely(reacquire_kernel_lock(current) < 0))
 		goto need_resched_nonpreemptible;
-- 
cgit v1.2.3


From 6e0534f278199f1e3dd1049b9bc19a7a5b87ada1 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Mon, 12 May 2008 21:21:01 +0200
Subject: sched: use a 2-d bitmap for searching lowest-pri CPU

The current code use a linear algorithm which causes scaling issues
on larger SMP machines.  This patch replaces that algorithm with a
2-dimensional bitmap to reduce latencies in the wake-up path.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/Makefile       |   1 +
 kernel/sched.c        |   7 ++
 kernel/sched_cpupri.c | 174 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched_cpupri.h |  36 +++++++++++
 kernel/sched_rt.c     |  98 ++++++----------------------
 5 files changed, 239 insertions(+), 77 deletions(-)
 create mode 100644 kernel/sched_cpupri.c
 create mode 100644 kernel/sched_cpupri.h

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 1c9938addb9d..ecdd2d335639 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_MARKERS) += marker.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
+obj-$(CONFIG_SMP) += sched_cpupri.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/sched.c b/kernel/sched.c
index aa960b84b881..8a1257b65560 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -74,6 +74,8 @@
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
 
+#include "sched_cpupri.h"
+
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -450,6 +452,9 @@ struct root_domain {
 	 */
 	cpumask_t rto_mask;
 	atomic_t rto_count;
+#ifdef CONFIG_SMP
+	struct cpupri cpupri;
+#endif
 };
 
 /*
@@ -6392,6 +6397,8 @@ static void init_rootdomain(struct root_domain *rd)
 
 	cpus_clear(rd->span);
 	cpus_clear(rd->online);
+
+	cpupri_init(&rd->cpupri);
 }
 
 static void init_defrootdomain(void)
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
new file mode 100644
index 000000000000..52154fefab7e
--- /dev/null
+++ b/kernel/sched_cpupri.c
@@ -0,0 +1,174 @@
+/*
+ *  kernel/sched_cpupri.c
+ *
+ *  CPU priority management
+ *
+ *  Copyright (C) 2007-2008 Novell
+ *
+ *  Author: Gregory Haskins <ghaskins@novell.com>
+ *
+ *  This code tracks the priority of each CPU so that global migration
+ *  decisions are easy to calculate.  Each CPU can be in a state as follows:
+ *
+ *                 (INVALID), IDLE, NORMAL, RT1, ... RT99
+ *
+ *  going from the lowest priority to the highest.  CPUs in the INVALID state
+ *  are not eligible for routing.  The system maintains this state with
+ *  a 2 dimensional bitmap (the first for priority class, the second for cpus
+ *  in that class).  Therefore a typical application without affinity
+ *  restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
+ *  searches).  For tasks with affinity restrictions, the algorithm has a
+ *  worst case complexity of O(min(102, nr_domcpus)), though the scenario that
+ *  yields the worst case search is fairly contrived.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; version 2
+ *  of the License.
+ */
+
+#include "sched_cpupri.h"
+
+/* Convert between a 140 based task->prio, and our 102 based cpupri */
+static int convert_prio(int prio)
+{
+	int cpupri;
+
+	if (prio == CPUPRI_INVALID)
+		cpupri = CPUPRI_INVALID;
+	else if (prio == MAX_PRIO)
+		cpupri = CPUPRI_IDLE;
+	else if (prio >= MAX_RT_PRIO)
+		cpupri = CPUPRI_NORMAL;
+	else
+		cpupri = MAX_RT_PRIO - prio + 1;
+
+	return cpupri;
+}
+
+#define for_each_cpupri_active(array, idx)                    \
+  for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES);     \
+       idx < CPUPRI_NR_PRIORITIES;                            \
+       idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
+
+/**
+ * cpupri_find - find the best (lowest-pri) CPU in the system
+ * @cp: The cpupri context
+ * @p: The task
+ * @lowest_mask: A mask to fill in with selected CPUs
+ *
+ * Note: This function returns the recommended CPUs as calculated during the
+ * current invokation.  By the time the call returns, the CPUs may have in
+ * fact changed priorities any number of times.  While not ideal, it is not
+ * an issue of correctness since the normal rebalancer logic will correct
+ * any discrepancies created by racing against the uncertainty of the current
+ * priority configuration.
+ *
+ * Returns: (int)bool - CPUs were found
+ */
+int cpupri_find(struct cpupri *cp, struct task_struct *p,
+		cpumask_t *lowest_mask)
+{
+	int                  idx      = 0;
+	int                  task_pri = convert_prio(p->prio);
+
+	for_each_cpupri_active(cp->pri_active, idx) {
+		struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
+		cpumask_t mask;
+
+		if (idx >= task_pri)
+			break;
+
+		cpus_and(mask, p->cpus_allowed, vec->mask);
+
+		if (cpus_empty(mask))
+			continue;
+
+		*lowest_mask = mask;
+		return 1;
+	}
+
+	return 0;
+}
+
+/**
+ * cpupri_set - update the cpu priority setting
+ * @cp: The cpupri context
+ * @cpu: The target cpu
+ * @pri: The priority (INVALID-RT99) to assign to this CPU
+ *
+ * Note: Assumes cpu_rq(cpu)->lock is locked
+ *
+ * Returns: (void)
+ */
+void cpupri_set(struct cpupri *cp, int cpu, int newpri)
+{
+	int                 *currpri = &cp->cpu_to_pri[cpu];
+	int                  oldpri  = *currpri;
+	unsigned long        flags;
+
+	newpri = convert_prio(newpri);
+
+	BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
+
+	if (newpri == oldpri)
+		return;
+
+	/*
+	 * If the cpu was currently mapped to a different value, we
+	 * first need to unmap the old value
+	 */
+	if (likely(oldpri != CPUPRI_INVALID)) {
+		struct cpupri_vec *vec  = &cp->pri_to_cpu[oldpri];
+
+		spin_lock_irqsave(&vec->lock, flags);
+
+		vec->count--;
+		if (!vec->count)
+			clear_bit(oldpri, cp->pri_active);
+		cpu_clear(cpu, vec->mask);
+
+		spin_unlock_irqrestore(&vec->lock, flags);
+	}
+
+	if (likely(newpri != CPUPRI_INVALID)) {
+		struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
+
+		spin_lock_irqsave(&vec->lock, flags);
+
+		cpu_set(cpu, vec->mask);
+		vec->count++;
+		if (vec->count == 1)
+			set_bit(newpri, cp->pri_active);
+
+		spin_unlock_irqrestore(&vec->lock, flags);
+	}
+
+	*currpri = newpri;
+}
+
+/**
+ * cpupri_init - initialize the cpupri structure
+ * @cp: The cpupri context
+ *
+ * Returns: (void)
+ */
+void cpupri_init(struct cpupri *cp)
+{
+	int i;
+
+	memset(cp, 0, sizeof(*cp));
+
+	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
+		struct cpupri_vec *vec = &cp->pri_to_cpu[i];
+
+		spin_lock_init(&vec->lock);
+		vec->count = 0;
+		cpus_clear(vec->mask);
+	}
+
+	for_each_possible_cpu(i)
+		cp->cpu_to_pri[i] = CPUPRI_INVALID;
+}
+
+
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
new file mode 100644
index 000000000000..0b6a3d110fac
--- /dev/null
+++ b/kernel/sched_cpupri.h
@@ -0,0 +1,36 @@
+#ifndef _LINUX_CPUPRI_H
+#define _LINUX_CPUPRI_H
+
+#include <linux/sched.h>
+
+#define CPUPRI_NR_PRIORITIES 2+MAX_RT_PRIO
+#define CPUPRI_NR_PRI_WORDS CPUPRI_NR_PRIORITIES/BITS_PER_LONG
+
+#define CPUPRI_INVALID -1
+#define CPUPRI_IDLE     0
+#define CPUPRI_NORMAL   1
+/* values 2-101 are RT priorities 0-99 */
+
+struct cpupri_vec {
+	spinlock_t lock;
+	int        count;
+	cpumask_t  mask;
+};
+
+struct cpupri {
+	struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
+	long              pri_active[CPUPRI_NR_PRI_WORDS];
+	int               cpu_to_pri[NR_CPUS];
+};
+
+#ifdef CONFIG_SMP
+int  cpupri_find(struct cpupri *cp,
+		 struct task_struct *p, cpumask_t *lowest_mask);
+void cpupri_set(struct cpupri *cp, int cpu, int pri);
+void cpupri_init(struct cpupri *cp);
+#else
+#define cpupri_set(cp, cpu, pri) do { } while (0)
+#define cpupri_init() do { } while (0)
+#endif
+
+#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index fefed39fafd8..44b06d75416e 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -391,8 +391,11 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
 	rt_rq->rt_nr_running++;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-	if (rt_se_prio(rt_se) < rt_rq->highest_prio)
+	if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
+		struct rq *rq = rq_of_rt_rq(rt_rq);
 		rt_rq->highest_prio = rt_se_prio(rt_se);
+		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_se_prio(rt_se));
+	}
 #endif
 #ifdef CONFIG_SMP
 	if (rt_se->nr_cpus_allowed > 1) {
@@ -416,6 +419,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 static inline
 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
+#ifdef CONFIG_SMP
+	int highest_prio = rt_rq->highest_prio;
+#endif
+
 	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
 	WARN_ON(!rt_rq->rt_nr_running);
 	rt_rq->rt_nr_running--;
@@ -439,6 +446,11 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 		rq->rt.rt_nr_migratory--;
 	}
 
+	if (rt_rq->highest_prio != highest_prio) {
+		struct rq *rq = rq_of_rt_rq(rt_rq);
+		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio);
+	}
+
 	update_rt_migration(rq_of_rt_rq(rt_rq));
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -763,73 +775,6 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 
 static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
 
-static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
-{
-	int       lowest_prio = -1;
-	int       lowest_cpu  = -1;
-	int       count       = 0;
-	int       cpu;
-
-	cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
-
-	/*
-	 * Scan each rq for the lowest prio.
-	 */
-	for_each_cpu_mask(cpu, *lowest_mask) {
-		struct rq *rq = cpu_rq(cpu);
-
-		/* We look for lowest RT prio or non-rt CPU */
-		if (rq->rt.highest_prio >= MAX_RT_PRIO) {
-			/*
-			 * if we already found a low RT queue
-			 * and now we found this non-rt queue
-			 * clear the mask and set our bit.
-			 * Otherwise just return the queue as is
-			 * and the count==1 will cause the algorithm
-			 * to use the first bit found.
-			 */
-			if (lowest_cpu != -1) {
-				cpus_clear(*lowest_mask);
-				cpu_set(rq->cpu, *lowest_mask);
-			}
-			return 1;
-		}
-
-		/* no locking for now */
-		if ((rq->rt.highest_prio > task->prio)
-		    && (rq->rt.highest_prio >= lowest_prio)) {
-			if (rq->rt.highest_prio > lowest_prio) {
-				/* new low - clear old data */
-				lowest_prio = rq->rt.highest_prio;
-				lowest_cpu = cpu;
-				count = 0;
-			}
-			count++;
-		} else
-			cpu_clear(cpu, *lowest_mask);
-	}
-
-	/*
-	 * Clear out all the set bits that represent
-	 * runqueues that were of higher prio than
-	 * the lowest_prio.
-	 */
-	if (lowest_cpu > 0) {
-		/*
-		 * Perhaps we could add another cpumask op to
-		 * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
-		 * Then that could be optimized to use memset and such.
-		 */
-		for_each_cpu_mask(cpu, *lowest_mask) {
-			if (cpu >= lowest_cpu)
-				break;
-			cpu_clear(cpu, *lowest_mask);
-		}
-	}
-
-	return count;
-}
-
 static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
 {
 	int first;
@@ -851,17 +796,12 @@ static int find_lowest_rq(struct task_struct *task)
 	cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
 	int this_cpu = smp_processor_id();
 	int cpu      = task_cpu(task);
-	int count    = find_lowest_cpus(task, lowest_mask);
 
-	if (!count)
-		return -1; /* No targets found */
+	if (task->rt.nr_cpus_allowed == 1)
+		return -1; /* No other targets possible */
 
-	/*
-	 * There is no sense in performing an optimal search if only one
-	 * target is found.
-	 */
-	if (count == 1)
-		return first_cpu(*lowest_mask);
+	if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
+		return -1; /* No targets found */
 
 	/*
 	 * At this point we have built a mask of cpus representing the
@@ -1218,6 +1158,8 @@ static void join_domain_rt(struct rq *rq)
 {
 	if (rq->rt.overloaded)
 		rt_set_overload(rq);
+
+	cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
 }
 
 /* Assumes rq->lock is held */
@@ -1225,6 +1167,8 @@ static void leave_domain_rt(struct rq *rq)
 {
 	if (rq->rt.overloaded)
 		rt_clear_overload(rq);
+
+	cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
 }
 
 /*
-- 
cgit v1.2.3


From 6d299f1b53b84e2665f402d9bcc494800aba6386 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Mon, 12 May 2008 21:21:14 +0200
Subject: sched: fix SCHED_OTHER balance iterator to include all tasks

The currently logic inadvertently skips the last task on the run-queue,
resulting in missed balance opportunities.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: David Bahi <dbahi@novell.com>
CC: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_fair.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 08ae848b71d4..1fe4c65a8170 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1275,23 +1275,18 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
 	struct task_struct *p = NULL;
 	struct sched_entity *se;
 
-	if (next == &cfs_rq->tasks)
-		return NULL;
-
-	/* Skip over entities that are not tasks */
-	do {
+	while (next != &cfs_rq->tasks) {
 		se = list_entry(next, struct sched_entity, group_node);
 		next = next->next;
-	} while (next != &cfs_rq->tasks && !entity_is_task(se));
 
-	if (next == &cfs_rq->tasks)
-		return NULL;
+		/* Skip over entities that are not tasks */
+		if (entity_is_task(se)) {
+			p = task_of(se);
+			break;
+		}
+	}
 
 	cfs_rq->balance_iterator = next;
-
-	if (entity_is_task(se))
-		p = task_of(se);
-
 	return p;
 }
 
-- 
cgit v1.2.3


From d07355f5def74d060333563b36ab51b89fd44cdd Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Mon, 12 May 2008 21:21:15 +0200
Subject: sched: check for SD_SERIALIZE atomically in rebalance_domains()

Nothing really serious here, mainly just a matter of nit-picking :-/

From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
For CONFIG_SCHED_DEBUG && CONFIG_SYSCT configs, sd->flags can be altered
while being manipulated in rebalance_domains(). Let's do an atomic check.
We rely here on the atomicity of read/write accesses for aligned words.

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 8a1257b65560..90329f1f8941 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3668,6 +3668,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 	/* Earliest time when we have to do rebalance again */
 	unsigned long next_balance = jiffies + 60*HZ;
 	int update_next_balance = 0;
+	int need_serialize;
 	cpumask_t tmp;
 
 	for_each_domain(cpu, sd) {
@@ -3685,8 +3686,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 		if (interval > HZ*NR_CPUS/10)
 			interval = HZ*NR_CPUS/10;
 
+		need_serialize = sd->flags & SD_SERIALIZE;
 
-		if (sd->flags & SD_SERIALIZE) {
+		if (need_serialize) {
 			if (!spin_trylock(&balancing))
 				goto out;
 		}
@@ -3702,7 +3704,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 			}
 			sd->last_balance = jiffies;
 		}
-		if (sd->flags & SD_SERIALIZE)
+		if (need_serialize)
 			spin_unlock(&balancing);
 out:
 		if (time_after(next_balance, sd->last_balance + interval)) {
-- 
cgit v1.2.3


From f7dcd80bbc8e7032443e6539ea1b830364f82200 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 24 May 2008 23:20:38 +0200
Subject: namespacecheck: fixes in kernel/sched.c

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 90329f1f8941..02a5eeedcb94 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1633,7 +1633,7 @@ inline int task_curr(const struct task_struct *p)
 }
 
 /* Used instead of source_load when we know the type == 0 */
-unsigned long weighted_cpuload(const int cpu)
+static unsigned long weighted_cpuload(const int cpu)
 {
 	return cpu_rq(cpu)->load.weight;
 }
-- 
cgit v1.2.3


From 81d41d7ece23a1c3b4bcd1604026d3a06cc4dc79 Mon Sep 17 00:00:00 2001
From: Rabin Vincent <rabin@rab.in>
Date: Sun, 11 May 2008 05:55:33 +0530
Subject: sched: fix defined-but-unused warning

Fix this warning, which appears with !CONFIG_SMP:
kernel/sched.c:1216: warning: `init_hrtick' defined but not used

Signed-off-by: Rabin Vincent <rabin@rab.in>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 02a5eeedcb94..f3faec52c5ab 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1130,6 +1130,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
+#ifdef CONFIG_SMP
 static void hotplug_hrtick_disable(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -1185,6 +1186,7 @@ static void init_hrtick(void)
 {
 	hotcpu_notifier(hotplug_hrtick, 0);
 }
+#endif /* CONFIG_SMP */
 
 static void init_rq_hrtick(struct rq *rq)
 {
-- 
cgit v1.2.3


From e21f5b153b9b4a6775d7d41964e372e13a9178ab Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 23 May 2008 09:05:58 -0700
Subject: sched: print module list in the "scheduling while atomic" warning

For the normal WARN_ON() etc we added a print-the-modules-list already,
which is very useful to figure out candidates for certain types of bugs.

This patch adds the same print to the "scheduling while atomic" BUG warning,
for the same reason: when we get here it's very useful to see which modules
are loaded, to narrow down the candidate code list.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: mingo@elte.hu
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index f3faec52c5ab..84a360670b9d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4070,6 +4070,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
 		prev->comm, prev->pid, preempt_count());
 
 	debug_show_held_locks(prev);
+	print_modules();
 	if (irqs_disabled())
 		print_irqtrace_events(prev);
 
-- 
cgit v1.2.3


From 6d6bc0ad867c46896d0994bb039e7550ecb9b51d Mon Sep 17 00:00:00 2001
From: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Date: Fri, 30 May 2008 14:23:45 +0200
Subject: sched: add comments for ifdefs in sched.c

make sched.c easier to read.

Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 76 +++++++++++++++++++++++++++++-----------------------------
 1 file changed, 38 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 84a360670b9d..ef4e25604bbe 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -292,15 +292,15 @@ struct task_group root_task_group;
 static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
 /* Default task group's cfs_rq on each cpu */
 static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_RT_GROUP_SCHED
 static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
 static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
-#endif
-#else
+#endif /* CONFIG_RT_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED */
 #define root_task_group init_task_group
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 
 /* task_group_lock serializes add/remove of task groups and also changes to
  * a task group's cpu shares.
@@ -310,9 +310,9 @@ static DEFINE_SPINLOCK(task_group_lock);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_USER_SCHED
 # define INIT_TASK_GROUP_LOAD	(2*NICE_0_LOAD)
-#else
+#else /* !CONFIG_USER_SCHED */
 # define INIT_TASK_GROUP_LOAD	NICE_0_LOAD
-#endif
+#endif /* CONFIG_USER_SCHED */
 
 /*
  * A weight of 0, 1 or ULONG_MAX can cause arithmetics problems.
@@ -1316,15 +1316,15 @@ void wake_up_idle_cpu(int cpu)
 	if (!tsk_is_polling(rq->idle))
 		smp_send_reschedule(cpu);
 }
-#endif
+#endif /* CONFIG_NO_HZ */
 
-#else
+#else /* !CONFIG_SMP */
 static void __resched_task(struct task_struct *p, int tif_bit)
 {
 	assert_spin_locked(&task_rq(p)->lock);
 	set_tsk_thread_flag(p, tif_bit);
 }
-#endif
+#endif /* CONFIG_SMP */
 
 #if BITS_PER_LONG == 32
 # define WMULT_CONST	(~0UL)
@@ -2129,7 +2129,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 			}
 		}
 	}
-#endif
+#endif /* CONFIG_SCHEDSTATS */
 
 out_activate:
 #endif /* CONFIG_SMP */
@@ -2329,7 +2329,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
 		notifier->ops->sched_out(notifier, next);
 }
 
-#else
+#else /* !CONFIG_PREEMPT_NOTIFIERS */
 
 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
 {
@@ -2341,7 +2341,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
 {
 }
 
-#endif
+#endif /* CONFIG_PREEMPT_NOTIFIERS */
 
 /**
  * prepare_task_switch - prepare to switch tasks
@@ -6300,9 +6300,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 	}
 	kfree(groupmask);
 }
-#else
+#else /* !CONFIG_SCHED_DEBUG */
 # define sched_domain_debug(sd, cpu) do { } while (0)
-#endif
+#endif /* CONFIG_SCHED_DEBUG */
 
 static int sd_degenerate(struct sched_domain *sd)
 {
@@ -6598,7 +6598,7 @@ static void sched_domain_node_span(int node, cpumask_t *span)
 		cpus_or(*span, *span, *nodemask);
 	}
 }
-#endif
+#endif /* CONFIG_NUMA */
 
 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 
@@ -6617,7 +6617,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
 		*sg = &per_cpu(sched_group_cpus, cpu);
 	return cpu;
 }
-#endif
+#endif /* CONFIG_SCHED_SMT */
 
 /*
  * multi-core sched-domains:
@@ -6625,7 +6625,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
 #ifdef CONFIG_SCHED_MC
 static DEFINE_PER_CPU(struct sched_domain, core_domains);
 static DEFINE_PER_CPU(struct sched_group, sched_group_core);
-#endif
+#endif /* CONFIG_SCHED_MC */
 
 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
 static int
@@ -6727,7 +6727,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
 		sg = sg->next;
 	} while (sg != group_head);
 }
-#endif
+#endif /* CONFIG_NUMA */
 
 #ifdef CONFIG_NUMA
 /* Free memory allocated for various sched_group structures */
@@ -6764,11 +6764,11 @@ next_sg:
 		sched_group_nodes_bycpu[cpu] = NULL;
 	}
 }
-#else
+#else /* !CONFIG_NUMA */
 static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
 {
 }
-#endif
+#endif /* CONFIG_NUMA */
 
 /*
  * Initialize sched groups cpu_power.
@@ -7459,7 +7459,7 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
 #endif
 	return err;
 }
-#endif
+#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 
 /*
  * Force a reinitialization of the sched domains hierarchy. The domains
@@ -7677,8 +7677,8 @@ void __init sched_init(void)
 
 		root_task_group.cfs_rq = (struct cfs_rq **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
-#endif
-#endif
+#endif /* CONFIG_USER_SCHED */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
 		init_task_group.rt_se = (struct sched_rt_entity **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
@@ -7692,8 +7692,8 @@ void __init sched_init(void)
 
 		root_task_group.rt_rq = (struct rt_rq **)ptr;
 		ptr += nr_cpu_ids * sizeof(void **);
-#endif
-#endif
+#endif /* CONFIG_USER_SCHED */
+#endif /* CONFIG_RT_GROUP_SCHED */
 	}
 
 #ifdef CONFIG_SMP
@@ -7709,8 +7709,8 @@ void __init sched_init(void)
 #ifdef CONFIG_USER_SCHED
 	init_rt_bandwidth(&root_task_group.rt_bandwidth,
 			global_rt_period(), RUNTIME_INF);
-#endif
-#endif
+#endif /* CONFIG_USER_SCHED */
+#endif /* CONFIG_RT_GROUP_SCHED */
 
 #ifdef CONFIG_GROUP_SCHED
 	list_add(&init_task_group.list, &task_groups);
@@ -7720,8 +7720,8 @@ void __init sched_init(void)
 	INIT_LIST_HEAD(&root_task_group.children);
 	init_task_group.parent = &root_task_group;
 	list_add(&init_task_group.siblings, &root_task_group.children);
-#endif
-#endif
+#endif /* CONFIG_USER_SCHED */
+#endif /* CONFIG_GROUP_SCHED */
 
 	for_each_possible_cpu(i) {
 		struct rq *rq;
@@ -8040,7 +8040,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
 {
 	list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
 }
-#else
+#else /* !CONFG_FAIR_GROUP_SCHED */
 static inline void free_fair_sched_group(struct task_group *tg)
 {
 }
@@ -8058,7 +8058,7 @@ static inline void register_fair_sched_group(struct task_group *tg, int cpu)
 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
 {
 }
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_RT_GROUP_SCHED
 static void free_rt_sched_group(struct task_group *tg)
@@ -8129,7 +8129,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
 {
 	list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
 }
-#else
+#else /* !CONFIG_RT_GROUP_SCHED */
 static inline void free_rt_sched_group(struct task_group *tg)
 {
 }
@@ -8147,7 +8147,7 @@ static inline void register_rt_sched_group(struct task_group *tg, int cpu)
 static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
 {
 }
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
 
 #ifdef CONFIG_GROUP_SCHED
 static void free_sched_group(struct task_group *tg)
@@ -8258,7 +8258,7 @@ void sched_move_task(struct task_struct *tsk)
 
 	task_rq_unlock(rq, &flags);
 }
-#endif
+#endif /* CONFIG_GROUP_SCHED */
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -8508,7 +8508,7 @@ static int sched_rt_global_constraints(void)
 
 	return ret;
 }
-#else
+#else /* !CONFIG_RT_GROUP_SCHED */
 static int sched_rt_global_constraints(void)
 {
 	unsigned long flags;
@@ -8526,7 +8526,7 @@ static int sched_rt_global_constraints(void)
 
 	return 0;
 }
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
 
 int sched_rt_handler(struct ctl_table *table, int write,
 		struct file *filp, void __user *buffer, size_t *lenp,
@@ -8634,7 +8634,7 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
 
 	return (u64) tg->shares;
 }
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_RT_GROUP_SCHED
 static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
@@ -8658,7 +8658,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
 {
 	return sched_group_rt_period(cgroup_tg(cgrp));
 }
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
 
 static struct cftype cpu_files[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
-- 
cgit v1.2.3


From 099f98c8a1f13501a98afbfff4756395a610581c Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Thu, 29 May 2008 20:56:32 +0530
Subject: sched: print the sd->level in sched_domain_debug code

While printing out the visual representation of the sched-domains, print
the level (MC, SMT, CPU, NODE, ... ) of each of the sched_domains.

Credit: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ef4e25604bbe..dc0be113f41d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6197,6 +6197,28 @@ void __init migration_init(void)
 
 #ifdef CONFIG_SCHED_DEBUG
 
+static inline const char *sd_level_to_string(enum sched_domain_level lvl)
+{
+	switch (lvl) {
+	case SD_LV_NONE:
+			return "NONE";
+	case SD_LV_SIBLING:
+			return "SIBLING";
+	case SD_LV_MC:
+			return "MC";
+	case SD_LV_CPU:
+			return "CPU";
+	case SD_LV_NODE:
+			return "NODE";
+	case SD_LV_ALLNODES:
+			return "ALLNODES";
+	case SD_LV_MAX:
+			return "MAX";
+
+	}
+	return "MAX";
+}
+
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 				  cpumask_t *groupmask)
 {
@@ -6216,7 +6238,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 		return -1;
 	}
 
-	printk(KERN_CONT "span %s\n", str);
+	printk(KERN_CONT "span %s level %s\n",
+		str, sd_level_to_string(sd->level));
 
 	if (!cpu_isset(cpu, sd->span)) {
 		printk(KERN_ERR "ERROR: domain->span does not contain "
-- 
cgit v1.2.3


From 1f11eb6a8bc92536d9e93ead48fa3ffbd1478571 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Wed, 4 Jun 2008 15:04:05 -0400
Subject: sched: fix cpupri hotplug support

The RT folks over at RedHat found an issue w.r.t. hotplug support which
was traced to problems with the cpupri infrastructure in the scheduler:

https://bugzilla.redhat.com/show_bug.cgi?id=449676

This bug affects 23-rt12+, 24-rtX, 25-rtX, and sched-devel.  This patch
applies to 25.4-rt4, though it should trivially apply to most cpupri enabled
kernels mentioned above.

It turned out that the issue was that offline cpus could get inadvertently
registered with cpupri so that they were erroneously selected during
migration decisions.  The end result would be an OOPS as the offline cpu
had tasks routed to it.

This patch generalizes the old join/leave domain interface into an
online/offline interface, and adjusts the root-domain/hotplug code to
utilize it.

I was able to easily reproduce the issue prior to this patch, and am no
longer able to reproduce it after this patch.  I can offline cpus
indefinately and everything seems to be in working order.

Thanks to Arnaldo (acme), Thomas, and Peter for doing the legwork to point
me in the right direction.  Also thank you to Peter for reviewing the
early iterations of this patch.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c    | 54 ++++++++++++++++++++++++++++++++++++++++--------------
 kernel/sched_rt.c | 24 ++++++++++++++++++------
 2 files changed, 58 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index dc0be113f41d..f0ed81b71282 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -529,6 +529,7 @@ struct rq {
 	int push_cpu;
 	/* cpu of this runqueue: */
 	int cpu;
+	int online;
 
 	struct task_struct *migration_thread;
 	struct list_head migration_queue;
@@ -1498,6 +1499,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 #endif
 
 #define sched_class_highest (&rt_sched_class)
+#define for_each_class(class) \
+   for (class = sched_class_highest; class; class = class->next)
 
 static inline void inc_load(struct rq *rq, const struct task_struct *p)
 {
@@ -6065,6 +6068,36 @@ static void unregister_sched_domain_sysctl(void)
 }
 #endif
 
+static void set_rq_online(struct rq *rq)
+{
+	if (!rq->online) {
+		const struct sched_class *class;
+
+		cpu_set(rq->cpu, rq->rd->online);
+		rq->online = 1;
+
+		for_each_class(class) {
+			if (class->rq_online)
+				class->rq_online(rq);
+		}
+	}
+}
+
+static void set_rq_offline(struct rq *rq)
+{
+	if (rq->online) {
+		const struct sched_class *class;
+
+		for_each_class(class) {
+			if (class->rq_offline)
+				class->rq_offline(rq);
+		}
+
+		cpu_clear(rq->cpu, rq->rd->online);
+		rq->online = 0;
+	}
+}
+
 /*
  * migration_call - callback that gets triggered when a CPU is added.
  * Here we can start up the necessary migration thread for the new CPU.
@@ -6102,7 +6135,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		spin_lock_irqsave(&rq->lock, flags);
 		if (rq->rd) {
 			BUG_ON(!cpu_isset(cpu, rq->rd->span));
-			cpu_set(cpu, rq->rd->online);
+
+			set_rq_online(rq);
 		}
 		spin_unlock_irqrestore(&rq->lock, flags);
 		break;
@@ -6163,7 +6197,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		spin_lock_irqsave(&rq->lock, flags);
 		if (rq->rd) {
 			BUG_ON(!cpu_isset(cpu, rq->rd->span));
-			cpu_clear(cpu, rq->rd->online);
+			set_rq_offline(rq);
 		}
 		spin_unlock_irqrestore(&rq->lock, flags);
 		break;
@@ -6385,20 +6419,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 {
 	unsigned long flags;
-	const struct sched_class *class;
 
 	spin_lock_irqsave(&rq->lock, flags);
 
 	if (rq->rd) {
 		struct root_domain *old_rd = rq->rd;
 
-		for (class = sched_class_highest; class; class = class->next) {
-			if (class->leave_domain)
-				class->leave_domain(rq);
-		}
+		if (cpu_isset(rq->cpu, old_rd->online))
+			set_rq_offline(rq);
 
 		cpu_clear(rq->cpu, old_rd->span);
-		cpu_clear(rq->cpu, old_rd->online);
 
 		if (atomic_dec_and_test(&old_rd->refcount))
 			kfree(old_rd);
@@ -6409,12 +6439,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 
 	cpu_set(rq->cpu, rd->span);
 	if (cpu_isset(rq->cpu, cpu_online_map))
-		cpu_set(rq->cpu, rd->online);
-
-	for (class = sched_class_highest; class; class = class->next) {
-		if (class->join_domain)
-			class->join_domain(rq);
-	}
+		set_rq_online(rq);
 
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
@@ -7824,6 +7849,7 @@ void __init sched_init(void)
 		rq->next_balance = jiffies;
 		rq->push_cpu = 0;
 		rq->cpu = i;
+		rq->online = 0;
 		rq->migration_thread = NULL;
 		INIT_LIST_HEAD(&rq->migration_queue);
 		rq_attach_root(rq, &def_root_domain);
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 44b06d75416e..e4821593d4de 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -12,6 +12,9 @@ static inline int rt_overloaded(struct rq *rq)
 
 static inline void rt_set_overload(struct rq *rq)
 {
+	if (!rq->online)
+		return;
+
 	cpu_set(rq->cpu, rq->rd->rto_mask);
 	/*
 	 * Make sure the mask is visible before we set
@@ -26,6 +29,9 @@ static inline void rt_set_overload(struct rq *rq)
 
 static inline void rt_clear_overload(struct rq *rq)
 {
+	if (!rq->online)
+		return;
+
 	/* the order here really doesn't matter */
 	atomic_dec(&rq->rd->rto_count);
 	cpu_clear(rq->cpu, rq->rd->rto_mask);
@@ -394,7 +400,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
 		struct rq *rq = rq_of_rt_rq(rt_rq);
 		rt_rq->highest_prio = rt_se_prio(rt_se);
-		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_se_prio(rt_se));
+
+		if (rq->online)
+			cpupri_set(&rq->rd->cpupri, rq->cpu,
+				   rt_se_prio(rt_se));
 	}
 #endif
 #ifdef CONFIG_SMP
@@ -448,7 +457,10 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 
 	if (rt_rq->highest_prio != highest_prio) {
 		struct rq *rq = rq_of_rt_rq(rt_rq);
-		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio);
+
+		if (rq->online)
+			cpupri_set(&rq->rd->cpupri, rq->cpu,
+				   rt_rq->highest_prio);
 	}
 
 	update_rt_migration(rq_of_rt_rq(rt_rq));
@@ -1154,7 +1166,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
 }
 
 /* Assumes rq->lock is held */
-static void join_domain_rt(struct rq *rq)
+static void rq_online_rt(struct rq *rq)
 {
 	if (rq->rt.overloaded)
 		rt_set_overload(rq);
@@ -1163,7 +1175,7 @@ static void join_domain_rt(struct rq *rq)
 }
 
 /* Assumes rq->lock is held */
-static void leave_domain_rt(struct rq *rq)
+static void rq_offline_rt(struct rq *rq)
 {
 	if (rq->rt.overloaded)
 		rt_clear_overload(rq);
@@ -1331,8 +1343,8 @@ static const struct sched_class rt_sched_class = {
 	.load_balance		= load_balance_rt,
 	.move_one_task		= move_one_task_rt,
 	.set_cpus_allowed       = set_cpus_allowed_rt,
-	.join_domain            = join_domain_rt,
-	.leave_domain           = leave_domain_rt,
+	.rq_online              = rq_online_rt,
+	.rq_offline             = rq_offline_rt,
 	.pre_schedule		= pre_schedule_rt,
 	.post_schedule		= post_schedule_rt,
 	.task_wake_up		= task_wake_up_rt,
-- 
cgit v1.2.3


From 709d4b0c60f990bccf3e10ba7c6da407ad65c97f Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Wed, 4 Jun 2008 15:04:10 -0400
Subject: sched: fix cpupri priocount

A rounding error was pointed out by Peter Zijlstra which would result
in the structure holding priorities to be off by one.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_cpupri.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 0b6a3d110fac..6b38355e2676 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -4,7 +4,7 @@
 #include <linux/sched.h>
 
 #define CPUPRI_NR_PRIORITIES 2+MAX_RT_PRIO
-#define CPUPRI_NR_PRI_WORDS CPUPRI_NR_PRIORITIES/BITS_PER_LONG
+#define CPUPRI_NR_PRI_WORDS (CPUPRI_NR_PRIORITIES + BITS_PER_LONG/2)/BITS_PER_LONG
 
 #define CPUPRI_INVALID -1
 #define CPUPRI_IDLE     0
-- 
cgit v1.2.3


From e539d8fcd11af811db70707d47ea436d5621d0da Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 5 Jun 2008 10:28:00 +0200
Subject: sched: fix the cpuprio count really

Peter pointed out that the last version of the "fix" was still one off
under certain circumstances. Use BITS_TO_LONG instead to get an
accurate result.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched_cpupri.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 6b38355e2676..f25811b0f931 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -3,8 +3,8 @@
 
 #include <linux/sched.h>
 
-#define CPUPRI_NR_PRIORITIES 2+MAX_RT_PRIO
-#define CPUPRI_NR_PRI_WORDS (CPUPRI_NR_PRIORITIES + BITS_PER_LONG/2)/BITS_PER_LONG
+#define CPUPRI_NR_PRIORITIES	(MAX_RT_PRIO + 2)
+#define CPUPRI_NR_PRI_WORDS	BITS_TO_LONGS(CPUPRI_NR_PRIORITIES)
 
 #define CPUPRI_INVALID -1
 #define CPUPRI_IDLE     0
-- 
cgit v1.2.3


From 1100ac91b6af02d8639d518fad5b434b1bf44ed6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 5 Jun 2008 12:25:37 +0200
Subject: sched: fix cpuprio build bug

this patch was not built on !SMP:

 kernel/sched_rt.c: In function 'inc_rt_tasks':
 kernel/sched_rt.c:404: error: 'struct rq' has no member named 'online'

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index e4821593d4de..eaa606071d51 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -399,16 +399,19 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 	if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
 		struct rq *rq = rq_of_rt_rq(rt_rq);
-		rt_rq->highest_prio = rt_se_prio(rt_se);
 
+		rt_rq->highest_prio = rt_se_prio(rt_se);
+#ifdef CONFIG_SMP
 		if (rq->online)
 			cpupri_set(&rq->rd->cpupri, rq->cpu,
 				   rt_se_prio(rt_se));
+#endif
 	}
 #endif
 #ifdef CONFIG_SMP
 	if (rt_se->nr_cpus_allowed > 1) {
 		struct rq *rq = rq_of_rt_rq(rt_rq);
+
 		rq->rt.rt_nr_migratory++;
 	}
 
-- 
cgit v1.2.3


From 5c8e1ed1d204a6770ca2854cd3b3597070fe7e5a Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qualcomm.com>
Date: Thu, 29 May 2008 11:17:01 -0700
Subject: sched: CPU hotplug events must not destroy scheduler domains created
 by the cpusets

First issue is not related to the cpusets. We're simply leaking doms_cur.
It's allocated in arch_init_sched_domains() which is called for every
hotplug event. So we just keep reallocation doms_cur without freeing it.
I introduced free_sched_domains() function that cleans things up.

Second issue is that sched domains created by the cpusets are
completely destroyed by the CPU hotplug events. For all CPU hotplug
events scheduler attaches all CPUs to the NULL domain and then puts
them all into the single domain thereby destroying domains created
by the cpusets (partition_sched_domains).
The solution is simple, when cpusets are enabled scheduler should not
create default domain and instead let cpusets do that. Which is
exactly what the patch does.

Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Cc: pj@sgi.com
Cc: menage@google.com
Cc: rostedt@goodmis.org
Cc: mingo@elte.hu
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/cpuset.c |  6 ++++++
 kernel/sched.c  | 22 ++++++++++++++++++++++
 2 files changed, 28 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 86ea9e34e326..6090d18b58a9 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1886,6 +1886,12 @@ static void common_cpu_mem_hotplug_unplug(void)
 	top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
 	scan_for_empty_cpusets(&top_cpuset);
 
+	/*
+	 * Scheduler destroys domains on hotplug events.
+	 * Rebuild them based on the current settings.
+	 */
+	rebuild_sched_domains();
+
 	cgroup_unlock();
 }
 
diff --git a/kernel/sched.c b/kernel/sched.c
index f0ed81b71282..1ddb0a8c7976 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7292,6 +7292,18 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
 {
 }
 
+/*
+ * Free current domain masks.
+ * Called after all cpus are attached to NULL domain.
+ */
+static void free_sched_domains(void)
+{
+	ndoms_cur = 0;
+	if (doms_cur != &fallback_doms)
+		kfree(doms_cur);
+	doms_cur = &fallback_doms;
+}
+
 /*
  * Set up scheduler domains and groups. Callers must hold the hotplug lock.
  * For now this just excludes isolated cpus, but could be used to
@@ -7439,6 +7451,7 @@ int arch_reinit_sched_domains(void)
 	get_online_cpus();
 	mutex_lock(&sched_domains_mutex);
 	detach_destroy_domains(&cpu_online_map);
+	free_sched_domains();
 	err = arch_init_sched_domains(&cpu_online_map);
 	mutex_unlock(&sched_domains_mutex);
 	put_online_cpus();
@@ -7524,6 +7537,7 @@ static int update_sched_domains(struct notifier_block *nfb,
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 		detach_destroy_domains(&cpu_online_map);
+		free_sched_domains();
 		return NOTIFY_OK;
 
 	case CPU_UP_CANCELED:
@@ -7542,8 +7556,16 @@ static int update_sched_domains(struct notifier_block *nfb,
 		return NOTIFY_DONE;
 	}
 
+#ifndef CONFIG_CPUSETS
+	/*
+	 * Create default domain partitioning if cpusets are disabled.
+	 * Otherwise we let cpusets rebuild the domains based on the
+	 * current setup.
+	 */
+
 	/* The hotplug lock is already held by cpu_up/cpu_down */
 	arch_init_sched_domains(&cpu_online_map);
+#endif
 
 	return NOTIFY_OK;
 }
-- 
cgit v1.2.3


From 68f4f1ec08e3d95730a2693b99df8260aa0d06ae Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qualcomm.com>
Date: Thu, 29 May 2008 11:17:02 -0700
Subject: sched: Move cpu masks from kernel/sched.c into kernel/cpu.c

kernel/cpu.c seems a more logical place for those maps since they do not really
have much to do with the scheduler these days.

kernel/cpu.c is now built for the UP kernel too, but it does not affect the size
the kernel sections.

$ size vmlinux

before
   text       data        bss        dec        hex    filename
3313797     307060     310352    3931209     3bfc49    vmlinux

after
   text       data        bss        dec        hex    filename
3313797     307060     310352    3931209     3bfc49    vmlinux

Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Cc: pj@sgi.com
Cc: menage@google.com
Cc: rostedt@goodmis.org
Cc: mingo@elte.hu
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/Makefile |  4 ++--
 kernel/cpu.c    | 24 ++++++++++++++++++++++++
 kernel/sched.c  | 18 ------------------
 3 files changed, 26 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index ecdd2d335639..6c55301112e0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -3,7 +3,7 @@
 #
 
 obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
-	    exit.o itimer.o time.o softirq.o resource.o \
+	    cpu.o exit.o itimer.o time.o softirq.o resource.o \
 	    sysctl.o capability.o ptrace.o timer.o user.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
 	    rcupdate.o extable.o params.o posix-timers.o \
@@ -27,7 +27,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
-obj-$(CONFIG_SMP) += cpu.o spinlock.o
+obj-$(CONFIG_SMP) += spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
 obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
 obj-$(CONFIG_UID16) += uid16.o
diff --git a/kernel/cpu.c b/kernel/cpu.c
index c77bc3a1c722..b11f06dc149a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -15,6 +15,28 @@
 #include <linux/stop_machine.h>
 #include <linux/mutex.h>
 
+/*
+ * Represents all cpu's present in the system
+ * In systems capable of hotplug, this map could dynamically grow
+ * as new cpu's are detected in the system via any platform specific
+ * method, such as ACPI for e.g.
+ */
+cpumask_t cpu_present_map __read_mostly;
+EXPORT_SYMBOL(cpu_present_map);
+
+#ifndef CONFIG_SMP
+
+/*
+ * Represents all cpu's that are currently online.
+ */
+cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
+EXPORT_SYMBOL(cpu_online_map);
+
+cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
+EXPORT_SYMBOL(cpu_possible_map);
+
+#else /* CONFIG_SMP */
+
 /* Serializes the updates to cpu_online_map, cpu_present_map */
 static DEFINE_MUTEX(cpu_add_remove_lock);
 
@@ -403,3 +425,5 @@ out:
 	cpu_maps_update_done();
 }
 #endif /* CONFIG_PM_SLEEP_SMP */
+
+#endif /* CONFIG_SMP */
diff --git a/kernel/sched.c b/kernel/sched.c
index 1ddb0a8c7976..f36f549e5744 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5080,24 +5080,6 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
 	return sched_setaffinity(pid, &new_mask);
 }
 
-/*
- * Represents all cpu's present in the system
- * In systems capable of hotplug, this map could dynamically grow
- * as new cpu's are detected in the system via any platform specific
- * method, such as ACPI for e.g.
- */
-
-cpumask_t cpu_present_map __read_mostly;
-EXPORT_SYMBOL(cpu_present_map);
-
-#ifndef CONFIG_SMP
-cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
-EXPORT_SYMBOL(cpu_online_map);
-
-cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
-EXPORT_SYMBOL(cpu_possible_map);
-#endif
-
 long sched_getaffinity(pid_t pid, cpumask_t *mask)
 {
 	struct task_struct *p;
-- 
cgit v1.2.3


From e958b3600484533ff801920290468adc8135f89d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 4 Jun 2008 23:22:32 +0200
Subject: sched: move weighted_cpuload into #ifdef CONFIG_SMP section

weighted_cpuload is only used on SMP. move it into the CONFIG_SMP
section.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index f36f549e5744..727bdef76161 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1637,12 +1637,6 @@ inline int task_curr(const struct task_struct *p)
 	return cpu_curr(task_cpu(p)) == p;
 }
 
-/* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
-{
-	return cpu_rq(cpu)->load.weight;
-}
-
 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
 	set_task_rq(p, cpu);
@@ -1671,6 +1665,12 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 
 #ifdef CONFIG_SMP
 
+/* Used instead of source_load when we know the type == 0 */
+static unsigned long weighted_cpuload(const int cpu)
+{
+	return cpu_rq(cpu)->load.weight;
+}
+
 /*
  * Is this task likely cache-hot:
  */
-- 
cgit v1.2.3


From 6ad36762d7a88d747f6fed95194b4f7ff5da8df4 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 8 Jun 2008 21:20:42 +0400
Subject: __mutex_lock_common: use signal_pending_state()

Change __mutex_lock_common() to use signal_pending_state() for the sake of
the code re-use.

This adds 7 bytes to kernel/mutex.o, but afaics only because gcc isn't smart
enough.

(btw, uninlining of __mutex_lock_common() shrinks .text from 2722 to 1542,
 perhaps it is worth doing).

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/mutex.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/mutex.c b/kernel/mutex.c
index d046a345d365..bcdc9ac8ef60 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -165,10 +165,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		 * got a signal? (This code gets eliminated in the
 		 * TASK_UNINTERRUPTIBLE case.)
 		 */
-		if (unlikely((state == TASK_INTERRUPTIBLE &&
-					signal_pending(task)) ||
-			      (state == TASK_KILLABLE &&
-					fatal_signal_pending(task)))) {
+		if (unlikely(signal_pending_state(state, task))) {
 			mutex_remove_waiter(lock, &waiter,
 					    task_thread_info(task));
 			mutex_release(&lock->dep_map, 1, ip);
-- 
cgit v1.2.3


From 0eb967012ea15e6e8cfab483d9fa37bc602d400c Mon Sep 17 00:00:00 2001
From: Abhishek Sagar <sagar.abhishek@gmail.com>
Date: Sun, 1 Jun 2008 21:47:30 +0530
Subject: ftrace: prevent freeing of all failed updates

Prevent freeing of records which cause problems and correspond to function from
core kernel text. A new flag, FTRACE_FL_CONVERTED is used to mark a record
as "converted". All other records are patched lazily to NOPs. Failed records
now also remain on frace_hash table. Each invocation of ftrace_record_ip now
checks whether the traced function has ever been recorded (including past
failures) and doesn't re-record it again.

Signed-off-by: Abhishek Sagar <sagar.abhishek@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 76 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 46 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f762f5a2d331..ec54cb7d69d6 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -216,6 +216,12 @@ ftrace_add_hash(struct dyn_ftrace *node, unsigned long key)
 	hlist_add_head_rcu(&node->node, &ftrace_hash[key]);
 }
 
+/* called from kstop_machine */
+static inline void ftrace_del_hash(struct dyn_ftrace *node)
+{
+	hlist_del(&node->node);
+}
+
 static void ftrace_free_rec(struct dyn_ftrace *rec)
 {
 	/* no locking, only called from kstop_machine */
@@ -332,12 +338,11 @@ ftrace_record_ip(unsigned long ip)
 #define FTRACE_ADDR ((long)(ftrace_caller))
 #define MCOUNT_ADDR ((long)(mcount))
 
-static void
+static int
 __ftrace_replace_code(struct dyn_ftrace *rec,
 		      unsigned char *old, unsigned char *new, int enable)
 {
 	unsigned long ip, fl;
-	int failed;
 
 	ip = rec->ip;
 
@@ -364,7 +369,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec,
 
 		if ((fl ==  (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) ||
 		    (fl == 0) || (rec->flags & FTRACE_FL_NOTRACE))
-			return;
+			return 0;
 
 		/*
 		 * If it is enabled disable it,
@@ -388,7 +393,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec,
 			 */
 			fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED);
 			if (fl == FTRACE_FL_NOTRACE)
-				return;
+				return 0;
 
 			new = ftrace_call_replace(ip, FTRACE_ADDR);
 		} else
@@ -396,34 +401,24 @@ __ftrace_replace_code(struct dyn_ftrace *rec,
 
 		if (enable) {
 			if (rec->flags & FTRACE_FL_ENABLED)
-				return;
+				return 0;
 			rec->flags |= FTRACE_FL_ENABLED;
 		} else {
 			if (!(rec->flags & FTRACE_FL_ENABLED))
-				return;
+				return 0;
 			rec->flags &= ~FTRACE_FL_ENABLED;
 		}
 	}
 
-	failed = ftrace_modify_code(ip, old, new);
-	if (failed) {
-		unsigned long key;
-		/* It is possible that the function hasn't been converted yet */
-		key = hash_long(ip, FTRACE_HASHBITS);
-		if (!ftrace_ip_in_hash(ip, key)) {
-			rec->flags |= FTRACE_FL_FAILED;
-			ftrace_free_rec(rec);
-		}
-
-	}
+	return ftrace_modify_code(ip, old, new);
 }
 
 static void ftrace_replace_code(int enable)
 {
+	int i, failed;
 	unsigned char *new = NULL, *old = NULL;
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
-	int i;
 
 	if (enable)
 		old = ftrace_nop_replace();
@@ -438,7 +433,15 @@ static void ftrace_replace_code(int enable)
 			if (rec->flags & FTRACE_FL_FAILED)
 				continue;
 
-			__ftrace_replace_code(rec, old, new, enable);
+			failed = __ftrace_replace_code(rec, old, new, enable);
+			if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
+				rec->flags |= FTRACE_FL_FAILED;
+				if ((system_state == SYSTEM_BOOTING) ||
+				    !kernel_text_address(rec->ip)) {
+					ftrace_del_hash(rec);
+					ftrace_free_rec(rec);
+				}
+			}
 		}
 	}
 }
@@ -467,7 +470,6 @@ ftrace_code_disable(struct dyn_ftrace *rec)
 	failed = ftrace_modify_code(ip, call, nop);
 	if (failed) {
 		rec->flags |= FTRACE_FL_FAILED;
-		ftrace_free_rec(rec);
 		return 0;
 	}
 	return 1;
@@ -621,8 +623,7 @@ unsigned long		ftrace_update_tot_cnt;
 static int __ftrace_update_code(void *ignore)
 {
 	struct dyn_ftrace *p;
-	struct hlist_head head;
-	struct hlist_node *t;
+	struct hlist_node *t, *n;
 	int save_ftrace_enabled;
 	cycle_t start, stop;
 	int i;
@@ -637,18 +638,33 @@ static int __ftrace_update_code(void *ignore)
 
 	/* No locks needed, the machine is stopped! */
 	for (i = 0; i < FTRACE_HASHSIZE; i++) {
-		if (hlist_empty(&ftrace_hash[i]))
-			continue;
+		/* all CPUS are stopped, we are safe to modify code */
+		hlist_for_each_entry_safe(p, t, n, &ftrace_hash[i], node) {
+			/* Skip over failed records which have not been
+			 * freed. */
+			if (p->flags & FTRACE_FL_FAILED)
+				continue;
 
-		head = ftrace_hash[i];
-		INIT_HLIST_HEAD(&ftrace_hash[i]);
+			/* Unconverted records are always at the head of the
+			 * hash bucket. Once we encounter a converted record,
+			 * simply skip over to the next bucket. Saves ftraced
+			 * some processor cycles (ftrace does its bid for
+			 * global warming :-p ). */
+			if (p->flags & (FTRACE_FL_CONVERTED))
+				break;
 
-		/* all CPUS are stopped, we are safe to modify code */
-		hlist_for_each_entry(p, t, &head, node) {
-			if (ftrace_code_disable(p))
+			if (ftrace_code_disable(p)) {
+				p->flags |= FTRACE_FL_CONVERTED;
 				ftrace_update_cnt++;
-		}
+			} else {
+				if ((system_state == SYSTEM_BOOTING) ||
+				    !kernel_text_address(p->ip)) {
+					ftrace_del_hash(p);
+					ftrace_free_rec(p);
 
+				}
+			}
+		}
 	}
 
 	stop = ftrace_now(raw_smp_processor_id());
-- 
cgit v1.2.3


From 1d74f2a0f64b4091e5e91b55ac1b17dff93f4b59 Mon Sep 17 00:00:00 2001
From: Abhishek Sagar <sagar.abhishek@gmail.com>
Date: Sun, 1 Jun 2008 21:47:42 +0530
Subject: ftrace: remove ftrace_ip_converted()

Remove the unneeded function ftrace_ip_converted().

Signed-off-by: Abhishek Sagar <sagar.abhishek@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ec54cb7d69d6..a8929e4c77c1 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -306,13 +306,6 @@ ftrace_record_ip(unsigned long ip)
 	if (ftrace_ip_in_hash(ip, key))
 		goto out_unlock;
 
-	/*
-	 * There's a slight race that the ftraced will update the
-	 * hash and reset here. If it is already converted, skip it.
-	 */
-	if (ftrace_ip_converted(ip))
-		goto out_unlock;
-
 	node = ftrace_alloc_dyn_node(ip);
 	if (!node)
 		goto out_unlock;
-- 
cgit v1.2.3


From eb9a7bf09172f409c10ec9560adeea95bb4045f5 Mon Sep 17 00:00:00 2001
From: Abhishek Sagar <sagar.abhishek@gmail.com>
Date: Sun, 1 Jun 2008 21:47:54 +0530
Subject: ftrace: add debugfs entry 'failures'

Identify functions which had their mcount call-site updates failed. This can
help us track functions which ftrace shouldn't fiddle with, and are thus not
being traced. If there is no race with any external agent which is modifying
the mcount call-site, then this file displays no entries (normal case).

Signed-off-by: Abhishek Sagar <sagar.abhishek@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 40 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a8929e4c77c1..ad568c742bfc 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -768,6 +768,7 @@ enum {
 	FTRACE_ITER_FILTER	= (1 << 0),
 	FTRACE_ITER_CONT	= (1 << 1),
 	FTRACE_ITER_NOTRACE	= (1 << 2),
+	FTRACE_ITER_FAILURES	= (1 << 3),
 };
 
 #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
@@ -799,9 +800,16 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 		}
 	} else {
 		rec = &iter->pg->records[iter->idx++];
-		if ((rec->flags & FTRACE_FL_FAILED) ||
+		if ((!(iter->flags & FTRACE_ITER_FAILURES) &&
+		     (rec->flags & FTRACE_FL_FAILED)) ||
+
+		    ((iter->flags & FTRACE_ITER_FAILURES) &&
+		     (!(rec->flags & FTRACE_FL_FAILED) ||
+		      (rec->flags & FTRACE_FL_FREE))) ||
+
 		    ((iter->flags & FTRACE_ITER_FILTER) &&
 		     !(rec->flags & FTRACE_FL_FILTER)) ||
+
 		    ((iter->flags & FTRACE_ITER_NOTRACE) &&
 		     !(rec->flags & FTRACE_FL_NOTRACE))) {
 			rec = NULL;
@@ -896,6 +904,24 @@ int ftrace_avail_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static int
+ftrace_failures_open(struct inode *inode, struct file *file)
+{
+	int ret;
+	struct seq_file *m;
+	struct ftrace_iterator *iter;
+
+	ret = ftrace_avail_open(inode, file);
+	if (!ret) {
+		m = (struct seq_file *)file->private_data;
+		iter = (struct ftrace_iterator *)m->private;
+		iter->flags = FTRACE_ITER_FAILURES;
+	}
+
+	return ret;
+}
+
+
 static void ftrace_filter_reset(int enable)
 {
 	struct ftrace_page *pg;
@@ -1309,6 +1335,13 @@ static struct file_operations ftrace_avail_fops = {
 	.release = ftrace_avail_release,
 };
 
+static struct file_operations ftrace_failures_fops = {
+	.open = ftrace_failures_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = ftrace_avail_release,
+};
+
 static struct file_operations ftrace_filter_fops = {
 	.open = ftrace_filter_open,
 	.read = ftrace_regex_read,
@@ -1386,6 +1419,11 @@ static __init int ftrace_init_debugfs(void)
 		pr_warning("Could not create debugfs "
 			   "'available_filter_functions' entry\n");
 
+	entry = debugfs_create_file("failures", 0444,
+				    d_tracer, NULL, &ftrace_failures_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'failures' entry\n");
+
 	entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer,
 				    NULL, &ftrace_filter_fops);
 	if (!entry)
-- 
cgit v1.2.3


From 34078a5e44db3cbed2e0ed580c29a39d94e0cd97 Mon Sep 17 00:00:00 2001
From: Abhishek Sagar <sagar.abhishek@gmail.com>
Date: Tue, 3 Jun 2008 08:33:41 +0530
Subject: ftrace: prevent freeing of all failed updates

Steven Rostedt wrote:
> If we unload a module and reload it, will it ever get converted again?

The intent was always to filter core kernel functions to prevent their freeing.
Here's a fix which should allow re-recording of module call-sites.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ad568c742bfc..0118979e211f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -430,7 +430,7 @@ static void ftrace_replace_code(int enable)
 			if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
 				rec->flags |= FTRACE_FL_FAILED;
 				if ((system_state == SYSTEM_BOOTING) ||
-				    !kernel_text_address(rec->ip)) {
+				    !core_kernel_text(rec->ip)) {
 					ftrace_del_hash(rec);
 					ftrace_free_rec(rec);
 				}
@@ -651,10 +651,9 @@ static int __ftrace_update_code(void *ignore)
 				ftrace_update_cnt++;
 			} else {
 				if ((system_state == SYSTEM_BOOTING) ||
-				    !kernel_text_address(p->ip)) {
+				    !core_kernel_text(p->ip)) {
 					ftrace_del_hash(p);
 					ftrace_free_rec(p);
-
 				}
 			}
 		}
-- 
cgit v1.2.3


From 7def2be1dc679984f4c4fb3ef19a8a081b2454ec Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 5 Jun 2008 14:49:58 +0200
Subject: sched: fix hotplug cpus on ia64

Cliff Wickman wrote:

> I built an ia64 kernel from Andrew's tree (2.6.26-rc2-mm1)
> and get a very predictable hotplug cpu problem.
> billberry1:/tmp/cpw # ./dis
> disabled cpu 17
> enabled cpu 17
> billberry1:/tmp/cpw # ./dis
> disabled cpu 17
> enabled cpu 17
> billberry1:/tmp/cpw # ./dis
>
> The script that disables the cpu always hangs (unkillable)
> on the 3rd attempt.
>
> And a bit further:
> The kstopmachine thread always sits on the run queue (real time) for about
> 30 minutes before running.

this fix solves some (but not all) issues between CPU hotplug and
RT bandwidth throttling.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    |  15 ++++++--
 kernel/sched_rt.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 115 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 727bdef76161..e9c24a128655 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7513,21 +7513,28 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
 static int update_sched_domains(struct notifier_block *nfb,
 				unsigned long action, void *hcpu)
 {
+	int cpu = (int)(long)hcpu;
+
 	switch (action) {
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
+		disable_runtime(cpu_rq(cpu));
+		/* fall-through */
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
 		detach_destroy_domains(&cpu_online_map);
 		free_sched_domains();
 		return NOTIFY_OK;
 
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
+
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
+		enable_runtime(cpu_rq(cpu));
+		/* fall-through */
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 		/*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index eaa606071d51..8ae3416e0bb4 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -286,6 +286,9 @@ static int balance_runtime(struct rt_rq *rt_rq)
 			continue;
 
 		spin_lock(&iter->rt_runtime_lock);
+		if (iter->rt_runtime == RUNTIME_INF)
+			goto next;
+
 		diff = iter->rt_runtime - iter->rt_time;
 		if (diff > 0) {
 			do_div(diff, weight);
@@ -299,12 +302,105 @@ static int balance_runtime(struct rt_rq *rt_rq)
 				break;
 			}
 		}
+next:
 		spin_unlock(&iter->rt_runtime_lock);
 	}
 	spin_unlock(&rt_b->rt_runtime_lock);
 
 	return more;
 }
+
+static void __disable_runtime(struct rq *rq)
+{
+	struct root_domain *rd = rq->rd;
+	struct rt_rq *rt_rq;
+
+	if (unlikely(!scheduler_running))
+		return;
+
+	for_each_leaf_rt_rq(rt_rq, rq) {
+		struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+		s64 want;
+		int i;
+
+		spin_lock(&rt_b->rt_runtime_lock);
+		spin_lock(&rt_rq->rt_runtime_lock);
+		if (rt_rq->rt_runtime == RUNTIME_INF ||
+				rt_rq->rt_runtime == rt_b->rt_runtime)
+			goto balanced;
+		spin_unlock(&rt_rq->rt_runtime_lock);
+
+		want = rt_b->rt_runtime - rt_rq->rt_runtime;
+
+		for_each_cpu_mask(i, rd->span) {
+			struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
+			s64 diff;
+
+			if (iter == rt_rq)
+				continue;
+
+			spin_lock(&iter->rt_runtime_lock);
+			if (want > 0) {
+				diff = min_t(s64, iter->rt_runtime, want);
+				iter->rt_runtime -= diff;
+				want -= diff;
+			} else {
+				iter->rt_runtime -= want;
+				want -= want;
+			}
+			spin_unlock(&iter->rt_runtime_lock);
+
+			if (!want)
+				break;
+		}
+
+		spin_lock(&rt_rq->rt_runtime_lock);
+		BUG_ON(want);
+balanced:
+		rt_rq->rt_runtime = RUNTIME_INF;
+		spin_unlock(&rt_rq->rt_runtime_lock);
+		spin_unlock(&rt_b->rt_runtime_lock);
+	}
+}
+
+static void disable_runtime(struct rq *rq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rq->lock, flags);
+	__disable_runtime(rq);
+	spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static void __enable_runtime(struct rq *rq)
+{
+	struct root_domain *rd = rq->rd;
+	struct rt_rq *rt_rq;
+
+	if (unlikely(!scheduler_running))
+		return;
+
+	for_each_leaf_rt_rq(rt_rq, rq) {
+		struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+
+		spin_lock(&rt_b->rt_runtime_lock);
+		spin_lock(&rt_rq->rt_runtime_lock);
+		rt_rq->rt_runtime = rt_b->rt_runtime;
+		rt_rq->rt_time = 0;
+		spin_unlock(&rt_rq->rt_runtime_lock);
+		spin_unlock(&rt_b->rt_runtime_lock);
+	}
+}
+
+static void enable_runtime(struct rq *rq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rq->lock, flags);
+	__enable_runtime(rq);
+	spin_unlock_irqrestore(&rq->lock, flags);
+}
+
 #endif
 
 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
@@ -334,14 +430,13 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 
 #ifdef CONFIG_SMP
 	if (rt_rq->rt_time > runtime) {
-		int more;
-
 		spin_unlock(&rt_rq->rt_runtime_lock);
-		more = balance_runtime(rt_rq);
+		balance_runtime(rt_rq);
 		spin_lock(&rt_rq->rt_runtime_lock);
 
-		if (more)
-			runtime = sched_rt_runtime(rt_rq);
+		runtime = sched_rt_runtime(rt_rq);
+		if (runtime == RUNTIME_INF)
+			return 0;
 	}
 #endif
 
@@ -1174,6 +1269,8 @@ static void rq_online_rt(struct rq *rq)
 	if (rq->rt.overloaded)
 		rt_set_overload(rq);
 
+	__enable_runtime(rq);
+
 	cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
 }
 
@@ -1183,6 +1280,8 @@ static void rq_offline_rt(struct rq *rq)
 	if (rq->rt.overloaded)
 		rt_clear_overload(rq);
 
+	__disable_runtime(rq);
+
 	cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
 }
 
-- 
cgit v1.2.3


From 9985b0bab332289f14837eff3c6e0bcc658b58f7 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 5 Jun 2008 12:57:11 -0700
Subject: sched: prevent bound kthreads from changing cpus_allowed

Kthreads that have called kthread_bind() are bound to specific cpus, so
other tasks should not be able to change their cpus_allowed from under
them.  Otherwise, it is possible to move kthreads, such as the migration
or software watchdog threads, so they are not allowed access to the cpu
they work on.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/cpuset.c  | 14 +++++++++++++-
 kernel/kthread.c |  1 +
 kernel/sched.c   |  6 ++++++
 3 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6090d18b58a9..b84354f4de36 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1190,6 +1190,15 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
 
 	if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
 		return -ENOSPC;
+	if (tsk->flags & PF_THREAD_BOUND) {
+		cpumask_t mask;
+
+		mutex_lock(&callback_mutex);
+		mask = cs->cpus_allowed;
+		mutex_unlock(&callback_mutex);
+		if (!cpus_equal(tsk->cpus_allowed, mask))
+			return -EINVAL;
+	}
 
 	return security_task_setscheduler(tsk, 0, NULL);
 }
@@ -1203,11 +1212,14 @@ static void cpuset_attach(struct cgroup_subsys *ss,
 	struct mm_struct *mm;
 	struct cpuset *cs = cgroup_cs(cont);
 	struct cpuset *oldcs = cgroup_cs(oldcont);
+	int err;
 
 	mutex_lock(&callback_mutex);
 	guarantee_online_cpus(cs, &cpus);
-	set_cpus_allowed_ptr(tsk, &cpus);
+	err = set_cpus_allowed_ptr(tsk, &cpus);
 	mutex_unlock(&callback_mutex);
+	if (err)
+		return;
 
 	from = oldcs->mems_allowed;
 	to = cs->mems_allowed;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index bd1b9ea024e1..97747cdd37c9 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -180,6 +180,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
 	set_task_cpu(k, cpu);
 	k->cpus_allowed = cpumask_of_cpu(cpu);
 	k->rt.nr_cpus_allowed = 1;
+	k->flags |= PF_THREAD_BOUND;
 }
 EXPORT_SYMBOL(kthread_bind);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index e9c24a128655..164fe7fe0d89 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5563,6 +5563,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
 		goto out;
 	}
 
+	if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
+		     !cpus_equal(p->cpus_allowed, *new_mask))) {
+		ret = -EINVAL;
+		goto out;
+	}
+
 	if (p->sched_class->set_cpus_allowed)
 		p->sched_class->set_cpus_allowed(p, new_mask);
 	else {
-- 
cgit v1.2.3


From 6492c7f83e88a3a9521793b6934d882b97afe287 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Sun, 8 Jun 2008 09:27:13 +0200
Subject: sched: trivial sched_features cleanup

Remove unused debug/tuning features.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_features.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 1c7283cb9581..62b39ca92ebd 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -6,5 +6,3 @@ SCHED_FEAT(CACHE_HOT_BUDDY, 1)
 SCHED_FEAT(SYNC_WAKEUPS, 1)
 SCHED_FEAT(HRTICK, 1)
 SCHED_FEAT(DOUBLE_TICK, 0)
-SCHED_FEAT(NORMALIZED_SLEEPER, 1)
-SCHED_FEAT(DEADLINE, 1)
-- 
cgit v1.2.3


From e9886ca3a93d7d041d3de8e5acebe213da777d59 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 9 Jun 2008 17:12:24 +0900
Subject: sched: kill off dead cfs_rq_set_shares()

Building with CONFIG_FAIR_GROUP_SCHED=y on UP results in an unused
cfs_rq_set_shares() reference. As nothing is using this dummy function
in the first place, just kill it off.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 164fe7fe0d89..07d5472dee99 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1480,16 +1480,8 @@ static unsigned long source_load(int cpu, int type);
 static unsigned long target_load(int cpu, int type);
 static unsigned long cpu_avg_load_per_task(int cpu);
 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
-#else /* CONFIG_SMP */
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
-{
-}
 #endif
 
-#endif /* CONFIG_SMP */
-
 #include "sched_stats.h"
 #include "sched_idletask.c"
 #include "sched_fair.c"
-- 
cgit v1.2.3


From 040ec23d07f95285e9777a85cda29cb339a3065b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 9 Jun 2008 01:45:29 -0700
Subject: sched: sched_clock() lockdep fix

Sitsofe Wheeler bisected the following commit to cause a lockdep to
warn about itself and turn itself off:

> commit c6531cce6e6e4b99bcda46b6268d6f2d9e30aea4
> Author: Ingo Molnar <mingo@elte.hu>
> Date:   Mon May 12 21:21:14 2008 +0200
>
>     sched: do not trace sched_clock

do not use raw irq flags in cpu_clock() as it causes lockdep to lose
track of the true state of the IRQ flag.

Reported-and-bisected-by: Sitsofe Wheeler <sitsofe@yahoo.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/sched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 6590a828138f..b8c9fe676221 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -889,7 +889,7 @@ unsigned long long notrace cpu_clock(int cpu)
 	unsigned long long prev_cpu_time, time, delta_time;
 	unsigned long flags;
 
-	raw_local_irq_save(flags);
+	local_irq_save(flags);
 	prev_cpu_time = per_cpu(prev_cpu_time, cpu);
 	time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
 	delta_time = time-prev_cpu_time;
@@ -898,7 +898,7 @@ unsigned long long notrace cpu_clock(int cpu)
 		time = __sync_cpu_clock(time, cpu);
 		per_cpu(prev_cpu_time, cpu) = time;
 	}
-	raw_local_irq_restore(flags);
+	local_irq_restore(flags);
 
 	return time;
 }
-- 
cgit v1.2.3


From 2b1bce1787700768cbc87c8509851c6f49d252dc Mon Sep 17 00:00:00 2001
From: Ankita Garg <ankita@in.ibm.com>
Date: Mon, 9 Jun 2008 14:10:25 +0530
Subject: ftrace: disable tracing when current_tracer is set to "none"

Found that inspite of setting the current_tracer to "none", trace from
the previous trace type continued to be collected. The patch below fixes
this and causes the trace to be disabled when the "none" type is
selected.

Compile and boot tested the patch for functionality.

Signed-off-by: Ankita Garg <ankita@in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 12f5e817380e..dde6f0ace6dc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -43,11 +43,6 @@ static cpumask_t __read_mostly		tracing_buffer_mask;
 #define for_each_tracing_cpu(cpu)	\
 	for_each_cpu_mask(cpu, tracing_buffer_mask)
 
-/* dummy trace to disable tracing */
-static struct tracer no_tracer __read_mostly = {
-	.name		= "none",
-};
-
 static int trace_alloc_page(void);
 static int trace_free_page(void);
 
@@ -135,6 +130,23 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 /* trace_flags holds iter_ctrl options */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT;
 
+static notrace void no_trace_init(struct trace_array *tr)
+{
+	int cpu;
+
+	if(tr->ctrl)
+		for_each_online_cpu(cpu)
+			tracing_reset(tr->data[cpu]);
+	tracer_enabled = 0;
+}
+
+/* dummy trace to disable tracing */
+static struct tracer no_tracer __read_mostly = {
+	.name		= "none",
+	.init		= no_trace_init
+};
+
+
 /**
  * trace_wake_up - wake up tasks waiting for trace input
  *
-- 
cgit v1.2.3


From 1eede070a59e1cc73da51e1aaa00d9ab86572cfc Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 20 May 2008 23:00:01 +0200
Subject: Introduce new top level suspend and hibernation callbacks

Introduce 'struct pm_ops' and 'struct pm_ext_ops' ('ext' meaning
'extended') representing suspend and hibernation operations for bus
types, device classes, device types and device drivers.

Modify the PM core to use 'struct pm_ops' and 'struct pm_ext_ops'
objects, if defined, instead of the ->suspend(), ->resume(),
->suspend_late(), and ->resume_early() callbacks (the old callbacks
will be considered as legacy and gradually phased out).

The main purpose of doing this is to separate suspend (aka S2RAM and
standby) callbacks from hibernation callbacks in such a way that the
new callbacks won't take arguments and the semantics of each of them
will be clearly specified.  This has been requested for multiple
times by many people, including Linus himself, and the reason is that
within the current scheme if ->resume() is called, for example, it's
difficult to say why it's been called (ie. is it a resume from RAM or
from hibernation or a suspend/hibernation failure etc.?).

The second purpose is to make the suspend/hibernation callbacks more
flexible so that device drivers can handle more than they can within
the current scheme.  For example, some drivers may need to prevent
new children of the device from being registered before their
->suspend() callbacks are executed or they may want to carry out some
operations requiring the availability of some other devices, not
directly bound via the parent-child relationship, in order to prepare
for the execution of ->suspend(), etc.

Ultimately, we'd like to stop using the freezing of tasks for suspend
and therefore the drivers' suspend/hibernation code will have to take
care of the handling of the user space during suspend/hibernation.
That, in turn, would be difficult within the current scheme, without
the new ->prepare() and ->complete() callbacks.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 kernel/power/disk.c | 22 +++++++++++++++-------
 kernel/power/main.c |  6 ++++--
 2 files changed, 19 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 14a656cdc652..d416be0efa8a 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -193,6 +193,7 @@ static int create_image(int platform_mode)
 	if (error)
 		return error;
 
+	device_pm_lock();
 	local_irq_disable();
 	/* At this point, device_suspend() has been called, but *not*
 	 * device_power_down(). We *must* call device_power_down() now.
@@ -224,9 +225,11 @@ static int create_image(int platform_mode)
 	/* NOTE:  device_power_up() is just a resume() for devices
 	 * that suspended with irqs off ... no overall powerup.
 	 */
-	device_power_up();
+	device_power_up(in_suspend ?
+		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
  Enable_irqs:
 	local_irq_enable();
+	device_pm_unlock();
 	return error;
 }
 
@@ -280,7 +283,8 @@ int hibernation_snapshot(int platform_mode)
  Finish:
 	platform_finish(platform_mode);
  Resume_devices:
-	device_resume();
+	device_resume(in_suspend ?
+		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
  Resume_console:
 	resume_console();
  Close:
@@ -300,8 +304,9 @@ static int resume_target_kernel(void)
 {
 	int error;
 
+	device_pm_lock();
 	local_irq_disable();
-	error = device_power_down(PMSG_PRETHAW);
+	error = device_power_down(PMSG_QUIESCE);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down, "
 			"aborting resume\n");
@@ -329,9 +334,10 @@ static int resume_target_kernel(void)
 	swsusp_free();
 	restore_processor_state();
 	touch_softlockup_watchdog();
-	device_power_up();
+	device_power_up(PMSG_RECOVER);
  Enable_irqs:
 	local_irq_enable();
+	device_pm_unlock();
 	return error;
 }
 
@@ -350,7 +356,7 @@ int hibernation_restore(int platform_mode)
 
 	pm_prepare_console();
 	suspend_console();
-	error = device_suspend(PMSG_PRETHAW);
+	error = device_suspend(PMSG_QUIESCE);
 	if (error)
 		goto Finish;
 
@@ -362,7 +368,7 @@ int hibernation_restore(int platform_mode)
 		enable_nonboot_cpus();
 	}
 	platform_restore_cleanup(platform_mode);
-	device_resume();
+	device_resume(PMSG_RECOVER);
  Finish:
 	resume_console();
 	pm_restore_console();
@@ -403,6 +409,7 @@ int hibernation_platform_enter(void)
 	if (error)
 		goto Finish;
 
+	device_pm_lock();
 	local_irq_disable();
 	error = device_power_down(PMSG_HIBERNATE);
 	if (!error) {
@@ -411,6 +418,7 @@ int hibernation_platform_enter(void)
 		while (1);
 	}
 	local_irq_enable();
+	device_pm_unlock();
 
 	/*
 	 * We don't need to reenable the nonboot CPUs or resume consoles, since
@@ -419,7 +427,7 @@ int hibernation_platform_enter(void)
  Finish:
 	hibernation_ops->finish();
  Resume_devices:
-	device_resume();
+	device_resume(PMSG_RESTORE);
  Resume_console:
 	resume_console();
  Close:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6a6d5eb3524e..d023b6b584e5 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -228,6 +228,7 @@ static int suspend_enter(suspend_state_t state)
 {
 	int error = 0;
 
+	device_pm_lock();
 	arch_suspend_disable_irqs();
 	BUG_ON(!irqs_disabled());
 
@@ -239,10 +240,11 @@ static int suspend_enter(suspend_state_t state)
 	if (!suspend_test(TEST_CORE))
 		error = suspend_ops->enter(state);
 
-	device_power_up();
+	device_power_up(PMSG_RESUME);
  Done:
 	arch_suspend_enable_irqs();
 	BUG_ON(irqs_disabled());
+	device_pm_unlock();
 	return error;
 }
 
@@ -291,7 +293,7 @@ int suspend_devices_and_enter(suspend_state_t state)
 	if (suspend_ops->finish)
 		suspend_ops->finish();
  Resume_devices:
-	device_resume();
+	device_resume(PMSG_RESUME);
  Resume_console:
 	resume_console();
  Close:
-- 
cgit v1.2.3


From 20764ff1efb440640353053ec83263e69e1259e0 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 12 Jun 2008 11:27:03 +0200
Subject: ftrace: fix printout

Do not print loglevel before "entries of %ld bytes". Move it to the previous
pr_info.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dde6f0ace6dc..6e9dae7eb418 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3051,9 +3051,8 @@ __init static int tracer_alloc_buffers(void)
 	}
 	max_tr.entries = global_trace.entries;
 
-	pr_info("tracer: %d pages allocated for %ld",
-		pages, trace_nr_entries);
-	pr_info(" entries of %ld bytes\n", (long)TRACE_ENTRY_SIZE);
+	pr_info("tracer: %d pages allocated for %ld entries of %ld bytes\n",
+		pages, trace_nr_entries, (long)TRACE_ENTRY_SIZE);
 	pr_info("   actual entries %ld\n", global_trace.entries);
 
 	tracer_init_debugfs();
-- 
cgit v1.2.3


From d8f3de0d2412bb91639cfefc5b3c79dbf3812212 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 12 Jun 2008 23:24:06 +0200
Subject: Suspend-related patches for 2.6.27

ACPI PM: Add possibility to change suspend sequence

There are some systems out there that don't work correctly with
our current suspend/hibernation code ordering.  Provide a workaround
for these systems allowing them to pass 'acpi_sleep=old_ordering' in
the kernel command line so that it will use the pre-ACPI 2.0 ("old")
suspend code ordering.

Unfortunately, this requires us to add a platform hook to the
resuming of devices for recovering the platform in case one of the
device drivers' .suspend() routines returns error code.  Namely,
ACPI 1.0 specifies that _PTS should be called before suspending
devices, but _WAK still should be called before resuming them in
order to undo the changes made by _PTS.  However, if there is an
error during suspending devices, they are automatically resumed
without returning control to the PM core, so the _WAK has to be
called from within device_resume() in that cases.

The patch also reorders and refactors the ACPI suspend/hibernation
code to avoid duplication as far as reasonably possible.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 kernel/power/disk.c | 28 ++++++++++++++++++++++------
 kernel/power/main.c | 10 +++++++---
 2 files changed, 29 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index d416be0efa8a..f011e0870b52 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -179,6 +179,17 @@ static void platform_restore_cleanup(int platform_mode)
 		hibernation_ops->restore_cleanup();
 }
 
+/**
+ *	platform_recover - recover the platform from a failure to suspend
+ *	devices.
+ */
+
+static void platform_recover(int platform_mode)
+{
+	if (platform_mode && hibernation_ops && hibernation_ops->recover)
+		hibernation_ops->recover();
+}
+
 /**
  *	create_image - freeze devices that need to be frozen with interrupts
  *	off, create the hibernation image and thaw those devices.  Control
@@ -258,10 +269,10 @@ int hibernation_snapshot(int platform_mode)
 	suspend_console();
 	error = device_suspend(PMSG_FREEZE);
 	if (error)
-		goto Resume_console;
+		goto Recover_platform;
 
 	if (hibernation_test(TEST_DEVICES))
-		goto Resume_devices;
+		goto Recover_platform;
 
 	error = platform_pre_snapshot(platform_mode);
 	if (error || hibernation_test(TEST_PLATFORM))
@@ -285,11 +296,14 @@ int hibernation_snapshot(int platform_mode)
  Resume_devices:
 	device_resume(in_suspend ?
 		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
- Resume_console:
 	resume_console();
  Close:
 	platform_end(platform_mode);
 	return error;
+
+ Recover_platform:
+	platform_recover(platform_mode);
+	goto Resume_devices;
 }
 
 /**
@@ -398,8 +412,11 @@ int hibernation_platform_enter(void)
 
 	suspend_console();
 	error = device_suspend(PMSG_HIBERNATE);
-	if (error)
-		goto Resume_console;
+	if (error) {
+		if (hibernation_ops->recover)
+			hibernation_ops->recover();
+		goto Resume_devices;
+	}
 
 	error = hibernation_ops->prepare();
 	if (error)
@@ -428,7 +445,6 @@ int hibernation_platform_enter(void)
 	hibernation_ops->finish();
  Resume_devices:
 	device_resume(PMSG_RESTORE);
- Resume_console:
 	resume_console();
  Close:
 	hibernation_ops->end();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index d023b6b584e5..3398f4651aa1 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -269,11 +269,11 @@ int suspend_devices_and_enter(suspend_state_t state)
 	error = device_suspend(PMSG_SUSPEND);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to suspend\n");
-		goto Resume_console;
+		goto Recover_platform;
 	}
 
 	if (suspend_test(TEST_DEVICES))
-		goto Resume_devices;
+		goto Recover_platform;
 
 	if (suspend_ops->prepare) {
 		error = suspend_ops->prepare();
@@ -294,12 +294,16 @@ int suspend_devices_and_enter(suspend_state_t state)
 		suspend_ops->finish();
  Resume_devices:
 	device_resume(PMSG_RESUME);
- Resume_console:
 	resume_console();
  Close:
 	if (suspend_ops->end)
 		suspend_ops->end();
 	return error;
+
+ Recover_platform:
+	if (suspend_ops->recover)
+		suspend_ops->recover();
+	goto Resume_devices;
 }
 
 /**
-- 
cgit v1.2.3


From 2429e4ee78e2fa40f82a4572dd21d4f3b4de9325 Mon Sep 17 00:00:00 2001
From: "Huang, Ying" <ying.huang@intel.com>
Date: Fri, 13 Jun 2008 14:40:17 +0800
Subject: lockdep: output lock_class key instead of address for forward
 dependency output

The key instead of address of lock_class should be output in
/proc/lockdep when forward dependency is output, because key is
output for lock_class itself as identifier too.

This patch is based on x86/auto-latest branch of git-x86 tree, and has
been tested on x86_64 platform.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep_proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index dc5d29648d85..688c5f1940bd 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -139,7 +139,7 @@ static int l_show(struct seq_file *m, void *v)
 
 	list_for_each_entry(entry, &class->locks_after, entry) {
 		if (entry->distance == 1) {
-			seq_printf(m, " -> [%p] ", entry->class);
+			seq_printf(m, " -> [%p] ", entry->class->key);
 			print_name(m, entry->class);
 			seq_puts(m, "\n");
 		}
-- 
cgit v1.2.3


From a5a242dceed5d1c74fe46088762a9e4312c2d000 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Fri, 13 Jun 2008 11:00:14 +0200
Subject: stacktrace: print_stack_trace() cleanup

- shorter code and better atomicity with regards to printk().

(It's been tested with the backtrace self-test code on i386 and x86_64.)

Cc: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/stacktrace.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 0914d0cbc83c..7eaea9d02a52 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -11,17 +11,14 @@
 
 void print_stack_trace(struct stack_trace *trace, int spaces)
 {
-	int i, j;
+	int i;
 
 	if (WARN_ON(!trace->entries))
 		return;
 
 	for (i = 0; i < trace->nr_entries; i++) {
-		unsigned long ip = trace->entries[i];
-
-		for (j = 0; j < spaces + 1; j++)
-			printk(" ");
-		print_ip_sym(ip);
+		printk("%*c", 1 + spaces, ' ');
+		print_ip_sym(trace->entries[i]);
 	}
 }
 
-- 
cgit v1.2.3


From a4500b84c51645bbc86be3ca84f2252b7ada060f Mon Sep 17 00:00:00 2001
From: Abhishek Sagar <sagar.abhishek@gmail.com>
Date: Sat, 14 Jun 2008 11:59:39 +0530
Subject: ftrace: fix "notrace" filtering priority

This is a fix to give notrace filter rules priority over "set_ftrace_filter"
rules.

This fix ensures that functions which are set to be filtered and are
concurrently marked as "notrace" don't get recorded. As of now, if
a record is marked as FTRACE_FL_FILTER and is enabled, then the notrace
flag is not checked. Tested on x86-32.

Signed-off-by: Abhishek Sagar <sagar.abhishek@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0118979e211f..b532e4a68c74 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -355,20 +355,26 @@ __ftrace_replace_code(struct dyn_ftrace *rec,
 		 * If this record is set not to trace then
 		 * do nothing.
 		 *
+		 * If this record is set not to trace and
+		 * it is enabled then disable it.
+		 *
 		 * If this record is not set to be filtered and
 		 * it is enabled, disable it.
 		 */
-		fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED);
+
+		fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE |
+				   FTRACE_FL_ENABLED);
 
 		if ((fl ==  (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) ||
-		    (fl == 0) || (rec->flags & FTRACE_FL_NOTRACE))
+		    (fl ==  (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE)) ||
+		    !fl || (fl == FTRACE_FL_NOTRACE))
 			return 0;
 
 		/*
 		 * If it is enabled disable it,
 		 * otherwise enable it!
 		 */
-		if (fl == FTRACE_FL_ENABLED) {
+		if (fl & FTRACE_FL_ENABLED) {
 			/* swap new and old */
 			new = old;
 			old = ftrace_call_replace(ip, FTRACE_ADDR);
-- 
cgit v1.2.3


From 87c8a64475f0597b7fd9c36d2f867ae8ef4a9eca Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 2 Jun 2008 13:19:08 +0200
Subject: printk: export console_drivers

this symbol is needed by drivers/video/xen-fbfront.ko.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/printk.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 475fc22a2857..70cfa5ac75ce 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -75,6 +75,8 @@ EXPORT_SYMBOL(oops_in_progress);
 static DECLARE_MUTEX(console_sem);
 static DECLARE_MUTEX(secondary_console_sem);
 struct console *console_drivers;
+EXPORT_SYMBOL_GPL(console_drivers);
+
 /*
  * This is used for debugging the mess that is the VT code by
  * keeping track if we have the console semaphore held. It's
-- 
cgit v1.2.3


From 906d882cacecd37ad2fdd03ed2a9b232bcb9507e Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Mon, 9 Jun 2008 16:35:25 -0700
Subject: rcu: remove unused field struct rcu_data::rcu_tasklet

Since softirq works for rcu reclaimer, rcu_tasklet is unused now.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupreempt.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index b8e4cdae4e8d..396b121edfe5 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -82,7 +82,6 @@ struct rcu_data {
 	spinlock_t	lock;		/* Protect rcu_data fields. */
 	long		completed;	/* Number of last completed batch. */
 	int		waitlistcount;
-	struct tasklet_struct rcu_tasklet;
 	struct rcu_head *nextlist;
 	struct rcu_head **nexttail;
 	struct rcu_head *waitlist[GP_STAGES];
-- 
cgit v1.2.3


From f22529351f7060d61eff3b76d7c9706f90aaedf3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 22 May 2008 10:37:48 +0200
Subject: namespacecheck: fixes

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c             | 2 +-
 kernel/trace/trace_sched_switch.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b532e4a68c74..0d5bcf69952d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -50,7 +50,7 @@ static struct ftrace_ops ftrace_list_end __read_mostly =
 static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
 ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
 
-void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
+static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
 {
 	struct ftrace_ops *op = ftrace_list;
 
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index c16935d3bc5c..93a662009151 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -196,7 +196,7 @@ static void tracing_sched_unregister(void)
 				&ctx_trace);
 }
 
-void tracing_start_sched_switch(void)
+static void tracing_start_sched_switch(void)
 {
 	long ref;
 
@@ -205,7 +205,7 @@ void tracing_start_sched_switch(void)
 		tracing_sched_register();
 }
 
-void tracing_stop_sched_switch(void)
+static void tracing_stop_sched_switch(void)
 {
 	long ref;
 
-- 
cgit v1.2.3


From 5af970a48f3ba0dd96a036b196c79dc923f28231 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 18 Jun 2008 10:09:48 +0200
Subject: rcutorture: WARN_ON_ONCE(1) when detecting an error

this makes it easier for automated tests to pick up such failures.
---
 kernel/rcutorture.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 0334b6a8baca..0ca7e9b290b0 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -687,6 +687,7 @@ rcu_torture_printk(char *page)
 	if (i > 1) {
 		cnt += sprintf(&page[cnt], "!!! ");
 		atomic_inc(&n_rcu_torture_error);
+		WARN_ON_ONCE(1);
 	}
 	cnt += sprintf(&page[cnt], "Reader Pipe: ");
 	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
-- 
cgit v1.2.3


From 20b6331bfed1f07ba1e5006889a5d64adc53615e Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Wed, 11 Jun 2008 00:58:30 +0200
Subject: sched: rework of "prioritize non-migratable tasks over migratable
 ones"

regarding this commit: 45c01e824991b2dd0a332e19efc4901acb31209f

I think we can do it simpler. Please take a look at the patch below.

Instead of having 2 separate arrays (which is + ~800 bytes on x86_32 and
twice so on x86_64), let's add "exclusive" (the ones that are bound to
this CPU) tasks to the head of the queue and "shared" ones -- to the
end.

In case of a few newly woken up "exclusive" tasks, they are 'stacked'
(not queued as now), meaning that a task {i+1} is being placed in front
of the previously woken up task {i}. But I don't think that this
behavior may cause any realistic problems.

There are a couple of changes on top of this one.

(1) in check_preempt_curr_rt()

I don't think there is a need for the "pick_next_rt_entity(rq, &rq->rt)
!= &rq->curr->rt" check.

enqueue_task_rt(p) and check_preempt_curr_rt() are always called one
after another with rq->lock being held so the following check
"p->rt.nr_cpus_allowed == 1 && rq->curr->rt.nr_cpus_allowed != 1" should
be enough (well, just its left part) to guarantee that 'p' has been
queued in front of the 'curr'.

(2) in set_cpus_allowed_rt()

I don't thinks there is a need for requeue_task_rt() here.

Perhaps, the only case when 'requeue' (+ reschedule) might be useful is
as follows:

i) weight == 1 && cpu_isset(task_cpu(p), *new_mask)

i.e. a task is being bound to this CPU);

ii) 'p' != rq->curr

but here, 'p' has already been on this CPU for a while and was not
migrated. i.e. it's possible that 'rq->curr' would not have high chances
to be migrated right at this particular moment (although, has chance in
a bit longer term), should we allow it to be preempted.

Anyway, I think we should not perhaps make it more complex trying to
address some rare corner cases. For instance, that's why a single queue
approach would be preferable. Unless I'm missing something obvious, this
approach gives us similar functionality at lower cost.

Verified only compilation-wise.

(Almost)-Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    |  6 ++----
 kernel/sched_rt.c | 44 +++++++++-----------------------------------
 2 files changed, 11 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 554de4009803..cc1d558406f8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -153,8 +153,7 @@ static inline int task_has_rt_policy(struct task_struct *p)
  */
 struct rt_prio_array {
 	DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
-	struct list_head xqueue[MAX_RT_PRIO]; /* exclusive queue */
-	struct list_head squeue[MAX_RT_PRIO];  /* shared queue */
+	struct list_head queue[MAX_RT_PRIO];
 };
 
 struct rt_bandwidth {
@@ -7620,8 +7619,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 
 	array = &rt_rq->active;
 	for (i = 0; i < MAX_RT_PRIO; i++) {
-		INIT_LIST_HEAD(array->xqueue + i);
-		INIT_LIST_HEAD(array->squeue + i);
+		INIT_LIST_HEAD(array->queue + i);
 		__clear_bit(i, array->bitmap);
 	}
 	/* delimiter for bitsearch: */
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 8ae3416e0bb4..f721b52acd8d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -576,16 +576,15 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
 	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 	struct rt_prio_array *array = &rt_rq->active;
 	struct rt_rq *group_rq = group_rt_rq(rt_se);
+	struct list_head *queue = array->queue + rt_se_prio(rt_se);
 
 	if (group_rq && rt_rq_throttled(group_rq))
 		return;
 
 	if (rt_se->nr_cpus_allowed == 1)
-		list_add_tail(&rt_se->run_list,
-			      array->xqueue + rt_se_prio(rt_se));
+		list_add(&rt_se->run_list, queue);
 	else
-		list_add_tail(&rt_se->run_list,
-			      array->squeue + rt_se_prio(rt_se));
+		list_add_tail(&rt_se->run_list, queue);
 
 	__set_bit(rt_se_prio(rt_se), array->bitmap);
 
@@ -598,8 +597,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
 	struct rt_prio_array *array = &rt_rq->active;
 
 	list_del_init(&rt_se->run_list);
-	if (list_empty(array->squeue + rt_se_prio(rt_se))
-	    && list_empty(array->xqueue + rt_se_prio(rt_se)))
+	if (list_empty(array->queue + rt_se_prio(rt_se)))
 		__clear_bit(rt_se_prio(rt_se), array->bitmap);
 
 	dec_rt_tasks(rt_se, rt_rq);
@@ -666,11 +664,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 /*
  * Put task to the end of the run list without the overhead of dequeue
  * followed by enqueue.
- *
- * Note: We always enqueue the task to the shared-queue, regardless of its
- * previous position w.r.t. exclusive vs shared.  This is so that exclusive RR
- * tasks fairly round-robin with all tasks on the runqueue, not just other
- * exclusive tasks.
  */
 static
 void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
@@ -678,7 +671,7 @@ void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
 	struct rt_prio_array *array = &rt_rq->active;
 
 	list_del_init(&rt_se->run_list);
-	list_add_tail(&rt_se->run_list, array->squeue + rt_se_prio(rt_se));
+	list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
 }
 
 static void requeue_task_rt(struct rq *rq, struct task_struct *p)
@@ -736,9 +729,6 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
 }
 #endif /* CONFIG_SMP */
 
-static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
-						   struct rt_rq *rt_rq);
-
 /*
  * Preempt the current task with a newly woken task if needed:
  */
@@ -764,8 +754,7 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
 	 */
 	if((p->prio == rq->curr->prio)
 	   && p->rt.nr_cpus_allowed == 1
-	   && rq->curr->rt.nr_cpus_allowed != 1
-	   && pick_next_rt_entity(rq, &rq->rt) != &rq->curr->rt) {
+	   && rq->curr->rt.nr_cpus_allowed != 1) {
 		cpumask_t mask;
 
 		if (cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
@@ -789,15 +778,8 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
 	idx = sched_find_first_bit(array->bitmap);
 	BUG_ON(idx >= MAX_RT_PRIO);
 
-	queue = array->xqueue + idx;
-	if (!list_empty(queue))
-		next = list_entry(queue->next, struct sched_rt_entity,
-				  run_list);
-	else {
-		queue = array->squeue + idx;
-		next = list_entry(queue->next, struct sched_rt_entity,
-				  run_list);
-	}
+	queue = array->queue + idx;
+	next = list_entry(queue->next, struct sched_rt_entity, run_list);
 
 	return next;
 }
@@ -867,7 +849,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 			continue;
 		if (next && next->prio < idx)
 			continue;
-		list_for_each_entry(rt_se, array->squeue + idx, run_list) {
+		list_for_each_entry(rt_se, array->queue + idx, run_list) {
 			struct task_struct *p = rt_task_of(rt_se);
 			if (pick_rt_task(rq, p, cpu)) {
 				next = p;
@@ -1249,14 +1231,6 @@ static void set_cpus_allowed_rt(struct task_struct *p,
 		}
 
 		update_rt_migration(rq);
-
-		if (unlikely(weight == 1 || p->rt.nr_cpus_allowed == 1))
-			/*
-			 * If either the new or old weight is a "1", we need
-			 * to requeue to properly move between shared and
-			 * exclusive queues.
-			 */
-			requeue_task_rt(rq, p);
 	}
 
 	p->cpus_allowed    = *new_mask;
-- 
cgit v1.2.3


From e17ba73b0ee6c0f24393c48b455e0d8db761782c Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Mon, 12 May 2008 15:44:40 +0200
Subject: x86, generic: mark early_printk as asmlinkage

It's not explicitly marked as asmlinkage, but invoked from x86_32
startup code with parameters on stack.

No other architectures define early_printk and none of them are affected
by this change, since defines asmlinkage as empty token.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 70cfa5ac75ce..de1a4f4470c3 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -38,7 +38,7 @@
 /*
  * Architectures can override it:
  */
-void __attribute__((weak)) early_printk(const char *fmt, ...)
+void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
 {
 }
 
-- 
cgit v1.2.3


From 4620b49f76096fa5183eecad7d689faa898a4c82 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Thu, 12 Jun 2008 23:21:53 +0200
Subject: softirq: remove initialization of static per-cpu variable

Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/softirq.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 059256874e9b..86775340ef1d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -359,10 +359,8 @@ struct tasklet_head
 	struct tasklet_struct **tail;
 };
 
-/* Some compilers disobey section attribute on statics when not
-   initialized -- RR */
-static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec) = { NULL };
-static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec) = { NULL };
+static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
+static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
 
 void __tasklet_schedule(struct tasklet_struct *t)
 {
-- 
cgit v1.2.3


From d120f65f3aaf306c957bc4c82e510f5b0f1e9b27 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 18 Jun 2008 05:21:44 -0700
Subject: rcu: make rcutorture more vicious: add stutter feature

This patch takes a step towards making rcutorture more brutal by allowing
the test to be automatically periodically paused, with the default being
to run the test for five seconds then pause for five seconds and repeat.
This behavior can be controlled using a new "stutter" module parameter, so
that "stutter=0" gives the old default behavior of running continuously.

Starting and stopping rcutorture more heavily stresses RCU's interaction
with the scheduler, as well as exercising more paths through the
grace-period detection code.

Note that the default to "shuffle_interval" has also been adjusted from
5 seconds to 3 seconds to provide varying overlap with the "stutter"
interval.

I am still unable to provoke the failures that Alexey has been seeing,
even with this patch, but will be doing a few additional things to beef
up rcutorture.

Suggested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutorture.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 56 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 0ca7e9b290b0..98ae7d168225 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -57,7 +57,8 @@ static int stat_interval;	/* Interval between stats, in seconds. */
 				/*  Defaults to "only at end of test". */
 static int verbose;		/* Print more debug info. */
 static int test_no_idle_hz;	/* Test RCU's support for tickless idle CPUs. */
-static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/
+static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
+static int stutter = 5;		/* Start/stop testing interval (in sec) */
 static char *torture_type = "rcu"; /* What RCU implementation to torture. */
 
 module_param(nreaders, int, 0444);
@@ -72,6 +73,8 @@ module_param(test_no_idle_hz, bool, 0444);
 MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
 module_param(shuffle_interval, int, 0444);
 MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
+module_param(stutter, int, 0444);
+MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
 module_param(torture_type, charp, 0444);
 MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
 
@@ -91,6 +94,7 @@ static struct task_struct **fakewriter_tasks;
 static struct task_struct **reader_tasks;
 static struct task_struct *stats_task;
 static struct task_struct *shuffler_task;
+static struct task_struct *stutter_task;
 
 #define RCU_TORTURE_PIPE_LEN 10
 
@@ -119,6 +123,8 @@ static atomic_t n_rcu_torture_mberror;
 static atomic_t n_rcu_torture_error;
 static struct list_head rcu_torture_removed;
 
+static int stutter_pause_test = 0;
+
 /*
  * Allocate an element from the rcu_tortures pool.
  */
@@ -179,6 +185,13 @@ rcu_random(struct rcu_random_state *rrsp)
 	return swahw32(rrsp->rrs_state);
 }
 
+static void
+rcu_stutter_wait(void)
+{
+	while (stutter_pause_test)
+		schedule_timeout_interruptible(1);
+}
+
 /*
  * Operations vector for selecting different types of tests.
  */
@@ -563,6 +576,7 @@ rcu_torture_writer(void *arg)
 		}
 		rcu_torture_current_version++;
 		oldbatch = cur_ops->completed();
+		rcu_stutter_wait();
 	} while (!kthread_should_stop() && !fullstop);
 	VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
 	while (!kthread_should_stop())
@@ -586,6 +600,7 @@ rcu_torture_fakewriter(void *arg)
 		schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
 		udelay(rcu_random(&rand) & 0x3ff);
 		cur_ops->sync();
+		rcu_stutter_wait();
 	} while (!kthread_should_stop() && !fullstop);
 
 	VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping");
@@ -641,6 +656,7 @@ rcu_torture_reader(void *arg)
 		preempt_enable();
 		cur_ops->readunlock(idx);
 		schedule();
+		rcu_stutter_wait();
 	} while (!kthread_should_stop() && !fullstop);
 	VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
 	while (!kthread_should_stop())
@@ -812,15 +828,34 @@ rcu_torture_shuffle(void *arg)
 	return 0;
 }
 
+/* Cause the rcutorture test to "stutter", starting and stopping all
+ * threads periodically.
+ */
+static int
+rcu_torture_stutter(void *arg)
+{
+	VERBOSE_PRINTK_STRING("rcu_torture_stutter task started");
+	do {
+		schedule_timeout_interruptible(stutter * HZ);
+		stutter_pause_test = 1;
+		if (!kthread_should_stop())
+			schedule_timeout_interruptible(stutter * HZ);
+		stutter_pause_test = 0;
+	} while (!kthread_should_stop());
+	VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping");
+	return 0;
+}
+
 static inline void
 rcu_torture_print_module_parms(char *tag)
 {
 	printk(KERN_ALERT "%s" TORTURE_FLAG
 		"--- %s: nreaders=%d nfakewriters=%d "
 		"stat_interval=%d verbose=%d test_no_idle_hz=%d "
-		"shuffle_interval = %d\n",
+		"shuffle_interval=%d stutter=%d\n",
 		torture_type, tag, nrealreaders, nfakewriters,
-		stat_interval, verbose, test_no_idle_hz, shuffle_interval);
+		stat_interval, verbose, test_no_idle_hz, shuffle_interval,
+		stutter);
 }
 
 static void
@@ -829,6 +864,11 @@ rcu_torture_cleanup(void)
 	int i;
 
 	fullstop = 1;
+	if (stutter_task) {
+		VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
+		kthread_stop(stutter_task);
+	}
+	stutter_task = NULL;
 	if (shuffler_task) {
 		VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
 		kthread_stop(shuffler_task);
@@ -1017,6 +1057,19 @@ rcu_torture_init(void)
 			goto unwind;
 		}
 	}
+	if (stutter < 0)
+		stutter = 0;
+	if (stutter) {
+		/* Create the stutter thread */
+		stutter_task = kthread_run(rcu_torture_stutter, NULL,
+					  "rcu_torture_stutter");
+		if (IS_ERR(stutter_task)) {
+			firsterr = PTR_ERR(stutter_task);
+			VERBOSE_PRINTK_ERRSTRING("Failed to create stutter");
+			stutter_task = NULL;
+			goto unwind;
+		}
+	}
 	return 0;
 
 unwind:
-- 
cgit v1.2.3


From 688c91755dc3d3c03d8c67c1df13c02be258768e Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 16 Jun 2008 15:51:08 -0700
Subject: softlockup: print a module list on being stuck

Most places in the kernel that go BUG: print a module list
(which is very useful for doing statistics and finding patterns),
however the softlockup detector does not do this yet.

This patch adds the one line change to fix this gap.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/softlockup.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 6b682d86bddf..f2bf5decb108 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -143,6 +143,7 @@ void softlockup_tick(void)
 	printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n",
 			this_cpu, now - touch_timestamp,
 			current->comm, task_pid_nr(current));
+	print_modules();
 	if (regs)
 		show_regs(regs);
 	else
-- 
cgit v1.2.3


From 31a72bce0bd6f3e0114009288bccbc96376eeeca Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 18 Jun 2008 09:26:49 -0700
Subject: rcu: make rcutorture more vicious: reinstate boot-time testing

This patch re-institutes the ability to build rcutorture directly into
the Linux kernel.  The reason that this capability was removed was that
this could result in your kernel being pretty much useless, as rcutorture
would be running starting from early boot.  This problem has been avoided
by (1) making rcutorture run only three seconds of every six by default,
(2) adding a CONFIG_RCU_TORTURE_TEST_RUNNABLE that permits rcutorture
to be quiesced at boot time, and (3) adding a sysctl in /proc named
/proc/sys/kernel/rcutorture_runnable that permits rcutorture to be
quiesced and unquiesced when built into the kernel.

Please note that this /proc file is -not- available when rcutorture
is built as a module.  Please also note that to get the earlier
take-no-prisoners behavior, you must use the boot command line to set
rcutorture's "stutter" parameter to zero.

The rcutorture quiescing mechanism is currently quite crude: loops
in each rcutorture process that poll a global variable once per tick.
Suggestions for improvement are welcome.  The default action will
be to reduce the polling rate to a few times per second.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Suggested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutorture.c |  9 ++++++++-
 kernel/sysctl.c     | 13 +++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 98ae7d168225..27003e2421c7 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -125,6 +125,13 @@ static struct list_head rcu_torture_removed;
 
 static int stutter_pause_test = 0;
 
+#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
+#define RCUTORTURE_RUNNABLE_INIT 1
+#else
+#define RCUTORTURE_RUNNABLE_INIT 0
+#endif
+int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
+
 /*
  * Allocate an element from the rcu_tortures pool.
  */
@@ -188,7 +195,7 @@ rcu_random(struct rcu_random_state *rrsp)
 static void
 rcu_stutter_wait(void)
 {
-	while (stutter_pause_test)
+	while (stutter_pause_test || !rcutorture_runnable)
 		schedule_timeout_interruptible(1);
 }
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 29116652dca8..c6887cf135c8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -82,6 +82,9 @@ extern int maps_protect;
 extern int sysctl_stat_interval;
 extern int latencytop_enabled;
 extern int sysctl_nr_open_min, sysctl_nr_open_max;
+#ifdef CONFIG_RCU_TORTURE_TEST
+extern int rcutorture_runnable;
+#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
 
 /* Constants used for minimum and  maximum */
 #if defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM)
@@ -813,6 +816,16 @@ static struct ctl_table kern_table[] = {
 		.child		= key_sysctls,
 	},
 #endif
+#ifdef CONFIG_RCU_TORTURE_TEST
+	{
+		.ctl_name       = CTL_UNNUMBERED,
+		.procname       = "rcutorture_runnable",
+		.data           = &rcutorture_runnable,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = &proc_dointvec,
+	},
+#endif
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
-- 
cgit v1.2.3


From ada18de2eb76961a4d4847f63291744c9e7beec4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Jun 2008 14:22:24 +0200
Subject: sched: debug: add some rt debug output

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: "Daniel K." <dk@uw.no>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_debug.c | 40 +++++++++++++++++++++++++++++++++++++---
 kernel/sched_rt.c    | 14 ++++++++++++++
 2 files changed, 51 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 8bb713040ac9..8e077b9c91cb 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -119,9 +119,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	struct sched_entity *last;
 	unsigned long flags;
 
-#if !defined(CONFIG_CGROUP_SCHED) || !defined(CONFIG_USER_SCHED)
-	SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
-#else
+#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
 	char path[128] = "";
 	struct cgroup *cgroup = NULL;
 	struct task_group *tg = cfs_rq->tg;
@@ -133,6 +131,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 		cgroup_path(cgroup, path, sizeof(path));
 
 	SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
+#else
+	SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
 #endif
 
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
@@ -169,6 +169,39 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			cfs_rq->nr_spread_over);
 }
 
+void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
+{
+#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
+	char path[128] = "";
+	struct cgroup *cgroup = NULL;
+	struct task_group *tg = rt_rq->tg;
+
+	if (tg)
+		cgroup = tg->css.cgroup;
+
+	if (cgroup)
+		cgroup_path(cgroup, path, sizeof(path));
+
+	SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
+#else
+	SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
+#endif
+
+
+#define P(x) \
+	SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
+#define PN(x) \
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
+
+	P(rt_nr_running);
+	P(rt_throttled);
+	PN(rt_time);
+	PN(rt_runtime);
+
+#undef PN
+#undef P
+}
+
 static void print_cpu(struct seq_file *m, int cpu)
 {
 	struct rq *rq = &per_cpu(runqueues, cpu);
@@ -208,6 +241,7 @@ static void print_cpu(struct seq_file *m, int cpu)
 #undef PN
 
 	print_cfs_stats(m, cpu);
+	print_rt_stats(m, cpu);
 
 	print_rq(m, rq, cpu);
 }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index fee5fa7c72db..2e0ccdcf046a 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1444,3 +1444,17 @@ static const struct sched_class rt_sched_class = {
 	.prio_changed		= prio_changed_rt,
 	.switched_to		= switched_to_rt,
 };
+
+#ifdef CONFIG_SCHED_DEBUG
+extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
+
+static void print_rt_stats(struct seq_file *m, int cpu)
+{
+	struct rt_rq *rt_rq;
+
+	rcu_read_lock();
+	for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu))
+		print_rt_rq(m, cpu, rt_rq);
+	rcu_read_unlock();
+}
+#endif
-- 
cgit v1.2.3


From b79f3833d81d54fc71d98c8064dc45f33a755a8a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Jun 2008 14:22:25 +0200
Subject: sched: rt: fix SMP bandwidth balancing for throttled groups

Now we exceed the runtime and get throttled - the period rollover tick
will subtract the cpu quota from the runtime and check if we're below
quota. However with this cpu having a very small portion of the runtime
it will not refresh as fast as it should.

Therefore, also rebalance the runtime when we're throttled.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: "Daniel K." <dk@uw.no>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 41 +++++++++++++++++++++++++++++------------
 1 file changed, 29 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 2e0ccdcf046a..87b2e3bf9472 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -228,6 +228,28 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 
 #endif
 
+#ifdef CONFIG_SMP
+static int do_balance_runtime(struct rt_rq *rt_rq);
+
+static int balance_runtime(struct rt_rq *rt_rq)
+{
+	int more = 0;
+
+	if (rt_rq->rt_time > rt_rq->rt_runtime) {
+		spin_unlock(&rt_rq->rt_runtime_lock);
+		more = do_balance_runtime(rt_rq);
+		spin_lock(&rt_rq->rt_runtime_lock);
+	}
+
+	return more;
+}
+#else
+static inline int balance_runtime(struct rt_rq *rt_rq)
+{
+	return 0;
+}
+#endif
+
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 {
 	int i, idle = 1;
@@ -247,6 +269,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 			u64 runtime;
 
 			spin_lock(&rt_rq->rt_runtime_lock);
+			if (rt_rq->rt_throttled)
+				balance_runtime(rt_rq);
 			runtime = rt_rq->rt_runtime;
 			rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
 			if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
@@ -267,7 +291,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 }
 
 #ifdef CONFIG_SMP
-static int balance_runtime(struct rt_rq *rt_rq)
+static int do_balance_runtime(struct rt_rq *rt_rq)
 {
 	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 	struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
@@ -428,17 +452,10 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 	if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq))
 		return 0;
 
-#ifdef CONFIG_SMP
-	if (rt_rq->rt_time > runtime) {
-		spin_unlock(&rt_rq->rt_runtime_lock);
-		balance_runtime(rt_rq);
-		spin_lock(&rt_rq->rt_runtime_lock);
-
-		runtime = sched_rt_runtime(rt_rq);
-		if (runtime == RUNTIME_INF)
-			return 0;
-	}
-#endif
+	balance_runtime(rt_rq);
+	runtime = sched_rt_runtime(rt_rq);
+	if (runtime == RUNTIME_INF)
+		return 0;
 
 	if (rt_rq->rt_time > runtime) {
 		rt_rq->rt_throttled = 1;
-- 
cgit v1.2.3


From eff6549b957d15d1ad168d90b8c1eb643b9c163f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Jun 2008 14:22:26 +0200
Subject: sched: rt: move some code around

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: "Daniel K." <dk@uw.no>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 119 ++++++++++++++++++++++++++----------------------------
 1 file changed, 57 insertions(+), 62 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 87b2e3bf9472..61d52112289c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -228,68 +228,6 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 
 #endif
 
-#ifdef CONFIG_SMP
-static int do_balance_runtime(struct rt_rq *rt_rq);
-
-static int balance_runtime(struct rt_rq *rt_rq)
-{
-	int more = 0;
-
-	if (rt_rq->rt_time > rt_rq->rt_runtime) {
-		spin_unlock(&rt_rq->rt_runtime_lock);
-		more = do_balance_runtime(rt_rq);
-		spin_lock(&rt_rq->rt_runtime_lock);
-	}
-
-	return more;
-}
-#else
-static inline int balance_runtime(struct rt_rq *rt_rq)
-{
-	return 0;
-}
-#endif
-
-static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
-{
-	int i, idle = 1;
-	cpumask_t span;
-
-	if (rt_b->rt_runtime == RUNTIME_INF)
-		return 1;
-
-	span = sched_rt_period_mask();
-	for_each_cpu_mask(i, span) {
-		int enqueue = 0;
-		struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
-		struct rq *rq = rq_of_rt_rq(rt_rq);
-
-		spin_lock(&rq->lock);
-		if (rt_rq->rt_time) {
-			u64 runtime;
-
-			spin_lock(&rt_rq->rt_runtime_lock);
-			if (rt_rq->rt_throttled)
-				balance_runtime(rt_rq);
-			runtime = rt_rq->rt_runtime;
-			rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
-			if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
-				rt_rq->rt_throttled = 0;
-				enqueue = 1;
-			}
-			if (rt_rq->rt_time || rt_rq->rt_nr_running)
-				idle = 0;
-			spin_unlock(&rt_rq->rt_runtime_lock);
-		}
-
-		if (enqueue)
-			sched_rt_rq_enqueue(rt_rq);
-		spin_unlock(&rq->lock);
-	}
-
-	return idle;
-}
-
 #ifdef CONFIG_SMP
 static int do_balance_runtime(struct rt_rq *rt_rq)
 {
@@ -425,8 +363,65 @@ static void enable_runtime(struct rq *rq)
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
 
+static int balance_runtime(struct rt_rq *rt_rq)
+{
+	int more = 0;
+
+	if (rt_rq->rt_time > rt_rq->rt_runtime) {
+		spin_unlock(&rt_rq->rt_runtime_lock);
+		more = do_balance_runtime(rt_rq);
+		spin_lock(&rt_rq->rt_runtime_lock);
+	}
+
+	return more;
+}
+#else
+static inline int balance_runtime(struct rt_rq *rt_rq)
+{
+	return 0;
+}
 #endif
 
+static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
+{
+	int i, idle = 1;
+	cpumask_t span;
+
+	if (rt_b->rt_runtime == RUNTIME_INF)
+		return 1;
+
+	span = sched_rt_period_mask();
+	for_each_cpu_mask(i, span) {
+		int enqueue = 0;
+		struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
+		struct rq *rq = rq_of_rt_rq(rt_rq);
+
+		spin_lock(&rq->lock);
+		if (rt_rq->rt_time) {
+			u64 runtime;
+
+			spin_lock(&rt_rq->rt_runtime_lock);
+			if (rt_rq->rt_throttled)
+				balance_runtime(rt_rq);
+			runtime = rt_rq->rt_runtime;
+			rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
+			if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
+				rt_rq->rt_throttled = 0;
+				enqueue = 1;
+			}
+			if (rt_rq->rt_time || rt_rq->rt_nr_running)
+				idle = 0;
+			spin_unlock(&rt_rq->rt_runtime_lock);
+		}
+
+		if (enqueue)
+			sched_rt_rq_enqueue(rt_rq);
+		spin_unlock(&rq->lock);
+	}
+
+	return idle;
+}
+
 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 {
 #ifdef CONFIG_RT_GROUP_SCHED
-- 
cgit v1.2.3


From 10b612f440a22a294e87ec7e8f03f9eea3338628 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Jun 2008 14:22:27 +0200
Subject: sched: rt: fix the bandwidth contraint computations

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: "Daniel K." <dk@uw.no>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 5b307da827ef..1f711a58a2b4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8419,7 +8419,7 @@ static unsigned long to_ratio(u64 period, u64 runtime)
 #ifdef CONFIG_CGROUP_SCHED
 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 {
-	struct task_group *tgi, *parent = tg ? tg->parent : NULL;
+	struct task_group *tgi, *parent = tg->parent;
 	unsigned long total = 0;
 
 	if (!parent) {
@@ -8443,7 +8443,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 	}
 	rcu_read_unlock();
 
-	return total + to_ratio(period, runtime) <
+	return total + to_ratio(period, runtime) <=
 		to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
 				parent->rt_bandwidth.rt_runtime);
 }
@@ -8560,10 +8560,15 @@ long sched_group_rt_period(struct task_group *tg)
 
 static int sched_rt_global_constraints(void)
 {
+	struct task_group *tg = &root_task_group;
+	u64 rt_runtime, rt_period;
 	int ret = 0;
 
+	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+	rt_runtime = tg->rt_bandwidth.rt_runtime;
+
 	mutex_lock(&rt_constraints_mutex);
-	if (!__rt_schedulable(NULL, 1, 0))
+	if (!__rt_schedulable(tg, rt_period, rt_runtime))
 		ret = -EINVAL;
 	mutex_unlock(&rt_constraints_mutex);
 
-- 
cgit v1.2.3


From 6c3df25511c2c51f2dd36cc52a8d22363d731793 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Jun 2008 14:22:28 +0200
Subject: sched: rt: dont stop the period timer when there are tasks wanting to
 run

So if the group ever gets throttled, it will never wake up again.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: "Daniel K." <dk@uw.no>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reported-by: "Daniel K." <dk@uw.no>
---
 kernel/sched_rt.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 61d52112289c..bd90c8bb0739 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -412,7 +412,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 			if (rt_rq->rt_time || rt_rq->rt_nr_running)
 				idle = 0;
 			spin_unlock(&rt_rq->rt_runtime_lock);
-		}
+		} else if (rt_rq->rt_nr_running)
+			idle = 0;
 
 		if (enqueue)
 			sched_rt_rq_enqueue(rt_rq);
-- 
cgit v1.2.3


From 443cd507ce7f78c6f8742b72736585c031d5a921 Mon Sep 17 00:00:00 2001
From: "Huang, Ying" <ying.huang@intel.com>
Date: Fri, 20 Jun 2008 16:39:21 +0800
Subject: lockdep: add lock_class information to lock_chain and output it

This patch records array of lock_class into lock_chain, and export
lock_chain information via /proc/lockdep_chains.

It is based on x86/master branch of git-x86 tree, and has been tested
on x86_64 platform.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c           | 38 +++++++++++++++++--
 kernel/lockdep_internals.h |  6 +++
 kernel/lockdep_proc.c      | 91 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 132 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 81a4e4a3f087..a796f1f38ac5 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -1458,7 +1458,14 @@ out_bug:
 }
 
 unsigned long nr_lock_chains;
-static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
+struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
+atomic_t nr_chain_hlocks;
+static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS];
+
+struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i)
+{
+	return lock_classes + chain_hlocks[chain->base + i];
+}
 
 /*
  * Look up a dependency chain. If the key is not present yet then
@@ -1466,10 +1473,15 @@ static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
  * validated. If the key is already hashed, return 0.
  * (On return with 1 graph_lock is held.)
  */
-static inline int lookup_chain_cache(u64 chain_key, struct lock_class *class)
+static inline int lookup_chain_cache(struct task_struct *curr,
+				     struct held_lock *hlock,
+				     u64 chain_key)
 {
+	struct lock_class *class = hlock->class;
 	struct list_head *hash_head = chainhashentry(chain_key);
 	struct lock_chain *chain;
+	struct held_lock *hlock_curr, *hlock_next;
+	int i, j, n;
 
 	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
 		return 0;
@@ -1517,6 +1529,26 @@ cache_hit:
 	}
 	chain = lock_chains + nr_lock_chains++;
 	chain->chain_key = chain_key;
+	chain->irq_context = hlock->irq_context;
+	/* Find the first held_lock of current chain */
+	hlock_next = hlock;
+	for (i = curr->lockdep_depth - 1; i >= 0; i--) {
+		hlock_curr = curr->held_locks + i;
+		if (hlock_curr->irq_context != hlock_next->irq_context)
+			break;
+		hlock_next = hlock;
+	}
+	i++;
+	chain->depth = curr->lockdep_depth + 1 - i;
+	n = atomic_add_return(chain->depth, &nr_chain_hlocks);
+	if (unlikely(n < MAX_LOCKDEP_CHAIN_HLOCKS)) {
+		chain->base = n - chain->depth;
+		for (j = 0; j < chain->depth - 1; j++, i++) {
+			int lock_id = curr->held_locks[i].class - lock_classes;
+			chain_hlocks[chain->base + j] = lock_id;
+		}
+		chain_hlocks[chain->base + j] = class - lock_classes;
+	}
 	list_add_tail_rcu(&chain->entry, hash_head);
 	debug_atomic_inc(&chain_lookup_misses);
 	inc_chains();
@@ -1538,7 +1570,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
 	 * graph_lock for us)
 	 */
 	if (!hlock->trylock && (hlock->check == 2) &&
-			lookup_chain_cache(chain_key, hlock->class)) {
+	    lookup_chain_cache(curr, hlock, chain_key)) {
 		/*
 		 * Check whether last held lock:
 		 *
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 8ce09bc4613d..db09b176dd34 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -23,6 +23,8 @@
 #define MAX_LOCKDEP_CHAINS_BITS	14
 #define MAX_LOCKDEP_CHAINS	(1UL << MAX_LOCKDEP_CHAINS_BITS)
 
+#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
+
 /*
  * Stack-trace: tightly packed array of stack backtrace
  * addresses. Protected by the hash_lock.
@@ -30,15 +32,19 @@
 #define MAX_STACK_TRACE_ENTRIES	262144UL
 
 extern struct list_head all_lock_classes;
+extern struct lock_chain lock_chains[];
 
 extern void
 get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4);
 
 extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str);
 
+struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i);
+
 extern unsigned long nr_lock_classes;
 extern unsigned long nr_list_entries;
 extern unsigned long nr_lock_chains;
+extern atomic_t nr_chain_hlocks;
 extern unsigned long nr_stack_trace_entries;
 
 extern unsigned int nr_hardirq_chains;
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 688c5f1940bd..14d052c8a835 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -178,6 +178,93 @@ static const struct file_operations proc_lockdep_operations = {
 	.release	= seq_release,
 };
 
+static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct lock_chain *chain;
+
+	(*pos)++;
+
+	if (v == SEQ_START_TOKEN)
+		chain = m->private;
+	else {
+		chain = v;
+
+		if (*pos < nr_lock_chains)
+			chain = lock_chains + *pos;
+		else
+			chain = NULL;
+	}
+
+	return chain;
+}
+
+static void *lc_start(struct seq_file *m, loff_t *pos)
+{
+	if (*pos == 0)
+		return SEQ_START_TOKEN;
+
+	if (*pos < nr_lock_chains)
+		return lock_chains + *pos;
+
+	return NULL;
+}
+
+static void lc_stop(struct seq_file *m, void *v)
+{
+}
+
+static int lc_show(struct seq_file *m, void *v)
+{
+	struct lock_chain *chain = v;
+	struct lock_class *class;
+	int i;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(m, "all lock chains:\n");
+		return 0;
+	}
+
+	seq_printf(m, "irq_context: %d\n", chain->irq_context);
+
+	for (i = 0; i < chain->depth; i++) {
+		class = lock_chain_get_class(chain, i);
+		seq_printf(m, "[%p] ", class->key);
+		print_name(m, class);
+		seq_puts(m, "\n");
+	}
+	seq_puts(m, "\n");
+
+	return 0;
+}
+
+static const struct seq_operations lockdep_chains_ops = {
+	.start	= lc_start,
+	.next	= lc_next,
+	.stop	= lc_stop,
+	.show	= lc_show,
+};
+
+static int lockdep_chains_open(struct inode *inode, struct file *file)
+{
+	int res = seq_open(file, &lockdep_chains_ops);
+	if (!res) {
+		struct seq_file *m = file->private_data;
+
+		if (nr_lock_chains)
+			m->private = lock_chains;
+		else
+			m->private = NULL;
+	}
+	return res;
+}
+
+static const struct file_operations proc_lockdep_chains_operations = {
+	.open		= lockdep_chains_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
 static void lockdep_stats_debug_show(struct seq_file *m)
 {
 #ifdef CONFIG_DEBUG_LOCKDEP
@@ -294,6 +381,8 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
 #ifdef CONFIG_PROVE_LOCKING
 	seq_printf(m, " dependency chains:             %11lu [max: %lu]\n",
 			nr_lock_chains, MAX_LOCKDEP_CHAINS);
+	seq_printf(m, " dependency chain hlocks:       %11d [max: %lu]\n",
+			atomic_read(&nr_chain_hlocks), MAX_LOCKDEP_CHAIN_HLOCKS);
 #endif
 
 #ifdef CONFIG_TRACE_IRQFLAGS
@@ -661,6 +750,8 @@ static const struct file_operations proc_lock_stat_operations = {
 static int __init lockdep_proc_init(void)
 {
 	proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations);
+	proc_create("lockdep_chains", S_IRUSR, NULL,
+		    &proc_lockdep_chains_operations);
 	proc_create("lockdep_stats", S_IRUSR, NULL,
 		    &proc_lockdep_stats_operations);
 
-- 
cgit v1.2.3


From 0f476b6d91a1395bda6464e653ce66ea9bea7167 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Wed, 18 Jun 2008 09:29:37 +0200
Subject: softirq: remove irqs_disabled warning from local_bh_enable

There's no need to use local_irq_save() over local_irq_disable() in the
local_bh_enable code since it is a bug to call it with irqs disabled and
do_softirq will enable irqs if there is any pending work.

Consolidate the code from local_bh_enable and ..._ip to avoid having a
disconnect between them in the warnings they trigger that is currently
there.

Also always trigger the warning on in_irq(), not just in the
trace-irqflags case.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Cc: Michael Buesch <mb@bu3sch.de>
Cc: David Ellingsworth <david@identd.dyndns.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/softirq.c | 48 +++++++++++-------------------------------------
 1 file changed, 11 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 86775340ef1d..2cf2502b7c73 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -131,23 +131,17 @@ void _local_bh_enable(void)
 
 EXPORT_SYMBOL(_local_bh_enable);
 
-void local_bh_enable(void)
+static inline void _local_bh_enable_ip(unsigned long ip)
 {
+	WARN_ON_ONCE(in_irq() || irqs_disabled());
 #ifdef CONFIG_TRACE_IRQFLAGS
-	unsigned long flags;
-
-	WARN_ON_ONCE(in_irq());
-#endif
-	WARN_ON_ONCE(irqs_disabled());
-
-#ifdef CONFIG_TRACE_IRQFLAGS
-	local_irq_save(flags);
+	local_irq_disable();
 #endif
 	/*
 	 * Are softirqs going to be turned on now:
 	 */
 	if (softirq_count() == SOFTIRQ_OFFSET)
-		trace_softirqs_on((unsigned long)__builtin_return_address(0));
+		trace_softirqs_on(ip);
 	/*
 	 * Keep preemption disabled until we are done with
 	 * softirq processing:
@@ -159,40 +153,20 @@ void local_bh_enable(void)
 
 	dec_preempt_count();
 #ifdef CONFIG_TRACE_IRQFLAGS
-	local_irq_restore(flags);
+	local_irq_enable();
 #endif
 	preempt_check_resched();
 }
+
+void local_bh_enable(void)
+{
+	_local_bh_enable_ip((unsigned long)__builtin_return_address(0));
+}
 EXPORT_SYMBOL(local_bh_enable);
 
 void local_bh_enable_ip(unsigned long ip)
 {
-#ifdef CONFIG_TRACE_IRQFLAGS
-	unsigned long flags;
-
-	WARN_ON_ONCE(in_irq());
-
-	local_irq_save(flags);
-#endif
-	/*
-	 * Are softirqs going to be turned on now:
-	 */
-	if (softirq_count() == SOFTIRQ_OFFSET)
-		trace_softirqs_on(ip);
-	/*
-	 * Keep preemption disabled until we are done with
-	 * softirq processing:
- 	 */
- 	sub_preempt_count(SOFTIRQ_OFFSET - 1);
-
-	if (unlikely(!in_interrupt() && local_softirq_pending()))
-		do_softirq();
-
-	dec_preempt_count();
-#ifdef CONFIG_TRACE_IRQFLAGS
-	local_irq_restore(flags);
-#endif
-	preempt_check_resched();
+	_local_bh_enable_ip(ip);
 }
 EXPORT_SYMBOL(local_bh_enable_ip);
 
-- 
cgit v1.2.3


From 395a59d0f8e86bb39cd700c3d185d30c670bb958 Mon Sep 17 00:00:00 2001
From: Abhishek Sagar <sagar.abhishek@gmail.com>
Date: Sat, 21 Jun 2008 23:47:27 +0530
Subject: ftrace: store mcount address in rec->ip

Record the address of the mcount call-site. Currently all archs except sparc64
record the address of the instruction following the mcount call-site. Some
general cleanups are entailed. Storing mcount addresses in rec->ip enables
looking them up in the kprobe hash table later on to check if they're kprobe'd.

Signed-off-by: Abhishek Sagar <sagar.abhishek@gmail.com>
Cc: davem@davemloft.net
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0d5bcf69952d..f1e9e5c74e64 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -27,6 +27,8 @@
 #include <linux/hash.h>
 #include <linux/list.h>
 
+#include <asm/ftrace.h>
+
 #include "trace.h"
 
 /* ftrace_enabled is a method to turn ftrace on or off */
@@ -329,7 +331,6 @@ ftrace_record_ip(unsigned long ip)
 }
 
 #define FTRACE_ADDR ((long)(ftrace_caller))
-#define MCOUNT_ADDR ((long)(mcount))
 
 static int
 __ftrace_replace_code(struct dyn_ftrace *rec,
-- 
cgit v1.2.3


From ecea656d1d5e912d2f3d332657ea4a6d8380f891 Mon Sep 17 00:00:00 2001
From: Abhishek Sagar <sagar.abhishek@gmail.com>
Date: Sat, 21 Jun 2008 23:47:53 +0530
Subject: ftrace: freeze kprobe'd records

Let records identified as being kprobe'd be marked as "frozen". The trouble
with records which have a kprobe installed on their mcount call-site is
that they don't get updated. So if such a function which is currently being
traced gets its tracing disabled due to a new filter rule (or because it
was added to the notrace list) then it won't be updated and continue being
traced. This patch allows scanning of all frozen records during tracing to
check if they should be traced.

Signed-off-by: Abhishek Sagar <sagar.abhishek@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/trace/trace.c  |  3 +++
 2 files changed, 74 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f1e9e5c74e64..d1238163155f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -163,6 +163,8 @@ enum {
 };
 
 static int ftrace_filtered;
+static int tracing_on;
+static int frozen_record_count;
 
 static struct hlist_head ftrace_hash[FTRACE_HASHSIZE];
 
@@ -195,6 +197,71 @@ static int ftrace_record_suspend;
 
 static struct dyn_ftrace *ftrace_free_records;
 
+
+#ifdef CONFIG_KPROBES
+static inline void freeze_record(struct dyn_ftrace *rec)
+{
+	if (!(rec->flags & FTRACE_FL_FROZEN)) {
+		rec->flags |= FTRACE_FL_FROZEN;
+		frozen_record_count++;
+	}
+}
+
+static inline void unfreeze_record(struct dyn_ftrace *rec)
+{
+	if (rec->flags & FTRACE_FL_FROZEN) {
+		rec->flags &= ~FTRACE_FL_FROZEN;
+		frozen_record_count--;
+	}
+}
+
+static inline int record_frozen(struct dyn_ftrace *rec)
+{
+	return rec->flags & FTRACE_FL_FROZEN;
+}
+#else
+# define freeze_record(rec)			({ 0; })
+# define unfreeze_record(rec)			({ 0; })
+# define record_frozen(rec)			({ 0; })
+#endif /* CONFIG_KPROBES */
+
+int skip_trace(unsigned long ip)
+{
+	unsigned long fl;
+	struct dyn_ftrace *rec;
+	struct hlist_node *t;
+	struct hlist_head *head;
+
+	if (frozen_record_count == 0)
+		return 0;
+
+	head = &ftrace_hash[hash_long(ip, FTRACE_HASHBITS)];
+	hlist_for_each_entry_rcu(rec, t, head, node) {
+		if (rec->ip == ip) {
+			if (record_frozen(rec)) {
+				if (rec->flags & FTRACE_FL_FAILED)
+					return 1;
+
+				if (!(rec->flags & FTRACE_FL_CONVERTED))
+					return 1;
+
+				if (!tracing_on || !ftrace_enabled)
+					return 1;
+
+				if (ftrace_filtered) {
+					fl = rec->flags & (FTRACE_FL_FILTER |
+							   FTRACE_FL_NOTRACE);
+					if (!fl || (fl & FTRACE_FL_NOTRACE))
+						return 1;
+				}
+			}
+			break;
+		}
+	}
+
+	return 0;
+}
+
 static inline int
 ftrace_ip_in_hash(unsigned long ip, unsigned long key)
 {
@@ -489,8 +556,11 @@ static int __ftrace_modify_code(void *data)
 		 */
 		__ftrace_update_code(NULL);
 		ftrace_replace_code(1);
-	} else if (*command & FTRACE_DISABLE_CALLS)
+		tracing_on = 1;
+	} else if (*command & FTRACE_DISABLE_CALLS) {
 		ftrace_replace_code(0);
+		tracing_on = 0;
+	}
 
 	if (*command & FTRACE_UPDATE_TRACE_FUNC)
 		ftrace_update_ftrace_func(ftrace_trace_function);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6e9dae7eb418..9ade79369bfb 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -988,6 +988,9 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
 	if (unlikely(!tracer_enabled))
 		return;
 
+	if (skip_trace(ip))
+		return;
+
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
-- 
cgit v1.2.3


From f22f9a89ce6857d377bf22dba4c1a8cd256c5136 Mon Sep 17 00:00:00 2001
From: Abhishek Sagar <sagar.abhishek@gmail.com>
Date: Sat, 21 Jun 2008 23:50:29 +0530
Subject: ftrace: avoid modifying kprobe'd records

Avoid modifying the mcount call-site if there is a kprobe installed on it.
These records are not marked as failed however. This allowed the filter
rules on them to remain up-to-date. Whenever the kprobe on the corresponding
record is removed, the record gets updated as normal.

Signed-off-by: Abhishek Sagar <sagar.abhishek@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 35 +++++++++++++++++++++++++++++++----
 1 file changed, 31 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d1238163155f..85e841335417 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -21,6 +21,7 @@
 #include <linux/hardirq.h>
 #include <linux/kthread.h>
 #include <linux/uaccess.h>
+#include <linux/kprobes.h>
 #include <linux/ftrace.h>
 #include <linux/sysctl.h>
 #include <linux/ctype.h>
@@ -500,6 +501,10 @@ static void ftrace_replace_code(int enable)
 			if (rec->flags & FTRACE_FL_FAILED)
 				continue;
 
+			/* ignore updates to this record's mcount site */
+			if (get_kprobe((void *)rec->ip))
+				continue;
+
 			failed = __ftrace_replace_code(rec, old, new, enable);
 			if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
 				rec->flags |= FTRACE_FL_FAILED;
@@ -692,11 +697,11 @@ unsigned long		ftrace_update_tot_cnt;
 
 static int __ftrace_update_code(void *ignore)
 {
+	int i, save_ftrace_enabled;
+	cycle_t start, stop;
 	struct dyn_ftrace *p;
 	struct hlist_node *t, *n;
-	int save_ftrace_enabled;
-	cycle_t start, stop;
-	int i;
+	struct hlist_head *head, temp_list;
 
 	/* Don't be recording funcs now */
 	ftrace_record_suspend++;
@@ -708,8 +713,11 @@ static int __ftrace_update_code(void *ignore)
 
 	/* No locks needed, the machine is stopped! */
 	for (i = 0; i < FTRACE_HASHSIZE; i++) {
+		INIT_HLIST_HEAD(&temp_list);
+		head = &ftrace_hash[i];
+
 		/* all CPUS are stopped, we are safe to modify code */
-		hlist_for_each_entry_safe(p, t, n, &ftrace_hash[i], node) {
+		hlist_for_each_entry_safe(p, t, n, head, node) {
 			/* Skip over failed records which have not been
 			 * freed. */
 			if (p->flags & FTRACE_FL_FAILED)
@@ -723,6 +731,19 @@ static int __ftrace_update_code(void *ignore)
 			if (p->flags & (FTRACE_FL_CONVERTED))
 				break;
 
+			/* Ignore updates to this record's mcount site.
+			 * Reintroduce this record at the head of this
+			 * bucket to attempt to "convert" it again if
+			 * the kprobe on it is unregistered before the
+			 * next run. */
+			if (get_kprobe((void *)p->ip)) {
+				ftrace_del_hash(p);
+				INIT_HLIST_NODE(&p->node);
+				hlist_add_head(&p->node, &temp_list);
+				continue;
+			}
+
+			/* convert record (i.e, patch mcount-call with NOP) */
 			if (ftrace_code_disable(p)) {
 				p->flags |= FTRACE_FL_CONVERTED;
 				ftrace_update_cnt++;
@@ -734,6 +755,12 @@ static int __ftrace_update_code(void *ignore)
 				}
 			}
 		}
+
+		hlist_for_each_entry_safe(p, t, n, &temp_list, node) {
+			hlist_del(&p->node);
+			INIT_HLIST_NODE(&p->node);
+			hlist_add_head(&p->node, head);
+		}
 	}
 
 	stop = ftrace_now(raw_smp_processor_id());
-- 
cgit v1.2.3


From 961ccddd59d627b89bd3dc284b6517833bbdf25d Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 23 Jun 2008 13:55:38 +1000
Subject: sched: add new API sched_setscheduler_nocheck: add a flag to control
 access checks

Hidehiro Kawai noticed that sched_setscheduler() can fail in
stop_machine: it calls sched_setscheduler() from insmod, which can
have CAP_SYS_MODULE without CAP_SYS_NICE.

Two cases could have failed, so are changed to sched_setscheduler_nocheck:
  kernel/softirq.c:cpu_callback()
	- CPU hotplug callback
  kernel/stop_machine.c:__stop_machine_run()
	- Called from various places, including modprobe()

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm@kvack.org
Cc: sugita <yumiko.sugita.yf@hitachi.com>
Cc: Satoshi OSHIMA <satoshi.oshima.fk@hitachi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c        | 48 ++++++++++++++++++++++++++++++++++++------------
 kernel/softirq.c      |  2 +-
 kernel/stop_machine.c |  2 +-
 3 files changed, 38 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b048ad8a11af..8d7c246ab864 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4746,16 +4746,8 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
 	set_load_weight(p);
 }
 
-/**
- * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
- * @p: the task in question.
- * @policy: new policy.
- * @param: structure containing the new RT priority.
- *
- * NOTE that the task may be already dead.
- */
-int sched_setscheduler(struct task_struct *p, int policy,
-		       struct sched_param *param)
+static int __sched_setscheduler(struct task_struct *p, int policy,
+				struct sched_param *param, bool user)
 {
 	int retval, oldprio, oldpolicy = -1, on_rq, running;
 	unsigned long flags;
@@ -4787,7 +4779,7 @@ recheck:
 	/*
 	 * Allow unprivileged RT tasks to decrease priority:
 	 */
-	if (!capable(CAP_SYS_NICE)) {
+	if (user && !capable(CAP_SYS_NICE)) {
 		if (rt_policy(policy)) {
 			unsigned long rlim_rtprio;
 
@@ -4823,7 +4815,8 @@ recheck:
 	 * Do not allow realtime tasks into groups that have no runtime
 	 * assigned.
 	 */
-	if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
+	if (user
+	    && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
 		return -EPERM;
 #endif
 
@@ -4872,8 +4865,39 @@ recheck:
 
 	return 0;
 }
+
+/**
+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * NOTE that the task may be already dead.
+ */
+int sched_setscheduler(struct task_struct *p, int policy,
+		       struct sched_param *param)
+{
+	return __sched_setscheduler(p, policy, param, true);
+}
 EXPORT_SYMBOL_GPL(sched_setscheduler);
 
+/**
+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Just like sched_setscheduler, only don't bother checking if the
+ * current context has permission.  For example, this is needed in
+ * stop_machine(): we create temporary high priority worker threads,
+ * but our caller might not have that capability.
+ */
+int sched_setscheduler_nocheck(struct task_struct *p, int policy,
+			       struct sched_param *param)
+{
+	return __sched_setscheduler(p, policy, param, false);
+}
+
 static int
 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 {
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 36e061740047..afd9120c2fc4 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -645,7 +645,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
 
 		p = per_cpu(ksoftirqd, hotcpu);
 		per_cpu(ksoftirqd, hotcpu) = NULL;
-		sched_setscheduler(p, SCHED_FIFO, &param);
+		sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
 		kthread_stop(p);
 		takeover_tasklets(hotcpu);
 		break;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index b7350bbfb076..ba9b2054ecbd 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -187,7 +187,7 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
 		struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
 
 		/* One high-prio thread per cpu.  We'll do this one. */
-		sched_setscheduler(p, SCHED_FIFO, &param);
+		sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
 		kthread_bind(p, cpu);
 		wake_up_process(p);
 		wait_for_completion(&smdata.done);
-- 
cgit v1.2.3


From cd1a28e8457e6ebf72c48d84d4e736307e86436e Mon Sep 17 00:00:00 2001
From: "Huang, Ying" <ying.huang@intel.com>
Date: Mon, 23 Jun 2008 11:20:54 +0800
Subject: lockdep: add lock_class information to lock_chain and output it

It is based on x86/master branch of git-x86 tree, and has been tested
on x86_64 platform.

ChangeLog:

v2:

- Enclosing proc file system related code into CONFIG_PROVE_LOCKING.

- Fix nr_chain_hlocks update code.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c           | 16 +++++++++++-----
 kernel/lockdep_internals.h |  2 +-
 kernel/lockdep_proc.c      |  6 +++++-
 3 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index a796f1f38ac5..7553a28b99cd 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -1459,7 +1459,7 @@ out_bug:
 
 unsigned long nr_lock_chains;
 struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
-atomic_t nr_chain_hlocks;
+int nr_chain_hlocks;
 static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS];
 
 struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i)
@@ -1481,7 +1481,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
 	struct list_head *hash_head = chainhashentry(chain_key);
 	struct lock_chain *chain;
 	struct held_lock *hlock_curr, *hlock_next;
-	int i, j, n;
+	int i, j, n, cn;
 
 	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
 		return 0;
@@ -1540,9 +1540,15 @@ cache_hit:
 	}
 	i++;
 	chain->depth = curr->lockdep_depth + 1 - i;
-	n = atomic_add_return(chain->depth, &nr_chain_hlocks);
-	if (unlikely(n < MAX_LOCKDEP_CHAIN_HLOCKS)) {
-		chain->base = n - chain->depth;
+	cn = nr_chain_hlocks;
+	while (cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS) {
+		n = cmpxchg(&nr_chain_hlocks, cn, cn + chain->depth);
+		if (n == cn)
+			break;
+		cn = n;
+	}
+	if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
+		chain->base = cn;
 		for (j = 0; j < chain->depth - 1; j++, i++) {
 			int lock_id = curr->held_locks[i].class - lock_classes;
 			chain_hlocks[chain->base + j] = lock_id;
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index db09b176dd34..c3600a091a28 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -44,7 +44,7 @@ struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i);
 extern unsigned long nr_lock_classes;
 extern unsigned long nr_list_entries;
 extern unsigned long nr_lock_chains;
-extern atomic_t nr_chain_hlocks;
+extern int nr_chain_hlocks;
 extern unsigned long nr_stack_trace_entries;
 
 extern unsigned int nr_hardirq_chains;
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 14d052c8a835..9b0e940e2545 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -178,6 +178,7 @@ static const struct file_operations proc_lockdep_operations = {
 	.release	= seq_release,
 };
 
+#ifdef CONFIG_PROVE_LOCKING
 static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct lock_chain *chain;
@@ -264,6 +265,7 @@ static const struct file_operations proc_lockdep_chains_operations = {
 	.llseek		= seq_lseek,
 	.release	= seq_release,
 };
+#endif /* CONFIG_PROVE_LOCKING */
 
 static void lockdep_stats_debug_show(struct seq_file *m)
 {
@@ -382,7 +384,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
 	seq_printf(m, " dependency chains:             %11lu [max: %lu]\n",
 			nr_lock_chains, MAX_LOCKDEP_CHAINS);
 	seq_printf(m, " dependency chain hlocks:       %11d [max: %lu]\n",
-			atomic_read(&nr_chain_hlocks), MAX_LOCKDEP_CHAIN_HLOCKS);
+			nr_chain_hlocks, MAX_LOCKDEP_CHAIN_HLOCKS);
 #endif
 
 #ifdef CONFIG_TRACE_IRQFLAGS
@@ -750,8 +752,10 @@ static const struct file_operations proc_lock_stat_operations = {
 static int __init lockdep_proc_init(void)
 {
 	proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations);
+#ifdef CONFIG_PROVE_LOCKING
 	proc_create("lockdep_chains", S_IRUSR, NULL,
 		    &proc_lockdep_chains_operations);
+#endif
 	proc_create("lockdep_stats", S_IRUSR, NULL,
 		    &proc_lockdep_stats_operations);
 
-- 
cgit v1.2.3


From e3d7be270c5b1be07ffadcc8b56599ad8e975c1d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 22 Jun 2008 13:06:38 -0700
Subject: rcu, rcutorture: make quiescent rcutorture less power-hungry

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: josh@freedesktop.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: dino@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: torvalds@linux-foundation.org
Cc: vegard.nossum@gmail.com
Cc: adobriyan@gmail.com
Cc: oleg@tv-sign.ru
Cc: bunk@kernel.org
Cc: rjw@sisk.pl
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutorture.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 27003e2421c7..0c40c13c5402 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -196,7 +196,10 @@ static void
 rcu_stutter_wait(void)
 {
 	while (stutter_pause_test || !rcutorture_runnable)
-		schedule_timeout_interruptible(1);
+		if (rcutorture_runnable)
+			schedule_timeout_interruptible(1);
+		else
+			schedule_timeout_interruptible(HZ);
 }
 
 /*
-- 
cgit v1.2.3


From 3ccf79f4570acacfefc51772e8f9207895b35ad7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 22 Jun 2008 14:02:55 -0700
Subject: rcu: make quiescent rcutorture less power-hungry

This patch aligns the rcutorture wakeup times to align with all other
multiple-of-a-second wakeups to further decrease power consumption.

Suggested-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: josh@freedesktop.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: dino@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: torvalds@linux-foundation.org
Cc: vegard.nossum@gmail.com
Cc: adobriyan@gmail.com
Cc: oleg@tv-sign.ru
Cc: bunk@kernel.org
Cc: rjw@sisk.pl
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutorture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 0c40c13c5402..5e954edf0ed5 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -199,7 +199,7 @@ rcu_stutter_wait(void)
 		if (rcutorture_runnable)
 			schedule_timeout_interruptible(1);
 		else
-			schedule_timeout_interruptible(HZ);
+			schedule_timeout_interruptible(round_jiffies_relative(HZ));
 }
 
 /*
-- 
cgit v1.2.3


From 13d5ef97f0675d789f559cfebc1df9d5e2b1879c Mon Sep 17 00:00:00 2001
From: Peng Haitao <penght@cn.fujitsu.com>
Date: Fri, 16 May 2008 10:15:04 +0800
Subject: [PATCH] kernel/audit.c: nlh->nlmsg_type is gotten more than once

The first argument "nlh->nlmsg_type" of audit_receive_filter() should be modified to "msg_type" in audit_receive_msg().

Signed-off-by: Peng Haitao <penght@cn.fujitsu.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index e8692a5748c2..56f30287e24c 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -779,7 +779,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		}
 		/* fallthrough */
 	case AUDIT_LIST:
-		err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
+		err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid,
 					   uid, seq, data, nlmsg_len(nlh),
 					   loginuid, sessionid, sid);
 		break;
@@ -798,7 +798,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		}
 		/* fallthrough */
 	case AUDIT_LIST_RULES:
-		err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
+		err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid,
 					   uid, seq, data, nlmsg_len(nlh),
 					   loginuid, sessionid, sid);
 		break;
-- 
cgit v1.2.3


From 9f0aecdd1cd6aacee9aa8f08031f4f2e09e454dc Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Mon, 19 May 2008 15:09:21 -0700
Subject: [PATCH] audit: fix kernel-doc parameter notation

Fix auditfilter kernel-doc misssing parameter description:

Warning(lin2626-rc3//kernel/auditfilter.c:1551): No description found for parameter 'sessionid'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditfilter.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 0e0bd27e6512..75cdf262851d 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1544,6 +1544,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
  * @data: payload data
  * @datasz: size of payload data
  * @loginuid: loginuid of sender
+ * @sessionid: sessionid for netlink audit message
  * @sid: SE Linux Security ID of sender
  */
 int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
-- 
cgit v1.2.3


From d8de72473effd674a3c1fe9621821f406f5587c9 Mon Sep 17 00:00:00 2001
From: Peng Haitao <penght@cn.fujitsu.com>
Date: Tue, 20 May 2008 09:13:02 +0800
Subject: [PATCH] remove useless argument type in audit_filter_user()

The second argument "type" is not used in audit_filter_user(), so I think that type can be removed. If I'm wrong, please tell me.

Signed-off-by: Peng Haitao <penght@cn.fujitsu.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit.c       | 2 +-
 kernel/auditfilter.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 56f30287e24c..e092f1c0ce30 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -738,7 +738,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		if (!audit_enabled && msg_type != AUDIT_USER_AVC)
 			return 0;
 
-		err = audit_filter_user(&NETLINK_CB(skb), msg_type);
+		err = audit_filter_user(&NETLINK_CB(skb));
 		if (err == 1) {
 			err = 0;
 			if (msg_type == AUDIT_USER_TTY) {
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 75cdf262851d..98c50cc671bb 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1721,7 +1721,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
 	return 1;
 }
 
-int audit_filter_user(struct netlink_skb_parms *cb, int type)
+int audit_filter_user(struct netlink_skb_parms *cb)
 {
 	enum audit_state state = AUDIT_DISABLED;
 	struct audit_entry *e;
-- 
cgit v1.2.3


From 8d5be7f4e8515af461cbc8f07687ccc81507d508 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Wed, 25 Jun 2008 08:50:10 +0200
Subject: softlockup: show irqtrace

This patch adds some information about when interrupts were last
enabled and disabled to the output of the softlockup detector.

Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Johannes Weiner <hannes@saeurebad.de>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/softlockup.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index f2bf5decb108..97977ecc3171 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -13,6 +13,7 @@
 #include <linux/delay.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
+#include <linux/lockdep.h>
 #include <linux/notifier.h>
 #include <linux/module.h>
 
@@ -144,6 +145,7 @@ void softlockup_tick(void)
 			this_cpu, now - touch_timestamp,
 			current->comm, task_pid_nr(current));
 	print_modules();
+	print_irqtrace_events(current);
 	if (regs)
 		show_regs(regs);
 	else
-- 
cgit v1.2.3


From 0729fbf3bc70870370b4f43d652f05a468dc68b8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 25 Jun 2008 12:24:52 -0700
Subject: rcu: make rcutorture even more vicious: invoke RCU readers from irq
 handlers (timers)

This patch allows torturing RCU from irq handlers (timers, in this case).
A new module parameter irqreader enables such additional torturing,
and is enabled by default.  Variants of RCU that do not tolerate readers
being called from irq handlers (e.g., SRCU) ignore irqreader.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: josh@freedesktop.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: dino@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: torvalds@linux-foundation.org
Cc: vegard.nossum@gmail.com
Cc: adobriyan@gmail.com
Cc: oleg@tv-sign.ru
Cc: bunk@kernel.org
Cc: rjw@sisk.pl
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutorture.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 70 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 5e954edf0ed5..90b5b123f7a1 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -59,6 +59,7 @@ static int verbose;		/* Print more debug info. */
 static int test_no_idle_hz;	/* Test RCU's support for tickless idle CPUs. */
 static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
 static int stutter = 5;		/* Start/stop testing interval (in sec) */
+static int irqreader = 1;	/* RCU readers from irq (timers). */
 static char *torture_type = "rcu"; /* What RCU implementation to torture. */
 
 module_param(nreaders, int, 0444);
@@ -75,6 +76,8 @@ module_param(shuffle_interval, int, 0444);
 MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
 module_param(stutter, int, 0444);
 MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
+module_param(irqreader, int, 0444);
+MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
 module_param(torture_type, charp, 0444);
 MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
 
@@ -121,6 +124,7 @@ static atomic_t n_rcu_torture_alloc_fail;
 static atomic_t n_rcu_torture_free;
 static atomic_t n_rcu_torture_mberror;
 static atomic_t n_rcu_torture_error;
+static long n_rcu_torture_timers = 0;
 static struct list_head rcu_torture_removed;
 
 static int stutter_pause_test = 0;
@@ -217,6 +221,7 @@ struct rcu_torture_ops {
 	void (*sync)(void);
 	void (*cb_barrier)(void);
 	int (*stats)(char *page);
+	int irqcapable;
 	char *name;
 };
 static struct rcu_torture_ops *cur_ops = NULL;
@@ -291,6 +296,7 @@ static struct rcu_torture_ops rcu_ops = {
 	.sync = synchronize_rcu,
 	.cb_barrier = rcu_barrier,
 	.stats = NULL,
+	.irqcapable = 1,
 	.name = "rcu"
 };
 
@@ -331,6 +337,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
 	.sync = synchronize_rcu,
 	.cb_barrier = NULL,
 	.stats = NULL,
+	.irqcapable = 1,
 	.name = "rcu_sync"
 };
 
@@ -392,6 +399,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
 	.sync = rcu_bh_torture_synchronize,
 	.cb_barrier = rcu_barrier_bh,
 	.stats = NULL,
+	.irqcapable = 1,
 	.name = "rcu_bh"
 };
 
@@ -406,6 +414,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
 	.sync = rcu_bh_torture_synchronize,
 	.cb_barrier = NULL,
 	.stats = NULL,
+	.irqcapable = 1,
 	.name = "rcu_bh_sync"
 };
 
@@ -532,6 +541,7 @@ static struct rcu_torture_ops sched_ops = {
 	.sync = sched_torture_synchronize,
 	.cb_barrier = rcu_barrier_sched,
 	.stats = NULL,
+	.irqcapable = 1,
 	.name = "sched"
 };
 
@@ -619,6 +629,52 @@ rcu_torture_fakewriter(void *arg)
 	return 0;
 }
 
+/*
+ * RCU torture reader from timer handler.  Dereferences rcu_torture_current,
+ * incrementing the corresponding element of the pipeline array.  The
+ * counter in the element should never be greater than 1, otherwise, the
+ * RCU implementation is broken.
+ */
+static void rcu_torture_timer(unsigned long unused)
+{
+	int idx;
+	int completed;
+	static DEFINE_RCU_RANDOM(rand);
+	static DEFINE_SPINLOCK(rand_lock);
+	struct rcu_torture *p;
+	int pipe_count;
+
+	idx = cur_ops->readlock();
+	completed = cur_ops->completed();
+	p = rcu_dereference(rcu_torture_current);
+	if (p == NULL) {
+		/* Leave because rcu_torture_writer is not yet underway */
+		cur_ops->readunlock(idx);
+		return;
+	}
+	if (p->rtort_mbtest == 0)
+		atomic_inc(&n_rcu_torture_mberror);
+	spin_lock(&rand_lock);
+	cur_ops->readdelay(&rand);
+	n_rcu_torture_timers++;
+	spin_unlock(&rand_lock);
+	preempt_disable();
+	pipe_count = p->rtort_pipe_count;
+	if (pipe_count > RCU_TORTURE_PIPE_LEN) {
+		/* Should not happen, but... */
+		pipe_count = RCU_TORTURE_PIPE_LEN;
+	}
+	++__get_cpu_var(rcu_torture_count)[pipe_count];
+	completed = cur_ops->completed() - completed;
+	if (completed > RCU_TORTURE_PIPE_LEN) {
+		/* Should not happen, but... */
+		completed = RCU_TORTURE_PIPE_LEN;
+	}
+	++__get_cpu_var(rcu_torture_batch)[completed];
+	preempt_enable();
+	cur_ops->readunlock(idx);
+}
+
 /*
  * RCU torture reader kthread.  Repeatedly dereferences rcu_torture_current,
  * incrementing the corresponding element of the pipeline array.  The
@@ -633,11 +689,18 @@ rcu_torture_reader(void *arg)
 	DEFINE_RCU_RANDOM(rand);
 	struct rcu_torture *p;
 	int pipe_count;
+	struct timer_list t;
 
 	VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
 	set_user_nice(current, 19);
+	if (irqreader && cur_ops->irqcapable)
+		setup_timer_on_stack(&t, rcu_torture_timer, 0);
 
 	do {
+		if (irqreader && cur_ops->irqcapable) {
+			if (!timer_pending(&t))
+				mod_timer(&t, 1);
+		}
 		idx = cur_ops->readlock();
 		completed = cur_ops->completed();
 		p = rcu_dereference(rcu_torture_current);
@@ -669,6 +732,8 @@ rcu_torture_reader(void *arg)
 		rcu_stutter_wait();
 	} while (!kthread_should_stop() && !fullstop);
 	VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
+	if (irqreader && cur_ops->irqcapable)
+		del_timer_sync(&t);
 	while (!kthread_should_stop())
 		schedule_timeout_uninterruptible(1);
 	return 0;
@@ -699,14 +764,15 @@ rcu_torture_printk(char *page)
 	cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
 	cnt += sprintf(&page[cnt],
 		       "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
-		       "rtmbe: %d",
+		       "rtmbe: %d nt: %ld",
 		       rcu_torture_current,
 		       rcu_torture_current_version,
 		       list_empty(&rcu_torture_freelist),
 		       atomic_read(&n_rcu_torture_alloc),
 		       atomic_read(&n_rcu_torture_alloc_fail),
 		       atomic_read(&n_rcu_torture_free),
-		       atomic_read(&n_rcu_torture_mberror));
+		       atomic_read(&n_rcu_torture_mberror),
+		       n_rcu_torture_timers);
 	if (atomic_read(&n_rcu_torture_mberror) != 0)
 		cnt += sprintf(&page[cnt], " !!!");
 	cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
@@ -862,10 +928,10 @@ rcu_torture_print_module_parms(char *tag)
 	printk(KERN_ALERT "%s" TORTURE_FLAG
 		"--- %s: nreaders=%d nfakewriters=%d "
 		"stat_interval=%d verbose=%d test_no_idle_hz=%d "
-		"shuffle_interval=%d stutter=%d\n",
+		"shuffle_interval=%d stutter=%d irqreader=%d\n",
 		torture_type, tag, nrealreaders, nfakewriters,
 		stat_interval, verbose, test_no_idle_hz, shuffle_interval,
-		stutter);
+		stutter, irqreader);
 }
 
 static void
-- 
cgit v1.2.3


From 3d4422332711ef48ef0f132f1fcbfcbd56c7f3d1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 26 Jun 2008 11:21:34 +0200
Subject: Add generic helpers for arch IPI function calls

This adds kernel/smp.c which contains helpers for IPI function calls. In
addition to supporting the existing smp_call_function() in a more efficient
manner, it also adds a more scalable variant called smp_call_function_single()
for calling a given function on a single CPU only.

The core of this is based on the x86-64 patch from Nick Piggin, lots of
changes since then. "Alan D. Brunelle" <Alan.Brunelle@hp.com> has
contributed lots of fixes and suggestions as well. Also thanks to
Paul E. McKenney <paulmck@linux.vnet.ibm.com> for reviewing RCU usage
and getting rid of the data allocation fallback deadlock.

Acked-by: Ingo Molnar <mingo@elte.hu>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 kernel/Makefile |   1 +
 kernel/smp.c    | 383 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 384 insertions(+)
 create mode 100644 kernel/smp.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 1c9938addb9d..9fa57976f252 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += cpu.o spinlock.o
+obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
 obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
 obj-$(CONFIG_UID16) += uid16.o
diff --git a/kernel/smp.c b/kernel/smp.c
new file mode 100644
index 000000000000..f77b75c027ad
--- /dev/null
+++ b/kernel/smp.c
@@ -0,0 +1,383 @@
+/*
+ * Generic helpers for smp ipi calls
+ *
+ * (C) Jens Axboe <jens.axboe@oracle.com> 2008
+ *
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/smp.h>
+
+static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
+static LIST_HEAD(call_function_queue);
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock);
+
+enum {
+	CSD_FLAG_WAIT		= 0x01,
+	CSD_FLAG_ALLOC		= 0x02,
+};
+
+struct call_function_data {
+	struct call_single_data csd;
+	spinlock_t lock;
+	unsigned int refs;
+	cpumask_t cpumask;
+	struct rcu_head rcu_head;
+};
+
+struct call_single_queue {
+	struct list_head list;
+	spinlock_t lock;
+};
+
+void __cpuinit init_call_single_data(void)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct call_single_queue *q = &per_cpu(call_single_queue, i);
+
+		spin_lock_init(&q->lock);
+		INIT_LIST_HEAD(&q->list);
+	}
+}
+
+static void csd_flag_wait(struct call_single_data *data)
+{
+	/* Wait for response */
+	do {
+		/*
+		 * We need to see the flags store in the IPI handler
+		 */
+		smp_mb();
+		if (!(data->flags & CSD_FLAG_WAIT))
+			break;
+		cpu_relax();
+	} while (1);
+}
+
+/*
+ * Insert a previously allocated call_single_data element for execution
+ * on the given CPU. data must already have ->func, ->info, and ->flags set.
+ */
+static void generic_exec_single(int cpu, struct call_single_data *data)
+{
+	struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
+	int wait = data->flags & CSD_FLAG_WAIT, ipi;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dst->lock, flags);
+	ipi = list_empty(&dst->list);
+	list_add_tail(&data->list, &dst->list);
+	spin_unlock_irqrestore(&dst->lock, flags);
+
+	if (ipi)
+		arch_send_call_function_single_ipi(cpu);
+
+	if (wait)
+		csd_flag_wait(data);
+}
+
+static void rcu_free_call_data(struct rcu_head *head)
+{
+	struct call_function_data *data;
+
+	data = container_of(head, struct call_function_data, rcu_head);
+
+	kfree(data);
+}
+
+/*
+ * Invoked by arch to handle an IPI for call function. Must be called with
+ * interrupts disabled.
+ */
+void generic_smp_call_function_interrupt(void)
+{
+	struct call_function_data *data;
+	int cpu = get_cpu();
+
+	/*
+	 * It's ok to use list_for_each_rcu() here even though we may delete
+	 * 'pos', since list_del_rcu() doesn't clear ->next
+	 */
+	rcu_read_lock();
+	list_for_each_entry_rcu(data, &call_function_queue, csd.list) {
+		int refs;
+
+		if (!cpu_isset(cpu, data->cpumask))
+			continue;
+
+		data->csd.func(data->csd.info);
+
+		spin_lock(&data->lock);
+		cpu_clear(cpu, data->cpumask);
+		WARN_ON(data->refs == 0);
+		data->refs--;
+		refs = data->refs;
+		spin_unlock(&data->lock);
+
+		if (refs)
+			continue;
+
+		spin_lock(&call_function_lock);
+		list_del_rcu(&data->csd.list);
+		spin_unlock(&call_function_lock);
+
+		if (data->csd.flags & CSD_FLAG_WAIT) {
+			/*
+			 * serialize stores to data with the flag clear
+			 * and wakeup
+			 */
+			smp_wmb();
+			data->csd.flags &= ~CSD_FLAG_WAIT;
+		} else
+			call_rcu(&data->rcu_head, rcu_free_call_data);
+	}
+	rcu_read_unlock();
+
+	put_cpu();
+}
+
+/*
+ * Invoked by arch to handle an IPI for call function single. Must be called
+ * from the arch with interrupts disabled.
+ */
+void generic_smp_call_function_single_interrupt(void)
+{
+	struct call_single_queue *q = &__get_cpu_var(call_single_queue);
+	LIST_HEAD(list);
+
+	/*
+	 * Need to see other stores to list head for checking whether
+	 * list is empty without holding q->lock
+	 */
+	smp_mb();
+	while (!list_empty(&q->list)) {
+		unsigned int data_flags;
+
+		spin_lock(&q->lock);
+		list_replace_init(&q->list, &list);
+		spin_unlock(&q->lock);
+
+		while (!list_empty(&list)) {
+			struct call_single_data *data;
+
+			data = list_entry(list.next, struct call_single_data,
+						list);
+			list_del(&data->list);
+
+			/*
+			 * 'data' can be invalid after this call if
+			 * flags == 0 (when called through
+			 * generic_exec_single(), so save them away before
+			 * making the call.
+			 */
+			data_flags = data->flags;
+
+			data->func(data->info);
+
+			if (data_flags & CSD_FLAG_WAIT) {
+				smp_wmb();
+				data->flags &= ~CSD_FLAG_WAIT;
+			} else if (data_flags & CSD_FLAG_ALLOC)
+				kfree(data);
+		}
+		/*
+		 * See comment on outer loop
+		 */
+		smp_mb();
+	}
+}
+
+/*
+ * smp_call_function_single - Run a function on a specific CPU
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @retry: Unused
+ * @wait: If true, wait until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code. Note that @wait
+ * will be implicitly turned on in case of allocation failures, since
+ * we fall back to on-stack allocation.
+ */
+int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+			     int retry, int wait)
+{
+	struct call_single_data d;
+	unsigned long flags;
+	/* prevent preemption and reschedule on another processor */
+	int me = get_cpu();
+
+	/* Can deadlock when called with interrupts disabled */
+	WARN_ON(irqs_disabled());
+
+	if (cpu == me) {
+		local_irq_save(flags);
+		func(info);
+		local_irq_restore(flags);
+	} else {
+		struct call_single_data *data = NULL;
+
+		if (!wait) {
+			data = kmalloc(sizeof(*data), GFP_ATOMIC);
+			if (data)
+				data->flags = CSD_FLAG_ALLOC;
+		}
+		if (!data) {
+			data = &d;
+			data->flags = CSD_FLAG_WAIT;
+		}
+
+		data->func = func;
+		data->info = info;
+		generic_exec_single(cpu, data);
+	}
+
+	put_cpu();
+	return 0;
+}
+EXPORT_SYMBOL(smp_call_function_single);
+
+/**
+ * __smp_call_function_single(): Run a function on another CPU
+ * @cpu: The CPU to run on.
+ * @data: Pre-allocated and setup data structure
+ *
+ * Like smp_call_function_single(), but allow caller to pass in a pre-allocated
+ * data structure. Useful for embedding @data inside other structures, for
+ * instance.
+ *
+ */
+void __smp_call_function_single(int cpu, struct call_single_data *data)
+{
+	/* Can deadlock when called with interrupts disabled */
+	WARN_ON((data->flags & CSD_FLAG_WAIT) && irqs_disabled());
+
+	generic_exec_single(cpu, data);
+}
+
+/**
+ * smp_call_function_mask(): Run a function on a set of other CPUs.
+ * @mask: The set of cpus to run on.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code.
+ *
+ * If @wait is true, then returns once @func has returned. Note that @wait
+ * will be implicitly turned on in case of allocation failures, since
+ * we fall back to on-stack allocation.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler. Preemption
+ * must be disabled when calling this function.
+ */
+int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
+			   int wait)
+{
+	struct call_function_data d;
+	struct call_function_data *data = NULL;
+	cpumask_t allbutself;
+	unsigned long flags;
+	int cpu, num_cpus;
+
+	/* Can deadlock when called with interrupts disabled */
+	WARN_ON(irqs_disabled());
+
+	cpu = smp_processor_id();
+	allbutself = cpu_online_map;
+	cpu_clear(cpu, allbutself);
+	cpus_and(mask, mask, allbutself);
+	num_cpus = cpus_weight(mask);
+
+	/*
+	 * If zero CPUs, return. If just a single CPU, turn this request
+	 * into a targetted single call instead since it's faster.
+	 */
+	if (!num_cpus)
+		return 0;
+	else if (num_cpus == 1) {
+		cpu = first_cpu(mask);
+		return smp_call_function_single(cpu, func, info, 0, wait);
+	}
+
+	if (!wait) {
+		data = kmalloc(sizeof(*data), GFP_ATOMIC);
+		if (data)
+			data->csd.flags = CSD_FLAG_ALLOC;
+	}
+	if (!data) {
+		data = &d;
+		data->csd.flags = CSD_FLAG_WAIT;
+	}
+
+	spin_lock_init(&data->lock);
+	data->csd.func = func;
+	data->csd.info = info;
+	data->refs = num_cpus;
+	data->cpumask = mask;
+
+	spin_lock_irqsave(&call_function_lock, flags);
+	list_add_tail_rcu(&data->csd.list, &call_function_queue);
+	spin_unlock_irqrestore(&call_function_lock, flags);
+
+	/* Send a message to all CPUs in the map */
+	arch_send_call_function_ipi(mask);
+
+	/* optionally wait for the CPUs to complete */
+	if (wait)
+		csd_flag_wait(&data->csd);
+
+	return 0;
+}
+EXPORT_SYMBOL(smp_call_function_mask);
+
+/**
+ * smp_call_function(): Run a function on all other CPUs.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @natomic: Unused
+ * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code.
+ *
+ * If @wait is true, then returns once @func has returned; otherwise
+ * it returns just before the target cpu calls @func. In case of allocation
+ * failure, @wait will be implicitly turned on.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler.
+ */
+int smp_call_function(void (*func)(void *), void *info, int natomic, int wait)
+{
+	int ret;
+
+	preempt_disable();
+	ret = smp_call_function_mask(cpu_online_map, func, info, wait);
+	preempt_enable();
+	return ret;
+}
+EXPORT_SYMBOL(smp_call_function);
+
+void ipi_call_lock(void)
+{
+	spin_lock(&call_function_lock);
+}
+
+void ipi_call_unlock(void)
+{
+	spin_unlock(&call_function_lock);
+}
+
+void ipi_call_lock_irq(void)
+{
+	spin_lock_irq(&call_function_lock);
+}
+
+void ipi_call_unlock_irq(void)
+{
+	spin_unlock_irq(&call_function_lock);
+}
-- 
cgit v1.2.3


From 8691e5a8f691cc2a4fda0651e8d307aaba0e7d68 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 6 Jun 2008 11:18:06 +0200
Subject: smp_call_function: get rid of the unused nonatomic/retry argument

It's never used and the comments refer to nonatomic and retry
interchangably. So get rid of it.

Acked-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 kernel/smp.c                 | 6 ++----
 kernel/softirq.c             | 2 +-
 kernel/time/tick-broadcast.c | 2 +-
 3 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index f77b75c027ad..7e0432a4a0e2 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -195,7 +195,6 @@ void generic_smp_call_function_single_interrupt(void)
  * smp_call_function_single - Run a function on a specific CPU
  * @func: The function to run. This must be fast and non-blocking.
  * @info: An arbitrary pointer to pass to the function.
- * @retry: Unused
  * @wait: If true, wait until function has completed on other CPUs.
  *
  * Returns 0 on success, else a negative status code. Note that @wait
@@ -203,7 +202,7 @@ void generic_smp_call_function_single_interrupt(void)
  * we fall back to on-stack allocation.
  */
 int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
-			     int retry, int wait)
+			     int wait)
 {
 	struct call_single_data d;
 	unsigned long flags;
@@ -339,7 +338,6 @@ EXPORT_SYMBOL(smp_call_function_mask);
  * smp_call_function(): Run a function on all other CPUs.
  * @func: The function to run. This must be fast and non-blocking.
  * @info: An arbitrary pointer to pass to the function.
- * @natomic: Unused
  * @wait: If true, wait (atomically) until function has completed on other CPUs.
  *
  * Returns 0 on success, else a negative status code.
@@ -351,7 +349,7 @@ EXPORT_SYMBOL(smp_call_function_mask);
  * You must not call this function with disabled interrupts or from a
  * hardware interrupt handler or from a bottom half handler.
  */
-int smp_call_function(void (*func)(void *), void *info, int natomic, int wait)
+int smp_call_function(void (*func)(void *), void *info, int wait)
 {
 	int ret;
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 36e061740047..d73afb4764ef 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -679,7 +679,7 @@ int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait)
 	int ret = 0;
 
 	preempt_disable();
-	ret = smp_call_function(func, info, retry, wait);
+	ret = smp_call_function(func, info, wait);
 	local_irq_disable();
 	func(info);
 	local_irq_enable();
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 57a1f02e5ec0..75e718539dcb 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -266,7 +266,7 @@ void tick_broadcast_on_off(unsigned long reason, int *oncpu)
 		       "offline CPU #%d\n", *oncpu);
 	else
 		smp_call_function_single(*oncpu, tick_do_broadcast_on_off,
-					 &reason, 1, 1);
+					 &reason, 1);
 }
 
 /*
-- 
cgit v1.2.3


From 15c8b6c1aaaf1c4edd67e2f02e4d8e1bd1a51c0d Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 9 May 2008 09:39:44 +0200
Subject: on_each_cpu(): kill unused 'retry' parameter

It's not even passed on to smp_call_function() anymore, since that
was removed. So kill it.

Acked-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 kernel/hrtimer.c  | 2 +-
 kernel/profile.c  | 6 +++---
 kernel/rcupdate.c | 2 +-
 kernel/softirq.c  | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 421be5fe5cc7..50e8616d7955 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -623,7 +623,7 @@ static void retrigger_next_event(void *arg)
 void clock_was_set(void)
 {
 	/* Retrigger the CPU local events everywhere */
-	on_each_cpu(retrigger_next_event, NULL, 0, 1);
+	on_each_cpu(retrigger_next_event, NULL, 1);
 }
 
 /*
diff --git a/kernel/profile.c b/kernel/profile.c
index ae7ead82cbc9..58926411eb2a 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -252,7 +252,7 @@ static void profile_flip_buffers(void)
 	mutex_lock(&profile_flip_mutex);
 	j = per_cpu(cpu_profile_flip, get_cpu());
 	put_cpu();
-	on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
+	on_each_cpu(__profile_flip_buffers, NULL, 1);
 	for_each_online_cpu(cpu) {
 		struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j];
 		for (i = 0; i < NR_PROFILE_HIT; ++i) {
@@ -275,7 +275,7 @@ static void profile_discard_flip_buffers(void)
 	mutex_lock(&profile_flip_mutex);
 	i = per_cpu(cpu_profile_flip, get_cpu());
 	put_cpu();
-	on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
+	on_each_cpu(__profile_flip_buffers, NULL, 1);
 	for_each_online_cpu(cpu) {
 		struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
 		memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
@@ -558,7 +558,7 @@ static int __init create_hash_tables(void)
 out_cleanup:
 	prof_on = 0;
 	smp_mb();
-	on_each_cpu(profile_nop, NULL, 0, 1);
+	on_each_cpu(profile_nop, NULL, 1);
 	for_each_online_cpu(cpu) {
 		struct page *page;
 
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index c09605f8d16c..6addab5e6d88 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -127,7 +127,7 @@ void rcu_barrier(void)
 	 * until all the callbacks are queued.
 	 */
 	rcu_read_lock();
-	on_each_cpu(rcu_barrier_func, NULL, 0, 1);
+	on_each_cpu(rcu_barrier_func, NULL, 1);
 	rcu_read_unlock();
 	wait_for_completion(&rcu_barrier_completion);
 	mutex_unlock(&rcu_barrier_mutex);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index d73afb4764ef..c159fd094772 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -674,7 +674,7 @@ __init int spawn_ksoftirqd(void)
 /*
  * Call a function on all processors
  */
-int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait)
+int on_each_cpu(void (*func) (void *info), void *info, int wait)
 {
 	int ret = 0;
 
-- 
cgit v1.2.3


From ce0d1b6f73870878aae622b72e85fe8f7a16b51c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 27 Jun 2008 11:50:32 +0200
Subject: fix: "smp_call_function: get rid of the unused nonatomic/retry
 argument"

fix:

kernel/smp.c: In function 'smp_call_function_mask':
kernel/smp.c:303: error: too many arguments to function 'smp_call_function_single'

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index 7e0432a4a0e2..4f582b257eba 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -300,7 +300,7 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
 		return 0;
 	else if (num_cpus == 1) {
 		cpu = first_cpu(mask);
-		return smp_call_function_single(cpu, func, info, 0, wait);
+		return smp_call_function_single(cpu, func, info, wait);
 	}
 
 	if (!wait) {
-- 
cgit v1.2.3


From bf647b62fdb948e757a7b4d18d4f16e3c763b1d1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:10 +0200
Subject: sched: clean up some unused variables

In file included from /mnt/build/linux-2.6/kernel/sched.c:1496:
/mnt/build/linux-2.6/kernel/sched_rt.c: In function '__enable_runtime':
/mnt/build/linux-2.6/kernel/sched_rt.c:339: warning: unused variable 'rd'
/mnt/build/linux-2.6/kernel/sched_rt.c: In function 'requeue_rt_entity':
/mnt/build/linux-2.6/kernel/sched_rt.c:692: warning: unused variable 'queue'

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index bd90c8bb0739..6b4a6b5a4167 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -336,7 +336,6 @@ static void disable_runtime(struct rq *rq)
 
 static void __enable_runtime(struct rq *rq)
 {
-	struct root_domain *rd = rq->rd;
 	struct rt_rq *rt_rq;
 
 	if (unlikely(!scheduler_running))
@@ -689,7 +688,6 @@ static
 void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
 {
 	struct rt_prio_array *array = &rt_rq->active;
-	struct list_head *queue = array->queue + rt_se_prio(rt_se);
 
 	if (on_rt_rq(rt_se)) {
 		list_del_init(&rt_se->run_list);
-- 
cgit v1.2.3


From a7be37ac8e1565e00880531f4e2aff421a21c803 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:11 +0200
Subject: sched: revert the revert of: weight calculations

Try again..

initial commit: 8f1bc385cfbab474db6c27b5af1e439614f3025c
revert: f9305d4a0968201b2818dbed0dc8cb0d4ee7aeb3

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c          |   9 ++---
 kernel/sched_fair.c     | 105 +++++++++++++++++++++++++++++++++---------------
 kernel/sched_features.h |   1 +
 3 files changed, 76 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c51d9fae8cd8..f653af684fb3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1342,6 +1342,9 @@ static void __resched_task(struct task_struct *p, int tif_bit)
  */
 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
 
+/*
+ * delta *= weight / lw
+ */
 static unsigned long
 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 		struct load_weight *lw)
@@ -1369,12 +1372,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
 }
 
-static inline unsigned long
-calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
-{
-	return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
-}
-
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
 	lw->weight += inc;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 1fe4c65a8170..496500988ce5 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -333,6 +333,34 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
 }
 #endif
 
+/*
+ * delta *= w / rw
+ */
+static inline unsigned long
+calc_delta_weight(unsigned long delta, struct sched_entity *se)
+{
+	for_each_sched_entity(se) {
+		delta = calc_delta_mine(delta,
+				se->load.weight, &cfs_rq_of(se)->load);
+	}
+
+	return delta;
+}
+
+/*
+ * delta *= rw / w
+ */
+static inline unsigned long
+calc_delta_fair(unsigned long delta, struct sched_entity *se)
+{
+	for_each_sched_entity(se) {
+		delta = calc_delta_mine(delta,
+				cfs_rq_of(se)->load.weight, &se->load);
+	}
+
+	return delta;
+}
+
 /*
  * The idea is to set a period in which each task runs once.
  *
@@ -362,47 +390,54 @@ static u64 __sched_period(unsigned long nr_running)
  */
 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	u64 slice = __sched_period(cfs_rq->nr_running);
-
-	for_each_sched_entity(se) {
-		cfs_rq = cfs_rq_of(se);
-
-		slice *= se->load.weight;
-		do_div(slice, cfs_rq->load.weight);
-	}
-
-
-	return slice;
+	return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
 }
 
 /*
  * We calculate the vruntime slice of a to be inserted task
  *
- * vs = s/w = p/rw
+ * vs = s*rw/w = p
  */
 static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	unsigned long nr_running = cfs_rq->nr_running;
-	unsigned long weight;
-	u64 vslice;
 
 	if (!se->on_rq)
 		nr_running++;
 
-	vslice = __sched_period(nr_running);
+	return __sched_period(nr_running);
+}
+
+/*
+ * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
+ * that it favours >=0 over <0.
+ *
+ *   -20         |
+ *               |
+ *     0 --------+-------
+ *             .'
+ *    19     .'
+ *
+ */
+static unsigned long
+calc_delta_asym(unsigned long delta, struct sched_entity *se)
+{
+	struct load_weight lw = {
+		.weight = NICE_0_LOAD,
+		.inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
+	};
 
 	for_each_sched_entity(se) {
-		cfs_rq = cfs_rq_of(se);
+		struct load_weight *se_lw = &se->load;
 
-		weight = cfs_rq->load.weight;
-		if (!se->on_rq)
-			weight += se->load.weight;
+		if (se->load.weight < NICE_0_LOAD)
+			se_lw = &lw;
 
-		vslice *= NICE_0_LOAD;
-		do_div(vslice, weight);
+		delta = calc_delta_mine(delta,
+				cfs_rq_of(se)->load.weight, se_lw);
 	}
 
-	return vslice;
+	return delta;
 }
 
 /*
@@ -419,11 +454,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 
 	curr->sum_exec_runtime += delta_exec;
 	schedstat_add(cfs_rq, exec_clock, delta_exec);
-	delta_exec_weighted = delta_exec;
-	if (unlikely(curr->load.weight != NICE_0_LOAD)) {
-		delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
-							&curr->load);
-	}
+	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
 	curr->vruntime += delta_exec_weighted;
 }
 
@@ -609,8 +640,17 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 
 	if (!initial) {
 		/* sleeps upto a single latency don't count. */
-		if (sched_feat(NEW_FAIR_SLEEPERS))
-			vruntime -= sysctl_sched_latency;
+		if (sched_feat(NEW_FAIR_SLEEPERS)) {
+			unsigned long thresh = sysctl_sched_latency;
+
+			/*
+			 * convert the sleeper threshold into virtual time
+			 */
+			if (sched_feat(NORMALIZED_SLEEPER))
+				thresh = calc_delta_fair(thresh, se);
+
+			vruntime -= thresh;
+		}
 
 		/* ensure we never gain time by being placed backwards. */
 		vruntime = max_vruntime(se->vruntime, vruntime);
@@ -1111,11 +1151,10 @@ static unsigned long wakeup_gran(struct sched_entity *se)
 	unsigned long gran = sysctl_sched_wakeup_granularity;
 
 	/*
-	 * More easily preempt - nice tasks, while not making
-	 * it harder for + nice tasks.
+	 * More easily preempt - nice tasks, while not making it harder for
+	 * + nice tasks.
 	 */
-	if (unlikely(se->load.weight > NICE_0_LOAD))
-		gran = calc_delta_fair(gran, &se->load);
+	gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
 
 	return gran;
 }
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 62b39ca92ebd..afa549166d8d 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,4 +1,5 @@
 SCHED_FEAT(NEW_FAIR_SLEEPERS, 1)
+SCHED_FEAT(NORMALIZED_SLEEPER, 1)
 SCHED_FEAT(WAKEUP_PREEMPT, 1)
 SCHED_FEAT(START_DEBIT, 1)
 SCHED_FEAT(AFFINE_WAKEUPS, 1)
-- 
cgit v1.2.3


From c9c294a630e28eec5f2865f028ecfc58d45c0a5a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:12 +0200
Subject: sched: fix calc_delta_asym()

calc_delta_asym() is supposed to do the same as calc_delta_fair() except
linearly shrink the result for negative nice processes - this causes them
to have a smaller preemption threshold so that they are more easily preempted.

The problem is that for task groups se->load.weight is the per cpu share of
the actual task group weight; take that into account.

Also provide a debug switch to disable the asymmetry (which I still don't
like - but it does greatly benefit some workloads)

This would explain the interactivity issues reported against group scheduling.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c     | 28 +++++++++++++++++++++++++++-
 kernel/sched_features.h |  1 +
 2 files changed, 28 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 496500988ce5..2268e634812b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -430,6 +430,29 @@ calc_delta_asym(unsigned long delta, struct sched_entity *se)
 	for_each_sched_entity(se) {
 		struct load_weight *se_lw = &se->load;
 
+#ifdef CONFIG_FAIR_SCHED_GROUP
+		struct cfs_rq *cfs_rq = se->my_q;
+		struct task_group *tg = NULL
+
+		if (cfs_rq)
+			tg = cfs_rq->tg;
+
+		if (tg && tg->shares < NICE_0_LOAD) {
+			/*
+			 * scale shares to what it would have been had
+			 * tg->weight been NICE_0_LOAD:
+			 *
+			 *   weight = 1024 * shares / tg->weight
+			 */
+			lw.weight *= se->load.weight;
+			lw.weight /= tg->shares;
+
+			lw.inv_weight = 0;
+
+			se_lw = &lw;
+		} else
+#endif
+
 		if (se->load.weight < NICE_0_LOAD)
 			se_lw = &lw;
 
@@ -1154,7 +1177,10 @@ static unsigned long wakeup_gran(struct sched_entity *se)
 	 * More easily preempt - nice tasks, while not making it harder for
 	 * + nice tasks.
 	 */
-	gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
+	if (sched_feat(ASYM_GRAN))
+		gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
+	else
+		gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
 
 	return gran;
 }
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index afa549166d8d..04123af2e678 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -7,3 +7,4 @@ SCHED_FEAT(CACHE_HOT_BUDDY, 1)
 SCHED_FEAT(SYNC_WAKEUPS, 1)
 SCHED_FEAT(HRTICK, 1)
 SCHED_FEAT(DOUBLE_TICK, 0)
+SCHED_FEAT(ASYM_GRAN, 1)
-- 
cgit v1.2.3


From ced8aa16e1db55c33c507174c1b1f9e107445865 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:13 +0200
Subject: sched: fix calc_delta_asym, #2

Ok, so why are we in this mess, it was:

  1/w

but now we mixed that rw in the mix like:

 rw/w

rw being \Sum w suggests: fiddling w, we should also fiddle rw, humm?

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 2268e634812b..2e197b8e43f1 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -429,6 +429,7 @@ calc_delta_asym(unsigned long delta, struct sched_entity *se)
 
 	for_each_sched_entity(se) {
 		struct load_weight *se_lw = &se->load;
+		unsigned long rw = cfs_rq_of(se)->load.weight;
 
 #ifdef CONFIG_FAIR_SCHED_GROUP
 		struct cfs_rq *cfs_rq = se->my_q;
@@ -450,14 +451,16 @@ calc_delta_asym(unsigned long delta, struct sched_entity *se)
 			lw.inv_weight = 0;
 
 			se_lw = &lw;
+			rw += lw.weight - se->load.weight;
 		} else
 #endif
 
-		if (se->load.weight < NICE_0_LOAD)
+		if (se->load.weight < NICE_0_LOAD) {
 			se_lw = &lw;
+			rw += NICE_0_LOAD - se->load.weight;
+		}
 
-		delta = calc_delta_mine(delta,
-				cfs_rq_of(se)->load.weight, se_lw);
+		delta = calc_delta_mine(delta, rw, se_lw);
 	}
 
 	return delta;
-- 
cgit v1.2.3


From c09595f63bb1909c5dc4dca288f4fe818561b5f3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:14 +0200
Subject: sched: revert revert of: fair-group: SMP-nice for group scheduling

Try again..

Initial commit: 18d95a2832c1392a2d63227a7a6d433cb9f2037e
Revert: 6363ca57c76b7b83639ca8c83fc285fa26a7880e

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c       | 430 +++++++++++++++++++++++++++++++++++++++++++++++----
 kernel/sched_debug.c |   5 +
 kernel/sched_fair.c  | 124 +++++++++------
 kernel/sched_rt.c    |   4 +
 4 files changed, 488 insertions(+), 75 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index f653af684fb3..874b6da15430 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -403,6 +403,43 @@ struct cfs_rq {
 	 */
 	struct list_head leaf_cfs_rq_list;
 	struct task_group *tg;	/* group that "owns" this runqueue */
+
+#ifdef CONFIG_SMP
+	unsigned long task_weight;
+	unsigned long shares;
+	/*
+	 * We need space to build a sched_domain wide view of the full task
+	 * group tree, in order to avoid depending on dynamic memory allocation
+	 * during the load balancing we place this in the per cpu task group
+	 * hierarchy. This limits the load balancing to one instance per cpu,
+	 * but more should not be needed anyway.
+	 */
+	struct aggregate_struct {
+		/*
+		 *   load = weight(cpus) * f(tg)
+		 *
+		 * Where f(tg) is the recursive weight fraction assigned to
+		 * this group.
+		 */
+		unsigned long load;
+
+		/*
+		 * part of the group weight distributed to this span.
+		 */
+		unsigned long shares;
+
+		/*
+		 * The sum of all runqueue weights within this span.
+		 */
+		unsigned long rq_weight;
+
+		/*
+		 * Weight contributed by tasks; this is the part we can
+		 * influence by moving tasks around.
+		 */
+		unsigned long task_weight;
+	} aggregate;
+#endif
 #endif
 };
 
@@ -1484,6 +1521,326 @@ static unsigned long source_load(int cpu, int type);
 static unsigned long target_load(int cpu, int type);
 static unsigned long cpu_avg_load_per_task(int cpu);
 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/*
+ * Group load balancing.
+ *
+ * We calculate a few balance domain wide aggregate numbers; load and weight.
+ * Given the pictures below, and assuming each item has equal weight:
+ *
+ *         root          1 - thread
+ *         / | \         A - group
+ *        A  1  B
+ *       /|\   / \
+ *      C 2 D 3   4
+ *      |   |
+ *      5   6
+ *
+ * load:
+ *    A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
+ *    which equals 1/9-th of the total load.
+ *
+ * shares:
+ *    The weight of this group on the selected cpus.
+ *
+ * rq_weight:
+ *    Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
+ *    B would get 2.
+ *
+ * task_weight:
+ *    Part of the rq_weight contributed by tasks; all groups except B would
+ *    get 1, B gets 2.
+ */
+
+static inline struct aggregate_struct *
+aggregate(struct task_group *tg, struct sched_domain *sd)
+{
+	return &tg->cfs_rq[sd->first_cpu]->aggregate;
+}
+
+typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
+
+/*
+ * Iterate the full tree, calling @down when first entering a node and @up when
+ * leaving it for the final time.
+ */
+static
+void aggregate_walk_tree(aggregate_func down, aggregate_func up,
+			 struct sched_domain *sd)
+{
+	struct task_group *parent, *child;
+
+	rcu_read_lock();
+	parent = &root_task_group;
+down:
+	(*down)(parent, sd);
+	list_for_each_entry_rcu(child, &parent->children, siblings) {
+		parent = child;
+		goto down;
+
+up:
+		continue;
+	}
+	(*up)(parent, sd);
+
+	child = parent;
+	parent = parent->parent;
+	if (parent)
+		goto up;
+	rcu_read_unlock();
+}
+
+/*
+ * Calculate the aggregate runqueue weight.
+ */
+static
+void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
+{
+	unsigned long rq_weight = 0;
+	unsigned long task_weight = 0;
+	int i;
+
+	for_each_cpu_mask(i, sd->span) {
+		rq_weight += tg->cfs_rq[i]->load.weight;
+		task_weight += tg->cfs_rq[i]->task_weight;
+	}
+
+	aggregate(tg, sd)->rq_weight = rq_weight;
+	aggregate(tg, sd)->task_weight = task_weight;
+}
+
+/*
+ * Compute the weight of this group on the given cpus.
+ */
+static
+void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
+{
+	unsigned long shares = 0;
+	int i;
+
+	for_each_cpu_mask(i, sd->span)
+		shares += tg->cfs_rq[i]->shares;
+
+	if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares)
+		shares = tg->shares;
+
+	aggregate(tg, sd)->shares = shares;
+}
+
+/*
+ * Compute the load fraction assigned to this group, relies on the aggregate
+ * weight and this group's parent's load, i.e. top-down.
+ */
+static
+void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
+{
+	unsigned long load;
+
+	if (!tg->parent) {
+		int i;
+
+		load = 0;
+		for_each_cpu_mask(i, sd->span)
+			load += cpu_rq(i)->load.weight;
+
+	} else {
+		load = aggregate(tg->parent, sd)->load;
+
+		/*
+		 * shares is our weight in the parent's rq so
+		 * shares/parent->rq_weight gives our fraction of the load
+		 */
+		load *= aggregate(tg, sd)->shares;
+		load /= aggregate(tg->parent, sd)->rq_weight + 1;
+	}
+
+	aggregate(tg, sd)->load = load;
+}
+
+static void __set_se_shares(struct sched_entity *se, unsigned long shares);
+
+/*
+ * Calculate and set the cpu's group shares.
+ */
+static void
+__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
+			  int tcpu)
+{
+	int boost = 0;
+	unsigned long shares;
+	unsigned long rq_weight;
+
+	if (!tg->se[tcpu])
+		return;
+
+	rq_weight = tg->cfs_rq[tcpu]->load.weight;
+
+	/*
+	 * If there are currently no tasks on the cpu pretend there is one of
+	 * average load so that when a new task gets to run here it will not
+	 * get delayed by group starvation.
+	 */
+	if (!rq_weight) {
+		boost = 1;
+		rq_weight = NICE_0_LOAD;
+	}
+
+	/*
+	 *           \Sum shares * rq_weight
+	 * shares =  -----------------------
+	 *               \Sum rq_weight
+	 *
+	 */
+	shares = aggregate(tg, sd)->shares * rq_weight;
+	shares /= aggregate(tg, sd)->rq_weight + 1;
+
+	/*
+	 * record the actual number of shares, not the boosted amount.
+	 */
+	tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;
+
+	if (shares < MIN_SHARES)
+		shares = MIN_SHARES;
+	else if (shares > MAX_SHARES)
+		shares = MAX_SHARES;
+
+	__set_se_shares(tg->se[tcpu], shares);
+}
+
+/*
+ * Re-adjust the weights on the cpu the task came from and on the cpu the
+ * task went to.
+ */
+static void
+__move_group_shares(struct task_group *tg, struct sched_domain *sd,
+		    int scpu, int dcpu)
+{
+	unsigned long shares;
+
+	shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
+
+	__update_group_shares_cpu(tg, sd, scpu);
+	__update_group_shares_cpu(tg, sd, dcpu);
+
+	/*
+	 * ensure we never loose shares due to rounding errors in the
+	 * above redistribution.
+	 */
+	shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
+	if (shares)
+		tg->cfs_rq[dcpu]->shares += shares;
+}
+
+/*
+ * Because changing a group's shares changes the weight of the super-group
+ * we need to walk up the tree and change all shares until we hit the root.
+ */
+static void
+move_group_shares(struct task_group *tg, struct sched_domain *sd,
+		  int scpu, int dcpu)
+{
+	while (tg) {
+		__move_group_shares(tg, sd, scpu, dcpu);
+		tg = tg->parent;
+	}
+}
+
+static
+void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
+{
+	unsigned long shares = aggregate(tg, sd)->shares;
+	int i;
+
+	for_each_cpu_mask(i, sd->span) {
+		struct rq *rq = cpu_rq(i);
+		unsigned long flags;
+
+		spin_lock_irqsave(&rq->lock, flags);
+		__update_group_shares_cpu(tg, sd, i);
+		spin_unlock_irqrestore(&rq->lock, flags);
+	}
+
+	aggregate_group_shares(tg, sd);
+
+	/*
+	 * ensure we never loose shares due to rounding errors in the
+	 * above redistribution.
+	 */
+	shares -= aggregate(tg, sd)->shares;
+	if (shares) {
+		tg->cfs_rq[sd->first_cpu]->shares += shares;
+		aggregate(tg, sd)->shares += shares;
+	}
+}
+
+/*
+ * Calculate the accumulative weight and recursive load of each task group
+ * while walking down the tree.
+ */
+static
+void aggregate_get_down(struct task_group *tg, struct sched_domain *sd)
+{
+	aggregate_group_weight(tg, sd);
+	aggregate_group_shares(tg, sd);
+	aggregate_group_load(tg, sd);
+}
+
+/*
+ * Rebalance the cpu shares while walking back up the tree.
+ */
+static
+void aggregate_get_up(struct task_group *tg, struct sched_domain *sd)
+{
+	aggregate_group_set_shares(tg, sd);
+}
+
+static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
+
+static void __init init_aggregate(void)
+{
+	int i;
+
+	for_each_possible_cpu(i)
+		spin_lock_init(&per_cpu(aggregate_lock, i));
+}
+
+static int get_aggregate(struct sched_domain *sd)
+{
+	if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu)))
+		return 0;
+
+	aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd);
+	return 1;
+}
+
+static void put_aggregate(struct sched_domain *sd)
+{
+	spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu));
+}
+
+static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+{
+	cfs_rq->shares = shares;
+}
+
+#else
+
+static inline void init_aggregate(void)
+{
+}
+
+static inline int get_aggregate(struct sched_domain *sd)
+{
+	return 0;
+}
+
+static inline void put_aggregate(struct sched_domain *sd)
+{
+}
+#endif
+
 #endif
 
 #include "sched_stats.h"
@@ -1498,26 +1855,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 #define for_each_class(class) \
    for (class = sched_class_highest; class; class = class->next)
 
-static inline void inc_load(struct rq *rq, const struct task_struct *p)
-{
-	update_load_add(&rq->load, p->se.load.weight);
-}
-
-static inline void dec_load(struct rq *rq, const struct task_struct *p)
-{
-	update_load_sub(&rq->load, p->se.load.weight);
-}
-
-static void inc_nr_running(struct task_struct *p, struct rq *rq)
+static void inc_nr_running(struct rq *rq)
 {
 	rq->nr_running++;
-	inc_load(rq, p);
 }
 
-static void dec_nr_running(struct task_struct *p, struct rq *rq)
+static void dec_nr_running(struct rq *rq)
 {
 	rq->nr_running--;
-	dec_load(rq, p);
 }
 
 static void set_load_weight(struct task_struct *p)
@@ -1609,7 +1954,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
 		rq->nr_uninterruptible--;
 
 	enqueue_task(rq, p, wakeup);
-	inc_nr_running(p, rq);
+	inc_nr_running(rq);
 }
 
 /*
@@ -1621,7 +1966,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
 		rq->nr_uninterruptible++;
 
 	dequeue_task(rq, p, sleep);
-	dec_nr_running(p, rq);
+	dec_nr_running(rq);
 }
 
 /**
@@ -2274,7 +2619,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		 * management (if any):
 		 */
 		p->sched_class->task_new(rq, p);
-		inc_nr_running(p, rq);
+		inc_nr_running(rq);
 	}
 	check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
@@ -3265,9 +3610,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	unsigned long imbalance;
 	struct rq *busiest;
 	unsigned long flags;
+	int unlock_aggregate;
 
 	cpus_setall(*cpus);
 
+	unlock_aggregate = get_aggregate(sd);
+
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
 	 * sibling can pick up load irrespective of busy siblings. In this case,
@@ -3383,8 +3731,9 @@ redo:
 
 	if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-		return -1;
-	return ld_moved;
+		ld_moved = -1;
+
+	goto out;
 
 out_balanced:
 	schedstat_inc(sd, lb_balanced[idle]);
@@ -3399,8 +3748,13 @@ out_one_pinned:
 
 	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-		return -1;
-	return 0;
+		ld_moved = -1;
+	else
+		ld_moved = 0;
+out:
+	if (unlock_aggregate)
+		put_aggregate(sd);
+	return ld_moved;
 }
 
 /*
@@ -4588,10 +4942,8 @@ void set_user_nice(struct task_struct *p, long nice)
 		goto out_unlock;
 	}
 	on_rq = p->se.on_rq;
-	if (on_rq) {
+	if (on_rq)
 		dequeue_task(rq, p, 0);
-		dec_load(rq, p);
-	}
 
 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
@@ -4601,7 +4953,6 @@ void set_user_nice(struct task_struct *p, long nice)
 
 	if (on_rq) {
 		enqueue_task(rq, p, 0);
-		inc_load(rq, p);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
@@ -7016,6 +7367,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 			SD_INIT(sd, ALLNODES);
 			set_domain_attribute(sd, attr);
 			sd->span = *cpu_map;
+			sd->first_cpu = first_cpu(sd->span);
 			cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
 			p = sd;
 			sd_allnodes = 1;
@@ -7026,6 +7378,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, NODE);
 		set_domain_attribute(sd, attr);
 		sched_domain_node_span(cpu_to_node(i), &sd->span);
+		sd->first_cpu = first_cpu(sd->span);
 		sd->parent = p;
 		if (p)
 			p->child = sd;
@@ -7037,6 +7390,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, CPU);
 		set_domain_attribute(sd, attr);
 		sd->span = *nodemask;
+		sd->first_cpu = first_cpu(sd->span);
 		sd->parent = p;
 		if (p)
 			p->child = sd;
@@ -7048,6 +7402,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, MC);
 		set_domain_attribute(sd, attr);
 		sd->span = cpu_coregroup_map(i);
+		sd->first_cpu = first_cpu(sd->span);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
@@ -7060,6 +7415,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, SIBLING);
 		set_domain_attribute(sd, attr);
 		sd->span = per_cpu(cpu_sibling_map, i);
+		sd->first_cpu = first_cpu(sd->span);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
@@ -7757,6 +8113,7 @@ void __init sched_init(void)
 	}
 
 #ifdef CONFIG_SMP
+	init_aggregate();
 	init_defrootdomain();
 #endif
 
@@ -8322,14 +8679,11 @@ void sched_move_task(struct task_struct *tsk)
 #endif /* CONFIG_GROUP_SCHED */
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void set_se_shares(struct sched_entity *se, unsigned long shares)
+static void __set_se_shares(struct sched_entity *se, unsigned long shares)
 {
 	struct cfs_rq *cfs_rq = se->cfs_rq;
-	struct rq *rq = cfs_rq->rq;
 	int on_rq;
 
-	spin_lock_irq(&rq->lock);
-
 	on_rq = se->on_rq;
 	if (on_rq)
 		dequeue_entity(cfs_rq, se, 0);
@@ -8339,8 +8693,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
 
 	if (on_rq)
 		enqueue_entity(cfs_rq, se, 0);
+}
 
-	spin_unlock_irq(&rq->lock);
+static void set_se_shares(struct sched_entity *se, unsigned long shares)
+{
+	struct cfs_rq *cfs_rq = se->cfs_rq;
+	struct rq *rq = cfs_rq->rq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rq->lock, flags);
+	__set_se_shares(se, shares);
+	spin_unlock_irqrestore(&rq->lock, flags);
 }
 
 static DEFINE_MUTEX(shares_mutex);
@@ -8379,8 +8742,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 	 * w/o tripping rebalance_share or load_balance_fair.
 	 */
 	tg->shares = shares;
-	for_each_possible_cpu(i)
+	for_each_possible_cpu(i) {
+		/*
+		 * force a rebalance
+		 */
+		cfs_rq_set_shares(tg->cfs_rq[i], 0);
 		set_se_shares(tg->se[i], shares);
+	}
 
 	/*
 	 * Enable load balance activity on this group, by inserting it back on
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 8e077b9c91cb..04394ccac88d 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -167,6 +167,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 #endif
 	SEQ_printf(m, "  .%-30s: %ld\n", "nr_spread_over",
 			cfs_rq->nr_spread_over);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_SMP
+	SEQ_printf(m, "  .%-30s: %lu\n", "shares", cfs_rq->shares);
+#endif
+#endif
 }
 
 void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 2e197b8e43f1..183388c4dead 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -567,10 +567,27 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
  * Scheduling class queueing methods:
  */
 
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+static void
+add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+{
+	cfs_rq->task_weight += weight;
+}
+#else
+static inline void
+add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+{
+}
+#endif
+
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	update_load_add(&cfs_rq->load, se->load.weight);
+	if (!parent_entity(se))
+		inc_cpu_load(rq_of(cfs_rq), se->load.weight);
+	if (entity_is_task(se))
+		add_cfs_task_weight(cfs_rq, se->load.weight);
 	cfs_rq->nr_running++;
 	se->on_rq = 1;
 	list_add(&se->group_node, &cfs_rq->tasks);
@@ -580,6 +597,10 @@ static void
 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	update_load_sub(&cfs_rq->load, se->load.weight);
+	if (!parent_entity(se))
+		dec_cpu_load(rq_of(cfs_rq), se->load.weight);
+	if (entity_is_task(se))
+		add_cfs_task_weight(cfs_rq, -se->load.weight);
 	cfs_rq->nr_running--;
 	se->on_rq = 0;
 	list_del_init(&se->group_node);
@@ -1372,75 +1393,90 @@ static struct task_struct *load_balance_next_fair(void *arg)
 	return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
 }
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
+static unsigned long
+__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		unsigned long max_load_move, struct sched_domain *sd,
+		enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
+		struct cfs_rq *cfs_rq)
 {
-	struct sched_entity *curr;
-	struct task_struct *p;
-
-	if (!cfs_rq->nr_running || !first_fair(cfs_rq))
-		return MAX_PRIO;
-
-	curr = cfs_rq->curr;
-	if (!curr)
-		curr = __pick_next_entity(cfs_rq);
+	struct rq_iterator cfs_rq_iterator;
 
-	p = task_of(curr);
+	cfs_rq_iterator.start = load_balance_start_fair;
+	cfs_rq_iterator.next = load_balance_next_fair;
+	cfs_rq_iterator.arg = cfs_rq;
 
-	return p->prio;
+	return balance_tasks(this_rq, this_cpu, busiest,
+			max_load_move, sd, idle, all_pinned,
+			this_best_prio, &cfs_rq_iterator);
 }
-#endif
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		  unsigned long max_load_move,
 		  struct sched_domain *sd, enum cpu_idle_type idle,
 		  int *all_pinned, int *this_best_prio)
 {
-	struct cfs_rq *busy_cfs_rq;
 	long rem_load_move = max_load_move;
-	struct rq_iterator cfs_rq_iterator;
-
-	cfs_rq_iterator.start = load_balance_start_fair;
-	cfs_rq_iterator.next = load_balance_next_fair;
+	int busiest_cpu = cpu_of(busiest);
+	struct task_group *tg;
 
-	for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
-#ifdef CONFIG_FAIR_GROUP_SCHED
-		struct cfs_rq *this_cfs_rq;
+	rcu_read_lock();
+	list_for_each_entry(tg, &task_groups, list) {
 		long imbalance;
-		unsigned long maxload;
+		unsigned long this_weight, busiest_weight;
+		long rem_load, max_load, moved_load;
+
+		/*
+		 * empty group
+		 */
+		if (!aggregate(tg, sd)->task_weight)
+			continue;
+
+		rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
+		rem_load /= aggregate(tg, sd)->load + 1;
+
+		this_weight = tg->cfs_rq[this_cpu]->task_weight;
+		busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
 
-		this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
+		imbalance = (busiest_weight - this_weight) / 2;
 
-		imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
-		/* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
-		if (imbalance <= 0)
+		if (imbalance < 0)
+			imbalance = busiest_weight;
+
+		max_load = max(rem_load, imbalance);
+		moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
+				max_load, sd, idle, all_pinned, this_best_prio,
+				tg->cfs_rq[busiest_cpu]);
+
+		if (!moved_load)
 			continue;
 
-		/* Don't pull more than imbalance/2 */
-		imbalance /= 2;
-		maxload = min(rem_load_move, imbalance);
+		move_group_shares(tg, sd, busiest_cpu, this_cpu);
 
-		*this_best_prio = cfs_rq_best_prio(this_cfs_rq);
-#else
-# define maxload rem_load_move
-#endif
-		/*
-		 * pass busy_cfs_rq argument into
-		 * load_balance_[start|next]_fair iterators
-		 */
-		cfs_rq_iterator.arg = busy_cfs_rq;
-		rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
-					       maxload, sd, idle, all_pinned,
-					       this_best_prio,
-					       &cfs_rq_iterator);
+		moved_load *= aggregate(tg, sd)->load;
+		moved_load /= aggregate(tg, sd)->rq_weight + 1;
 
-		if (rem_load_move <= 0)
+		rem_load_move -= moved_load;
+		if (rem_load_move < 0)
 			break;
 	}
+	rcu_read_unlock();
 
 	return max_load_move - rem_load_move;
 }
+#else
+static unsigned long
+load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		  unsigned long max_load_move,
+		  struct sched_domain *sd, enum cpu_idle_type idle,
+		  int *all_pinned, int *this_best_prio)
+{
+	return __load_balance_fair(this_rq, this_cpu, busiest,
+			max_load_move, sd, idle, all_pinned,
+			this_best_prio, &busiest->cfs);
+}
+#endif
 
 static int
 move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 6b4a6b5a4167..765932d0399d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -670,6 +670,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 		rt_se->timeout = 0;
 
 	enqueue_rt_entity(rt_se);
+
+	inc_cpu_load(rq, p->se.load.weight);
 }
 
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -678,6 +680,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 
 	update_curr_rt(rq);
 	dequeue_rt_entity(rt_se);
+
+	dec_cpu_load(rq, p->se.load.weight);
 }
 
 /*
-- 
cgit v1.2.3


From 76a2a6ee8a0660a29127f05989ac59ae1ce865fa Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:15 +0200
Subject: sched: sched_clock_cpu() based cpu_clock()

with sched_clock_cpu() being reasonably in sync between cpus (max 1 jiffy
difference) use this to provide cpu_clock().

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c       | 76 ----------------------------------------------------
 kernel/sched_clock.c | 12 +++++++++
 2 files changed, 12 insertions(+), 76 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 874b6da15430..eb3454c410fa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -818,82 +818,6 @@ static inline u64 global_rt_runtime(void)
 	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 }
 
-unsigned long long time_sync_thresh = 100000;
-
-static DEFINE_PER_CPU(unsigned long long, time_offset);
-static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
-
-/*
- * Global lock which we take every now and then to synchronize
- * the CPUs time. This method is not warp-safe, but it's good
- * enough to synchronize slowly diverging time sources and thus
- * it's good enough for tracing:
- */
-static DEFINE_SPINLOCK(time_sync_lock);
-static unsigned long long prev_global_time;
-
-static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
-{
-	/*
-	 * We want this inlined, to not get tracer function calls
-	 * in this critical section:
-	 */
-	spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_);
-	__raw_spin_lock(&time_sync_lock.raw_lock);
-
-	if (time < prev_global_time) {
-		per_cpu(time_offset, cpu) += prev_global_time - time;
-		time = prev_global_time;
-	} else {
-		prev_global_time = time;
-	}
-
-	__raw_spin_unlock(&time_sync_lock.raw_lock);
-	spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_);
-
-	return time;
-}
-
-static unsigned long long __cpu_clock(int cpu)
-{
-	unsigned long long now;
-
-	/*
-	 * Only call sched_clock() if the scheduler has already been
-	 * initialized (some code might call cpu_clock() very early):
-	 */
-	if (unlikely(!scheduler_running))
-		return 0;
-
-	now = sched_clock_cpu(cpu);
-
-	return now;
-}
-
-/*
- * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
- * clock constructed from sched_clock():
- */
-unsigned long long cpu_clock(int cpu)
-{
-	unsigned long long prev_cpu_time, time, delta_time;
-	unsigned long flags;
-
-	local_irq_save(flags);
-	prev_cpu_time = per_cpu(prev_cpu_time, cpu);
-	time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
-	delta_time = time-prev_cpu_time;
-
-	if (unlikely(delta_time > time_sync_thresh)) {
-		time = __sync_cpu_clock(time, cpu);
-		per_cpu(prev_cpu_time, cpu) = time;
-	}
-	local_irq_restore(flags);
-
-	return time;
-}
-EXPORT_SYMBOL_GPL(cpu_clock);
-
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index ce05271219ab..3c696db59452 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -244,3 +244,15 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 {
 	return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
 }
+
+unsigned long long cpu_clock(int cpu)
+{
+	unsigned long long clock;
+	unsigned long flags;
+
+	raw_local_irq_save(flags);
+	clock = sched_clock_cpu(cpu);
+	raw_local_irq_restore(flags);
+
+	return clock;
+}
-- 
cgit v1.2.3


From 103638d95ba5b0c53c8d9c0cb581156ccc8513ee Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:16 +0200
Subject: sched: fix wakeup granularity and buddy granularity

Uncouple buddy selection from wakeup granularity.

The initial idea was that buddies could run ahead as far as a normal task
can - do this by measuring a pair 'slice' just as we do for a normal task.

This means we can drop the wakeup_granularity back to 5ms.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      |  1 +
 kernel/sched_fair.c | 15 +++++++--------
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index eb3454c410fa..7d282c52bd42 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -375,6 +375,7 @@ struct cfs_rq {
 
 	u64 exec_clock;
 	u64 min_vruntime;
+	u64 pair_start;
 
 	struct rb_root tasks_timeline;
 	struct rb_node *rb_leftmost;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 183388c4dead..509092af0330 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
 
 /*
  * SCHED_OTHER wake-up granularity.
- * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds)
  *
  * This option delays the preemption effects of decoupled workloads
  * and reduces their over-scheduling. Synchronous workloads will still
  * have immediate wakeup/sleep latencies.
  */
-unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
+unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
 
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
 
@@ -813,17 +813,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
 }
 
-static int
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
-
 static struct sched_entity *
 pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	if (!cfs_rq->next)
-		return se;
+	struct rq *rq = rq_of(cfs_rq);
+	u64 pair_slice = rq->clock - cfs_rq->pair_start;
 
-	if (wakeup_preempt_entity(cfs_rq->next, se) != 0)
+	if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) {
+		cfs_rq->pair_start = rq->clock;
 		return se;
+	}
 
 	return cfs_rq->next;
 }
-- 
cgit v1.2.3


From 32df2ee86a580f70f2dbb90cf81f413aa655f838 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:17 +0200
Subject: sched: add full schedstats to /proc/sched_debug

show all the schedstats in /debug/sched_debug as well.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_debug.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 04394ccac88d..bbe6b31c3c56 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -162,8 +162,23 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
 	SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
 #ifdef CONFIG_SCHEDSTATS
-	SEQ_printf(m, "  .%-30s: %d\n", "bkl_count",
-			rq->bkl_count);
+#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
+
+	P(yld_exp_empty);
+	P(yld_act_empty);
+	P(yld_both_empty);
+	P(yld_count);
+
+	P(sched_switch);
+	P(sched_count);
+	P(sched_goidle);
+
+	P(ttwu_count);
+	P(ttwu_local);
+
+	P(bkl_count);
+
+#undef P
 #endif
 	SEQ_printf(m, "  .%-30s: %ld\n", "nr_spread_over",
 			cfs_rq->nr_spread_over);
-- 
cgit v1.2.3


From b6a86c746f5b708012809958462234d19e9c8177 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:18 +0200
Subject: sched: fix sched_domain aggregation

Keeping the aggregate on the first cpu of the sched domain has two problems:
 - it could collide between different sched domains on different cpus
 - it could slow things down because of the remote accesses

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 113 +++++++++++++++++++++++++---------------------------
 kernel/sched_fair.c |  12 +++---
 2 files changed, 60 insertions(+), 65 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 7d282c52bd42..160d3c209b8f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1480,12 +1480,12 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
  */
 
 static inline struct aggregate_struct *
-aggregate(struct task_group *tg, struct sched_domain *sd)
+aggregate(struct task_group *tg, int cpu)
 {
-	return &tg->cfs_rq[sd->first_cpu]->aggregate;
+	return &tg->cfs_rq[cpu]->aggregate;
 }
 
-typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
+typedef void (*aggregate_func)(struct task_group *, int, struct sched_domain *);
 
 /*
  * Iterate the full tree, calling @down when first entering a node and @up when
@@ -1493,14 +1493,14 @@ typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
  */
 static
 void aggregate_walk_tree(aggregate_func down, aggregate_func up,
-			 struct sched_domain *sd)
+			 int cpu, struct sched_domain *sd)
 {
 	struct task_group *parent, *child;
 
 	rcu_read_lock();
 	parent = &root_task_group;
 down:
-	(*down)(parent, sd);
+	(*down)(parent, cpu, sd);
 	list_for_each_entry_rcu(child, &parent->children, siblings) {
 		parent = child;
 		goto down;
@@ -1508,7 +1508,7 @@ down:
 up:
 		continue;
 	}
-	(*up)(parent, sd);
+	(*up)(parent, cpu, sd);
 
 	child = parent;
 	parent = parent->parent;
@@ -1520,8 +1520,8 @@ up:
 /*
  * Calculate the aggregate runqueue weight.
  */
-static
-void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
+static void
+aggregate_group_weight(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
 	unsigned long rq_weight = 0;
 	unsigned long task_weight = 0;
@@ -1532,15 +1532,15 @@ void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
 		task_weight += tg->cfs_rq[i]->task_weight;
 	}
 
-	aggregate(tg, sd)->rq_weight = rq_weight;
-	aggregate(tg, sd)->task_weight = task_weight;
+	aggregate(tg, cpu)->rq_weight = rq_weight;
+	aggregate(tg, cpu)->task_weight = task_weight;
 }
 
 /*
  * Compute the weight of this group on the given cpus.
  */
-static
-void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
+static void
+aggregate_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
 	unsigned long shares = 0;
 	int i;
@@ -1548,18 +1548,18 @@ void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
 	for_each_cpu_mask(i, sd->span)
 		shares += tg->cfs_rq[i]->shares;
 
-	if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares)
+	if ((!shares && aggregate(tg, cpu)->rq_weight) || shares > tg->shares)
 		shares = tg->shares;
 
-	aggregate(tg, sd)->shares = shares;
+	aggregate(tg, cpu)->shares = shares;
 }
 
 /*
  * Compute the load fraction assigned to this group, relies on the aggregate
  * weight and this group's parent's load, i.e. top-down.
  */
-static
-void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
+static void
+aggregate_group_load(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
 	unsigned long load;
 
@@ -1571,17 +1571,17 @@ void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
 			load += cpu_rq(i)->load.weight;
 
 	} else {
-		load = aggregate(tg->parent, sd)->load;
+		load = aggregate(tg->parent, cpu)->load;
 
 		/*
 		 * shares is our weight in the parent's rq so
 		 * shares/parent->rq_weight gives our fraction of the load
 		 */
-		load *= aggregate(tg, sd)->shares;
-		load /= aggregate(tg->parent, sd)->rq_weight + 1;
+		load *= aggregate(tg, cpu)->shares;
+		load /= aggregate(tg->parent, cpu)->rq_weight + 1;
 	}
 
-	aggregate(tg, sd)->load = load;
+	aggregate(tg, cpu)->load = load;
 }
 
 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
@@ -1590,8 +1590,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
  * Calculate and set the cpu's group shares.
  */
 static void
-__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
-			  int tcpu)
+__update_group_shares_cpu(struct task_group *tg, int cpu,
+			  struct sched_domain *sd, int tcpu)
 {
 	int boost = 0;
 	unsigned long shares;
@@ -1618,8 +1618,8 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
 	 *               \Sum rq_weight
 	 *
 	 */
-	shares = aggregate(tg, sd)->shares * rq_weight;
-	shares /= aggregate(tg, sd)->rq_weight + 1;
+	shares = aggregate(tg, cpu)->shares * rq_weight;
+	shares /= aggregate(tg, cpu)->rq_weight + 1;
 
 	/*
 	 * record the actual number of shares, not the boosted amount.
@@ -1639,15 +1639,15 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
  * task went to.
  */
 static void
-__move_group_shares(struct task_group *tg, struct sched_domain *sd,
+__move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd,
 		    int scpu, int dcpu)
 {
 	unsigned long shares;
 
 	shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
 
-	__update_group_shares_cpu(tg, sd, scpu);
-	__update_group_shares_cpu(tg, sd, dcpu);
+	__update_group_shares_cpu(tg, cpu, sd, scpu);
+	__update_group_shares_cpu(tg, cpu, sd, dcpu);
 
 	/*
 	 * ensure we never loose shares due to rounding errors in the
@@ -1663,19 +1663,19 @@ __move_group_shares(struct task_group *tg, struct sched_domain *sd,
  * we need to walk up the tree and change all shares until we hit the root.
  */
 static void
-move_group_shares(struct task_group *tg, struct sched_domain *sd,
+move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd,
 		  int scpu, int dcpu)
 {
 	while (tg) {
-		__move_group_shares(tg, sd, scpu, dcpu);
+		__move_group_shares(tg, cpu, sd, scpu, dcpu);
 		tg = tg->parent;
 	}
 }
 
-static
-void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
+static void
+aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
-	unsigned long shares = aggregate(tg, sd)->shares;
+	unsigned long shares = aggregate(tg, cpu)->shares;
 	int i;
 
 	for_each_cpu_mask(i, sd->span) {
@@ -1683,20 +1683,20 @@ void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
 		unsigned long flags;
 
 		spin_lock_irqsave(&rq->lock, flags);
-		__update_group_shares_cpu(tg, sd, i);
+		__update_group_shares_cpu(tg, cpu, sd, i);
 		spin_unlock_irqrestore(&rq->lock, flags);
 	}
 
-	aggregate_group_shares(tg, sd);
+	aggregate_group_shares(tg, cpu, sd);
 
 	/*
 	 * ensure we never loose shares due to rounding errors in the
 	 * above redistribution.
 	 */
-	shares -= aggregate(tg, sd)->shares;
+	shares -= aggregate(tg, cpu)->shares;
 	if (shares) {
-		tg->cfs_rq[sd->first_cpu]->shares += shares;
-		aggregate(tg, sd)->shares += shares;
+		tg->cfs_rq[cpu]->shares += shares;
+		aggregate(tg, cpu)->shares += shares;
 	}
 }
 
@@ -1704,21 +1704,21 @@ void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
  * Calculate the accumulative weight and recursive load of each task group
  * while walking down the tree.
  */
-static
-void aggregate_get_down(struct task_group *tg, struct sched_domain *sd)
+static void
+aggregate_get_down(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
-	aggregate_group_weight(tg, sd);
-	aggregate_group_shares(tg, sd);
-	aggregate_group_load(tg, sd);
+	aggregate_group_weight(tg, cpu, sd);
+	aggregate_group_shares(tg, cpu, sd);
+	aggregate_group_load(tg, cpu, sd);
 }
 
 /*
  * Rebalance the cpu shares while walking back up the tree.
  */
-static
-void aggregate_get_up(struct task_group *tg, struct sched_domain *sd)
+static void
+aggregate_get_up(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
-	aggregate_group_set_shares(tg, sd);
+	aggregate_group_set_shares(tg, cpu, sd);
 }
 
 static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
@@ -1731,18 +1731,18 @@ static void __init init_aggregate(void)
 		spin_lock_init(&per_cpu(aggregate_lock, i));
 }
 
-static int get_aggregate(struct sched_domain *sd)
+static int get_aggregate(int cpu, struct sched_domain *sd)
 {
-	if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu)))
+	if (!spin_trylock(&per_cpu(aggregate_lock, cpu)))
 		return 0;
 
-	aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd);
+	aggregate_walk_tree(aggregate_get_down, aggregate_get_up, cpu, sd);
 	return 1;
 }
 
-static void put_aggregate(struct sched_domain *sd)
+static void put_aggregate(int cpu, struct sched_domain *sd)
 {
-	spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu));
+	spin_unlock(&per_cpu(aggregate_lock, cpu));
 }
 
 static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
@@ -1756,12 +1756,12 @@ static inline void init_aggregate(void)
 {
 }
 
-static inline int get_aggregate(struct sched_domain *sd)
+static inline int get_aggregate(int cpu, struct sched_domain *sd)
 {
 	return 0;
 }
 
-static inline void put_aggregate(struct sched_domain *sd)
+static inline void put_aggregate(int cpu, struct sched_domain *sd)
 {
 }
 #endif
@@ -3539,7 +3539,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 
 	cpus_setall(*cpus);
 
-	unlock_aggregate = get_aggregate(sd);
+	unlock_aggregate = get_aggregate(this_cpu, sd);
 
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
@@ -3678,7 +3678,7 @@ out_one_pinned:
 		ld_moved = 0;
 out:
 	if (unlock_aggregate)
-		put_aggregate(sd);
+		put_aggregate(this_cpu, sd);
 	return ld_moved;
 }
 
@@ -7292,7 +7292,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 			SD_INIT(sd, ALLNODES);
 			set_domain_attribute(sd, attr);
 			sd->span = *cpu_map;
-			sd->first_cpu = first_cpu(sd->span);
 			cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
 			p = sd;
 			sd_allnodes = 1;
@@ -7303,7 +7302,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, NODE);
 		set_domain_attribute(sd, attr);
 		sched_domain_node_span(cpu_to_node(i), &sd->span);
-		sd->first_cpu = first_cpu(sd->span);
 		sd->parent = p;
 		if (p)
 			p->child = sd;
@@ -7315,7 +7313,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, CPU);
 		set_domain_attribute(sd, attr);
 		sd->span = *nodemask;
-		sd->first_cpu = first_cpu(sd->span);
 		sd->parent = p;
 		if (p)
 			p->child = sd;
@@ -7327,7 +7324,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, MC);
 		set_domain_attribute(sd, attr);
 		sd->span = cpu_coregroup_map(i);
-		sd->first_cpu = first_cpu(sd->span);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
@@ -7340,7 +7336,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, SIBLING);
 		set_domain_attribute(sd, attr);
 		sd->span = per_cpu(cpu_sibling_map, i);
-		sd->first_cpu = first_cpu(sd->span);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 509092af0330..40cf24ab4de8 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1429,11 +1429,11 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		/*
 		 * empty group
 		 */
-		if (!aggregate(tg, sd)->task_weight)
+		if (!aggregate(tg, this_cpu)->task_weight)
 			continue;
 
-		rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
-		rem_load /= aggregate(tg, sd)->load + 1;
+		rem_load = rem_load_move * aggregate(tg, this_cpu)->rq_weight;
+		rem_load /= aggregate(tg, this_cpu)->load + 1;
 
 		this_weight = tg->cfs_rq[this_cpu]->task_weight;
 		busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
@@ -1451,10 +1451,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		if (!moved_load)
 			continue;
 
-		move_group_shares(tg, sd, busiest_cpu, this_cpu);
+		move_group_shares(tg, this_cpu, sd, busiest_cpu, this_cpu);
 
-		moved_load *= aggregate(tg, sd)->load;
-		moved_load /= aggregate(tg, sd)->rq_weight + 1;
+		moved_load *= aggregate(tg, this_cpu)->load;
+		moved_load /= aggregate(tg, this_cpu)->rq_weight + 1;
 
 		rem_load_move -= moved_load;
 		if (rem_load_move < 0)
-- 
cgit v1.2.3


From 4d8d595dfa69e1c807bf928f364668a7f30da5dc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:19 +0200
Subject: sched: update aggregate when holding the RQs

It was observed that in __update_group_shares_cpu()

  rq_weight > aggregate()->rq_weight

This is caused by forks/wakeups in between the initial aggregate pass and
locking of the RQs for load balance. To avoid this situation partially re-do
the aggregation once we have the RQs locked (which avoids new tasks from
appearing).

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 160d3c209b8f..dae20199dc9c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1721,6 +1721,11 @@ aggregate_get_up(struct task_group *tg, int cpu, struct sched_domain *sd)
 	aggregate_group_set_shares(tg, cpu, sd);
 }
 
+static void
+aggregate_get_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
+{
+}
+
 static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
 
 static void __init init_aggregate(void)
@@ -1740,6 +1745,11 @@ static int get_aggregate(int cpu, struct sched_domain *sd)
 	return 1;
 }
 
+static void update_aggregate(int cpu, struct sched_domain *sd)
+{
+	aggregate_walk_tree(aggregate_get_down, aggregate_get_nop, cpu, sd);
+}
+
 static void put_aggregate(int cpu, struct sched_domain *sd)
 {
 	spin_unlock(&per_cpu(aggregate_lock, cpu));
@@ -1761,6 +1771,10 @@ static inline int get_aggregate(int cpu, struct sched_domain *sd)
 	return 0;
 }
 
+static inline void update_aggregate(int cpu, struct sched_domain *sd)
+{
+}
+
 static inline void put_aggregate(int cpu, struct sched_domain *sd)
 {
 }
@@ -2192,6 +2206,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 	int load_idx = sd->forkexec_idx;
 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
 
+	/*
+	 * now that we have both rqs locked the rq weight won't change
+	 * anymore - so update the stats.
+	 */
+	update_aggregate(this_cpu, sd);
+
 	do {
 		unsigned long load, avg_load;
 		int local_group;
-- 
cgit v1.2.3


From 53fecd8ae1900fb571086f54f664051004665b55 Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Fri, 27 Jun 2008 13:41:20 +0200
Subject: sched: kill task_group balancing

The idea was to balance groups until we've reached the global goal, however
Vatsa rightly pointed out that we might never reach that goal this way -
hence take out this logic.

[ the initial rationale for this 'feature' was to promote max concurrency
  within a group - it does not however affect fairness ]

Reported-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 40cf24ab4de8..b10c0d61a2a9 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1422,9 +1422,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 
 	rcu_read_lock();
 	list_for_each_entry(tg, &task_groups, list) {
-		long imbalance;
-		unsigned long this_weight, busiest_weight;
-		long rem_load, max_load, moved_load;
+		long rem_load, moved_load;
 
 		/*
 		 * empty group
@@ -1435,17 +1433,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		rem_load = rem_load_move * aggregate(tg, this_cpu)->rq_weight;
 		rem_load /= aggregate(tg, this_cpu)->load + 1;
 
-		this_weight = tg->cfs_rq[this_cpu]->task_weight;
-		busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
-
-		imbalance = (busiest_weight - this_weight) / 2;
-
-		if (imbalance < 0)
-			imbalance = busiest_weight;
-
-		max_load = max(rem_load, imbalance);
 		moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
-				max_load, sd, idle, all_pinned, this_best_prio,
+				rem_load, sd, idle, all_pinned, this_best_prio,
 				tg->cfs_rq[busiest_cpu]);
 
 		if (!moved_load)
-- 
cgit v1.2.3


From d3f40dbab954d83383b6a516582d5c09cc216dcc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:21 +0200
Subject: sched: dont micro manage share losses

We used to try and contain the loss of 'shares' by playing arithmetic
games. Replace that by noticing that at the top sched_domain we'll
always have the full weight in shares to distribute.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 26 +++-----------------------
 1 file changed, 3 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index dae20199dc9c..28229c5d4983 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1551,6 +1551,9 @@ aggregate_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd)
 	if ((!shares && aggregate(tg, cpu)->rq_weight) || shares > tg->shares)
 		shares = tg->shares;
 
+	if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
+		shares = tg->shares;
+
 	aggregate(tg, cpu)->shares = shares;
 }
 
@@ -1642,20 +1645,8 @@ static void
 __move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd,
 		    int scpu, int dcpu)
 {
-	unsigned long shares;
-
-	shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
-
 	__update_group_shares_cpu(tg, cpu, sd, scpu);
 	__update_group_shares_cpu(tg, cpu, sd, dcpu);
-
-	/*
-	 * ensure we never loose shares due to rounding errors in the
-	 * above redistribution.
-	 */
-	shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
-	if (shares)
-		tg->cfs_rq[dcpu]->shares += shares;
 }
 
 /*
@@ -1675,7 +1666,6 @@ move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd,
 static void
 aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
-	unsigned long shares = aggregate(tg, cpu)->shares;
 	int i;
 
 	for_each_cpu_mask(i, sd->span) {
@@ -1688,16 +1678,6 @@ aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain *
 	}
 
 	aggregate_group_shares(tg, cpu, sd);
-
-	/*
-	 * ensure we never loose shares due to rounding errors in the
-	 * above redistribution.
-	 */
-	shares -= aggregate(tg, cpu)->shares;
-	if (shares) {
-		tg->cfs_rq[cpu]->shares += shares;
-		aggregate(tg, cpu)->shares += shares;
-	}
 }
 
 /*
-- 
cgit v1.2.3


From a25b5aca8740ea99d5e18dfc71235a52b685dcf7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:22 +0200
Subject: sched: no need to aggregate task_weight

We only need to know the task_weight of the busiest rq - nothing to do
if there are no tasks there.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 16 +---------------
 kernel/sched_fair.c |  2 +-
 2 files changed, 2 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 28229c5d4983..716cfc8e099e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -433,12 +433,6 @@ struct cfs_rq {
 		 * The sum of all runqueue weights within this span.
 		 */
 		unsigned long rq_weight;
-
-		/*
-		 * Weight contributed by tasks; this is the part we can
-		 * influence by moving tasks around.
-		 */
-		unsigned long task_weight;
 	} aggregate;
 #endif
 #endif
@@ -1473,10 +1467,6 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
  * rq_weight:
  *    Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
  *    B would get 2.
- *
- * task_weight:
- *    Part of the rq_weight contributed by tasks; all groups except B would
- *    get 1, B gets 2.
  */
 
 static inline struct aggregate_struct *
@@ -1524,16 +1514,12 @@ static void
 aggregate_group_weight(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
 	unsigned long rq_weight = 0;
-	unsigned long task_weight = 0;
 	int i;
 
-	for_each_cpu_mask(i, sd->span) {
+	for_each_cpu_mask(i, sd->span)
 		rq_weight += tg->cfs_rq[i]->load.weight;
-		task_weight += tg->cfs_rq[i]->task_weight;
-	}
 
 	aggregate(tg, cpu)->rq_weight = rq_weight;
-	aggregate(tg, cpu)->task_weight = task_weight;
 }
 
 /*
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index b10c0d61a2a9..03b9fbd9d648 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1427,7 +1427,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		/*
 		 * empty group
 		 */
-		if (!aggregate(tg, this_cpu)->task_weight)
+		if (!tg->cfs_rq[busiest_cpu]->task_weight)
 			continue;
 
 		rem_load = rem_load_move * aggregate(tg, this_cpu)->rq_weight;
-- 
cgit v1.2.3


From c8cba857b4997d5b00451d01474638f6a153f713 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:23 +0200
Subject: sched: simplify the group load balancer

While thinking about the previous patch - I realized that using per domain
aggregate load values in load_balance_fair() is wrong. We should use the
load value for that CPU.

By not needing per domain hierarchical load values we don't need to store
per domain aggregate shares, which greatly simplifies all the math.

It basically falls apart in two separate computations:
 - per domain update of the shares
 - per CPU update of the hierarchical load

Also get rid of the move_group_shares() stuff - just re-compute the shares
again after a successful load balance.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 286 ++++++++++++----------------------------------------
 kernel/sched_fair.c |  15 +--
 2 files changed, 72 insertions(+), 229 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 716cfc8e099e..f864b751fd19 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -406,34 +406,23 @@ struct cfs_rq {
 	struct task_group *tg;	/* group that "owns" this runqueue */
 
 #ifdef CONFIG_SMP
-	unsigned long task_weight;
-	unsigned long shares;
 	/*
-	 * We need space to build a sched_domain wide view of the full task
-	 * group tree, in order to avoid depending on dynamic memory allocation
-	 * during the load balancing we place this in the per cpu task group
-	 * hierarchy. This limits the load balancing to one instance per cpu,
-	 * but more should not be needed anyway.
+	 * the part of load.weight contributed by tasks
 	 */
-	struct aggregate_struct {
-		/*
-		 *   load = weight(cpus) * f(tg)
-		 *
-		 * Where f(tg) is the recursive weight fraction assigned to
-		 * this group.
-		 */
-		unsigned long load;
+	unsigned long task_weight;
 
-		/*
-		 * part of the group weight distributed to this span.
-		 */
-		unsigned long shares;
+	/*
+	 *   h_load = weight * f(tg)
+	 *
+	 * Where f(tg) is the recursive weight fraction assigned to
+	 * this group.
+	 */
+	unsigned long h_load;
 
-		/*
-		 * The sum of all runqueue weights within this span.
-		 */
-		unsigned long rq_weight;
-	} aggregate;
+	/*
+	 * this cpu's part of tg->shares
+	 */
+	unsigned long shares;
 #endif
 #endif
 };
@@ -1443,47 +1432,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
-/*
- * Group load balancing.
- *
- * We calculate a few balance domain wide aggregate numbers; load and weight.
- * Given the pictures below, and assuming each item has equal weight:
- *
- *         root          1 - thread
- *         / | \         A - group
- *        A  1  B
- *       /|\   / \
- *      C 2 D 3   4
- *      |   |
- *      5   6
- *
- * load:
- *    A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
- *    which equals 1/9-th of the total load.
- *
- * shares:
- *    The weight of this group on the selected cpus.
- *
- * rq_weight:
- *    Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
- *    B would get 2.
- */
-
-static inline struct aggregate_struct *
-aggregate(struct task_group *tg, int cpu)
-{
-	return &tg->cfs_rq[cpu]->aggregate;
-}
-
-typedef void (*aggregate_func)(struct task_group *, int, struct sched_domain *);
+typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
 
 /*
  * Iterate the full tree, calling @down when first entering a node and @up when
  * leaving it for the final time.
  */
-static
-void aggregate_walk_tree(aggregate_func down, aggregate_func up,
-			 int cpu, struct sched_domain *sd)
+static void
+walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
 {
 	struct task_group *parent, *child;
 
@@ -1507,72 +1463,6 @@ up:
 	rcu_read_unlock();
 }
 
-/*
- * Calculate the aggregate runqueue weight.
- */
-static void
-aggregate_group_weight(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-	unsigned long rq_weight = 0;
-	int i;
-
-	for_each_cpu_mask(i, sd->span)
-		rq_weight += tg->cfs_rq[i]->load.weight;
-
-	aggregate(tg, cpu)->rq_weight = rq_weight;
-}
-
-/*
- * Compute the weight of this group on the given cpus.
- */
-static void
-aggregate_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-	unsigned long shares = 0;
-	int i;
-
-	for_each_cpu_mask(i, sd->span)
-		shares += tg->cfs_rq[i]->shares;
-
-	if ((!shares && aggregate(tg, cpu)->rq_weight) || shares > tg->shares)
-		shares = tg->shares;
-
-	if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
-		shares = tg->shares;
-
-	aggregate(tg, cpu)->shares = shares;
-}
-
-/*
- * Compute the load fraction assigned to this group, relies on the aggregate
- * weight and this group's parent's load, i.e. top-down.
- */
-static void
-aggregate_group_load(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-	unsigned long load;
-
-	if (!tg->parent) {
-		int i;
-
-		load = 0;
-		for_each_cpu_mask(i, sd->span)
-			load += cpu_rq(i)->load.weight;
-
-	} else {
-		load = aggregate(tg->parent, cpu)->load;
-
-		/*
-		 * shares is our weight in the parent's rq so
-		 * shares/parent->rq_weight gives our fraction of the load
-		 */
-		load *= aggregate(tg, cpu)->shares;
-		load /= aggregate(tg->parent, cpu)->rq_weight + 1;
-	}
-
-	aggregate(tg, cpu)->load = load;
-}
-
 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
 
 /*
@@ -1580,16 +1470,16 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
  */
 static void
 __update_group_shares_cpu(struct task_group *tg, int cpu,
-			  struct sched_domain *sd, int tcpu)
+			  unsigned long sd_shares, unsigned long sd_rq_weight)
 {
 	int boost = 0;
 	unsigned long shares;
 	unsigned long rq_weight;
 
-	if (!tg->se[tcpu])
+	if (!tg->se[cpu])
 		return;
 
-	rq_weight = tg->cfs_rq[tcpu]->load.weight;
+	rq_weight = tg->cfs_rq[cpu]->load.weight;
 
 	/*
 	 * If there are currently no tasks on the cpu pretend there is one of
@@ -1601,124 +1491,97 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
 		rq_weight = NICE_0_LOAD;
 	}
 
+	if (unlikely(rq_weight > sd_rq_weight))
+		rq_weight = sd_rq_weight;
+
 	/*
 	 *           \Sum shares * rq_weight
 	 * shares =  -----------------------
 	 *               \Sum rq_weight
 	 *
 	 */
-	shares = aggregate(tg, cpu)->shares * rq_weight;
-	shares /= aggregate(tg, cpu)->rq_weight + 1;
+	shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
 
 	/*
 	 * record the actual number of shares, not the boosted amount.
 	 */
-	tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;
+	tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
 
 	if (shares < MIN_SHARES)
 		shares = MIN_SHARES;
 	else if (shares > MAX_SHARES)
 		shares = MAX_SHARES;
 
-	__set_se_shares(tg->se[tcpu], shares);
+	__set_se_shares(tg->se[cpu], shares);
 }
 
 /*
- * Re-adjust the weights on the cpu the task came from and on the cpu the
- * task went to.
+ * Re-compute the task group their per cpu shares over the given domain.
+ * This needs to be done in a bottom-up fashion because the rq weight of a
+ * parent group depends on the shares of its child groups.
  */
 static void
-__move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd,
-		    int scpu, int dcpu)
+tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
-	__update_group_shares_cpu(tg, cpu, sd, scpu);
-	__update_group_shares_cpu(tg, cpu, sd, dcpu);
-}
+	unsigned long rq_weight = 0;
+	unsigned long shares = 0;
+	int i;
 
-/*
- * Because changing a group's shares changes the weight of the super-group
- * we need to walk up the tree and change all shares until we hit the root.
- */
-static void
-move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd,
-		  int scpu, int dcpu)
-{
-	while (tg) {
-		__move_group_shares(tg, cpu, sd, scpu, dcpu);
-		tg = tg->parent;
+	for_each_cpu_mask(i, sd->span) {
+		rq_weight += tg->cfs_rq[i]->load.weight;
+		shares += tg->cfs_rq[i]->shares;
 	}
-}
 
-static void
-aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-	int i;
+	if ((!shares && rq_weight) || shares > tg->shares)
+		shares = tg->shares;
+
+	if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
+		shares = tg->shares;
 
 	for_each_cpu_mask(i, sd->span) {
 		struct rq *rq = cpu_rq(i);
 		unsigned long flags;
 
 		spin_lock_irqsave(&rq->lock, flags);
-		__update_group_shares_cpu(tg, cpu, sd, i);
+		__update_group_shares_cpu(tg, i, shares, rq_weight);
 		spin_unlock_irqrestore(&rq->lock, flags);
 	}
-
-	aggregate_group_shares(tg, cpu, sd);
 }
 
 /*
- * Calculate the accumulative weight and recursive load of each task group
- * while walking down the tree.
+ * Compute the cpu's hierarchical load factor for each task group.
+ * This needs to be done in a top-down fashion because the load of a child
+ * group is a fraction of its parents load.
  */
 static void
-aggregate_get_down(struct task_group *tg, int cpu, struct sched_domain *sd)
+tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
-	aggregate_group_weight(tg, cpu, sd);
-	aggregate_group_shares(tg, cpu, sd);
-	aggregate_group_load(tg, cpu, sd);
-}
-
-/*
- * Rebalance the cpu shares while walking back up the tree.
- */
-static void
-aggregate_get_up(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-	aggregate_group_set_shares(tg, cpu, sd);
-}
-
-static void
-aggregate_get_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-}
-
-static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
+	unsigned long load;
 
-static void __init init_aggregate(void)
-{
-	int i;
+	if (!tg->parent) {
+		load = cpu_rq(cpu)->load.weight;
+	} else {
+		load = tg->parent->cfs_rq[cpu]->h_load;
+		load *= tg->cfs_rq[cpu]->shares;
+		load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
+	}
 
-	for_each_possible_cpu(i)
-		spin_lock_init(&per_cpu(aggregate_lock, i));
+	tg->cfs_rq[cpu]->h_load = load;
 }
 
-static int get_aggregate(int cpu, struct sched_domain *sd)
+static void
+tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
-	if (!spin_trylock(&per_cpu(aggregate_lock, cpu)))
-		return 0;
-
-	aggregate_walk_tree(aggregate_get_down, aggregate_get_up, cpu, sd);
-	return 1;
 }
 
-static void update_aggregate(int cpu, struct sched_domain *sd)
+static void update_shares(struct sched_domain *sd)
 {
-	aggregate_walk_tree(aggregate_get_down, aggregate_get_nop, cpu, sd);
+	walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
 }
 
-static void put_aggregate(int cpu, struct sched_domain *sd)
+static void update_h_load(int cpu)
 {
-	spin_unlock(&per_cpu(aggregate_lock, cpu));
+	walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
 }
 
 static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
@@ -1728,22 +1591,10 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 
 #else
 
-static inline void init_aggregate(void)
-{
-}
-
-static inline int get_aggregate(int cpu, struct sched_domain *sd)
-{
-	return 0;
-}
-
-static inline void update_aggregate(int cpu, struct sched_domain *sd)
+static inline void update_shares(struct sched_domain *sd)
 {
 }
 
-static inline void put_aggregate(int cpu, struct sched_domain *sd)
-{
-}
 #endif
 
 #endif
@@ -2172,12 +2023,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 	int load_idx = sd->forkexec_idx;
 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
 
-	/*
-	 * now that we have both rqs locked the rq weight won't change
-	 * anymore - so update the stats.
-	 */
-	update_aggregate(this_cpu, sd);
-
 	do {
 		unsigned long load, avg_load;
 		int local_group;
@@ -3521,12 +3366,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	unsigned long imbalance;
 	struct rq *busiest;
 	unsigned long flags;
-	int unlock_aggregate;
 
 	cpus_setall(*cpus);
 
-	unlock_aggregate = get_aggregate(this_cpu, sd);
-
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
 	 * sibling can pick up load irrespective of busy siblings. In this case,
@@ -3540,6 +3382,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	schedstat_inc(sd, lb_count[idle]);
 
 redo:
+	update_shares(sd);
 	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
 				   cpus, balance);
 
@@ -3663,8 +3506,8 @@ out_one_pinned:
 	else
 		ld_moved = 0;
 out:
-	if (unlock_aggregate)
-		put_aggregate(this_cpu, sd);
+	if (ld_moved)
+		update_shares(sd);
 	return ld_moved;
 }
 
@@ -8019,7 +7862,6 @@ void __init sched_init(void)
 	}
 
 #ifdef CONFIG_SMP
-	init_aggregate();
 	init_defrootdomain();
 #endif
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 03b9fbd9d648..7b8d664d6f22 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1421,17 +1421,20 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	struct task_group *tg;
 
 	rcu_read_lock();
+	update_h_load(busiest_cpu);
+
 	list_for_each_entry(tg, &task_groups, list) {
+		struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
 		long rem_load, moved_load;
 
 		/*
 		 * empty group
 		 */
-		if (!tg->cfs_rq[busiest_cpu]->task_weight)
+		if (!busiest_cfs_rq->task_weight)
 			continue;
 
-		rem_load = rem_load_move * aggregate(tg, this_cpu)->rq_weight;
-		rem_load /= aggregate(tg, this_cpu)->load + 1;
+		rem_load = rem_load_move * busiest_cfs_rq->load.weight;
+		rem_load /= busiest_cfs_rq->h_load + 1;
 
 		moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
 				rem_load, sd, idle, all_pinned, this_best_prio,
@@ -1440,10 +1443,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		if (!moved_load)
 			continue;
 
-		move_group_shares(tg, this_cpu, sd, busiest_cpu, this_cpu);
-
-		moved_load *= aggregate(tg, this_cpu)->load;
-		moved_load /= aggregate(tg, this_cpu)->rq_weight + 1;
+		moved_load *= busiest_cfs_rq->h_load;
+		moved_load /= busiest_cfs_rq->load.weight + 1;
 
 		rem_load_move -= moved_load;
 		if (rem_load_move < 0)
-- 
cgit v1.2.3


From 3e5459b4bea3ca2618cc02d56d12639f2cba531d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:24 +0200
Subject: sched: fix newidle smp group balancing

Re-compute the shares on newidle - so we can make a decision based on
recent data.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index f864b751fd19..cdd09462fc98 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1579,6 +1579,13 @@ static void update_shares(struct sched_domain *sd)
 	walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
 }
 
+static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
+{
+	spin_unlock(&rq->lock);
+	update_shares(sd);
+	spin_lock(&rq->lock);
+}
+
 static void update_h_load(int cpu)
 {
 	walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
@@ -1595,6 +1602,10 @@ static inline void update_shares(struct sched_domain *sd)
 {
 }
 
+static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
+{
+}
+
 #endif
 
 #endif
@@ -3543,6 +3554,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
 
 	schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
 redo:
+	update_shares_locked(this_rq, sd);
 	group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
 				   &sd_idle, cpus, NULL);
 	if (!group) {
@@ -3586,6 +3598,7 @@ redo:
 	} else
 		sd->nr_balance_failed = 0;
 
+	update_shares_locked(this_rq, sd);
 	return ld_moved;
 
 out_balanced:
-- 
cgit v1.2.3


From 039a1c41b3a489e34593ea1e1687f6fdad6b13ab Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:25 +0200
Subject: sched: fix sched_balance_self() smp group balancing

Finding the least idle cpu is more accurate when done with updated shares.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index cdd09462fc98..39d5495540d2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2128,6 +2128,9 @@ static int sched_balance_self(int cpu, int flag)
 			sd = tmp;
 	}
 
+	if (sd)
+		update_shares(sd);
+
 	while (sd) {
 		cpumask_t span, tmpmask;
 		struct sched_group *group;
-- 
cgit v1.2.3


From a8a51d5e59561aa5b4d66e19eca819b537783e8f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:26 +0200
Subject: sched: persistent average load per task

Remove the fall-back to SCHED_LOAD_SCALE by remembering the previous value of
cpu_avg_load_per_task() - this is useful because of the hierarchical group
model in which task weight can be much smaller.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 39d5495540d2..6a6b0139eb32 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -554,6 +554,8 @@ struct rq {
 	int cpu;
 	int online;
 
+	unsigned long avg_load_per_task;
+
 	struct task_struct *migration_thread;
 	struct list_head migration_queue;
 #endif
@@ -1427,9 +1429,18 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
 #ifdef CONFIG_SMP
 static unsigned long source_load(int cpu, int type);
 static unsigned long target_load(int cpu, int type);
-static unsigned long cpu_avg_load_per_task(int cpu);
 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 
+static unsigned long cpu_avg_load_per_task(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	if (rq->nr_running)
+		rq->avg_load_per_task = rq->load.weight / rq->nr_running;
+
+	return rq->avg_load_per_task;
+}
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
 typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
@@ -2010,18 +2021,6 @@ static unsigned long target_load(int cpu, int type)
 	return max(rq->cpu_load[type-1], total);
 }
 
-/*
- * Return the average load per task on the cpu's run queue
- */
-static unsigned long cpu_avg_load_per_task(int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-	unsigned long total = weighted_cpuload(cpu);
-	unsigned long n = rq->nr_running;
-
-	return n ? total / n : SCHED_LOAD_SCALE;
-}
-
 /*
  * find_idlest_group finds and returns the least busy CPU group within the
  * domain.
-- 
cgit v1.2.3


From bb3469ac9b50f14ad6eba129ca0ad4fd033097a0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:27 +0200
Subject: sched: hierarchical load vs affine wakeups

With hierarchical grouping we can't just compare task weight to rq weight - we
need to scale the weight appropriately.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 7b8d664d6f22..865cb53a7ccf 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1073,6 +1073,25 @@ static inline int wake_idle(int cpu, struct task_struct *p)
 
 static const struct sched_class fair_sched_class;
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static unsigned long task_h_load(struct task_struct *p)
+{
+	unsigned long h_load = p->se.load.weight;
+	struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
+
+	update_h_load(task_cpu(p));
+
+	h_load = calc_delta_mine(h_load, cfs_rq->h_load, &cfs_rq->load);
+
+	return h_load;
+}
+#else
+static unsigned long task_h_load(struct task_struct *p)
+{
+	return p->se.load.weight;
+}
+#endif
+
 static int
 wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
 	    struct task_struct *p, int prev_cpu, int this_cpu, int sync,
@@ -1093,9 +1112,9 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
 	 * of the current CPU:
 	 */
 	if (sync)
-		tl -= current->se.load.weight;
+		tl -= task_h_load(current);
 
-	balanced = 100*(tl + p->se.load.weight) <= imbalance*load;
+	balanced = 100*(tl + task_h_load(p)) <= imbalance*load;
 
 	/*
 	 * If the currently running task will sleep within
-- 
cgit v1.2.3


From 408ed066b11cf9ee4536573b4269ee3613bd735e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:28 +0200
Subject: sched: hierarchical load vs find_busiest_group

find_busiest_group() has some assumptions about task weight being in the
NICE_0_LOAD range. Hierarchical task groups break this assumption - fix this
by replacing it with the average task weight, which will adapt the situation.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 6a6b0139eb32..5e2aa394a812 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3050,6 +3050,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	max_load = this_load = total_load = total_pwr = 0;
 	busiest_load_per_task = busiest_nr_running = 0;
 	this_load_per_task = this_nr_running = 0;
+
 	if (idle == CPU_NOT_IDLE)
 		load_idx = sd->busy_idx;
 	else if (idle == CPU_NEWLY_IDLE)
@@ -3064,6 +3065,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		int __group_imb = 0;
 		unsigned int balance_cpu = -1, first_idle_cpu = 0;
 		unsigned long sum_nr_running, sum_weighted_load;
+		unsigned long sum_avg_load_per_task;
+		unsigned long avg_load_per_task;
 
 		local_group = cpu_isset(this_cpu, group->cpumask);
 
@@ -3072,6 +3075,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 
 		/* Tally up the load of all CPUs in the group */
 		sum_weighted_load = sum_nr_running = avg_load = 0;
+		sum_avg_load_per_task = avg_load_per_task = 0;
+
 		max_cpu_load = 0;
 		min_cpu_load = ~0UL;
 
@@ -3105,6 +3110,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 			avg_load += load;
 			sum_nr_running += rq->nr_running;
 			sum_weighted_load += weighted_cpuload(i);
+
+			sum_avg_load_per_task += cpu_avg_load_per_task(i);
 		}
 
 		/*
@@ -3126,7 +3133,20 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		avg_load = sg_div_cpu_power(group,
 				avg_load * SCHED_LOAD_SCALE);
 
-		if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE)
+
+		/*
+		 * Consider the group unbalanced when the imbalance is larger
+		 * than the average weight of two tasks.
+		 *
+		 * APZ: with cgroup the avg task weight can vary wildly and
+		 *      might not be a suitable number - should we keep a
+		 *      normalized nr_running number somewhere that negates
+		 *      the hierarchy?
+		 */
+		avg_load_per_task = sg_div_cpu_power(group,
+				sum_avg_load_per_task * SCHED_LOAD_SCALE);
+
+		if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
 			__group_imb = 1;
 
 		group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
@@ -3267,9 +3287,9 @@ small_imbalance:
 			if (busiest_load_per_task > this_load_per_task)
 				imbn = 1;
 		} else
-			this_load_per_task = SCHED_LOAD_SCALE;
+			this_load_per_task = cpu_avg_load_per_task(this_cpu);
 
-		if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
+		if (max_load - this_load + 2*busiest_load_per_task >=
 					busiest_load_per_task * imbn) {
 			*imbalance = busiest_load_per_task;
 			return busiest;
-- 
cgit v1.2.3


From 42a3ac7d5cee89849448b41b86faeb86f98e92f6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:29 +0200
Subject: sched: fix load scaling in group balancing

doing the load balance will change cfs_rq->load.weight (that's the whole point)
but since that's part of the scale factor, we'll scale back with a different
amount.

Weight getting smaller would result in an inflated moved_load which causes
it to stop balancing too soon.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 865cb53a7ccf..734e4c556fcb 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1444,6 +1444,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 
 	list_for_each_entry(tg, &task_groups, list) {
 		struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
+		unsigned long busiest_h_load = busiest_cfs_rq->h_load;
+		unsigned long busiest_weight = busiest_cfs_rq->load.weight;
 		long rem_load, moved_load;
 
 		/*
@@ -1452,8 +1454,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		if (!busiest_cfs_rq->task_weight)
 			continue;
 
-		rem_load = rem_load_move * busiest_cfs_rq->load.weight;
-		rem_load /= busiest_cfs_rq->h_load + 1;
+		rem_load = rem_load_move * busiest_weight;
+		rem_load /= busiest_h_load + 1;
 
 		moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
 				rem_load, sd, idle, all_pinned, this_best_prio,
@@ -1462,8 +1464,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		if (!moved_load)
 			continue;
 
-		moved_load *= busiest_cfs_rq->h_load;
-		moved_load /= busiest_cfs_rq->load.weight + 1;
+		moved_load *= busiest_h_load;
+		moved_load /= busiest_weight + 1;
 
 		rem_load_move -= moved_load;
 		if (rem_load_move < 0)
-- 
cgit v1.2.3


From 4be9daaa1b33701f011f4117f22dc1e45a3e6e34 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:30 +0200
Subject: sched: fix task_h_load()

Currently task_h_load() computes the load of a task and uses that to either
subtract it from the total, or add to it.

However, removing or adding a task need not have any effect on the total load
at all. Imagine adding a task to a group that is local to one cpu - in that
case the total load of that cpu is unaffected.

So properly compute addition/removal:

 s_i = S * rw_i / \Sum_j rw_j
 s'_i = S * (rw_i + wl) / (\Sum_j rw_j + wg)

then s'_i - s_i gives the change in load.

Where s_i is the shares for cpu i, S the group weight, rw_i the runqueue weight
for that cpu, wl the weight we add (subtract) and wg the weight contribution to
the runqueue.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 49 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 40 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 734e4c556fcb..a1694441f8b7 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1074,22 +1074,53 @@ static inline int wake_idle(int cpu, struct task_struct *p)
 static const struct sched_class fair_sched_class;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static unsigned long task_h_load(struct task_struct *p)
+static unsigned long effective_load(struct task_group *tg, long wl, int cpu)
 {
-	unsigned long h_load = p->se.load.weight;
-	struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
+	struct sched_entity *se = tg->se[cpu];
+	long wg = wl;
 
-	update_h_load(task_cpu(p));
+	for_each_sched_entity(se) {
+#define D(n) (likely(n) ? (n) : 1)
+
+		long S, Srw, rw, s, sn;
+
+		S = se->my_q->tg->shares;
+		s = se->my_q->shares;
+		rw = se->my_q->load.weight;
 
-	h_load = calc_delta_mine(h_load, cfs_rq->h_load, &cfs_rq->load);
+		Srw = S * rw / D(s);
+		sn = S * (rw + wl) / D(Srw + wg);
+
+		wl = sn - s;
+		wg = 0;
+#undef D
+	}
 
-	return h_load;
+	return wl;
 }
+
+static unsigned long task_load_sub(struct task_struct *p)
+{
+	return effective_load(task_group(p), -(long)p->se.load.weight, task_cpu(p));
+}
+
+static unsigned long task_load_add(struct task_struct *p, int cpu)
+{
+	return effective_load(task_group(p), p->se.load.weight, cpu);
+}
+
 #else
-static unsigned long task_h_load(struct task_struct *p)
+
+static unsigned long task_load_sub(struct task_struct *p)
+{
+	return -p->se.load.weight;
+}
+
+static unsigned long task_load_add(struct task_struct *p, int cpu)
 {
 	return p->se.load.weight;
 }
+
 #endif
 
 static int
@@ -1112,9 +1143,9 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
 	 * of the current CPU:
 	 */
 	if (sync)
-		tl -= task_h_load(current);
+		tl += task_load_sub(current);
 
-	balanced = 100*(tl + task_h_load(p)) <= imbalance*load;
+	balanced = 100*(tl + task_load_add(p, this_cpu)) <= imbalance*load;
 
 	/*
 	 * If the currently running task will sleep within
-- 
cgit v1.2.3


From 051c67640e771fd6ad1b676fc0c16c379b3c6f80 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:31 +0200
Subject: sched: remove prio preference from balance decisions

Priority looses much of its meaning in a hierarchical context. So don't
use it in balance decisions.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 5e2aa394a812..10d43f5bf0fc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2896,7 +2896,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	      enum cpu_idle_type idle, int *all_pinned,
 	      int *this_best_prio, struct rq_iterator *iterator)
 {
-	int loops = 0, pulled = 0, pinned = 0, skip_for_load;
+	int loops = 0, pulled = 0, pinned = 0;
 	struct task_struct *p;
 	long rem_load_move = max_load_move;
 
@@ -2912,14 +2912,8 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 next:
 	if (!p || loops++ > sysctl_sched_nr_migrate)
 		goto out;
-	/*
-	 * To help distribute high priority tasks across CPUs we don't
-	 * skip a task if it will be the highest priority task (i.e. smallest
-	 * prio value) on its new queue regardless of its load weight
-	 */
-	skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
-							 SCHED_LOAD_SCALE_FUZZ;
-	if ((skip_for_load && p->prio >= *this_best_prio) ||
+
+	if ((p->se.load.weight >> 1) > rem_load_move ||
 	    !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
 		p = iterator->next(iterator->arg);
 		goto next;
-- 
cgit v1.2.3


From cb5ef42a03a13f95a9ea94e6cda4f7a47497871f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:32 +0200
Subject: sched: optimize effective_load()

s_i = S * rw_i / \Sum_j rw_j

 -> \Sum_j rw_j = S * rw_i / s_i

 -> s'_i = S * (rw_i + w) / (\Sum_j rw_j + w)

delta s = s' - s = S * (rw + w) / ((S * rw / s) + w)
        = s * (S * (rw + w) / (S * rw + s * w) - 1)

 a = S*(rw+w), b = S*rw + s*w

delta s = s * (a-b) / b

IOW, trade one divide for two multiplies

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a1694441f8b7..0d197be3e3e9 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1082,16 +1082,16 @@ static unsigned long effective_load(struct task_group *tg, long wl, int cpu)
 	for_each_sched_entity(se) {
 #define D(n) (likely(n) ? (n) : 1)
 
-		long S, Srw, rw, s, sn;
+		long S, rw, s, a, b;
 
 		S = se->my_q->tg->shares;
 		s = se->my_q->shares;
 		rw = se->my_q->load.weight;
 
-		Srw = S * rw / D(s);
-		sn = S * (rw + wl) / D(Srw + wg);
+		a = S*(rw + wl);
+		b = S*rw + s*wg;
 
-		wl = sn - s;
+		wl = s*(a-b)/D(b);
 		wg = 0;
 #undef D
 	}
-- 
cgit v1.2.3


From 93b75217df39e6d75889cc6f8050343286aff4a5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:33 +0200
Subject: sched: disable source/target_load bias

The bias given by source/target_load functions can be very large, disable
it by default to get faster convergence.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c          | 4 ++--
 kernel/sched_features.h | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 10d43f5bf0fc..6c5eb3bc37e0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2000,7 +2000,7 @@ static unsigned long source_load(int cpu, int type)
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long total = weighted_cpuload(cpu);
 
-	if (type == 0)
+	if (type == 0 || !sched_feat(LB_BIAS))
 		return total;
 
 	return min(rq->cpu_load[type-1], total);
@@ -2015,7 +2015,7 @@ static unsigned long target_load(int cpu, int type)
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long total = weighted_cpuload(cpu);
 
-	if (type == 0)
+	if (type == 0 || !sched_feat(LB_BIAS))
 		return total;
 
 	return max(rq->cpu_load[type-1], total);
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 04123af2e678..d56e3053e746 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -8,3 +8,4 @@ SCHED_FEAT(SYNC_WAKEUPS, 1)
 SCHED_FEAT(HRTICK, 1)
 SCHED_FEAT(DOUBLE_TICK, 0)
 SCHED_FEAT(ASYM_GRAN, 1)
+SCHED_FEAT(LB_BIAS, 0)
\ No newline at end of file
-- 
cgit v1.2.3


From cd80917e4ff465ea77106f8e4fb631eedc4cf426 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:34 +0200
Subject: sched: fix shares boost logic

In case the domain is empty, pretend there is a single task on each cpu, so
that together with the boost logic we end up giving 1/n shares to each
cpu.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 6c5eb3bc37e0..1cff969f6646 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1549,6 +1549,9 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
 	if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
 		shares = tg->shares;
 
+	if (!rq_weight)
+		rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
+
 	for_each_cpu_mask(i, sd->span) {
 		struct rq *rq = cpu_rq(i);
 		unsigned long flags;
-- 
cgit v1.2.3


From 2398f2c6d34b43025f274fc42eaca34d23ec2320 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:35 +0200
Subject: sched: update shares on wakeup

We found that the affine wakeup code needs rather accurate load figures
to be effective. The trouble is that updating the load figures is fairly
expensive with group scheduling. Therefore ratelimit the updating.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c          | 30 +++++++++++++++++++++++++++++-
 kernel/sched_features.h |  3 ++-
 kernel/sysctl.c         |  8 ++++++++
 3 files changed, 39 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 1cff969f6646..62db0891025a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -777,6 +777,12 @@ late_initcall(sched_init_debug);
  */
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 
+/*
+ * ratelimit for updating the group shares.
+ * default: 0.5ms
+ */
+const_debug unsigned int sysctl_sched_shares_ratelimit = 500000;
+
 /*
  * period over which we measure -rt task cpu usage in us.
  * default: 1s
@@ -1590,7 +1596,13 @@ tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
 
 static void update_shares(struct sched_domain *sd)
 {
-	walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
+	u64 now = cpu_clock(raw_smp_processor_id());
+	s64 elapsed = now - sd->last_update;
+
+	if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
+		sd->last_update = now;
+		walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
+	}
 }
 
 static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
@@ -2199,6 +2211,22 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	if (!sched_feat(SYNC_WAKEUPS))
 		sync = 0;
 
+#ifdef CONFIG_SMP
+	if (sched_feat(LB_WAKEUP_UPDATE)) {
+		struct sched_domain *sd;
+
+		this_cpu = raw_smp_processor_id();
+		cpu = task_cpu(p);
+
+		for_each_domain(this_cpu, sd) {
+			if (cpu_isset(cpu, sd->span)) {
+				update_shares(sd);
+				break;
+			}
+		}
+	}
+#endif
+
 	smp_wmb();
 	rq = task_rq_lock(p, &flags);
 	old_state = p->state;
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index d56e3053e746..7d616d2a2a3f 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -8,4 +8,5 @@ SCHED_FEAT(SYNC_WAKEUPS, 1)
 SCHED_FEAT(HRTICK, 1)
 SCHED_FEAT(DOUBLE_TICK, 0)
 SCHED_FEAT(ASYM_GRAN, 1)
-SCHED_FEAT(LB_BIAS, 0)
\ No newline at end of file
+SCHED_FEAT(LB_BIAS, 0)
+SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 29116652dca8..fe8cdc80ff02 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -264,6 +264,14 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &min_wakeup_granularity_ns,
 		.extra2		= &max_wakeup_granularity_ns,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_shares_ratelimit",
+		.data		= &sysctl_sched_shares_ratelimit,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_child_runs_first",
-- 
cgit v1.2.3


From 243e0e7b7d3b54749ece2e879ecd7e2a11874443 Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Date: Fri, 27 Jun 2008 13:41:36 +0200
Subject: sched: fix mult overflow

It was observed these mults can overflow.

Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0d197be3e3e9..26ebe180cdea 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1477,7 +1477,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
 		unsigned long busiest_h_load = busiest_cfs_rq->h_load;
 		unsigned long busiest_weight = busiest_cfs_rq->load.weight;
-		long rem_load, moved_load;
+		u64 rem_load, moved_load;
 
 		/*
 		 * empty group
@@ -1485,8 +1485,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		if (!busiest_cfs_rq->task_weight)
 			continue;
 
-		rem_load = rem_load_move * busiest_weight;
-		rem_load /= busiest_h_load + 1;
+		rem_load = (u64)rem_load_move * busiest_weight;
+		rem_load = div_u64(rem_load, busiest_h_load + 1);
 
 		moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
 				rem_load, sd, idle, all_pinned, this_best_prio,
@@ -1496,7 +1496,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 			continue;
 
 		moved_load *= busiest_h_load;
-		moved_load /= busiest_weight + 1;
+		moved_load = div_u64(moved_load, busiest_weight + 1);
 
 		rem_load_move -= moved_load;
 		if (rem_load_move < 0)
-- 
cgit v1.2.3


From 83378269a5fad98f562ebc0f09c349575e6cbfe1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:37 +0200
Subject: sched: correct wakeup weight calculations

rw_i = {2, 4, 1, 0}
s_i = {2/7, 4/7, 1/7, 0}

wakeup on cpu0, weight=1

rw'_i = {3, 4, 1, 0}
s'_i = {3/8, 4/8, 1/8, 0}

s_0 = S * rw_0 / \Sum rw_j ->
  \Sum rw_j = S*rw_0/s_0 = 1*2*7/2 = 7 (correct)

s'_0 = S * (rw_0 + 1) / (\Sum rw_j + 1) =
       1 * (2+1) / (7+1) = 3/8 (correct

so we find that adding 1 to cpu0 gains 5/56 in weight
if say the other cpu were, cpu1, we'd also have to calculate its 4/56 loss

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      |  4 ++++
 kernel/sched_fair.c | 48 ++++++++++++++++++++++++++----------------------
 2 files changed, 30 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 62db0891025a..01d3e51b7116 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -365,6 +365,10 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 #else
 
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
+static inline struct task_group *task_group(struct task_struct *p)
+{
+	return NULL;
+}
 
 #endif	/* CONFIG_GROUP_SCHED */
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 26ebe180cdea..bed2f71e63d9 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1074,10 +1074,10 @@ static inline int wake_idle(int cpu, struct task_struct *p)
 static const struct sched_class fair_sched_class;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static unsigned long effective_load(struct task_group *tg, long wl, int cpu)
+static unsigned long effective_load(struct task_group *tg, int cpu,
+		unsigned long wl, unsigned long wg)
 {
 	struct sched_entity *se = tg->se[cpu];
-	long wg = wl;
 
 	for_each_sched_entity(se) {
 #define D(n) (likely(n) ? (n) : 1)
@@ -1092,6 +1092,13 @@ static unsigned long effective_load(struct task_group *tg, long wl, int cpu)
 		b = S*rw + s*wg;
 
 		wl = s*(a-b)/D(b);
+		/*
+		 * Assume the group is already running and will
+		 * thus already be accounted for in the weight.
+		 *
+		 * That is, moving shares between CPUs, does not
+		 * alter the group weight.
+		 */
 		wg = 0;
 #undef D
 	}
@@ -1099,26 +1106,12 @@ static unsigned long effective_load(struct task_group *tg, long wl, int cpu)
 	return wl;
 }
 
-static unsigned long task_load_sub(struct task_struct *p)
-{
-	return effective_load(task_group(p), -(long)p->se.load.weight, task_cpu(p));
-}
-
-static unsigned long task_load_add(struct task_struct *p, int cpu)
-{
-	return effective_load(task_group(p), p->se.load.weight, cpu);
-}
-
 #else
 
-static unsigned long task_load_sub(struct task_struct *p)
+static inline unsigned long effective_load(struct task_group *tg, int cpu,
+		unsigned long wl, unsigned long wg)
 {
-	return -p->se.load.weight;
-}
-
-static unsigned long task_load_add(struct task_struct *p, int cpu)
-{
-	return p->se.load.weight;
+	return wl;
 }
 
 #endif
@@ -1130,8 +1123,10 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
 	    unsigned int imbalance)
 {
 	struct task_struct *curr = this_rq->curr;
+	struct task_group *tg;
 	unsigned long tl = this_load;
 	unsigned long tl_per_task;
+	unsigned long weight;
 	int balanced;
 
 	if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
@@ -1142,10 +1137,19 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
 	 * effect of the currently running task from the load
 	 * of the current CPU:
 	 */
-	if (sync)
-		tl += task_load_sub(current);
+	if (sync) {
+		tg = task_group(current);
+		weight = current->se.load.weight;
+
+		tl += effective_load(tg, this_cpu, -weight, -weight);
+		load += effective_load(tg, prev_cpu, 0, -weight);
+	}
+
+	tg = task_group(p);
+	weight = p->se.load.weight;
 
-	balanced = 100*(tl + task_load_add(p, this_cpu)) <= imbalance*load;
+	balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
+		imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
 
 	/*
 	 * If the currently running task will sleep within
-- 
cgit v1.2.3


From f1d239f73200a5803a89e5929fb3abc1596b7589 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:38 +0200
Subject: sched: incremental effective_load()

Increase the accuracy of the effective_load values.

Not only consider the current increment (as per the attempted wakeup), but
also consider the delta between when we last adjusted the shares and the
current situation.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      |  6 ++++++
 kernel/sched_fair.c | 18 +++++++++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 01d3e51b7116..7613f69f0978 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -427,6 +427,11 @@ struct cfs_rq {
 	 * this cpu's part of tg->shares
 	 */
 	unsigned long shares;
+
+	/*
+	 * load.weight at the time we set shares
+	 */
+	unsigned long rq_weight;
 #endif
 #endif
 };
@@ -1527,6 +1532,7 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
 	 * record the actual number of shares, not the boosted amount.
 	 */
 	tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
+	tg->cfs_rq[cpu]->rq_weight = rq_weight;
 
 	if (shares < MIN_SHARES)
 		shares = MIN_SHARES;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index bed2f71e63d9..e87f1a52f625 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1074,10 +1074,22 @@ static inline int wake_idle(int cpu, struct task_struct *p)
 static const struct sched_class fair_sched_class;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static unsigned long effective_load(struct task_group *tg, int cpu,
-		unsigned long wl, unsigned long wg)
+static long effective_load(struct task_group *tg, int cpu,
+		long wl, long wg)
 {
 	struct sched_entity *se = tg->se[cpu];
+	long more_w;
+
+	if (!tg->parent)
+		return wl;
+
+	/*
+	 * Instead of using this increment, also add the difference
+	 * between when the shares were last updated and now.
+	 */
+	more_w = se->my_q->load.weight - se->my_q->rq_weight;
+	wl += more_w;
+	wg += more_w;
 
 	for_each_sched_entity(se) {
 #define D(n) (likely(n) ? (n) : 1)
@@ -1086,7 +1098,7 @@ static unsigned long effective_load(struct task_group *tg, int cpu,
 
 		S = se->my_q->tg->shares;
 		s = se->my_q->shares;
-		rw = se->my_q->load.weight;
+		rw = se->my_q->rq_weight;
 
 		a = S*(rw + wl);
 		b = S*rw + s*wg;
-- 
cgit v1.2.3


From f5bfb7d9ff73d72ee4f2f4830a6f0c9088d00f92 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:39 +0200
Subject: sched: bias effective_load() error towards failing wake_affine().

Measurement shows that the difference between cgroup:/ and cgroup:/foo
wake_affine() results is that the latter succeeds significantly more.

Therefore bias the calculations towards failing the test.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c     | 28 ++++++++++++++++++++++++++++
 kernel/sched_features.h |  1 +
 2 files changed, 29 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e87f1a52f625..9bcc0030a58b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1074,6 +1074,27 @@ static inline int wake_idle(int cpu, struct task_struct *p)
 static const struct sched_class fair_sched_class;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * effective_load() calculates the load change as seen from the root_task_group
+ *
+ * Adding load to a group doesn't make a group heavier, but can cause movement
+ * of group shares between cpus. Assuming the shares were perfectly aligned one
+ * can calculate the shift in shares.
+ *
+ * The problem is that perfectly aligning the shares is rather expensive, hence
+ * we try to avoid doing that too often - see update_shares(), which ratelimits
+ * this change.
+ *
+ * We compensate this by not only taking the current delta into account, but
+ * also considering the delta between when the shares were last adjusted and
+ * now.
+ *
+ * We still saw a performance dip, some tracing learned us that between
+ * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
+ * significantly. Therefore try to bias the error in direction of failing
+ * the affine wakeup.
+ *
+ */
 static long effective_load(struct task_group *tg, int cpu,
 		long wl, long wg)
 {
@@ -1083,6 +1104,13 @@ static long effective_load(struct task_group *tg, int cpu,
 	if (!tg->parent)
 		return wl;
 
+	/*
+	 * By not taking the decrease of shares on the other cpu into
+	 * account our error leans towards reducing the affine wakeups.
+	 */
+	if (!wl && sched_feat(ASYM_EFF_LOAD))
+		return wl;
+
 	/*
 	 * Instead of using this increment, also add the difference
 	 * between when the shares were last updated and now.
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 7d616d2a2a3f..862b06bd560a 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -10,3 +10,4 @@ SCHED_FEAT(DOUBLE_TICK, 0)
 SCHED_FEAT(ASYM_GRAN, 1)
 SCHED_FEAT(LB_BIAS, 0)
 SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
+SCHED_FEAT(ASYM_EFF_LOAD, 1)
-- 
cgit v1.2.3


From 55e12e5e7b1d7e7c05a4be10cb5fd092c039aa78 Mon Sep 17 00:00:00 2001
From: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Date: Tue, 24 Jun 2008 23:39:43 +0530
Subject: sched: make sched_{rt,fair}.c ifdefs more readable

Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c |  6 +++---
 kernel/sched_rt.c   | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 9bcc0030a58b..2e43d4a748c3 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -921,7 +921,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
 		hrtick_start(rq, delta, requeue);
 	}
 }
-#else
+#else /* !CONFIG_SCHED_HRTICK */
 static inline void
 hrtick_start_fair(struct rq *rq, struct task_struct *p)
 {
@@ -1062,7 +1062,7 @@ static int wake_idle(int cpu, struct task_struct *p)
 	}
 	return cpu;
 }
-#else
+#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
 static inline int wake_idle(int cpu, struct task_struct *p)
 {
 	return cpu;
@@ -1586,7 +1586,7 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 
 	return 0;
 }
-#endif
+#endif /* CONFIG_SMP */
 
 /*
  * scheduler tick hitting a task of our scheduling class:
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 765932d0399d..47ceac9e8552 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -161,7 +161,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 	return &rt_rq->tg->rt_bandwidth;
 }
 
-#else
+#else /* !CONFIG_RT_GROUP_SCHED */
 
 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 {
@@ -226,7 +226,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 	return &def_rt_bandwidth;
 }
 
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
 
 #ifdef CONFIG_SMP
 static int do_balance_runtime(struct rt_rq *rt_rq)
@@ -374,12 +374,12 @@ static int balance_runtime(struct rt_rq *rt_rq)
 
 	return more;
 }
-#else
+#else /* !CONFIG_SMP */
 static inline int balance_runtime(struct rt_rq *rt_rq)
 {
 	return 0;
 }
-#endif
+#endif /* CONFIG_SMP */
 
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 {
@@ -1472,4 +1472,4 @@ static void print_rt_stats(struct seq_file *m, int cpu)
 		print_rt_rq(m, cpu, rt_rq);
 	rcu_read_unlock();
 }
-#endif
+#endif /* CONFIG_SCHED_DEBUG */
-- 
cgit v1.2.3


From 4c9fe8ad813b257a2b9ddf0f752105a75a7dae63 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 27 Jun 2008 14:49:35 +0200
Subject: sched: export cpu_clock

the rcutorture module relies on cpu_clock.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_clock.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 3c696db59452..ed5a8c415046 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -256,3 +256,4 @@ unsigned long long cpu_clock(int cpu)
 
 	return clock;
 }
+EXPORT_SYMBOL_GPL(cpu_clock);
-- 
cgit v1.2.3


From ad118c54a3587b2c69a769d0ba37d4d8dce4559d Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Fri, 27 Jun 2008 18:04:48 +0200
Subject: stacktrace: add saved stack traces to backtrace self-test

This patch adds saved stack-traces to the backtrace suite of self-tests.

Note that we don't depend on or unconditionally enable CONFIG_STACKTRACE
because not all architectures may have it (and we still want to enable the
other tests for those architectures).

Cc: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/backtracetest.c | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index d1a7605c5b8f..50f7abd0813d 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -10,9 +10,10 @@
  * of the License.
  */
 
+#include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/sched.h>
-#include <linux/delay.h>
+#include <linux/stacktrace.h>
 
 static struct timer_list backtrace_timer;
 
@@ -22,6 +23,31 @@ static void backtrace_test_timer(unsigned long data)
 	printk("The following trace is a kernel self test and not a bug!\n");
 	dump_stack();
 }
+
+#ifdef CONFIG_STACKTRACE
+static void backtrace_test_saved(void)
+{
+	struct stack_trace trace;
+	unsigned long entries[8];
+
+	printk("Testing a saved backtrace.\n");
+	printk("The following trace is a kernel self test and not a bug!\n");
+
+	trace.nr_entries = 0;
+	trace.max_entries = ARRAY_SIZE(entries);
+	trace.entries = entries;
+	trace.skip = 0;
+
+	save_stack_trace(&trace);
+	print_stack_trace(&trace, 0);
+}
+#else
+static void backtrace_test_saved(void)
+{
+	printk("Saved backtrace test skipped.\n");
+}
+#endif
+
 static int backtrace_regression_test(void)
 {
 	printk("====[ backtrace testing ]===========\n");
@@ -29,6 +55,8 @@ static int backtrace_regression_test(void)
 	printk("The following trace is a kernel self test and not a bug!\n");
 	dump_stack();
 
+	backtrace_test_saved();
+
 	init_timer(&backtrace_timer);
 	backtrace_timer.function = backtrace_test_timer;
 	mod_timer(&backtrace_timer, jiffies + 10);
-- 
cgit v1.2.3


From 4e6a0535dd036377961027262aecb138099f925d Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Fri, 27 Jun 2008 18:06:54 +0200
Subject: backtrace: replace timer with tasklet + completions

On qemu, the backtrace would show up _after_ the "end of backtrace
testing" message.

This patch changes it to use completions instead, which will guarantee
that no such race exists.

Cc: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/backtracetest.c | 37 ++++++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index 50f7abd0813d..a5e026bc45c4 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -10,18 +10,39 @@
  * of the License.
  */
 
+#include <linux/completion.h>
 #include <linux/delay.h>
+#include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/stacktrace.h>
 
-static struct timer_list backtrace_timer;
+static void backtrace_test_normal(void)
+{
+	printk("Testing a backtrace from process context.\n");
+	printk("The following trace is a kernel self test and not a bug!\n");
+
+	dump_stack();
+}
+
+static DECLARE_COMPLETION(backtrace_work);
+
+static void backtrace_test_irq_callback(unsigned long data)
+{
+	dump_stack();
+	complete(&backtrace_work);
+}
+
+static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0);
 
-static void backtrace_test_timer(unsigned long data)
+static void backtrace_test_irq(void)
 {
 	printk("Testing a backtrace from irq context.\n");
 	printk("The following trace is a kernel self test and not a bug!\n");
-	dump_stack();
+
+	init_completion(&backtrace_work);
+	tasklet_schedule(&backtrace_tasklet);
+	wait_for_completion(&backtrace_work);
 }
 
 #ifdef CONFIG_STACKTRACE
@@ -51,17 +72,11 @@ static void backtrace_test_saved(void)
 static int backtrace_regression_test(void)
 {
 	printk("====[ backtrace testing ]===========\n");
-	printk("Testing a backtrace from process context.\n");
-	printk("The following trace is a kernel self test and not a bug!\n");
-	dump_stack();
 
+	backtrace_test_normal();
+	backtrace_test_irq();
 	backtrace_test_saved();
 
-	init_timer(&backtrace_timer);
-	backtrace_timer.function = backtrace_test_timer;
-	mod_timer(&backtrace_timer, jiffies + 10);
-
-	msleep(10);
 	printk("====[ end of backtrace testing ]====\n");
 	return 0;
 }
-- 
cgit v1.2.3


From 79c537998d143b127c8c662a403c3356cb885f1c Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Sun, 29 Jun 2008 00:16:56 +0200
Subject: sched: fix cpu hotplug

the CPU hotplug problems (crashes under high-volume unplug+replug
tests) seem to be related to migrate_dead_tasks().

Firstly I added traces to see all tasks being migrated with
migrate_live_tasks() and migrate_dead_tasks(). On my setup the problem
pops up (the one with "se == NULL" in the loop of
pick_next_task_fair()) shortly after the traces indicate that some has
been migrated with migrate_dead_tasks()). btw., I can reproduce it
much faster now with just a plain cpu down/up loop.

[disclaimer] Well, unless I'm really missing something important in
this late hour [/desclaimer] pick_next_task() is not something
appropriate for migrate_dead_tasks() :-)

the following change seems to eliminate the problem on my setup
(although, I kept it running only for a few minutes to get a few
messages indicating migrate_dead_tasks() does move tasks and the
system is still ok)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 3aaa5c8cb421..a66e85639de2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5887,6 +5887,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
 		next = pick_next_task(rq, rq->curr);
 		if (!next)
 			break;
+		next->sched_class->put_prev_task(rq, next);
 		migrate_dead(dead_cpu, next);
 
 	}
-- 
cgit v1.2.3


From 2d452c9b10caeec455eb5e56a0ef4ed485178213 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 29 Jun 2008 15:01:59 +0200
Subject: sched: sched_clock_cpu() based cpu_clock(), lockdep fix

Vegard Nossum reported:

> WARNING: at kernel/lockdep.c:2738 check_flags+0x142/0x160()

which happens due to:

 unsigned long long cpu_clock(int cpu)
 {
         unsigned long long clock;
         unsigned long flags;

         raw_local_irq_save(flags);

as lower level functions can take locks, we must not do that, use
proper lockdep-annotated irq save/restore.

Reported-by: Vegard Nossum <vegard.nossum@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_clock.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index ed5a8c415046..60094e257a9a 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -250,9 +250,9 @@ unsigned long long cpu_clock(int cpu)
 	unsigned long long clock;
 	unsigned long flags;
 
-	raw_local_irq_save(flags);
+	local_irq_save(flags);
 	clock = sched_clock_cpu(cpu);
-	raw_local_irq_restore(flags);
+	local_irq_restore(flags);
 
 	return clock;
 }
-- 
cgit v1.2.3


From 34e83e850f5e5ee2a18cd77a5d70d31972a632e6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 27 Jun 2008 15:42:36 +0200
Subject: sched: build fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix:

kernel/sched.c: In function ‘sched_group_set_shares':
kernel/sched.c:8635: error: implicit declaration of function ‘cfs_rq_set_shares'

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 7613f69f0978..058250a63b64 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1627,11 +1627,6 @@ static void update_h_load(int cpu)
 	walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
 }
 
-static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
-{
-	cfs_rq->shares = shares;
-}
-
 #else
 
 static inline void update_shares(struct sched_domain *sd)
@@ -1646,6 +1641,13 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 
 #endif
 
+static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+{
+#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+	cfs_rq->shares = shares;
+#endif
+}
+
 #include "sched_stats.h"
 #include "sched_idletask.c"
 #include "sched_fair.c"
-- 
cgit v1.2.3


From 30432094a7f506ad24997a3ba6aed913ab61c01d Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Fri, 27 Jun 2008 21:35:50 +0200
Subject: sched: fix warning

This patch fixes the following warning:

kernel/sched.c:1667: warning: 'cfs_rq_set_shares' defined but not used

This seems the correct way to fix this; cfs_rq_set_shares() is only used
in a single place, which is also inside #ifdef CONFIG_FAIR_GROUP_SCHED.

Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 058250a63b64..677c80b9a6b5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1641,12 +1641,14 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 
 #endif
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
 static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 {
-#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+#ifdef CONFIG_SMP
 	cfs_rq->shares = shares;
 #endif
 }
+#endif
 
 #include "sched_stats.h"
 #include "sched_idletask.c"
-- 
cgit v1.2.3


From 8594698ebddeef5443b7da8258ae33b3eaca61d5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 27 Jun 2008 21:20:17 +0200
Subject: stacktrace: fix modular build, export print_stack_trace and
 save_stack_trace

fix:

ERROR: "print_stack_trace" [kernel/backtracetest.ko] undefined!
ERROR: "save_stack_trace" [kernel/backtracetest.ko] undefined!

Signed-off-by: Ingo Molnar <mingo@elte.hu>

and fix:

  Building modules, stage 2.
  MODPOST 376 modules
ERROR: "print_stack_trace" [kernel/backtracetest.ko] undefined!
make[1]: *** [__modpost] Error 1

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/stacktrace.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 7eaea9d02a52..94b527ef1d1e 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -6,6 +6,7 @@
  *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  */
 #include <linux/sched.h>
+#include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/stacktrace.h>
 
@@ -21,4 +22,5 @@ void print_stack_trace(struct stack_trace *trace, int spaces)
 		print_ip_sym(trace->entries[i]);
 	}
 }
+EXPORT_SYMBOL_GPL(print_stack_trace);
 
-- 
cgit v1.2.3


From ee7e5516be4f2107535ad5a3d47d9c79f93661a2 Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dbaryshkov@gmail.com>
Date: Sun, 29 Jun 2008 14:18:46 +0400
Subject: generic: per-device coherent dma allocator

Currently x86_32, sh and cris-v32 provide per-device coherent dma
memory allocator.

However their implementation is nearly identical. Refactor out
common code to be reused by them.

Signed-off-by: Dmitry Baryshkov <dbaryshkov@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Makefile       |   1 +
 kernel/dma-coherent.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 128 insertions(+)
 create mode 100644 kernel/dma-coherent.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 1c9938addb9d..9e287d8ab766 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_MARKERS) += marker.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
+obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
new file mode 100644
index 000000000000..89a554cfd936
--- /dev/null
+++ b/kernel/dma-coherent.c
@@ -0,0 +1,127 @@
+/*
+ * Coherent per-device memory handling.
+ * Borrowed from i386
+ */
+#include <linux/kernel.h>
+#include <linux/dma-mapping.h>
+
+struct dma_coherent_mem {
+	void		*virt_base;
+	u32		device_base;
+	int		size;
+	int		flags;
+	unsigned long	*bitmap;
+};
+
+int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
+				dma_addr_t device_addr, size_t size, int flags)
+{
+	void __iomem *mem_base = NULL;
+	int pages = size >> PAGE_SHIFT;
+	int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
+
+	if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
+		goto out;
+	if (!size)
+		goto out;
+	if (dev->dma_mem)
+		goto out;
+
+	/* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
+
+	mem_base = ioremap(bus_addr, size);
+	if (!mem_base)
+		goto out;
+
+	dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
+	if (!dev->dma_mem)
+		goto out;
+	dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+	if (!dev->dma_mem->bitmap)
+		goto free1_out;
+
+	dev->dma_mem->virt_base = mem_base;
+	dev->dma_mem->device_base = device_addr;
+	dev->dma_mem->size = pages;
+	dev->dma_mem->flags = flags;
+
+	if (flags & DMA_MEMORY_MAP)
+		return DMA_MEMORY_MAP;
+
+	return DMA_MEMORY_IO;
+
+ free1_out:
+	kfree(dev->dma_mem);
+ out:
+	if (mem_base)
+		iounmap(mem_base);
+	return 0;
+}
+EXPORT_SYMBOL(dma_declare_coherent_memory);
+
+void dma_release_declared_memory(struct device *dev)
+{
+	struct dma_coherent_mem *mem = dev->dma_mem;
+
+	if (!mem)
+		return;
+	dev->dma_mem = NULL;
+	iounmap(mem->virt_base);
+	kfree(mem->bitmap);
+	kfree(mem);
+}
+EXPORT_SYMBOL(dma_release_declared_memory);
+
+void *dma_mark_declared_memory_occupied(struct device *dev,
+					dma_addr_t device_addr, size_t size)
+{
+	struct dma_coherent_mem *mem = dev->dma_mem;
+	int pos, err;
+	int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
+
+	pages >>= PAGE_SHIFT;
+
+	if (!mem)
+		return ERR_PTR(-EINVAL);
+
+	pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
+	err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
+	if (err != 0)
+		return ERR_PTR(err);
+	return mem->virt_base + (pos << PAGE_SHIFT);
+}
+EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
+
+int dma_alloc_from_coherent(struct device *dev, ssize_t size,
+				       dma_addr_t *dma_handle, void **ret)
+{
+	struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+	int order = get_order(size);
+
+	if (mem) {
+		int page = bitmap_find_free_region(mem->bitmap, mem->size,
+						     order);
+		if (page >= 0) {
+			*dma_handle = mem->device_base + (page << PAGE_SHIFT);
+			*ret = mem->virt_base + (page << PAGE_SHIFT);
+			memset(*ret, 0, size);
+		}
+		if (mem->flags & DMA_MEMORY_EXCLUSIVE)
+			*ret = NULL;
+	}
+	return (mem != NULL);
+}
+
+int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
+{
+	struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+
+	if (mem && vaddr >= mem->virt_base && vaddr <
+		   (mem->virt_base + (mem->size << PAGE_SHIFT))) {
+		int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
+
+		bitmap_release_region(mem->bitmap, page, order);
+		return 1;
+	}
+	return 0;
+}
-- 
cgit v1.2.3


From dd7a1e5615b1719c0fdffee1ea5a7820ac8141a6 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@saeurebad.de>
Date: Fri, 27 Jun 2008 15:07:21 +0200
Subject: softlockup: fix watchdog task wakeup frequency

Updating the timestamp more often is pointless as we print the warnings
only if we exceed the threshold.  And the check for hung tasks relies on
the last timestamp, so it will keep working correctly, too.

Signed-off-by: Johannes Weiner <hannes@saeurebad.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/softlockup.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 97977ecc3171..d53ab702a7db 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -130,8 +130,11 @@ void softlockup_tick(void)
 
 	now = get_timestamp(this_cpu);
 
-	/* Wake up the high-prio watchdog task every second: */
-	if (now > (touch_timestamp + 1))
+	/*
+	 * Wake up the high-prio watchdog task twice per
+	 * threshold timespan.
+	 */
+	if (now > touch_timestamp + softlockup_thresh/2)
 		wake_up_process(per_cpu(watchdog_task, this_cpu));
 
 	/* Warn about unreasonable delays: */
-- 
cgit v1.2.3


From 619b0488038224391e64fa03854651ca0f5efe56 Mon Sep 17 00:00:00 2001
From: Raistlin <raistlin@linux.it>
Date: Thu, 26 Jun 2008 18:54:09 +0200
Subject: sched: fix divide error when trying to configure rt_period to zero

Here it is another little Oops we found while configuring invalid values
via cgroups:

echo 0 > /dev/cgroups/0/cpu.rt_period_us
or
echo 4294967296 > /dev/cgroups/0/cpu.rt_period_us

[  205.509825] divide error: 0000 [#1]
[  205.510151] Modules linked in:
[  205.510151]
[  205.510151] Pid: 2339, comm: bash Not tainted (2.6.26-rc8 #33)
[  205.510151] EIP: 0060:[<c030c6ef>] EFLAGS: 00000293 CPU: 0
[  205.510151] EIP is at div64_u64+0x5f/0x70
[  205.510151] EAX: 0000389f EBX: 00000000 ECX: 00000000 EDX: 00000000
[  205.510151] ESI: d9800000 EDI: 00000000 EBP: c6cede60 ESP: c6cede50
[  205.510151]  DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068
[  205.510151] Process bash (pid: 2339, ti=c6cec000 task=c79be370 task.ti=c6cec000)
[  205.510151] Stack: d9800000 0000389f c05971a0 d9800000 c6cedeb4 c0214dbd 00000000 00000000
[  205.510151]        c6cede88 c0242bd8 c05377c0 c7a41b40 00000000 00000000 00000000 c05971a0
[  205.510151]        c780ed20 c7508494 c7a41b40 00000000 00000002 c6cedebc c05971a0 ffffffea
[  205.510151] Call Trace:
[  205.510151]  [<c0214dbd>] ? __rt_schedulable+0x1cd/0x240
[  205.510151]  [<c0242bd8>] ? cgroup_file_open+0x18/0xe0
[  205.510151]  [<c0214fe4>] ? tg_set_bandwidth+0xa4/0xf0
[  205.510151]  [<c0215066>] ? sched_group_set_rt_period+0x36/0x50
[  205.510151]  [<c021508e>] ? cpu_rt_period_write_uint+0xe/0x10
[  205.510151]  [<c0242dc5>] ? cgroup_file_write+0x125/0x160
[  205.510151]  [<c0232c15>] ? hrtimer_interrupt+0x155/0x190
[  205.510151]  [<c02f047f>] ? security_file_permission+0xf/0x20
[  205.510151]  [<c0277ad8>] ? rw_verify_area+0x48/0xc0
[  205.510151]  [<c0283744>] ? dupfd+0x104/0x130
[  205.510151]  [<c027838c>] ? vfs_write+0x9c/0x160
[  205.510151]  [<c0242ca0>] ? cgroup_file_write+0x0/0x160
[  205.510151]  [<c027850d>] ? sys_write+0x3d/0x70
[  205.510151]  [<c0203019>] ? sysenter_past_esp+0x6a/0x91
[  205.510151]  =======================
[  205.510151] Code: 0f 45 de 31 f6 0f ad d0 d3 ea f6 c1 20 0f 45 c2 0f 45 d6 89 45 f0 89 55 f4 8b 55 f4 31 c9 8b 45 f0 39 d3 89 c6 77 08 89 d0 31 d2 <f7> f3 89 c1 83 c4 08 89 f0 f7 f3 89 ca 5b 5e 5d c3 55 89 e5 56
[  205.510151] EIP: [<c030c6ef>] div64_u64+0x5f/0x70 SS:ESP 0068:c6cede50

The attached patch solves the issue for me.

I'm checking as soon as possible for the period not being zero since, if
it is, going ahead is useless. This way we also save a mutex_lock() and
a read_lock() wrt doing it inside tg_set_bandwidth() or
__rt_schedulable().

Signed-off-by: Dario Faggioli <raistlin@linux.it>
Signed-off-by: Michael Trimarchi <trimarchimichael@yahoo.it>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index a66e85639de2..94ead43eda62 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8502,6 +8502,9 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
 	rt_period = (u64)rt_period_us * NSEC_PER_USEC;
 	rt_runtime = tg->rt_bandwidth.rt_runtime;
 
+	if (rt_period == 0)
+		return -EINVAL;
+
 	return tg_set_bandwidth(tg, rt_period, rt_runtime);
 }
 
-- 
cgit v1.2.3


From 3e2f69fdd1b00166e7d589bce56b2d36a9e74374 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@saeurebad.de>
Date: Tue, 1 Jul 2008 09:12:04 +0200
Subject: softlockup: fix watchdog task wakeup frequency

The print_timestamp can never be bigger than the touch_timestamp, at
maximum it can be equal.  And if it is, the second check for
touch_timestamp + 1 bigger print_timestamp is always true, too.

The check for equality is sufficient as we proceed in one-second-steps
and are at least one second away from the last print-out if we have
another timestamp.

Signed-off-by: Johannes Weiner <hannes@saeurebad.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/softlockup.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index d53ab702a7db..7bd8d1aadd5d 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -116,11 +116,8 @@ void softlockup_tick(void)
 	print_timestamp = per_cpu(print_timestamp, this_cpu);
 
 	/* report at most once a second */
-	if ((print_timestamp >= touch_timestamp &&
-			print_timestamp < (touch_timestamp + 1)) ||
-			did_panic) {
+	if (print_timestamp == touch_timestamp || did_panic)
 		return;
-	}
 
 	/* do not print during early bootup: */
 	if (unlikely(system_state != SYSTEM_RUNNING)) {
-- 
cgit v1.2.3


From 8558f8f81680a43d383abd1b5f23d3501fedfa65 Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Fri, 27 Jun 2008 10:17:38 +0530
Subject: rcu: fix hotplug vs rcu race

Dhaval Giani reported this warning during cpu hotplug stress-tests:

| On running kernel compiles in parallel with cpu hotplug:
|
| WARNING: at arch/x86/kernel/smp.c:118
| native_smp_send_reschedule+0x21/0x36()
| Modules linked in:
| Pid: 27483, comm: cc1 Not tainted 2.6.26-rc7 #1
| [...]
|  [<c0110355>] native_smp_send_reschedule+0x21/0x36
|  [<c014fe8f>] force_quiescent_state+0x47/0x57
|  [<c014fef0>] call_rcu+0x51/0x6d
|  [<c01713b3>] __fput+0x130/0x158
|  [<c0171231>] fput+0x17/0x19
|  [<c016fd99>] filp_close+0x4d/0x57
|  [<c016fdff>] sys_close+0x5c/0x97

IMHO the warning is a spurious one.

cpu_online_map is updated by the _cpu_down() using stop_machine_run().
Since force_quiescent_state is invoked from irqs disabled section,
stop_machine_run() won't be executing while a cpu is executing
force_quiescent_state(). Hence the cpu_online_map is stable while we're
in the irq disabled section.

However, a cpu might have been offlined _just_ before we disabled irqs
while entering force_quiescent_state(). And rcu subsystem might not yet
have handled the CPU_DEAD notification, leading to the offlined cpu's
bit being set in the rcp->cpumask.

Hence cpumask = (rcp->cpumask & cpu_online_map) to prevent sending
smp_reschedule() to an offlined CPU.

Here's the timeline:

CPU_A						 CPU_B
--------------------------------------------------------------
cpu_down():					.
.					   	.
.						.
stop_machine(): /* disables preemption,		.
		 * and irqs */			.
.						.
.						.
take_cpu_down();				.
.						.
.						.
.						.
cpu_disable(); /*this removes cpu 		.
		*from cpu_online_map 		.
		*/				.
.						.
.						.
restart_machine(); /* enables irqs */		.
------WINDOW DURING WHICH rcp->cpumask is stale ---------------
.						call_rcu();
.						/* disables irqs here */
.						.force_quiescent_state();
.CPU_DEAD:					.for_each_cpu(rcp->cpumask)
.						.   smp_send_reschedule();
.						.
.						.   WARN_ON() for offlined CPU!
.
.
.
rcu_cpu_notify:
.
-------- WINDOW ENDS ------------------------------------------
rcu_offline_cpu() /* Which calls cpu_quiet()
		   * which removes
		   * cpu from rcp->cpumask.
		   */

If a new batch was started just before calling stop_machine_run(), the
"tobe-offlined" cpu is still present in rcp-cpumask.

During a cpu-offline, from take_cpu_down(), we queue an rt-prio idle
task as the next task to be picked by the scheduler. We also call
cpu_disable() which will disable any further interrupts and remove the
cpu's bit from the cpu_online_map.

Once the stop_machine_run() successfully calls take_cpu_down(), it calls
schedule(). That's the last time a schedule is called on the offlined
cpu, and hence the last time when rdp->passed_quiesc will be set to 1
through rcu_qsctr_inc().

But the cpu_quiet() will be on this cpu will be called only when the
next RCU_SOFTIRQ occurs on this CPU. So at this time, the offlined CPU
is still set in rcp->cpumask.

Now coming back to the idle_task which truely offlines the CPU, it does
check for a pending RCU and raises the softirq, since it will find
rdp->passed_quiesc to be 0 in this case. However, since the cpu is
offline I am not sure if the softirq will trigger on the CPU.

Even if it doesn't the rcu_offline_cpu() will find that rcp->completed
is not the same as rcp->cur, which means that our cpu could be holding
up the grace period progression. Hence we call cpu_quiet() and move
ahead.

But because of the window explained in the timeline, we could still have
a call_rcu() before the RCU subsystem executes it's CPU_DEAD
notification, and we send smp_send_reschedule() to offlined cpu while
trying to force the quiescent states. The appended patch adds comments
and prevents checking for offlined cpu everytime.

cpu_online_map is updated by the _cpu_down() using stop_machine_run().
Since force_quiescent_state is invoked from irqs disabled section,
stop_machine_run() won't be executing while a cpu is executing
force_quiescent_state(). Hence the cpu_online_map is stable while we're
in the irq disabled section.

Reported-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Acked-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Rusty Russel <rusty@rustcorp.com.au>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index f4ffbd0f306f..a38895a5b8e2 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -89,8 +89,22 @@ static void force_quiescent_state(struct rcu_data *rdp,
 		/*
 		 * Don't send IPI to itself. With irqs disabled,
 		 * rdp->cpu is the current cpu.
+		 *
+		 * cpu_online_map is updated by the _cpu_down()
+		 * using stop_machine_run(). Since we're in irqs disabled
+		 * section, stop_machine_run() is not exectuting, hence
+		 * the cpu_online_map is stable.
+		 *
+		 * However,  a cpu might have been offlined _just_ before
+		 * we disabled irqs while entering here.
+		 * And rcu subsystem might not yet have handled the CPU_DEAD
+		 * notification, leading to the offlined cpu's bit
+		 * being set in the rcp->cpumask.
+		 *
+		 * Hence cpumask = (rcp->cpumask & cpu_online_map) to prevent
+		 * sending smp_reschedule() to an offlined CPU.
 		 */
-		cpumask = rcp->cpumask;
+		cpus_and(cpumask, rcp->cpumask, cpu_online_map);
 		cpu_clear(rdp->cpu, cpumask);
 		for_each_cpu_mask(cpu, cpumask)
 			smp_send_reschedule(cpu);
-- 
cgit v1.2.3


From 3e61e0c976532a542b23bbb74c8f631815171078 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Mon, 30 Jun 2008 23:48:37 +0300
Subject: mmiotrace broken in linux-next (8-bit writes only)

The moment mmiotrace is enabled, I hit a NULL deref in:

IP: [<ffffffff80256e71>] __trace_special+0x17c/0x23a
Call Trace:
 [<ffffffff802573cc>] ftrace_special+0x6f/0x9a
 [<ffffffff8023e3e4>] down+0x19/0x4a
 [<ffffffff80228adc>] acquire_console_sem+0x42/0x58
 [<ffffffff8035d273>] con_flush_chars+0x28/0x43
 [<ffffffff80354a70>] write_chan+0x22e/0x334
 [<ffffffff802244e9>] ? default_wake_function+0x0/0xf
 [<ffffffff8035236d>] tty_write+0x195/0x228
 [<ffffffff80354842>] ? write_chan+0x0/0x334
 [<ffffffff8027c23a>] vfs_write+0xae/0x137
 [<ffffffff8027c6e3>] sys_write+0x47/0x70
 [<ffffffff8020b1db>] system_call_after_swapgs+0x7b/0x80

which means 'entry' in __trace_special() is NULL.

[ mingo@elte.hu: that ftrace_special() was a leftover. ]

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: proski@gnu.org
Cc: "Vegard Nossum" <vegard.nossum@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/semaphore.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 1a064adab658..aaaeae8244e7 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -54,7 +54,6 @@ void down(struct semaphore *sem)
 {
 	unsigned long flags;
 
-	ftrace_special(sem->count, 0, __LINE__);
 	spin_lock_irqsave(&sem->lock, flags);
 	if (likely(sem->count > 0))
 		sem->count--;
-- 
cgit v1.2.3


From db26e64dc3f0d51c4db1a625c248a81f7850eee9 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 20 May 2008 19:16:33 +0200
Subject: pm_qos_params: BKL pushdown

[jmc: added <linux/smp_lock.h>]

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 kernel/pm_qos_params.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 0afe32be4c85..8cb757026386 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -29,6 +29,7 @@
 
 #include <linux/pm_qos_params.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/time.h>
@@ -358,15 +359,19 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
 	int ret;
 	long pm_qos_class;
 
+	lock_kernel();
 	pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
 	if (pm_qos_class >= 0) {
 		filp->private_data = (void *)pm_qos_class;
 		sprintf(name, "process_%d", current->pid);
 		ret = pm_qos_add_requirement(pm_qos_class, name,
 					PM_QOS_DEFAULT_VALUE);
-		if (ret >= 0)
+		if (ret >= 0) {
+			unlock_kernel();
 			return 0;
+		}
 	}
+	unlock_kernel();
 
 	return -EPERM;
 }
-- 
cgit v1.2.3


From da9cbc87395308a21465bd25441297bbba0477e1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 30 Jun 2008 20:42:08 +0200
Subject: block: blkdev.h cleanup, move iocontext stuff to iocontext.h

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 kernel/exit.c | 1 +
 kernel/fork.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 8f6185e69b69..ceb258782835 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -13,6 +13,7 @@
 #include <linux/personality.h>
 #include <linux/tty.h>
 #include <linux/mnt_namespace.h>
+#include <linux/iocontext.h>
 #include <linux/key.h>
 #include <linux/security.h>
 #include <linux/cpu.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 19908b26cf80..b71ccd09fc8d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -23,6 +23,7 @@
 #include <linux/sem.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
+#include <linux/iocontext.h>
 #include <linux/key.h>
 #include <linux/binfmts.h>
 #include <linux/mman.h>
-- 
cgit v1.2.3


From 98a05ed4bd7774f533ab185fe0bf2fdc58292d7c Mon Sep 17 00:00:00 2001
From: Abhishek Sagar <sagar.abhishek@gmail.com>
Date: Thu, 26 Jun 2008 22:51:51 +0530
Subject: ftrace: prevent ftrace modifications while being kprobe'd, v2

add two missing chunks for ftrace+kprobe.

Signed-off-by: Abhishek Sagar <sagar.abhishek@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 85e841335417..0f271c45cd02 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -502,8 +502,12 @@ static void ftrace_replace_code(int enable)
 				continue;
 
 			/* ignore updates to this record's mcount site */
-			if (get_kprobe((void *)rec->ip))
+			if (get_kprobe((void *)rec->ip)) {
+				freeze_record(rec);
 				continue;
+			} else {
+				unfreeze_record(rec);
+			}
 
 			failed = __ftrace_replace_code(rec, old, new, enable);
 			if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
@@ -740,7 +744,10 @@ static int __ftrace_update_code(void *ignore)
 				ftrace_del_hash(p);
 				INIT_HLIST_NODE(&p->node);
 				hlist_add_head(&p->node, &temp_list);
+				freeze_record(p);
 				continue;
+			} else {
+				unfreeze_record(p);
 			}
 
 			/* convert record (i.e, patch mcount-call with NOP) */
-- 
cgit v1.2.3


From ee3ece830f6db9837f7ac67008f532a8c1e755f4 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 3 Jul 2008 14:31:26 -0400
Subject: hrtimer: prevent migration for raising softirq

Due to a possible deadlock, the waking of the softirq was pushed outside
of the hrtimer base locks. See commit 0c96c5979a522c3323c30a078a70120e29b5bdbc

Unfortunately this allows the task to migrate after setting up the softirq
and raising it. Since softirqs run a queue that is per-cpu we may raise the
softirq on the wrong CPU and this will keep the queued softirq task from
running.

To solve this issue, this patch disables preemption around the releasing
of the hrtimer lock and raising of the softirq.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/hrtimer.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 421be5fe5cc7..ab80515008f4 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1003,10 +1003,18 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 	 */
 	raise = timer->state == HRTIMER_STATE_PENDING;
 
+	/*
+	 * We use preempt_disable to prevent this task from migrating after
+	 * setting up the softirq and raising it. Otherwise, if me migrate
+	 * we will raise the softirq on the wrong CPU.
+	 */
+	preempt_disable();
+
 	unlock_hrtimer_base(timer, &flags);
 
 	if (raise)
 		hrtimer_raise_softirq();
+	preempt_enable();
 
 	return ret;
 }
-- 
cgit v1.2.3


From c4acb2c0669c5c5c9b28e9d02a34b5c67edf7092 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 27 Jun 2008 14:29:55 -0600
Subject: sched: terminate newidle balancing once at least one task has moved
 over

Inspired by Peter Zijlstra.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Cc: npiggin@suse.de
Cc: rostedt@goodmis.org
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 677c80b9a6b5..d99aeabeb72f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3013,6 +3013,10 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 				max_load_move - total_load_moved,
 				sd, idle, all_pinned, &this_best_prio);
 		class = class->next;
+
+		if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
+			break;
+
 	} while (class && max_load_move > total_load_moved);
 
 	return total_load_moved > 0;
-- 
cgit v1.2.3


From 2087a1ad822cd3a68b73338457047fcc54da726b Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Fri, 27 Jun 2008 14:30:00 -0600
Subject: sched: add avg-overlap support to RT tasks

We have the notion of tracking process-coupling (a.k.a. buddy-wake) via
the p->se.last_wake / p->se.avg_overlap facilities, but it is only used
for cfs to cfs interactions.  There is no reason why an rt to cfs
interaction cannot share in establishing a relationhip in a similar
manner.

Because PREEMPT_RT runs many kernel threads as FIFO priority, we often
times have heavy interaction between RT threads waking CFS applications.
This patch offers a substantial boost (50-60%+) in perfomance under those
circumstances.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Cc: npiggin@suse.de
Cc: rostedt@goodmis.org
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 14 ++++++++++++++
 kernel/sched_fair.c | 21 ++-------------------
 2 files changed, 16 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index d99aeabeb72f..bbc40c3a0657 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1693,6 +1693,12 @@ static void set_load_weight(struct task_struct *p)
 	p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
 }
 
+static void update_avg(u64 *avg, u64 sample)
+{
+	s64 diff = sample - *avg;
+	*avg += diff >> 3;
+}
+
 static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
 {
 	sched_info_queued(p);
@@ -1702,6 +1708,12 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
 
 static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
 {
+	if (sleep && p->se.last_wakeup) {
+		update_avg(&p->se.avg_overlap,
+			   p->se.sum_exec_runtime - p->se.last_wakeup);
+		p->se.last_wakeup = 0;
+	}
+
 	p->sched_class->dequeue_task(rq, p, sleep);
 	p->se.on_rq = 0;
 }
@@ -2313,6 +2325,8 @@ out_running:
 		p->sched_class->task_wake_up(rq, p);
 #endif
 out:
+	current->se.last_wakeup = current->se.sum_exec_runtime;
+
 	task_rq_unlock(rq, &flags);
 
 	return success;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 2e43d4a748c3..f2aa987027d6 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -726,21 +726,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 		__enqueue_entity(cfs_rq, se);
 }
 
-static void update_avg(u64 *avg, u64 sample)
-{
-	s64 diff = sample - *avg;
-	*avg += diff >> 3;
-}
-
-static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	if (!se->last_wakeup)
-		return;
-
-	update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup);
-	se->last_wakeup = 0;
-}
-
 static void
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 {
@@ -751,7 +736,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 
 	update_stats_dequeue(cfs_rq, se);
 	if (sleep) {
-		update_avg_stats(cfs_rq, se);
 #ifdef CONFIG_SCHEDSTATS
 		if (entity_is_task(se)) {
 			struct task_struct *tsk = task_of(se);
@@ -1196,9 +1180,9 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
 	 * a reasonable amount of time then attract this newly
 	 * woken task:
 	 */
-	if (sync && balanced && curr->sched_class == &fair_sched_class) {
+	if (sync && balanced) {
 		if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
-				p->se.avg_overlap < sysctl_sched_migration_cost)
+		    p->se.avg_overlap < sysctl_sched_migration_cost)
 			return 1;
 	}
 
@@ -1359,7 +1343,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 		return;
 	}
 
-	se->last_wakeup = se->sum_exec_runtime;
 	if (unlikely(se == pse))
 		return;
 
-- 
cgit v1.2.3


From 46ac22bab42cc868b9c1d0e915ddbc8e8065a44d Mon Sep 17 00:00:00 2001
From: Ankita Garg <ankita@in.ibm.com>
Date: Tue, 1 Jul 2008 14:30:06 +0530
Subject: sched: fix accounting in task delay accounting & migration

On Thu, Jun 19, 2008 at 12:27:14PM +0200, Peter Zijlstra wrote:
> On Thu, 2008-06-05 at 10:50 +0530, Ankita Garg wrote:
>
> > Thanks Peter for the explanation...
> >
> > I agree with the above and that is the reason why I did not see weird
> > values with cpu_time. But, run_delay still would suffer skews as the end
> > points for delta could be taken on different cpus due to migration (more
> > so on RT kernel due to the push-pull operations). With the below patch,
> > I could not reproduce the issue I had seen earlier. After every dequeue,
> > we take the delta and start wait measurements from zero when moved to a
> > different rq.
>
> OK, so task delay delay accounting is broken because it doesn't take
> migration into account.
>
> What you've done is make it symmetric wrt enqueue, and account it like
>
>   cpu0      cpu1
>
> enqueue
>  <wait-d1>
> dequeue
>             enqueue
>              <wait-d2>
>             run
>
> Where you add both d1 and d2 to the run_delay,.. right?
>

Thanks for reviewing the patch. The above is exactly what I have done.

> This seems like a good fix, however it looks like the patch will break
> compilation in !CONFIG_SCHEDSTATS && !CONFIG_TASK_DELAY_ACCT, of it
> failing to provide a stub for sched_info_dequeue() in that case.

Fixed. Pl. find the new patch below.

Signed-off-by: Ankita Garg <ankita@in.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Gregory Haskins <ghaskins@novell.com>
Cc: rostedt@goodmis.org
Cc: suresh.b.siddha@intel.com
Cc: aneesh.kumar@linux.vnet.ibm.com
Cc: dhaval@linux.vnet.ibm.com
Cc: vatsa@linux.vnet.ibm.com
Cc: David Bahi <DBahi@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c       |  1 +
 kernel/sched_stats.h | 42 +++++++++++++++++++++++++++++++++---------
 2 files changed, 34 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index bbc40c3a0657..996bc15196a5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1714,6 +1714,7 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
 		p->se.last_wakeup = 0;
 	}
 
+	sched_info_dequeued(p);
 	p->sched_class->dequeue_task(rq, p, sleep);
 	p->se.on_rq = 0;
 }
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 80179ef7450e..8385d43987e2 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -118,6 +118,13 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 	if (rq)
 		rq->rq_sched_info.cpu_time += delta;
 }
+
+static inline void
+rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
+{
+	if (rq)
+		rq->rq_sched_info.run_delay += delta;
+}
 # define schedstat_inc(rq, field)	do { (rq)->field++; } while (0)
 # define schedstat_add(rq, field, amt)	do { (rq)->field += (amt); } while (0)
 # define schedstat_set(var, val)	do { var = (val); } while (0)
@@ -126,6 +133,9 @@ static inline void
 rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
 {}
 static inline void
+rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
+{}
+static inline void
 rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 {}
 # define schedstat_inc(rq, field)	do { } while (0)
@@ -134,6 +144,11 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 #endif
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+static inline void sched_info_reset_dequeued(struct task_struct *t)
+{
+	t->sched_info.last_queued = 0;
+}
+
 /*
  * Called when a process is dequeued from the active array and given
  * the cpu.  We should note that with the exception of interactive
@@ -143,15 +158,22 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
  * active queue, thus delaying tasks in the expired queue from running;
  * see scheduler_tick()).
  *
- * This function is only called from sched_info_arrive(), rather than
- * dequeue_task(). Even though a task may be queued and dequeued multiple
- * times as it is shuffled about, we're really interested in knowing how
- * long it was from the *first* time it was queued to the time that it
- * finally hit a cpu.
+ * Though we are interested in knowing how long it was from the *first* time a
+ * task was queued to the time that it finally hit a cpu, we call this routine
+ * from dequeue_task() to account for possible rq->clock skew across cpus. The
+ * delta taken on each cpu would annul the skew.
  */
 static inline void sched_info_dequeued(struct task_struct *t)
 {
-	t->sched_info.last_queued = 0;
+	unsigned long long now = task_rq(t)->clock, delta = 0;
+
+	if (unlikely(sched_info_on()))
+		if (t->sched_info.last_queued)
+			delta = now - t->sched_info.last_queued;
+	sched_info_reset_dequeued(t);
+	t->sched_info.run_delay += delta;
+
+	rq_sched_info_dequeued(task_rq(t), delta);
 }
 
 /*
@@ -165,7 +187,7 @@ static void sched_info_arrive(struct task_struct *t)
 
 	if (t->sched_info.last_queued)
 		delta = now - t->sched_info.last_queued;
-	sched_info_dequeued(t);
+	sched_info_reset_dequeued(t);
 	t->sched_info.run_delay += delta;
 	t->sched_info.last_arrival = now;
 	t->sched_info.pcount++;
@@ -242,7 +264,9 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
 		__sched_info_switch(prev, next);
 }
 #else
-#define sched_info_queued(t)		do { } while (0)
-#define sched_info_switch(t, next)	do { } while (0)
+#define sched_info_queued(t)			do { } while (0)
+#define sched_info_reset_dequeued(t)	do { } while (0)
+#define sched_info_dequeued(t)			do { } while (0)
+#define sched_info_switch(t, next)		do { } while (0)
 #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
 
-- 
cgit v1.2.3


From cde53535991fbb5c34a1566f25955297c1487b8d Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 4 Jul 2008 09:59:22 -0700
Subject: Christoph has moved

Remove all clameter@sgi.com addresses from the kernel tree since they will
become invalid on June 27th.  Change my maintainer email address for the
slab allocators to cl@linux-foundation.org (which will be the new email
address for the future).

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 29fc39f1029c..ce7799540c91 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -13,7 +13,7 @@
  *   Kai Petzke <wpp@marie.physik.tu-berlin.de>
  *   Theodore Ts'o <tytso@mit.edu>
  *
- * Made to use alloc_percpu by Christoph Lameter <clameter@sgi.com>.
+ * Made to use alloc_percpu by Christoph Lameter.
  */
 
 #include <linux/module.h>
-- 
cgit v1.2.3


From 086f7316f0d400806d76323beefae996bb3849b1 Mon Sep 17 00:00:00 2001
From: "Andrew G. Morgan" <morgan@kernel.org>
Date: Fri, 4 Jul 2008 09:59:58 -0700
Subject: security: filesystem capabilities: fix fragile setuid fixup code

This commit includes a bugfix for the fragile setuid fixup code in the
case that filesystem capabilities are supported (in access()).  The effect
of this fix is gated on filesystem capability support because changing
securebits is only supported when filesystem capabilities support is
configured.)

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Andrew G. Morgan <morgan@kernel.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/capability.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'kernel')

diff --git a/kernel/capability.c b/kernel/capability.c
index cfbe44299488..901e0fdc3fff 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -121,6 +121,27 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
  * uninteresting and/or not to be changed.
  */
 
+/*
+ * Atomically modify the effective capabilities returning the original
+ * value. No permission check is performed here - it is assumed that the
+ * caller is permitted to set the desired effective capabilities.
+ */
+kernel_cap_t cap_set_effective(const kernel_cap_t pE_new)
+{
+	kernel_cap_t pE_old;
+
+	spin_lock(&task_capability_lock);
+
+	pE_old = current->cap_effective;
+	current->cap_effective = pE_new;
+
+	spin_unlock(&task_capability_lock);
+
+	return pE_old;
+}
+
+EXPORT_SYMBOL(cap_set_effective);
+
 /**
  * sys_capget - get the capabilities of a given process.
  * @header: pointer to struct that contains capability version and
-- 
cgit v1.2.3


From 3b7253238801a7b97b3929d8db2fa7a0721fb17b Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 16 Jun 2008 15:51:08 -0700
Subject: softlockup: print a module list on being stuck

Most places in the kernel that go BUG: print a module list
(which is very useful for doing statistics and finding patterns),
however the softlockup detector does not do this yet.

This patch adds the one line change to fix this gap.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/softlockup.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index c828c2339cc9..a272d78185eb 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -120,6 +120,7 @@ void softlockup_tick(void)
 	printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n",
 			this_cpu, now - touch_timestamp,
 			current->comm, task_pid_nr(current));
+	print_modules();
 	if (regs)
 		show_regs(regs);
 	else
-- 
cgit v1.2.3


From aa276e1cafb3ce9d01d1e837bcd67e92616013ac Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Jun 2008 19:15:00 +0200
Subject: x86, clockevents: add C1E aware idle function

C1E on AMD machines is like C3 but without control from the OS. Up to
now we disabled the local apic timer for those machines as it stops
when the CPU goes into C1E. This excludes those machines from high
resolution timers / dynamic ticks, which hurts especially X2 based
laptops.

The current boot time C1E detection has another, more serious flaw
as well: some BIOSes do not enable C1E until the ACPI processor module
is loaded. This causes systems to stop working after that point.

To work nicely with C1E enabled machines we use a separate idle
function, which checks on idle entry whether C1E was enabled in the
Interrupt Pending Message MSR. This allows us to do timer broadcasting
for C1E and covers the late enablement of C1E as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/tick-broadcast.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 57a1f02e5ec0..67f80c261709 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -30,6 +30,7 @@
 struct tick_device tick_broadcast_device;
 static cpumask_t tick_broadcast_mask;
 static DEFINE_SPINLOCK(tick_broadcast_lock);
+static int tick_broadcast_force;
 
 #ifdef CONFIG_TICK_ONESHOT
 static void tick_broadcast_clear_oneshot(int cpu);
@@ -232,10 +233,11 @@ static void tick_do_broadcast_on_off(void *why)
 						     CLOCK_EVT_MODE_SHUTDOWN);
 		}
 		if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
-			dev->features |= CLOCK_EVT_FEAT_DUMMY;
+			tick_broadcast_force = 1;
 		break;
 	case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
-		if (cpu_isset(cpu, tick_broadcast_mask)) {
+		if (!tick_broadcast_force &&
+		    cpu_isset(cpu, tick_broadcast_mask)) {
 			cpu_clear(cpu, tick_broadcast_mask);
 			if (td->mode == TICKDEV_MODE_PERIODIC)
 				tick_setup_periodic(dev, 0);
-- 
cgit v1.2.3


From 076ac2af86c3b7f89ac31bc50a7508d3e035b786 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Mon, 12 May 2008 21:21:12 +0200
Subject: sched, numa: replace MAX_NUMNODES with nr_node_ids in kernel/sched.c

  * Replace usages of MAX_NUMNODES with nr_node_ids in kernel/sched.c,
    where appropriate.  This saves some allocated space as well as many
    wasted cycles going through node entries that are non-existent.

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 94ead43eda62..bcc22b569ee9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6538,9 +6538,9 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
 
 	min_val = INT_MAX;
 
-	for (i = 0; i < MAX_NUMNODES; i++) {
+	for (i = 0; i < nr_node_ids; i++) {
 		/* Start at @node */
-		n = (node + i) % MAX_NUMNODES;
+		n = (node + i) % nr_node_ids;
 
 		if (!nr_cpus_node(n))
 			continue;
@@ -6734,7 +6734,7 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
 		if (!sched_group_nodes)
 			continue;
 
-		for (i = 0; i < MAX_NUMNODES; i++) {
+		for (i = 0; i < nr_node_ids; i++) {
 			struct sched_group *oldsg, *sg = sched_group_nodes[i];
 
 			*nodemask = node_to_cpumask(i);
@@ -6927,7 +6927,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 	/*
 	 * Allocate the per-node list of sched groups
 	 */
-	sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *),
+	sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
 				    GFP_KERNEL);
 	if (!sched_group_nodes) {
 		printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -7066,7 +7066,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #endif
 
 	/* Set up physical groups */
-	for (i = 0; i < MAX_NUMNODES; i++) {
+	for (i = 0; i < nr_node_ids; i++) {
 		SCHED_CPUMASK_VAR(nodemask, allmasks);
 		SCHED_CPUMASK_VAR(send_covered, allmasks);
 
@@ -7090,7 +7090,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 					send_covered, tmpmask);
 	}
 
-	for (i = 0; i < MAX_NUMNODES; i++) {
+	for (i = 0; i < nr_node_ids; i++) {
 		/* Set up node groups */
 		struct sched_group *sg, *prev;
 		SCHED_CPUMASK_VAR(nodemask, allmasks);
@@ -7129,9 +7129,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		cpus_or(*covered, *covered, *nodemask);
 		prev = sg;
 
-		for (j = 0; j < MAX_NUMNODES; j++) {
+		for (j = 0; j < nr_node_ids; j++) {
 			SCHED_CPUMASK_VAR(notcovered, allmasks);
-			int n = (i + j) % MAX_NUMNODES;
+			int n = (i + j) % nr_node_ids;
 			node_to_cpumask_ptr(pnodemask, n);
 
 			cpus_complement(*notcovered, *covered);
@@ -7184,7 +7184,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 	}
 
 #ifdef CONFIG_NUMA
-	for (i = 0; i < MAX_NUMNODES; i++)
+	for (i = 0; i < nr_node_ids; i++)
 		init_numa_sched_groups_power(sched_group_nodes[i]);
 
 	if (sd_allnodes) {
-- 
cgit v1.2.3


From a29d1cfe9e9337aedeed505afddc8465ac709b87 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 2 Jun 2008 13:19:08 +0200
Subject: printk: export console_drivers

this symbol is needed by drivers/video/xen-fbfront.ko.

[ cherry-picked from tip/core/printk ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/printk.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 028ed75d4864..1fb1382009f3 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -75,6 +75,8 @@ EXPORT_SYMBOL(oops_in_progress);
 static DECLARE_MUTEX(console_sem);
 static DECLARE_MUTEX(secondary_console_sem);
 struct console *console_drivers;
+EXPORT_SYMBOL_GPL(console_drivers);
+
 /*
  * This is used for debugging the mess that is the VT code by
  * keeping track if we have the console semaphore held. It's
-- 
cgit v1.2.3


From 7683c57c489bd17795945f4ae1c1d73e7c7b38e3 Mon Sep 17 00:00:00 2001
From: Daniel Guilak <guilak@linux.vnet.ibm.com>
Date: Tue, 8 Jul 2008 15:02:06 -0700
Subject: kernel/printk.c: Made printk_recursion_bug_msg static.

Signed-off-by: Daniel Guilak <daniel@danielguilak.com>
Acked-by: Josh Triplett <josh@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 8fb01c32aa3b..e2129e83fd75 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -666,7 +666,7 @@ static int acquire_console_semaphore_for_printk(unsigned int cpu)
 	return retval;
 }
 
-const char printk_recursion_bug_msg [] =
+static const char printk_recursion_bug_msg [] =
 			KERN_CRIT "BUG: recent printk recursion!\n";
 static int printk_recursion_bug;
 
-- 
cgit v1.2.3


From 48627d8d23c34106c1365563604739a50343edaf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Jul 2008 07:01:13 +0200
Subject: genirq: remove extraneous checks in manage.c

In http://bugzilla.kernel.org/show_bug.cgi?id=9580 it was pointed out
that the desc->chip checks are extraneous. In fact these are left
overs from early development and can be removed safely.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/manage.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 469814e9b9ee..77a51be36010 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -377,7 +377,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 
 		/* Setup the type (level, edge polarity) if configured: */
 		if (new->flags & IRQF_TRIGGER_MASK) {
-			if (desc->chip && desc->chip->set_type)
+			if (desc->chip->set_type)
 				desc->chip->set_type(irq,
 						new->flags & IRQF_TRIGGER_MASK);
 			else
@@ -387,8 +387,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 				 */
 				printk(KERN_WARNING "No IRQF_TRIGGER set_type "
 				       "function for IRQ %d (%s)\n", irq,
-				       desc->chip ? desc->chip->name :
-				       "unknown");
+				       desc->chip->name);
 		} else
 			compat_irq_chip_set_default_handler(desc);
 
-- 
cgit v1.2.3


From dc7fab8b3bb388c57c6c4a43ba68c8a32ca25204 Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Thu, 10 Jul 2008 00:32:40 +0200
Subject: sched: fix cpu hotplug

I think we may have a race between try_to_wake_up() and
migrate_live_tasks() -> move_task_off_dead_cpu() when the later one
may end up looping endlessly.

Interrupts are enabled on other CPUs when migration_call(CPU_DEAD, ...) is
called so we may get a race between try_to_wake_up() and
migrate_live_tasks() -> move_task_off_dead_cpu(). The former one may push
a task out of a dead CPU causing the later one to loop endlessly.

Heiko Carstens observed:

| That's exactly what explains a dump I got yesterday. Thanks for fixing! :)

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Cc: miaox@cn.fujitsu.com
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 94ead43eda62..9397b8710138 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5621,8 +5621,10 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 
 	double_rq_lock(rq_src, rq_dest);
 	/* Already moved. */
-	if (task_cpu(p) != src_cpu)
+	if (task_cpu(p) != src_cpu) {
+		ret = 1;
 		goto out;
+	}
 	/* Affinity changed (again). */
 	if (!cpu_isset(dest_cpu, p->cpus_allowed))
 		goto out;
-- 
cgit v1.2.3


From 544304b200c3869bc1312bcf941c4cf04d65b56c Mon Sep 17 00:00:00 2001
From: Daniel Guilak <guilak@linux.vnet.ibm.com>
Date: Thu, 10 Jul 2008 09:38:19 -0700
Subject: kernel/kprobes.c: Made kprobe_blacklist static.

Signed-off-by: Daniel Guilak <daniel@danielguilak.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d4998f81e229..1485ca8d0e00 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -79,7 +79,7 @@ static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
  *
  * For such cases, we now have a blacklist
  */
-struct kprobe_blackpoint kprobe_blacklist[] = {
+static struct kprobe_blackpoint kprobe_blacklist[] = {
 	{"preempt_schedule",},
 	{NULL}    /* Terminator */
 };
-- 
cgit v1.2.3


From 70ff05554f91a1edda1f11684da1dbde09e2feea Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Thu, 10 Jul 2008 17:25:35 +1000
Subject: Fix PREEMPT_RCU without HOTPLUG_CPU

PREEMPT_RCU without HOTPLUG_CPU is broken.  The rcu_online_cpu is called
to initially populate rcu_cpu_online_map with all online CPUs when the
hotplug event handler is installed, and also to populate the map with
CPUs as they come online.  The former case is meant to happen with and
without HOTPLUG_CPU, but without HOTPLUG_CPU, the rcu_offline_cpu
function is no-oped -- while it still gets called, it does not set the
rcu CPU map.

With a blank RCU CPU map, grace periods get to tick by completely
oblivious to active RCU read side critical sections.  This results in
free-before-grace bugs.

Fix is obvious once the problem is known. (Also, change __devinit to
__cpuinit so the function gets thrown away on !HOTPLUG_CPU kernels).

Signed-off-by: Nick Piggin <npiggin@suse.de>
Reported-and-tested-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
[ Nick is my personal hero of the day - Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/rcupreempt.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 5e02b7740702..41d275a81df5 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -925,26 +925,22 @@ void rcu_offline_cpu(int cpu)
 	spin_unlock_irqrestore(&rdp->lock, flags);
 }
 
-void __devinit rcu_online_cpu(int cpu)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
-	cpu_set(cpu, rcu_cpu_online_map);
-	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
-}
-
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
 
 void rcu_offline_cpu(int cpu)
 {
 }
 
-void __devinit rcu_online_cpu(int cpu)
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
+
+void __cpuinit rcu_online_cpu(int cpu)
 {
-}
+	unsigned long flags;
 
-#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
+	spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
+	cpu_set(cpu, rcu_cpu_online_map);
+	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
+}
 
 static void rcu_process_callbacks(struct softirq_action *unused)
 {
-- 
cgit v1.2.3


From b1e387348a2a70954312b102d0589c3e2ca3dba1 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 10 Jul 2008 11:25:03 -0700
Subject: sched: fix cpu hotplug, cleanup

Clean up __migrate_task(): to just have separate "done" and "fail"
cases, instead of that "out" case with random error behavior.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 9397b8710138..4e2f60335656 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5621,13 +5621,11 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 
 	double_rq_lock(rq_src, rq_dest);
 	/* Already moved. */
-	if (task_cpu(p) != src_cpu) {
-		ret = 1;
-		goto out;
-	}
+	if (task_cpu(p) != src_cpu)
+		goto done;
 	/* Affinity changed (again). */
 	if (!cpu_isset(dest_cpu, p->cpus_allowed))
-		goto out;
+		goto fail;
 
 	on_rq = p->se.on_rq;
 	if (on_rq)
@@ -5638,8 +5636,9 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 		activate_task(rq_dest, p, 0);
 		check_preempt_curr(rq_dest, p);
 	}
+done:
 	ret = 1;
-out:
+fail:
 	double_rq_unlock(rq_src, rq_dest);
 	return ret;
 }
-- 
cgit v1.2.3


From 857f3fd7a496ddf4329345af65a4a2b16dd25fe8 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Fri, 11 Jul 2008 11:09:22 +0200
Subject: nohz: don't stop idle tick if softirqs are pending.

In case a cpu goes idle but softirqs are pending only an error message is
printed to the console. It may take a very long time until the pending
softirqs will finally be executed. Worst case would be a hanging system.

With this patch the timer tick just continues and the softirqs will be
executed after the next interrupt. Still a delay but better than a
hanging system.

Currently we have at least two device drivers on s390 which under certain
circumstances schedule a tasklet from process context. This is a reason
why we can end up with pending softirqs when going idle. Fixing these
drivers seems to be non-trivial.
However there is no question that the drivers should be fixed.
This patch shouldn't be considered as a bug fix. It just is intended to
keep a system running even if device drivers are buggy.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Jan Glauber <jan.glauber@de.ibm.com>
Cc: Stefan Weinhuber <wein@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/tick-sched.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index cb75394ed00e..86baa4f0dfe4 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -235,6 +235,7 @@ void tick_nohz_stop_sched_tick(void)
 			       local_softirq_pending());
 			ratelimit++;
 		}
+		goto end;
 	}
 
 	ts->idle_calls++;
-- 
cgit v1.2.3


From 007c05d4d2ce42fabd58cb54ed98e0a1714d9d86 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 10 Jul 2008 20:58:09 -0400
Subject: ftrace: move sched_switch enable after markers

We have two markers now that are enabled on sched_switch. One that records
the context switching and the other that records task wake ups. Currently
we enable the tracing first and then set the markers. This causes some
confusing traces:

# tracer: sched_switch
#
#           TASK-PID   CPU#    TIMESTAMP  FUNCTION
#              | |      |          |         |
       trace-cmd-3973  [00]   115.834817:   3973:120:R   +     3:  0:S
       trace-cmd-3973  [01]   115.834910:   3973:120:R   +     6:  0:S
       trace-cmd-3973  [02]   115.834910:   3973:120:R   +     9:  0:S
       trace-cmd-3973  [03]   115.834910:   3973:120:R   +    12:  0:S
       trace-cmd-3973  [02]   115.834910:   3973:120:R   +     9:  0:S
          <idle>-0     [02]   115.834910:      0:140:R ==>  3973:120:R

Here we see that trace-cmd with PID 3973 wakes up task 9 but the next line
shows the idle task doing a context switch to task 3973.

Enabling the tracing to _after_ the markers are set creates a much saner
output:

# tracer: sched_switch
#
#           TASK-PID   CPU#    TIMESTAMP  FUNCTION
#              | |      |          |         |
          <idle>-0     [02]  7922.634225:      0:140:R ==>  4790:120:R
       trace-cmd-4789  [03]  7922.634225:      0:140:R   +  4790:120:R

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_sched_switch.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 93a662009151..cb817a209aa0 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -227,14 +227,14 @@ void tracing_stop_cmdline_record(void)
 static void start_sched_trace(struct trace_array *tr)
 {
 	sched_switch_reset(tr);
-	tracer_enabled = 1;
 	tracing_start_cmdline_record();
+	tracer_enabled = 1;
 }
 
 static void stop_sched_trace(struct trace_array *tr)
 {
-	tracing_stop_cmdline_record();
 	tracer_enabled = 0;
+	tracing_stop_cmdline_record();
 }
 
 static void sched_switch_trace_init(struct trace_array *tr)
-- 
cgit v1.2.3


From 001b6767b1d0c89e458e5ddb039245b268f569fb Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 10 Jul 2008 20:58:10 -0400
Subject: ftrace: define function trace nop

When CONFIG_FTRACE is not enabled, the tracing_start_functon_trace
and tracing_stop_function_trace should be nops.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6b8bd8800d04..df840f42be39 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -216,8 +216,6 @@ void trace_function(struct trace_array *tr,
 		    unsigned long parent_ip,
 		    unsigned long flags);
 
-void tracing_start_function_trace(void);
-void tracing_stop_function_trace(void);
 void tracing_start_cmdline_record(void);
 void tracing_stop_cmdline_record(void);
 int register_tracer(struct tracer *type);
@@ -234,6 +232,14 @@ void update_max_tr_single(struct trace_array *tr,
 
 extern cycle_t ftrace_now(int cpu);
 
+#ifdef CONFIG_FTRACE
+void tracing_start_function_trace(void);
+void tracing_stop_function_trace(void);
+#else
+# define tracing_start_function_trace()		do { } while (0)
+# define tracing_stop_function_trace()		do { } while (0)
+#endif
+
 #ifdef CONFIG_CONTEXT_SWITCH_TRACER
 typedef void
 (*tracer_switch_func_t)(void *private,
-- 
cgit v1.2.3


From 1e16c0a081f6c93f04c6af784d6a160955269f91 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 10 Jul 2008 20:58:11 -0400
Subject: ftrace: trace schedule

After the sched_clock code has been removed from sched.c we can now trace
the scheduler. The scheduler has a lot of functions that would be worth
tracing.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index ca2433e84873..480976275d98 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,7 +11,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o
 
-CFLAGS_REMOVE_sched.o = -pg -mno-spe
+CFLAGS_REMOVE_sched.o = -mno-spe
 
 ifdef CONFIG_FTRACE
 # Do not trace debug files and internal ftrace files
-- 
cgit v1.2.3


From b5c21b4514b38f450848feb432f7120376d01ffe Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 10 Jul 2008 20:58:12 -0400
Subject: ftrace: check proper config for preempt type

There is no CONFIG_PREEMPT_DESKTOP. Use the proper entry CONFIG_PREEMPT.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9ade79369bfb..f8fdb9cedc24 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1341,7 +1341,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 		   "server",
 #elif defined(CONFIG_PREEMPT_VOLUNTARY)
 		   "desktop",
-#elif defined(CONFIG_PREEMPT_DESKTOP)
+#elif defined(CONFIG_PREEMPT)
 		   "preempt",
 #else
 		   "unknown",
-- 
cgit v1.2.3


From ad591240ceadcaf41b2a88855ca5f1c77c5a0298 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 10 Jul 2008 20:58:13 -0400
Subject: ftrace: start wakeup tracing after setting function tracer

Enabling the wakeup tracer before enabling the function tracing causes
some strange results due to the dynamic enabling of the functions.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_sched_wakeup.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index bf7e91caef57..3c8d61df4474 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -352,9 +352,10 @@ static void start_wakeup_tracer(struct trace_array *tr)
 	 */
 	smp_wmb();
 
-	tracer_enabled = 1;
 	register_ftrace_function(&trace_ops);
 
+	tracer_enabled = 1;
+
 	return;
 fail_deprobe_wake_new:
 	marker_probe_unregister("kernel_sched_wakeup_new",
-- 
cgit v1.2.3


From 26bc83f4cb911a0b4dabfe23b700aaf3235f2955 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 10 Jul 2008 20:58:14 -0400
Subject: ftrace: use current CPU for function startup

This is more of a clean up. Currently the function tracer initializes the
tracer with which ever CPU was last used for tracing. This value isn't
realy useful for function tracing, but at least it should be something other
than a random number.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 7ee7dcd76b7d..312144897970 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -28,7 +28,10 @@ static void function_reset(struct trace_array *tr)
 
 static void start_function_trace(struct trace_array *tr)
 {
+	tr->cpu = get_cpu();
 	function_reset(tr);
+	put_cpu();
+
 	tracing_start_cmdline_record();
 	tracing_start_function_trace();
 }
-- 
cgit v1.2.3


From a2bb6a3d85ef3124cd336403a95abc0540d3fbe2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 10 Jul 2008 20:58:15 -0400
Subject: ftrace: add ftrace_kill_atomic

It has been suggested that I add a way to disable the function tracer
on an oops. This code adds a ftrace_kill_atomic. It is not meant to be
used in normal situations. It will disable the ftrace tracer, but will
not perform the nice shutdown that requires scheduling.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0f271c45cd02..1359632668a4 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1601,6 +1601,21 @@ core_initcall(ftrace_dynamic_init);
 # define ftrace_force_shutdown()	do { } while (0)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
+/**
+ * ftrace_kill_atomic - kill ftrace from critical sections
+ *
+ * This function should be used by panic code. It stops ftrace
+ * but in a not so nice way. If you need to simply kill ftrace
+ * from a non-atomic section, use ftrace_kill.
+ */
+void ftrace_kill_atomic(void)
+{
+	ftrace_disabled = 1;
+	ftrace_enabled = 0;
+	ftraced_suspend = -1;
+	clear_ftrace_function();
+}
+
 /**
  * ftrace_kill - totally shutdown ftrace
  *
-- 
cgit v1.2.3


From 60bc080090e3bf6afa29c62cb25f913706551010 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 10 Jul 2008 20:58:16 -0400
Subject: ftrace: separate out the function enabled variable

Currently the function tracer uses the global tracer_enabled variable that
is used to keep track if the tracer is enabled or not. The function tracing
startup needs to be separated out, otherwise the internal happenings of
the tracer startup is also recorded.

This patch creates a ftrace_function_enabled variable to all the starting
of the function traces to happen after everything has been started.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f8fdb9cedc24..2e37857f7dfe 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -96,6 +96,9 @@ static DEFINE_PER_CPU(struct trace_array_cpu, max_data);
 /* tracer_enabled is used to toggle activation of a tracer */
 static int			tracer_enabled = 1;
 
+/* function tracing enabled */
+int				ftrace_function_enabled;
+
 /*
  * trace_nr_entries is the number of entries that is allocated
  * for a buffer. Note, the number of entries is always rounded
@@ -134,6 +137,7 @@ static notrace void no_trace_init(struct trace_array *tr)
 {
 	int cpu;
 
+	ftrace_function_enabled = 0;
 	if(tr->ctrl)
 		for_each_online_cpu(cpu)
 			tracing_reset(tr->data[cpu]);
@@ -985,7 +989,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
 	long disabled;
 	int cpu;
 
-	if (unlikely(!tracer_enabled))
+	if (unlikely(!ftrace_function_enabled))
 		return;
 
 	if (skip_trace(ip))
@@ -1010,11 +1014,15 @@ static struct ftrace_ops trace_ops __read_mostly =
 
 void tracing_start_function_trace(void)
 {
+	ftrace_function_enabled = 0;
 	register_ftrace_function(&trace_ops);
+	if (tracer_enabled)
+		ftrace_function_enabled = 1;
 }
 
 void tracing_stop_function_trace(void)
 {
+	ftrace_function_enabled = 0;
 	unregister_ftrace_function(&trace_ops);
 }
 #endif
@@ -1850,8 +1858,10 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 		m->private = iter;
 
 		/* stop the trace while dumping */
-		if (iter->tr->ctrl)
+		if (iter->tr->ctrl) {
 			tracer_enabled = 0;
+			ftrace_function_enabled = 0;
+		}
 
 		if (iter->trace && iter->trace->open)
 			iter->trace->open(iter);
@@ -1884,8 +1894,14 @@ int tracing_release(struct inode *inode, struct file *file)
 		iter->trace->close(iter);
 
 	/* reenable tracing if it was previously enabled */
-	if (iter->tr->ctrl)
+	if (iter->tr->ctrl) {
 		tracer_enabled = 1;
+		/*
+		 * It is safe to enable function tracing even if it
+		 * isn't used
+		 */
+		ftrace_function_enabled = 1;
+	}
 	mutex_unlock(&trace_types_lock);
 
 	seq_release(inode, file);
-- 
cgit v1.2.3


From 62c43dd9864dbd52ff158922d1d08c75f20335af Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 7 Jul 2008 14:16:50 -0400
Subject: sched_clock: record from last tick

The sched_clock code tries to keep within the gtod time by one tick (jiffy).
The current code mistakenly keeps track of the delta jiffies between
updates of the clock, where the the delta is used to compare with the
number of jiffies that have past since an update of the gtod. The gtod is
updated at each schedule tick not each sched_clock update. After one
jiffy passes the clock is updated fine. But the delta is taken from the
last update so if the next update happens before the next tick the delta
jiffies used will be incorrect.

This patch changes the code to check the delta of jiffies between ticks
and not updates to match the comparison of the updates with the gtod.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_clock.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index ce05271219ab..e383bc7df6dd 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -40,7 +40,7 @@ struct sched_clock_data {
 	 */
 	raw_spinlock_t		lock;
 
-	unsigned long		prev_jiffies;
+	unsigned long		tick_jiffies;
 	u64			prev_raw;
 	u64			tick_raw;
 	u64			tick_gtod;
@@ -71,7 +71,7 @@ void sched_clock_init(void)
 		struct sched_clock_data *scd = cpu_sdc(cpu);
 
 		scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
-		scd->prev_jiffies = now_jiffies;
+		scd->tick_jiffies = now_jiffies;
 		scd->prev_raw = 0;
 		scd->tick_raw = 0;
 		scd->tick_gtod = ktime_now;
@@ -90,7 +90,7 @@ void sched_clock_init(void)
 static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
 {
 	unsigned long now_jiffies = jiffies;
-	long delta_jiffies = now_jiffies - scd->prev_jiffies;
+	long delta_jiffies = now_jiffies - scd->tick_jiffies;
 	u64 clock = scd->clock;
 	u64 min_clock, max_clock;
 	s64 delta = now - scd->prev_raw;
@@ -119,7 +119,6 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
 		clock = min_clock;
 
 	scd->prev_raw = now;
-	scd->prev_jiffies = now_jiffies;
 	scd->clock = clock;
 }
 
@@ -179,6 +178,7 @@ u64 sched_clock_cpu(int cpu)
 void sched_clock_tick(void)
 {
 	struct sched_clock_data *scd = this_scd();
+	unsigned long now_jiffies = jiffies;
 	u64 now, now_gtod;
 
 	if (unlikely(!sched_clock_running))
@@ -196,6 +196,7 @@ void sched_clock_tick(void)
 	 * already observe 1 new jiffy; adding a new tick_gtod to that would
 	 * increase the clock 2 jiffies.
 	 */
+	scd->tick_jiffies = now_jiffies;
 	scd->tick_raw = now;
 	scd->tick_gtod = now_gtod;
 	__raw_spin_unlock(&scd->lock);
-- 
cgit v1.2.3


From f7cce27f5605b9e137b829a47949cb2d3c7e1cab Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 7 Jul 2008 14:16:51 -0400
Subject: sched_clock: widen the max and min time

With keeping the max and min sched time within one jiffy of the gtod clock
was too tight. Just before a schedule tick the max could easily be hit, as
well as just after a schedule_tick the min could be hit. This caused the
clock to jump around by a jiffy.

This patch widens the minimum to
   last gtod + (delta_jiffies ? delta_jiffies - 1 : 0) * TICK_NSECS

and the maximum to
    last gtod + (2 + delta_jiffies) * TICK_NSECS

This keeps the minum to gtod or if one jiffy less than delta jiffies
and the maxim 2 jiffies ahead of gtod. This may cause unstable TSCs to be
a bit more sporadic, but it helps keep a clock with a stable TSC working well.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_clock.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index e383bc7df6dd..42b81fa38cbd 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -96,14 +96,21 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
 	s64 delta = now - scd->prev_raw;
 
 	WARN_ON_ONCE(!irqs_disabled());
-	min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC;
+
+	min_clock = scd->tick_gtod +
+		(delta_jiffies ? delta_jiffies - 1 : 0) * TICK_NSEC;
 
 	if (unlikely(delta < 0)) {
 		clock++;
 		goto out;
 	}
 
-	max_clock = min_clock + TICK_NSEC;
+	/*
+	 * The clock must stay within a jiffie of the gtod.
+	 * But since we may be at the start of a jiffy or the end of one
+	 * we add another jiffy buffer.
+	 */
+	max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC;
 
 	if (unlikely(clock + delta > max_clock)) {
 		if (clock < max_clock)
-- 
cgit v1.2.3


From af52a90a14cdaa54ecbfb6e6982abb13466a4b56 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 7 Jul 2008 14:16:52 -0400
Subject: sched_clock: stop maximum check on NO HZ

Working with ftrace I would get large jumps of 11 millisecs or more with
the clock tracer. This killed the latencing timings of ftrace and also
caused the irqoff self tests to fail.

What was happening is with NO_HZ the idle would stop the jiffy counter and
before the jiffy counter was updated the sched_clock would have a bad
delta jiffies to compare with the gtod with the maximum.

The jiffies would stop and the last sched_tick would record the last gtod.
On wakeup, the sched clock update would compare the gtod + delta jiffies
(which would be zero) and compare it to the TSC. The TSC would have
correctly (with a stable TSC) moved forward several jiffies. But because the
jiffies has not been updated yet the clock would be prevented from moving
forward because it would appear that the TSC jumped too far ahead.

The clock would then virtually stop, until the jiffies are updated. Then
the next sched clock update would see that the clock was very much behind
since the delta jiffies is now correct. This would then jump the clock
forward by several jiffies.

This caused ftrace to report several milliseconds of interrupts off
latency at every resume from NO_HZ idle.

This patch adds hooks into the nohz code to disable the checking of the
maximum clock update when nohz is in effect. It resumes the max check
when nohz has updated the jiffies again.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_clock.c     | 39 ++++++++++++++++++++++++++++++++++++++-
 kernel/time/tick-sched.c |  2 ++
 2 files changed, 40 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 42b81fa38cbd..97159e225a77 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -45,6 +45,9 @@ struct sched_clock_data {
 	u64			tick_raw;
 	u64			tick_gtod;
 	u64			clock;
+#ifdef CONFIG_NO_HZ
+	int			check_max;
+#endif
 };
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
@@ -76,11 +79,45 @@ void sched_clock_init(void)
 		scd->tick_raw = 0;
 		scd->tick_gtod = ktime_now;
 		scd->clock = ktime_now;
+#ifdef CONFIG_NO_HZ
+		scd->check_max = 1;
+#endif
 	}
 
 	sched_clock_running = 1;
 }
 
+#ifdef CONFIG_NO_HZ
+/*
+ * The dynamic ticks makes the delta jiffies inaccurate. This
+ * prevents us from checking the maximum time update.
+ * Disable the maximum check during stopped ticks.
+ */
+void sched_clock_tick_stop(int cpu)
+{
+	struct sched_clock_data *scd = cpu_sdc(cpu);
+
+	scd->check_max = 0;
+}
+
+void sched_clock_tick_start(int cpu)
+{
+	struct sched_clock_data *scd = cpu_sdc(cpu);
+
+	scd->check_max = 1;
+}
+
+static int check_max(struct sched_clock_data *scd)
+{
+	return scd->check_max;
+}
+#else
+static int check_max(struct sched_clock_data *scd)
+{
+	return 1;
+}
+#endif /* CONFIG_NO_HZ */
+
 /*
  * update the percpu scd from the raw @now value
  *
@@ -112,7 +149,7 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
 	 */
 	max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC;
 
-	if (unlikely(clock + delta > max_clock)) {
+	if (unlikely(clock + delta > max_clock) && check_max(scd)) {
 		if (clock < max_clock)
 			clock = max_clock;
 		else
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b854a895591e..d63008b09a4c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -276,6 +276,7 @@ void tick_nohz_stop_sched_tick(void)
 			ts->tick_stopped = 1;
 			ts->idle_jiffies = last_jiffies;
 			rcu_enter_nohz();
+			sched_clock_tick_stop(cpu);
 		}
 
 		/*
@@ -375,6 +376,7 @@ void tick_nohz_restart_sched_tick(void)
 	select_nohz_load_balancer(0);
 	now = ktime_get();
 	tick_do_update_jiffies64(now);
+	sched_clock_tick_start(cpu);
 	cpu_clear(cpu, nohz_cpu_mask);
 
 	/*
-- 
cgit v1.2.3


From 2b8a0cf4890d7537a77b51caa8f508e4a05a0e67 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 7 Jul 2008 19:49:41 -0400
Subject: sched_clock: fix calculation of other CPU

The algorithm to calculate the 'now' of another CPU is not correct.
At each scheduler tick, each CPU records the last sched_clock and
gtod (tick_raw and tick_gtod respectively). If the TSC is somewhat the
same in speed between two clocks the algorithm would be:

  tick_gtod1 + (now1 - tick_raw1) = tick_gtod2 + (now2 - tick_raw2)

To calculate now2 we would have:

  now2 = (tick_gtod1 - tick_gtod2) + (tick_raw2 - tick_raw1) + now1

Currently the algorithm is:

  now2 = (tick_gtod1 - tick_gtod2) + (tick_raw1 - tick_raw2) + now1

This solves most of the rest of the issues I've had with timestamps in
ftace.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_clock.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 97159e225a77..55fca1e9e12a 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -203,8 +203,8 @@ u64 sched_clock_cpu(int cpu)
 		now -= my_scd->tick_raw;
 		now += scd->tick_raw;
 
-		now -= my_scd->tick_gtod;
-		now += scd->tick_gtod;
+		now += my_scd->tick_gtod;
+		now -= scd->tick_gtod;
 
 		__raw_spin_unlock(&my_scd->lock);
 	} else {
-- 
cgit v1.2.3


From c0c87734f125d2fa8ebc70310f3257fa6209f2b6 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 9 Jul 2008 00:15:31 -0400
Subject: sched_clock: only update deltas with local reads.

Reading the CPU clock should try to stay accurate within the CPU.
By reading the CPU clock from another CPU and updating the deltas can
cause unneeded jumps when reading from the local CPU.

This patch changes the code to update the last read TSC only when read
from the local CPU.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_clock.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 55fca1e9e12a..ee7cce5029ce 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -124,7 +124,7 @@ static int check_max(struct sched_clock_data *scd)
  *  - filter out backward motion
  *  - use jiffies to generate a min,max window to clip the raw values
  */
-static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
+static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *time)
 {
 	unsigned long now_jiffies = jiffies;
 	long delta_jiffies = now_jiffies - scd->tick_jiffies;
@@ -162,8 +162,12 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
 	if (unlikely(clock < min_clock))
 		clock = min_clock;
 
-	scd->prev_raw = now;
-	scd->clock = clock;
+	if (time)
+		*time = clock;
+	else {
+		scd->prev_raw = now;
+		scd->clock = clock;
+	}
 }
 
 static void lock_double_clock(struct sched_clock_data *data1,
@@ -207,15 +211,18 @@ u64 sched_clock_cpu(int cpu)
 		now -= scd->tick_gtod;
 
 		__raw_spin_unlock(&my_scd->lock);
+
+		__update_sched_clock(scd, now, &clock);
+
+		__raw_spin_unlock(&scd->lock);
+
 	} else {
 		__raw_spin_lock(&scd->lock);
+		__update_sched_clock(scd, now, NULL);
+		clock = scd->clock;
+		__raw_spin_unlock(&scd->lock);
 	}
 
-	__update_sched_clock(scd, now);
-	clock = scd->clock;
-
-	__raw_spin_unlock(&scd->lock);
-
 	return clock;
 }
 
@@ -234,7 +241,7 @@ void sched_clock_tick(void)
 	now_gtod = ktime_to_ns(ktime_get());
 
 	__raw_spin_lock(&scd->lock);
-	__update_sched_clock(scd, now);
+	__update_sched_clock(scd, now, NULL);
 	/*
 	 * update tick_gtod after __update_sched_clock() because that will
 	 * already observe 1 new jiffy; adding a new tick_gtod to that would
-- 
cgit v1.2.3


From a83bc47c33ab182f1e48977fd5a04024d713c75e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 9 Jul 2008 00:15:32 -0400
Subject: sched_clock: record TSC after gtod

To read the gtod we need to grab the xtime lock for read. Reading the gtod
before the TSC can cause a bigger gab if the xtime lock is contended.

This patch simply reverses the order to read the TSC after the gtod.
The locking in the reading of the gtod handles any barriers one might
think is needed.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_clock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index ee7cce5029ce..28ff6bf5e02b 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -237,8 +237,8 @@ void sched_clock_tick(void)
 
 	WARN_ON_ONCE(!irqs_disabled());
 
-	now = sched_clock();
 	now_gtod = ktime_to_ns(ktime_get());
+	now = sched_clock();
 
 	__raw_spin_lock(&scd->lock);
 	__update_sched_clock(scd, now, NULL);
-- 
cgit v1.2.3


From c300ba252829e9325e08f0af60687add94445b25 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 9 Jul 2008 00:15:33 -0400
Subject: sched_clock: and multiplier for TSC to gtod drift

The sched_clock code currently tries to keep all CPU clocks of all CPUS
somewhat in sync. At every clock tick it records the gtod clock and
uses that and jiffies and the TSC to calculate a CPU clock that tries to
stay in sync with all the other CPUs.

ftrace depends heavily on this timer and it detects when this timer
"jumps".  One problem is that the TSC and the gtod also drift.
When the TSC is 0.1% faster or slower than the gtod it is very noticeable
in ftrace. To help compensate for this, I've added a multiplier that
tries to keep the CPU clock updating at the same rate as the gtod.

I've tried various ways to get it to be in sync and this ended up being
the most reliable. At every scheduler tick we calculate the new multiplier:

  multi = delta_gtod / delta_TSC

This means we perform a 64 bit divide at the tick (once a HZ). A shift
is used to handle the accuracy.

Other methods that failed due to dynamic HZ are:

(not used)  multi += (gtod - tsc) / delta_gtod
(not used)  multi += (gtod - (last_tsc + delta_tsc)) / delta_gtod

as well as other variants.

This code still allows for a slight drift between TSC and gtod, but
it keeps the damage down to a minimum.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_clock.c | 40 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 37 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 28ff6bf5e02b..8affbfd0cdb0 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -3,6 +3,9 @@
  *
  *  Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  *
+ *  Updates and enhancements:
+ *    Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com>
+ *
  * Based on code by:
  *   Ingo Molnar <mingo@redhat.com>
  *   Guillaume Chazarain <guichaz@gmail.com>
@@ -32,6 +35,11 @@
 
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
 
+#define MULTI_SHIFT 15
+/* Max is double, Min is 1/2 */
+#define MAX_MULTI (2LL << MULTI_SHIFT)
+#define MIN_MULTI (1LL << (MULTI_SHIFT-1))
+
 struct sched_clock_data {
 	/*
 	 * Raw spinlock - this is a special case: this might be called
@@ -45,6 +53,7 @@ struct sched_clock_data {
 	u64			tick_raw;
 	u64			tick_gtod;
 	u64			clock;
+	s64			multi;
 #ifdef CONFIG_NO_HZ
 	int			check_max;
 #endif
@@ -79,6 +88,7 @@ void sched_clock_init(void)
 		scd->tick_raw = 0;
 		scd->tick_gtod = ktime_now;
 		scd->clock = ktime_now;
+		scd->multi = 1 << MULTI_SHIFT;
 #ifdef CONFIG_NO_HZ
 		scd->check_max = 1;
 #endif
@@ -134,8 +144,13 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *tim
 
 	WARN_ON_ONCE(!irqs_disabled());
 
-	min_clock = scd->tick_gtod +
-		(delta_jiffies ? delta_jiffies - 1 : 0) * TICK_NSEC;
+	/*
+	 * At schedule tick the clock can be just under the gtod. We don't
+	 * want to push it too prematurely.
+	 */
+	min_clock = scd->tick_gtod + (delta_jiffies * TICK_NSEC);
+	if (min_clock > TICK_NSEC)
+		min_clock -= TICK_NSEC / 2;
 
 	if (unlikely(delta < 0)) {
 		clock++;
@@ -149,6 +164,9 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *tim
 	 */
 	max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC;
 
+	delta *= scd->multi;
+	delta >>= MULTI_SHIFT;
+
 	if (unlikely(clock + delta > max_clock) && check_max(scd)) {
 		if (clock < max_clock)
 			clock = max_clock;
@@ -230,6 +248,7 @@ void sched_clock_tick(void)
 {
 	struct sched_clock_data *scd = this_scd();
 	unsigned long now_jiffies = jiffies;
+	s64 mult, delta_gtod, delta_raw;
 	u64 now, now_gtod;
 
 	if (unlikely(!sched_clock_running))
@@ -247,9 +266,23 @@ void sched_clock_tick(void)
 	 * already observe 1 new jiffy; adding a new tick_gtod to that would
 	 * increase the clock 2 jiffies.
 	 */
-	scd->tick_jiffies = now_jiffies;
+	delta_gtod = now_gtod - scd->tick_gtod;
+	delta_raw = now - scd->tick_raw;
+
+	if ((long)delta_raw > 0) {
+		mult = delta_gtod << MULTI_SHIFT;
+		do_div(mult, delta_raw);
+		scd->multi = mult;
+		if (scd->multi > MAX_MULTI)
+			scd->multi = MAX_MULTI;
+		else if (scd->multi < MIN_MULTI)
+			scd->multi = MIN_MULTI;
+	} else
+		scd->multi = 1 << MULTI_SHIFT;
+
 	scd->tick_raw = now;
 	scd->tick_gtod = now_gtod;
+	scd->tick_jiffies = now_jiffies;
 	__raw_spin_unlock(&scd->lock);
 }
 
@@ -279,6 +312,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 	__raw_spin_lock(&scd->lock);
 	scd->prev_raw = now;
 	scd->clock += delta_ns;
+	scd->multi = 1 << MULTI_SHIFT;
 	__raw_spin_unlock(&scd->lock);
 
 	touch_softlockup_watchdog();
-- 
cgit v1.2.3


From b2613e370dbeb69edbff989382fa54f2395aa471 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 11 Jul 2008 16:44:27 +0200
Subject: ftrace: build fix for ftraced_suspend

fix:

 kernel/trace/ftrace.c:1615: error: 'ftraced_suspend' undeclared (first use in this function)
 kernel/trace/ftrace.c:1615: error: (Each undeclared identifier is reported only once
 kernel/trace/ftrace.c:1615: error: for each function it appears in.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1359632668a4..4231a3dc224a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1612,7 +1612,9 @@ void ftrace_kill_atomic(void)
 {
 	ftrace_disabled = 1;
 	ftrace_enabled = 0;
+#ifdef CONFIG_DYNAMIC_FTRACE
 	ftraced_suspend = -1;
+#endif
 	clear_ftrace_function();
 }
 
-- 
cgit v1.2.3


From 72b1e22dfcad1daca6906148fd956ffe404bb0bc Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Thu, 10 Jul 2008 11:16:45 -0700
Subject: x64, x2apic/intr-remap: generic irq migration support from process
 context

Generic infrastructure for migrating the irq from the process context in the
presence of CONFIG_GENERIC_PENDING_IRQ.

This will be used later for migrating irq in the presence of
interrupt-remapping.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: akpm@linux-foundation.org
Cc: arjan@linux.intel.com
Cc: andi@firstfloor.org
Cc: ebiederm@xmission.com
Cc: jbarnes@virtuousgeek.org
Cc: steiner@sgi.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/manage.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 46d6611a33bb..628b5572a7c2 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -87,7 +87,14 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
 	set_balance_irq_affinity(irq, cpumask);
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-	set_pending_irq(irq, cpumask);
+	if (desc->status & IRQ_MOVE_PCNTXT) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&desc->lock, flags);
+		desc->chip->set_affinity(irq, cpumask);
+		spin_unlock_irqrestore(&desc->lock, flags);
+	} else
+		set_pending_irq(irq, cpumask);
 #else
 	desc->affinity = cpumask;
 	desc->chip->set_affinity(irq, cpumask);
-- 
cgit v1.2.3


From 3e84050c81ffb4961ef43d20e1fb1d7607167d83 Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Sun, 13 Jul 2008 02:10:29 +0200
Subject: cpusets, hotplug, scheduler: fix scheduler domain breakage

Commit f18f982ab ("sched: CPU hotplug events must not destroy scheduler
domains created by the cpusets") introduced a hotplug-related problem as
described below:

Upon CPU_DOWN_PREPARE,

  update_sched_domains() -> detach_destroy_domains(&cpu_online_map)

does the following:

/*
 * Force a reinitialization of the sched domains hierarchy. The domains
 * and groups cannot be updated in place without racing with the balancing
 * code, so we temporarily attach all running cpus to the NULL domain
 * which will prevent rebalancing while the sched domains are recalculated.
 */

The sched-domains should be rebuilt when a CPU_DOWN ops. has been
completed, effectively either upon CPU_DEAD{_FROZEN} (upon success) or
CPU_DOWN_FAILED{_FROZEN} (upon failure -- restore the things to their
initial state). That's what update_sched_domains() also does but only
for !CPUSETS case.

With f18f982ab, sched-domains' reinitialization is delegated to
CPUSETS code:

cpuset_handle_cpuhp() -> common_cpu_mem_hotplug_unplug() ->
rebuild_sched_domains()

Being called for CPU_UP_PREPARE and if its callback is called after
update_sched_domains()), it just negates all the work done by
update_sched_domains() -- i.e. a soon-to-be-offline cpu is included in
the sched-domains and that makes it visible for the load-balancer
while the CPU_DOWN ops. is in progress.

__migrate_live_tasks() moves the tasks off a 'dead' cpu (it's already
"offline" when this function is called).

try_to_wake_up() is called for one of these tasks from another CPU ->
the load-balancer (wake_idle()) picks up a "dead" CPU and places the
task on it. Then e.g. BUG_ON(rq->nr_running) detects this a bit later
-> oops.

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Tested-by: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Paul Menage <menage@google.com>
Cc: Max Krasnyansky <maxk@qualcomm.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: miaox@cn.fujitsu.com
Cc: rostedt@goodmis.org
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/cpuset.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 9fceb97e989c..798b3ab054eb 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1882,7 +1882,7 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
  * in order to minimize text size.
  */
 
-static void common_cpu_mem_hotplug_unplug(void)
+static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
 {
 	cgroup_lock();
 
@@ -1894,7 +1894,8 @@ static void common_cpu_mem_hotplug_unplug(void)
 	 * Scheduler destroys domains on hotplug events.
 	 * Rebuild them based on the current settings.
 	 */
-	rebuild_sched_domains();
+	if (rebuild_sd)
+		rebuild_sched_domains();
 
 	cgroup_unlock();
 }
@@ -1912,11 +1913,22 @@ static void common_cpu_mem_hotplug_unplug(void)
 static int cpuset_handle_cpuhp(struct notifier_block *unused_nb,
 				unsigned long phase, void *unused_cpu)
 {
-	if (phase == CPU_DYING || phase == CPU_DYING_FROZEN)
+	switch (phase) {
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+	case CPU_DOWN_FAILED:
+	case CPU_DOWN_FAILED_FROZEN:
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		common_cpu_mem_hotplug_unplug(1);
+		break;
+	default:
 		return NOTIFY_DONE;
+	}
 
-	common_cpu_mem_hotplug_unplug();
-	return 0;
+	return NOTIFY_OK;
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
@@ -1929,7 +1941,7 @@ static int cpuset_handle_cpuhp(struct notifier_block *unused_nb,
 
 void cpuset_track_online_nodes(void)
 {
-	common_cpu_mem_hotplug_unplug();
+	common_cpu_mem_hotplug_unplug(0);
 }
 #endif
 
-- 
cgit v1.2.3


From 199a952876adbfc2b6c13b8b07adabebf4ff54b2 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Thu, 26 Jun 2008 10:06:43 +0800
Subject: rcu classic: update qlen when cpu offline

When callbacks are moved from offline cpu to this cpu,
the qlen field of this rdp should be updated.

[ Paul E. McKenney: ]

The effect of this bug would be for force_quiescent_state() to be invoked
when it should not and vice versa -- wasting cycles in the first case
and letting RCU callbacks remain piled up in the second case.  The bug
is thus "benign" in that it does not result in premature grace-period
termination, but should of course be fixed nonetheless.

Preemption is disabled by the caller's get_cpu_var(), so we are guaranteed
to remain on the same CPU, as required.  The local_irq_disable() is indeed
needed, otherwise, an interrupt might invoke call_rcu() or call_rcu_bh(),
which could cause that interrupt's increment of ->qlen to be lost.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 214e1cde9812..529190c485fd 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -387,6 +387,10 @@ static void __rcu_offline_cpu(struct rcu_data *this_rdp,
 	rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
 	rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
 	rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
+
+	local_irq_disable();
+	this_rdp->qlen += rdp->qlen;
+	local_irq_enable();
 }
 
 static void rcu_offline_cpu(int cpu)
-- 
cgit v1.2.3


From 006ebb40d3d65338bd74abb03b945f8d60e362bd Mon Sep 17 00:00:00 2001
From: Stephen Smalley <sds@tycho.nsa.gov>
Date: Mon, 19 May 2008 08:32:49 -0400
Subject: Security: split proc ptrace checking into read vs. attach

Enable security modules to distinguish reading of process state via
proc from full ptrace access by renaming ptrace_may_attach to
ptrace_may_access and adding a mode argument indicating whether only
read access or full attach access is requested.  This allows security
modules to permit access to reading process state without granting
full ptrace access.  The base DAC/capability checking remains unchanged.

Read access to /proc/pid/mem continues to apply a full ptrace attach
check since check_mem_permission() already requires the current task
to already be ptracing the target.  The other ptrace checks within
proc for elements like environ, maps, and fds are changed to pass the
read mode instead of attach.

In the SELinux case, we model such reading of process state as a
reading of a proc file labeled with the target process' label.  This
enables SELinux policy to permit such reading of process state without
permitting control or manipulation of the target process, as there are
a number of cases where programs probe for such information via proc
but do not need to be able to control the target (e.g. procps,
lsof, PolicyKit, ConsoleKit).  At present we have to choose between
allowing full ptrace in policy (more permissive than required/desired)
or breaking functionality (or in some cases just silencing the denials
via dontaudit rules but this can hide genuine attacks).

This version of the patch incorporates comments from Casey Schaufler
(change/replace existing ptrace_may_attach interface, pass access
mode), and Chris Wright (provide greater consistency in the checking).

Note that like their predecessors __ptrace_may_attach and
ptrace_may_attach, the __ptrace_may_access and ptrace_may_access
interfaces use different return value conventions from each other (0
or -errno vs. 1 or 0).  I retained this difference to avoid any
changes to the caller logic but made the difference clearer by
changing the latter interface to return a bool rather than an int and
by adding a comment about it to ptrace.h for any future callers.

Signed-off-by:  Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/ptrace.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 6c19e94fd0a5..e337390fce01 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -121,7 +121,7 @@ int ptrace_check_attach(struct task_struct *child, int kill)
 	return ret;
 }
 
-int __ptrace_may_attach(struct task_struct *task)
+int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 {
 	/* May we inspect the given task?
 	 * This check is used both for attaching with ptrace
@@ -148,16 +148,16 @@ int __ptrace_may_attach(struct task_struct *task)
 	if (!dumpable && !capable(CAP_SYS_PTRACE))
 		return -EPERM;
 
-	return security_ptrace(current, task);
+	return security_ptrace(current, task, mode);
 }
 
-int ptrace_may_attach(struct task_struct *task)
+bool ptrace_may_access(struct task_struct *task, unsigned int mode)
 {
 	int err;
 	task_lock(task);
-	err = __ptrace_may_attach(task);
+	err = __ptrace_may_access(task, mode);
 	task_unlock(task);
-	return !err;
+	return (!err ? true : false);
 }
 
 int ptrace_attach(struct task_struct *task)
@@ -195,7 +195,7 @@ repeat:
 	/* the same process cannot be attached many times */
 	if (task->ptrace & PT_PTRACED)
 		goto bad;
-	retval = __ptrace_may_attach(task);
+	retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH);
 	if (retval)
 		goto bad;
 
@@ -494,7 +494,8 @@ int ptrace_traceme(void)
 	 */
 	task_lock(current);
 	if (!(current->ptrace & PT_PTRACED)) {
-		ret = security_ptrace(current->parent, current);
+		ret = security_ptrace(current->parent, current,
+				      PTRACE_MODE_ATTACH);
 		/*
 		 * Set the ptrace bit in the process ptrace flags.
 		 */
-- 
cgit v1.2.3


From 992860e991f2015fb8c8df65aa32afa0dcbb4430 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 14 Jul 2008 10:28:38 +0200
Subject: lockdep: fix ftrace irq tracing false positive

fix this false positive:

[    0.020000] ------------[ cut here ]------------
[    0.020000] WARNING: at kernel/lockdep.c:2718 check_flags+0x14a/0x170()
[    0.020000] Modules linked in:
[    0.020000] Pid: 0, comm: swapper Not tainted 2.6.26-tip-00343-gd7e5521-dirty #14486
[    0.020000]  [<c01312e4>] warn_on_slowpath+0x54/0x80
[    0.020000]  [<c067e451>] ? _spin_unlock_irqrestore+0x61/0x70
[    0.020000]  [<c0131bb1>] ? release_console_sem+0x201/0x210
[    0.020000]  [<c0143d65>] ? __kernel_text_address+0x35/0x40
[    0.020000]  [<c010562e>] ? dump_trace+0x5e/0x140
[    0.020000]  [<c01518b5>] ? __lock_acquire+0x245/0x820
[    0.020000]  [<c015063a>] check_flags+0x14a/0x170
[    0.020000]  [<c0151ed8>] ? lock_acquire+0x48/0xc0
[    0.020000]  [<c0151ee1>] lock_acquire+0x51/0xc0
[    0.020000]  [<c014a16c>] ? down+0x2c/0x40
[    0.020000]  [<c010a609>] ? sched_clock+0x9/0x10
[    0.020000]  [<c067e7b2>] _write_lock+0x32/0x60
[    0.020000]  [<c013797f>] ? request_resource+0x1f/0xb0
[    0.020000]  [<c013797f>] request_resource+0x1f/0xb0
[    0.020000]  [<c02f89ad>] vgacon_startup+0x2bd/0x3e0
[    0.020000]  [<c094d62a>] con_init+0x19/0x22f
[    0.020000]  [<c0330c7c>] ? tty_register_ldisc+0x5c/0x70
[    0.020000]  [<c094cf49>] console_init+0x20/0x2e
[    0.020000]  [<c092a969>] start_kernel+0x20c/0x379
[    0.020000]  [<c092a516>] ? unknown_bootoption+0x0/0x1f6
[    0.020000]  [<c092a099>] __init_begin+0x99/0xa1
[    0.020000]  =======================
[    0.020000] ---[ end trace 4eaa2a86a8e2da22 ]---
[    0.020000] possible reason: unannotated irqs-on.
[    0.020000] irq event stamp: 0

which occurs if CONFIG_TRACE_IRQFLAGS=y, CONFIG_DEBUG_LOCKDEP=y,
but CONFIG_PROVE_LOCKING is disabled.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 7553a28b99cd..fc5d5aabd77a 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2688,7 +2688,8 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
  */
 static void check_flags(unsigned long flags)
 {
-#if defined(CONFIG_DEBUG_LOCKDEP) && defined(CONFIG_TRACE_IRQFLAGS)
+#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) && \
+    defined(CONFIG_TRACE_IRQFLAGS)
 	if (!debug_locks)
 		return;
 
-- 
cgit v1.2.3


From d12c1a37925a8ec386994169605fe99217295199 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 14 Jul 2008 12:09:28 +0200
Subject: lockdep: fix kernel/fork.c warning

fix:

[    0.184011] ------------[ cut here ]------------
[    0.188011] WARNING: at kernel/fork.c:918 copy_process+0x1c0/0x1084()
[    0.192011] Pid: 0, comm: swapper Not tainted 2.6.26-tip-00351-g01d4a50-dirty #14521
[    0.196011]  [<c0135d48>] warn_on_slowpath+0x3c/0x60
[    0.200012]  [<c016f805>] ? __alloc_pages_internal+0x92/0x36b
[    0.208012]  [<c033de5e>] ? __spin_lock_init+0x24/0x4a
[    0.212012]  [<c01347e3>] copy_process+0x1c0/0x1084
[    0.216013]  [<c013575f>] do_fork+0xb8/0x1ad
[    0.220013]  [<c034f75e>] ? acpi_os_release_lock+0x8/0xa
[    0.228013]  [<c034ff7a>] ? acpi_os_vprintf+0x20/0x24
[    0.232014]  [<c01129ee>] kernel_thread+0x75/0x7d
[    0.236014]  [<c0a491eb>] ? kernel_init+0x0/0x24a
[    0.240014]  [<c0a491eb>] ? kernel_init+0x0/0x24a
[    0.244014]  [<c01151b0>] ? kernel_thread_helper+0x0/0x10
[    0.252015]  [<c06c6ac0>] rest_init+0x14/0x50
[    0.256015]  [<c0a498ce>] start_kernel+0x2b9/0x2c0
[    0.260015]  [<c0a4904f>] __init_begin+0x4f/0x57
[    0.264016]  =======================
[    0.268016] ---[ end trace 4eaa2a86a8e2da22 ]---
[    0.272016] enabled ExtINT on CPU#0

which occurs if CONFIG_TRACE_IRQFLAGS=y, CONFIG_DEBUG_LOCKDEP=y,
but CONFIG_PROVE_LOCKING is disabled.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 19908b26cf80..cdb1f82d3bd2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -909,7 +909,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	rt_mutex_init_task(p);
 
-#ifdef CONFIG_TRACE_IRQFLAGS
+#ifdef CONFIG_PROVE_LOCKING
 	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
 	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
 #endif
-- 
cgit v1.2.3


From 63cf13b77ab785e87c867defa8545e6d4a989774 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Tue, 15 Jul 2008 13:22:49 -0700
Subject: generic ipi function calls: wait on alloc failure fallback

When a GFP_ATOMIC allocation fails, it falls back to allocating the
data on the stack and converting it to a waiting call.

Make sure we actually wait in this case.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/smp.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index ab10793b0707..462c785ca1ee 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -312,6 +312,7 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
 	if (!data) {
 		data = &d;
 		data->csd.flags = CSD_FLAG_WAIT;
+		wait = 1;
 	}
 
 	spin_lock_init(&data->lock);
-- 
cgit v1.2.3


From 0775b3dbcb6d17b531b36df520ddab735647f3f7 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Tue, 8 Jul 2008 15:07:08 -0700
Subject: suspend, xen: enable PM_SLEEP for CONFIG_XEN

Xen save/restore requires PM_SLEEP to be set without requiring
SUSPEND or HIBERNATION.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Stephen Tweedie <sct@redhat.com>
Cc: Eduardo Habkost <ehabkost@redhat.com>
Cc: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/power/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index b45da40e8d25..1436c47cb39b 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -82,7 +82,7 @@ config PM_SLEEP_SMP
 
 config PM_SLEEP
 	bool
-	depends on SUSPEND || HIBERNATION
+	depends on SUSPEND || HIBERNATION || XEN
 	default y
 
 config SUSPEND
-- 
cgit v1.2.3


From 6717ef1aa750b54ddb9d8854b91707ee21f0ad23 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 9 Jul 2008 22:17:01 +0200
Subject: Revert "suspend, xen: enable PM_SLEEP for CONFIG_XEN"

This reverts commit 6fbbec428c8e7bb617da2e8a589af2e97bcf3bc4.

Rafael doesnt like it - it breaks various assumptions.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/power/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 1436c47cb39b..b45da40e8d25 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -82,7 +82,7 @@ config PM_SLEEP_SMP
 
 config PM_SLEEP
 	bool
-	depends on SUSPEND || HIBERNATION || XEN
+	depends on SUSPEND || HIBERNATION
 	default y
 
 config SUSPEND
-- 
cgit v1.2.3


From b62b8ef906cdf7115af579ce7378886ce3e0ce00 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Tue, 29 Apr 2008 02:35:56 -0400
Subject: force offline the processor during hot-removal

The ACPI device node for the cpu has already been unregistered
when acpi_processor_handle_eject is called.
Thus we should offline the cpu and continue, rather than a failure here.
http://bugzilla.kernel.org/show_bug.cgi?id=9772

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 kernel/cpu.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index b11f06dc149a..cfb1d43ab801 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -299,6 +299,7 @@ int __ref cpu_down(unsigned int cpu)
 	cpu_maps_update_done();
 	return err;
 }
+EXPORT_SYMBOL(cpu_down);
 #endif /*CONFIG_HOTPLUG_CPU*/
 
 /* Requires cpu_add_remove_lock to be held */
-- 
cgit v1.2.3


From ebb12db51f6c13b30752fcf506baad4c617b153c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 11 Jun 2008 22:04:29 +0200
Subject: Freezer: Introduce PF_FREEZER_NOSIG

The freezer currently attempts to distinguish kernel threads from
user space tasks by checking if their mm pointer is unset and it
does not send fake signals to kernel threads.  However, there are
kernel threads, mostly related to networking, that behave like
user space tasks and may want to be sent a fake signal to be frozen.

Introduce the new process flag PF_FREEZER_NOSIG that will be set
by default for all kernel threads and make the freezer only send
fake signals to the tasks having PF_FREEZER_NOSIG unset.  Provide
the set_freezable_with_signal() function to be called by the kernel
threads that want to be sent a fake signal for freezing.

This patch should not change the freezer's observable behavior.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 kernel/kthread.c       |  2 +-
 kernel/power/process.c | 97 ++++++++++++++++++++++----------------------------
 2 files changed, 43 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 97747cdd37c9..ac3fb7326641 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -235,7 +235,7 @@ int kthreadd(void *unused)
 	set_user_nice(tsk, KTHREAD_NICE_LEVEL);
 	set_cpus_allowed(tsk, CPU_MASK_ALL);
 
-	current->flags |= PF_NOFREEZE;
+	current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
 
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index f1d0b345c9ba..5fb87652f214 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -19,9 +19,6 @@
  */
 #define TIMEOUT	(20 * HZ)
 
-#define FREEZER_KERNEL_THREADS 0
-#define FREEZER_USER_SPACE 1
-
 static inline int freezeable(struct task_struct * p)
 {
 	if ((p == current) ||
@@ -84,63 +81,53 @@ static void fake_signal_wake_up(struct task_struct *p)
 	spin_unlock_irqrestore(&p->sighand->siglock, flags);
 }
 
-static int has_mm(struct task_struct *p)
+static inline bool should_send_signal(struct task_struct *p)
 {
-	return (p->mm && !(p->flags & PF_BORROWED_MM));
+	return !(p->flags & PF_FREEZER_NOSIG);
 }
 
 /**
  *	freeze_task - send a freeze request to given task
  *	@p: task to send the request to
- *	@with_mm_only: if set, the request will only be sent if the task has its
- *		own mm
- *	Return value: 0, if @with_mm_only is set and the task has no mm of its
- *		own or the task is frozen, 1, otherwise
+ *	@sig_only: if set, the request will only be sent if the task has the
+ *		PF_FREEZER_NOSIG flag unset
+ *	Return value: 'false', if @sig_only is set and the task has
+ *		PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
  *
- *	The freeze request is sent by seting the tasks's TIF_FREEZE flag and
+ *	The freeze request is sent by setting the tasks's TIF_FREEZE flag and
  *	either sending a fake signal to it or waking it up, depending on whether
- *	or not it has its own mm (ie. it is a user land task).  If @with_mm_only
- *	is set and the task has no mm of its own (ie. it is a kernel thread),
- *	its TIF_FREEZE flag should not be set.
- *
- *	The task_lock() is necessary to prevent races with exit_mm() or
- *	use_mm()/unuse_mm() from occuring.
+ *	or not it has PF_FREEZER_NOSIG set.  If @sig_only is set and the task
+ *	has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
+ *	TIF_FREEZE flag will not be set.
  */
-static int freeze_task(struct task_struct *p, int with_mm_only)
+static bool freeze_task(struct task_struct *p, bool sig_only)
 {
-	int ret = 1;
+	/*
+	 * We first check if the task is freezing and next if it has already
+	 * been frozen to avoid the race with frozen_process() which first marks
+	 * the task as frozen and next clears its TIF_FREEZE.
+	 */
+	if (!freezing(p)) {
+		rmb();
+		if (frozen(p))
+			return false;
 
-	task_lock(p);
-	if (freezing(p)) {
-		if (has_mm(p)) {
-			if (!signal_pending(p))
-				fake_signal_wake_up(p);
-		} else {
-			if (with_mm_only)
-				ret = 0;
-			else
-				wake_up_state(p, TASK_INTERRUPTIBLE);
-		}
+		if (!sig_only || should_send_signal(p))
+			set_freeze_flag(p);
+		else
+			return false;
+	}
+
+	if (should_send_signal(p)) {
+		if (!signal_pending(p))
+			fake_signal_wake_up(p);
+	} else if (sig_only) {
+		return false;
 	} else {
-		rmb();
-		if (frozen(p)) {
-			ret = 0;
-		} else {
-			if (has_mm(p)) {
-				set_freeze_flag(p);
-				fake_signal_wake_up(p);
-			} else {
-				if (with_mm_only) {
-					ret = 0;
-				} else {
-					set_freeze_flag(p);
-					wake_up_state(p, TASK_INTERRUPTIBLE);
-				}
-			}
-		}
+		wake_up_state(p, TASK_INTERRUPTIBLE);
 	}
-	task_unlock(p);
-	return ret;
+
+	return true;
 }
 
 static void cancel_freezing(struct task_struct *p)
@@ -156,7 +143,7 @@ static void cancel_freezing(struct task_struct *p)
 	}
 }
 
-static int try_to_freeze_tasks(int freeze_user_space)
+static int try_to_freeze_tasks(bool sig_only)
 {
 	struct task_struct *g, *p;
 	unsigned long end_time;
@@ -175,7 +162,7 @@ static int try_to_freeze_tasks(int freeze_user_space)
 			if (frozen(p) || !freezeable(p))
 				continue;
 
-			if (!freeze_task(p, freeze_user_space))
+			if (!freeze_task(p, sig_only))
 				continue;
 
 			/*
@@ -235,13 +222,13 @@ int freeze_processes(void)
 	int error;
 
 	printk("Freezing user space processes ... ");
-	error = try_to_freeze_tasks(FREEZER_USER_SPACE);
+	error = try_to_freeze_tasks(true);
 	if (error)
 		goto Exit;
 	printk("done.\n");
 
 	printk("Freezing remaining freezable tasks ... ");
-	error = try_to_freeze_tasks(FREEZER_KERNEL_THREADS);
+	error = try_to_freeze_tasks(false);
 	if (error)
 		goto Exit;
 	printk("done.");
@@ -251,7 +238,7 @@ int freeze_processes(void)
 	return error;
 }
 
-static void thaw_tasks(int thaw_user_space)
+static void thaw_tasks(bool nosig_only)
 {
 	struct task_struct *g, *p;
 
@@ -260,7 +247,7 @@ static void thaw_tasks(int thaw_user_space)
 		if (!freezeable(p))
 			continue;
 
-		if (!p->mm == thaw_user_space)
+		if (nosig_only && should_send_signal(p))
 			continue;
 
 		thaw_process(p);
@@ -271,8 +258,8 @@ static void thaw_tasks(int thaw_user_space)
 void thaw_processes(void)
 {
 	printk("Restarting tasks ... ");
-	thaw_tasks(FREEZER_KERNEL_THREADS);
-	thaw_tasks(FREEZER_USER_SPACE);
+	thaw_tasks(true);
+	thaw_tasks(false);
 	schedule();
 	printk("done.\n");
 }
-- 
cgit v1.2.3


From 52d11025dba32bed696eaee1822b26529e764770 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@redhat.com>
Date: Wed, 11 Jun 2008 22:07:52 +0200
Subject: snapshot: Push BKL down into ioctl handlers

Push BKL down into ioctl handlers - snapshot device.

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Len Brown <len.brown@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 kernel/power/user.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/user.c b/kernel/power/user.c
index f5512cb3aa86..658262b15994 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -23,6 +23,7 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
+#include <linux/smp_lock.h>
 
 #include <asm/uaccess.h>
 
@@ -164,8 +165,8 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
 	return res;
 }
 
-static int snapshot_ioctl(struct inode *inode, struct file *filp,
-                          unsigned int cmd, unsigned long arg)
+static long snapshot_ioctl(struct file *filp, unsigned int cmd,
+							unsigned long arg)
 {
 	int error = 0;
 	struct snapshot_data *data;
@@ -181,6 +182,8 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 
 	data = filp->private_data;
 
+	lock_kernel();
+
 	switch (cmd) {
 
 	case SNAPSHOT_FREEZE:
@@ -389,7 +392,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 		error = -ENOTTY;
 
 	}
-
+	unlock_kernel();
 	return error;
 }
 
@@ -399,7 +402,7 @@ static const struct file_operations snapshot_fops = {
 	.read = snapshot_read,
 	.write = snapshot_write,
 	.llseek = no_llseek,
-	.ioctl = snapshot_ioctl,
+	.unlocked_ioctl = snapshot_ioctl,
 };
 
 static struct miscdevice snapshot_device = {
-- 
cgit v1.2.3


From 25f2f3daadaf0768a61d02ee3ed3d9a21e9dc46c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 11 Jun 2008 22:09:45 +0200
Subject: snapshot: Use pm_mutex for mutual exclusion

We can avoid taking the BKL in snapshot_ioctl() if pm_mutex is used to prevent
the ioctls from being executed concurrently.

In addition, although it is only possible to open /dev/snapshot once, the task
which has done that may spawn a child that will inherit the open descriptor,
so in theory they can call snapshot_write(), snapshot_read() and
snapshot_release() concurrently.  pm_mutex can also be used for mutual
exclusion in such cases.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 kernel/power/user.c | 68 +++++++++++++++++++++++++++++++++--------------------
 1 file changed, 42 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/user.c b/kernel/power/user.c
index 658262b15994..a6332a313262 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -70,16 +70,22 @@ static int snapshot_open(struct inode *inode, struct file *filp)
 	struct snapshot_data *data;
 	int error;
 
-	if (!atomic_add_unless(&snapshot_device_available, -1, 0))
-		return -EBUSY;
+	mutex_lock(&pm_mutex);
+
+	if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
+		error = -EBUSY;
+		goto Unlock;
+	}
 
 	if ((filp->f_flags & O_ACCMODE) == O_RDWR) {
 		atomic_inc(&snapshot_device_available);
-		return -ENOSYS;
+		error = -ENOSYS;
+		goto Unlock;
 	}
 	if(create_basic_memory_bitmaps()) {
 		atomic_inc(&snapshot_device_available);
-		return -ENOMEM;
+		error = -ENOMEM;
+		goto Unlock;
 	}
 	nonseekable_open(inode, filp);
 	data = &snapshot_state;
@@ -99,33 +105,36 @@ static int snapshot_open(struct inode *inode, struct file *filp)
 		if (error)
 			pm_notifier_call_chain(PM_POST_HIBERNATION);
 	}
-	if (error) {
+	if (error)
 		atomic_inc(&snapshot_device_available);
-		return error;
-	}
 	data->frozen = 0;
 	data->ready = 0;
 	data->platform_support = 0;
 
-	return 0;
+ Unlock:
+	mutex_unlock(&pm_mutex);
+
+	return error;
 }
 
 static int snapshot_release(struct inode *inode, struct file *filp)
 {
 	struct snapshot_data *data;
 
+	mutex_lock(&pm_mutex);
+
 	swsusp_free();
 	free_basic_memory_bitmaps();
 	data = filp->private_data;
 	free_all_swap_pages(data->swap);
-	if (data->frozen) {
-		mutex_lock(&pm_mutex);
+	if (data->frozen)
 		thaw_processes();
-		mutex_unlock(&pm_mutex);
-	}
 	pm_notifier_call_chain(data->mode == O_WRONLY ?
 			PM_POST_HIBERNATION : PM_POST_RESTORE);
 	atomic_inc(&snapshot_device_available);
+
+	mutex_unlock(&pm_mutex);
+
 	return 0;
 }
 
@@ -135,9 +144,13 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
 	struct snapshot_data *data;
 	ssize_t res;
 
+	mutex_lock(&pm_mutex);
+
 	data = filp->private_data;
-	if (!data->ready)
-		return -ENODATA;
+	if (!data->ready) {
+		res = -ENODATA;
+		goto Unlock;
+	}
 	res = snapshot_read_next(&data->handle, count);
 	if (res > 0) {
 		if (copy_to_user(buf, data_of(data->handle), res))
@@ -145,6 +158,10 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
 		else
 			*offp = data->handle.offset;
 	}
+
+ Unlock:
+	mutex_unlock(&pm_mutex);
+
 	return res;
 }
 
@@ -154,6 +171,8 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
 	struct snapshot_data *data;
 	ssize_t res;
 
+	mutex_lock(&pm_mutex);
+
 	data = filp->private_data;
 	res = snapshot_write_next(&data->handle, count);
 	if (res > 0) {
@@ -162,6 +181,9 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
 		else
 			*offp = data->handle.offset;
 	}
+
+	mutex_unlock(&pm_mutex);
+
 	return res;
 }
 
@@ -180,16 +202,16 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	data = filp->private_data;
+	if (!mutex_trylock(&pm_mutex))
+		return -EBUSY;
 
-	lock_kernel();
+	data = filp->private_data;
 
 	switch (cmd) {
 
 	case SNAPSHOT_FREEZE:
 		if (data->frozen)
 			break;
-		mutex_lock(&pm_mutex);
 		printk("Syncing filesystems ... ");
 		sys_sync();
 		printk("done.\n");
@@ -197,7 +219,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 		error = freeze_processes();
 		if (error)
 			thaw_processes();
-		mutex_unlock(&pm_mutex);
 		if (!error)
 			data->frozen = 1;
 		break;
@@ -205,9 +226,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 	case SNAPSHOT_UNFREEZE:
 		if (!data->frozen || data->ready)
 			break;
-		mutex_lock(&pm_mutex);
 		thaw_processes();
-		mutex_unlock(&pm_mutex);
 		data->frozen = 0;
 		break;
 
@@ -310,16 +329,11 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 			error = -EPERM;
 			break;
 		}
-		if (!mutex_trylock(&pm_mutex)) {
-			error = -EBUSY;
-			break;
-		}
 		/*
 		 * Tasks are frozen and the notifiers have been called with
 		 * PM_HIBERNATION_PREPARE
 		 */
 		error = suspend_devices_and_enter(PM_SUSPEND_MEM);
-		mutex_unlock(&pm_mutex);
 		break;
 
 	case SNAPSHOT_PLATFORM_SUPPORT:
@@ -392,7 +406,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 		error = -ENOTTY;
 
 	}
-	unlock_kernel();
+
+	mutex_unlock(&pm_mutex);
+
 	return error;
 }
 
-- 
cgit v1.2.3


From 98abed02007b19bbfd68b6d06a5485afc3eeb01b Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 19 Mar 2008 19:24:59 -0700
Subject: do_wait reorganization

This breaks out the guts of do_wait into three subfunctions.
The control flow is less nonobvious without so much goto.
do_wait_thread and ptrace_do_wait contain the main work of the outer loop.
wait_consider_task contains the main work of the inner loop.

Signed-off-by: Roland McGrath <roland@redhat.com>
---
 kernel/exit.c | 215 ++++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 135 insertions(+), 80 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index ceb258782835..7453356a961f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1238,7 +1238,7 @@ static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
  * the lock and this task is uninteresting.  If we return nonzero, we have
  * released the lock and the system call should return.
  */
-static int wait_task_zombie(struct task_struct *p, int noreap,
+static int wait_task_zombie(struct task_struct *p, int options,
 			    struct siginfo __user *infop,
 			    int __user *stat_addr, struct rusage __user *ru)
 {
@@ -1246,7 +1246,10 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
 	int retval, status, traced;
 	pid_t pid = task_pid_vnr(p);
 
-	if (unlikely(noreap)) {
+	if (!likely(options & WEXITED))
+		return 0;
+
+	if (unlikely(options & WNOWAIT)) {
 		uid_t uid = p->uid;
 		int exit_code = p->exit_code;
 		int why, status;
@@ -1397,13 +1400,16 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
  * released the lock and the system call should return.
  */
 static int wait_task_stopped(struct task_struct *p,
-			     int noreap, struct siginfo __user *infop,
+			     int options, struct siginfo __user *infop,
 			     int __user *stat_addr, struct rusage __user *ru)
 {
 	int retval, exit_code, why;
 	uid_t uid = 0; /* unneeded, required by compiler */
 	pid_t pid;
 
+	if (!(p->ptrace & PT_PTRACED) && !(options & WUNTRACED))
+		return 0;
+
 	exit_code = 0;
 	spin_lock_irq(&p->sighand->siglock);
 
@@ -1421,7 +1427,7 @@ static int wait_task_stopped(struct task_struct *p,
 	if (!exit_code)
 		goto unlock_sig;
 
-	if (!noreap)
+	if (!unlikely(options & WNOWAIT))
 		p->exit_code = 0;
 
 	uid = p->uid;
@@ -1442,7 +1448,7 @@ unlock_sig:
 	why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED;
 	read_unlock(&tasklist_lock);
 
-	if (unlikely(noreap))
+	if (unlikely(options & WNOWAIT))
 		return wait_noreap_copyout(p, pid, uid,
 					   why, exit_code,
 					   infop, ru);
@@ -1476,7 +1482,7 @@ unlock_sig:
  * the lock and this task is uninteresting.  If we return nonzero, we have
  * released the lock and the system call should return.
  */
-static int wait_task_continued(struct task_struct *p, int noreap,
+static int wait_task_continued(struct task_struct *p, int options,
 			       struct siginfo __user *infop,
 			       int __user *stat_addr, struct rusage __user *ru)
 {
@@ -1484,6 +1490,9 @@ static int wait_task_continued(struct task_struct *p, int noreap,
 	pid_t pid;
 	uid_t uid;
 
+	if (!unlikely(options & WCONTINUED))
+		return 0;
+
 	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
 		return 0;
 
@@ -1493,7 +1502,7 @@ static int wait_task_continued(struct task_struct *p, int noreap,
 		spin_unlock_irq(&p->sighand->siglock);
 		return 0;
 	}
-	if (!noreap)
+	if (!unlikely(options & WNOWAIT))
 		p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
 	spin_unlock_irq(&p->sighand->siglock);
 
@@ -1519,89 +1528,137 @@ static int wait_task_continued(struct task_struct *p, int noreap,
 	return retval;
 }
 
+/*
+ * Consider @p for a wait by @parent.
+ *
+ * -ECHILD should be in *@notask_error before the first call.
+ * Returns nonzero for a final return, when we have unlocked tasklist_lock.
+ * Returns zero if the search for a child should continue;
+ * then *@notask_error is 0 if @p is an eligible child, or still -ECHILD.
+ */
+static int wait_consider_task(struct task_struct *parent,
+			      struct task_struct *p, int *notask_error,
+			      enum pid_type type, struct pid *pid, int options,
+			      struct siginfo __user *infop,
+			      int __user *stat_addr, struct rusage __user *ru)
+{
+	int ret = eligible_child(type, pid, options, p);
+	if (ret <= 0)
+		return ret;
+
+	if (p->exit_state == EXIT_DEAD)
+		return 0;
+
+	/*
+	 * We don't reap group leaders with subthreads.
+	 */
+	if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
+		return wait_task_zombie(p, options, infop, stat_addr, ru);
+
+	/*
+	 * It's stopped or running now, so it might
+	 * later continue, exit, or stop again.
+	 */
+	*notask_error = 0;
+
+	if (task_is_stopped_or_traced(p))
+		return wait_task_stopped(p, options, infop, stat_addr, ru);
+
+	return wait_task_continued(p, options, infop, stat_addr, ru);
+}
+
+/*
+ * Do the work of do_wait() for one thread in the group, @tsk.
+ *
+ * -ECHILD should be in *@notask_error before the first call.
+ * Returns nonzero for a final return, when we have unlocked tasklist_lock.
+ * Returns zero if the search for a child should continue; then
+ * *@notask_error is 0 if there were any eligible children, or still -ECHILD.
+ */
+static int do_wait_thread(struct task_struct *tsk, int *notask_error,
+			  enum pid_type type, struct pid *pid, int options,
+			  struct siginfo __user *infop, int __user *stat_addr,
+			  struct rusage __user *ru)
+{
+	struct task_struct *p;
+
+	list_for_each_entry(p, &tsk->children, sibling) {
+		int ret = wait_consider_task(tsk, p, notask_error,
+					     type, pid, options,
+					     infop, stat_addr, ru);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int ptrace_do_wait(struct task_struct *tsk, int *notask_error,
+			  enum pid_type type, struct pid *pid, int options,
+			  struct siginfo __user *infop, int __user *stat_addr,
+			  struct rusage __user *ru)
+{
+	struct task_struct *p;
+
+	/*
+	 * If we never saw an eligile child, check for children stolen by
+	 * ptrace.  We don't leave -ECHILD in *@notask_error if there are any,
+	 * because we will eventually be allowed to wait for them again.
+	 */
+	if (!*notask_error)
+		return 0;
+
+	list_for_each_entry(p, &tsk->ptrace_children, ptrace_list) {
+		int ret = eligible_child(type, pid, options, p);
+		if (unlikely(ret < 0))
+			return ret;
+		if (ret) {
+			*notask_error = 0;
+			return 0;
+		}
+	}
+
+	return 0;
+}
+
 static long do_wait(enum pid_type type, struct pid *pid, int options,
 		    struct siginfo __user *infop, int __user *stat_addr,
 		    struct rusage __user *ru)
 {
 	DECLARE_WAITQUEUE(wait, current);
 	struct task_struct *tsk;
-	int flag, retval;
+	int retval;
 
 	add_wait_queue(&current->signal->wait_chldexit,&wait);
 repeat:
-	/* If there is nothing that can match our critier just get out */
+	/*
+	 * If there is nothing that can match our critiera just get out.
+	 * We will clear @retval to zero if we see any child that might later
+	 * match our criteria, even if we are not able to reap it yet.
+	 */
 	retval = -ECHILD;
 	if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type])))
 		goto end;
 
-	/*
-	 * We will set this flag if we see any child that might later
-	 * match our criteria, even if we are not able to reap it yet.
-	 */
-	flag = retval = 0;
 	current->state = TASK_INTERRUPTIBLE;
 	read_lock(&tasklist_lock);
 	tsk = current;
 	do {
-		struct task_struct *p;
-
-		list_for_each_entry(p, &tsk->children, sibling) {
-			int ret = eligible_child(type, pid, options, p);
-			if (!ret)
-				continue;
-
-			if (unlikely(ret < 0)) {
-				retval = ret;
-			} else if (task_is_stopped_or_traced(p)) {
-				/*
-				 * It's stopped now, so it might later
-				 * continue, exit, or stop again.
-				 */
-				flag = 1;
-				if (!(p->ptrace & PT_PTRACED) &&
-				    !(options & WUNTRACED))
-					continue;
-
-				retval = wait_task_stopped(p,
-						(options & WNOWAIT), infop,
-						stat_addr, ru);
-			} else if (p->exit_state == EXIT_ZOMBIE &&
-					!delay_group_leader(p)) {
-				/*
-				 * We don't reap group leaders with subthreads.
-				 */
-				if (!likely(options & WEXITED))
-					continue;
-				retval = wait_task_zombie(p,
-						(options & WNOWAIT), infop,
-						stat_addr, ru);
-			} else if (p->exit_state != EXIT_DEAD) {
-				/*
-				 * It's running now, so it might later
-				 * exit, stop, or stop and then continue.
-				 */
-				flag = 1;
-				if (!unlikely(options & WCONTINUED))
-					continue;
-				retval = wait_task_continued(p,
-						(options & WNOWAIT), infop,
-						stat_addr, ru);
-			}
-			if (retval != 0) /* tasklist_lock released */
-				goto end;
-		}
-		if (!flag) {
-			list_for_each_entry(p, &tsk->ptrace_children,
-								ptrace_list) {
-				flag = eligible_child(type, pid, options, p);
-				if (!flag)
-					continue;
-				if (likely(flag > 0))
-					break;
-				retval = flag;
-				goto end;
-			}
+		int tsk_result = do_wait_thread(tsk, &retval,
+						type, pid, options,
+						infop, stat_addr, ru);
+		if (!tsk_result)
+			tsk_result = ptrace_do_wait(tsk, &retval,
+						    type, pid, options,
+						    infop, stat_addr, ru);
+		if (tsk_result) {
+			/*
+			 * tasklist_lock is unlocked and we have a final result.
+			 */
+			retval = tsk_result;
+			goto end;
 		}
+
 		if (options & __WNOTHREAD)
 			break;
 		tsk = next_thread(tsk);
@@ -1609,16 +1666,14 @@ repeat:
 	} while (tsk != current);
 	read_unlock(&tasklist_lock);
 
-	if (flag) {
-		if (options & WNOHANG)
-			goto end;
+	if (!retval && !(options & WNOHANG)) {
 		retval = -ERESTARTSYS;
-		if (signal_pending(current))
-			goto end;
-		schedule();
-		goto repeat;
+		if (!signal_pending(current)) {
+			schedule();
+			goto repeat;
+		}
 	}
-	retval = -ECHILD;
+
 end:
 	current->state = TASK_RUNNING;
 	remove_wait_queue(&current->signal->wait_chldexit,&wait);
-- 
cgit v1.2.3


From f470021adb9190819c03d6d8c5c860a17480aa6d Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Mon, 24 Mar 2008 18:36:23 -0700
Subject: ptrace children revamp

ptrace no longer fiddles with the children/sibling links, and the
old ptrace_children list is gone.  Now ptrace, whether of one's own
children or another's via PTRACE_ATTACH, just uses the new ptraced
list instead.

There should be no user-visible difference that matters.  The only
change is the order in which do_wait() sees multiple stopped
children and stopped ptrace attachees.  Since wait_task_stopped()
was changed earlier so it no longer reorders the children list, we
already know this won't cause any new problems.

Signed-off-by: Roland McGrath <roland@redhat.com>
---
 kernel/exit.c   | 226 +++++++++++++++++++++++++++++---------------------------
 kernel/fork.c   |   6 +-
 kernel/ptrace.c |  37 ++++++----
 3 files changed, 146 insertions(+), 123 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 7453356a961f..1e909826a804 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -71,7 +71,7 @@ static void __unhash_process(struct task_struct *p)
 		__get_cpu_var(process_counts)--;
 	}
 	list_del_rcu(&p->thread_group);
-	remove_parent(p);
+	list_del_init(&p->sibling);
 }
 
 /*
@@ -152,6 +152,18 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 	put_task_struct(container_of(rhp, struct task_struct, rcu));
 }
 
+/*
+ * Do final ptrace-related cleanup of a zombie being reaped.
+ *
+ * Called with write_lock(&tasklist_lock) held.
+ */
+static void ptrace_release_task(struct task_struct *p)
+{
+	BUG_ON(!list_empty(&p->ptraced));
+	ptrace_unlink(p);
+	BUG_ON(!list_empty(&p->ptrace_entry));
+}
+
 void release_task(struct task_struct * p)
 {
 	struct task_struct *leader;
@@ -160,8 +172,7 @@ repeat:
 	atomic_dec(&p->user->processes);
 	proc_flush_task(p);
 	write_lock_irq(&tasklist_lock);
-	ptrace_unlink(p);
-	BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
+	ptrace_release_task(p);
 	__exit_signal(p);
 
 	/*
@@ -315,9 +326,8 @@ static void reparent_to_kthreadd(void)
 
 	ptrace_unlink(current);
 	/* Reparent to init */
-	remove_parent(current);
 	current->real_parent = current->parent = kthreadd_task;
-	add_parent(current);
+	list_move_tail(&current->sibling, &current->real_parent->children);
 
 	/* Set the exit signal to SIGCHLD so we signal init on exit */
 	current->exit_signal = SIGCHLD;
@@ -692,37 +702,71 @@ static void exit_mm(struct task_struct * tsk)
 	mmput(mm);
 }
 
-static void
-reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
+/*
+ * Detach all tasks we were using ptrace on.
+ * Any that need to be release_task'd are put on the @dead list.
+ *
+ * Called with write_lock(&tasklist_lock) held.
+ */
+static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
 {
-	if (p->pdeath_signal)
-		/* We already hold the tasklist_lock here.  */
-		group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
+	struct task_struct *p, *n;
 
-	/* Move the child from its dying parent to the new one.  */
-	if (unlikely(traced)) {
-		/* Preserve ptrace links if someone else is tracing this child.  */
-		list_del_init(&p->ptrace_list);
-		if (ptrace_reparented(p))
-			list_add(&p->ptrace_list, &p->real_parent->ptrace_children);
-	} else {
-		/* If this child is being traced, then we're the one tracing it
-		 * anyway, so let go of it.
+	list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
+		__ptrace_unlink(p);
+
+		if (p->exit_state != EXIT_ZOMBIE)
+			continue;
+
+		/*
+		 * If it's a zombie, our attachedness prevented normal
+		 * parent notification or self-reaping.  Do notification
+		 * now if it would have happened earlier.  If it should
+		 * reap itself, add it to the @dead list.  We can't call
+		 * release_task() here because we already hold tasklist_lock.
+		 *
+		 * If it's our own child, there is no notification to do.
 		 */
-		p->ptrace = 0;
-		remove_parent(p);
-		p->parent = p->real_parent;
-		add_parent(p);
+		if (!task_detached(p) && thread_group_empty(p)) {
+			if (!same_thread_group(p->real_parent, parent))
+				do_notify_parent(p, p->exit_signal);
+		}
 
-		if (task_is_traced(p)) {
+		if (task_detached(p)) {
 			/*
-			 * If it was at a trace stop, turn it into
-			 * a normal stop since it's no longer being
-			 * traced.
+			 * Mark it as in the process of being reaped.
 			 */
-			ptrace_untrace(p);
+			p->exit_state = EXIT_DEAD;
+			list_add(&p->ptrace_entry, dead);
 		}
 	}
+}
+
+/*
+ * Finish up exit-time ptrace cleanup.
+ *
+ * Called without locks.
+ */
+static void ptrace_exit_finish(struct task_struct *parent,
+			       struct list_head *dead)
+{
+	struct task_struct *p, *n;
+
+	BUG_ON(!list_empty(&parent->ptraced));
+
+	list_for_each_entry_safe(p, n, dead, ptrace_entry) {
+		list_del_init(&p->ptrace_entry);
+		release_task(p);
+	}
+}
+
+static void reparent_thread(struct task_struct *p, struct task_struct *father)
+{
+	if (p->pdeath_signal)
+		/* We already hold the tasklist_lock here.  */
+		group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
+
+	list_move_tail(&p->sibling, &p->real_parent->children);
 
 	/* If this is a threaded reparent there is no need to
 	 * notify anyone anything has happened.
@@ -737,7 +781,8 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
 	/* If we'd notified the old parent about this child's death,
 	 * also notify the new parent.
 	 */
-	if (!traced && p->exit_state == EXIT_ZOMBIE &&
+	if (!ptrace_reparented(p) &&
+	    p->exit_state == EXIT_ZOMBIE &&
 	    !task_detached(p) && thread_group_empty(p))
 		do_notify_parent(p, p->exit_signal);
 
@@ -754,12 +799,15 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
 static void forget_original_parent(struct task_struct *father)
 {
 	struct task_struct *p, *n, *reaper = father;
-	struct list_head ptrace_dead;
-
-	INIT_LIST_HEAD(&ptrace_dead);
+	LIST_HEAD(ptrace_dead);
 
 	write_lock_irq(&tasklist_lock);
 
+	/*
+	 * First clean up ptrace if we were using it.
+	 */
+	ptrace_exit(father, &ptrace_dead);
+
 	do {
 		reaper = next_thread(reaper);
 		if (reaper == father) {
@@ -768,58 +816,19 @@ static void forget_original_parent(struct task_struct *father)
 		}
 	} while (reaper->flags & PF_EXITING);
 
-	/*
-	 * There are only two places where our children can be:
-	 *
-	 * - in our child list
-	 * - in our ptraced child list
-	 *
-	 * Search them and reparent children.
-	 */
 	list_for_each_entry_safe(p, n, &father->children, sibling) {
-		int ptrace;
-
-		ptrace = p->ptrace;
-
-		/* if father isn't the real parent, then ptrace must be enabled */
-		BUG_ON(father != p->real_parent && !ptrace);
-
-		if (father == p->real_parent) {
-			/* reparent with a reaper, real father it's us */
-			p->real_parent = reaper;
-			reparent_thread(p, father, 0);
-		} else {
-			/* reparent ptraced task to its real parent */
-			__ptrace_unlink (p);
-			if (p->exit_state == EXIT_ZOMBIE && !task_detached(p) &&
-			    thread_group_empty(p))
-				do_notify_parent(p, p->exit_signal);
-		}
-
-		/*
-		 * if the ptraced child is a detached zombie we must collect
-		 * it before we exit, or it will remain zombie forever since
-		 * we prevented it from self-reap itself while it was being
-		 * traced by us, to be able to see it in wait4.
-		 */
-		if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && task_detached(p)))
-			list_add(&p->ptrace_list, &ptrace_dead);
-	}
-
-	list_for_each_entry_safe(p, n, &father->ptrace_children, ptrace_list) {
 		p->real_parent = reaper;
-		reparent_thread(p, father, 1);
+		if (p->parent == father) {
+			BUG_ON(p->ptrace);
+			p->parent = p->real_parent;
+		}
+		reparent_thread(p, father);
 	}
 
 	write_unlock_irq(&tasklist_lock);
 	BUG_ON(!list_empty(&father->children));
-	BUG_ON(!list_empty(&father->ptrace_children));
-
-	list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_list) {
-		list_del_init(&p->ptrace_list);
-		release_task(p);
-	}
 
+	ptrace_exit_finish(father, &ptrace_dead);
 }
 
 /*
@@ -1180,13 +1189,6 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
 			return 0;
 	}
 
-	/*
-	 * Do not consider detached threads that are
-	 * not ptraced:
-	 */
-	if (task_detached(p) && !p->ptrace)
-		return 0;
-
 	/* Wait for all children (clone and not) if __WALL is set;
 	 * otherwise, wait for clone children *only* if __WCLONE is
 	 * set; otherwise, wait for non-clone children *only*.  (Note:
@@ -1399,7 +1401,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
  * the lock and this task is uninteresting.  If we return nonzero, we have
  * released the lock and the system call should return.
  */
-static int wait_task_stopped(struct task_struct *p,
+static int wait_task_stopped(int ptrace, struct task_struct *p,
 			     int options, struct siginfo __user *infop,
 			     int __user *stat_addr, struct rusage __user *ru)
 {
@@ -1407,7 +1409,7 @@ static int wait_task_stopped(struct task_struct *p,
 	uid_t uid = 0; /* unneeded, required by compiler */
 	pid_t pid;
 
-	if (!(p->ptrace & PT_PTRACED) && !(options & WUNTRACED))
+	if (!(options & WUNTRACED))
 		return 0;
 
 	exit_code = 0;
@@ -1416,7 +1418,7 @@ static int wait_task_stopped(struct task_struct *p,
 	if (unlikely(!task_is_stopped_or_traced(p)))
 		goto unlock_sig;
 
-	if (!(p->ptrace & PT_PTRACED) && p->signal->group_stop_count > 0)
+	if (!ptrace && p->signal->group_stop_count > 0)
 		/*
 		 * A group stop is in progress and this is the group leader.
 		 * We won't report until all threads have stopped.
@@ -1445,7 +1447,7 @@ unlock_sig:
 	 */
 	get_task_struct(p);
 	pid = task_pid_vnr(p);
-	why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED;
+	why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
 	read_unlock(&tasklist_lock);
 
 	if (unlikely(options & WNOWAIT))
@@ -1536,7 +1538,7 @@ static int wait_task_continued(struct task_struct *p, int options,
  * Returns zero if the search for a child should continue;
  * then *@notask_error is 0 if @p is an eligible child, or still -ECHILD.
  */
-static int wait_consider_task(struct task_struct *parent,
+static int wait_consider_task(struct task_struct *parent, int ptrace,
 			      struct task_struct *p, int *notask_error,
 			      enum pid_type type, struct pid *pid, int options,
 			      struct siginfo __user *infop,
@@ -1546,6 +1548,15 @@ static int wait_consider_task(struct task_struct *parent,
 	if (ret <= 0)
 		return ret;
 
+	if (likely(!ptrace) && unlikely(p->ptrace)) {
+		/*
+		 * This child is hidden by ptrace.
+		 * We aren't allowed to see it now, but eventually we will.
+		 */
+		*notask_error = 0;
+		return 0;
+	}
+
 	if (p->exit_state == EXIT_DEAD)
 		return 0;
 
@@ -1562,7 +1573,8 @@ static int wait_consider_task(struct task_struct *parent,
 	*notask_error = 0;
 
 	if (task_is_stopped_or_traced(p))
-		return wait_task_stopped(p, options, infop, stat_addr, ru);
+		return wait_task_stopped(ptrace, p, options,
+					 infop, stat_addr, ru);
 
 	return wait_task_continued(p, options, infop, stat_addr, ru);
 }
@@ -1583,11 +1595,16 @@ static int do_wait_thread(struct task_struct *tsk, int *notask_error,
 	struct task_struct *p;
 
 	list_for_each_entry(p, &tsk->children, sibling) {
-		int ret = wait_consider_task(tsk, p, notask_error,
-					     type, pid, options,
-					     infop, stat_addr, ru);
-		if (ret)
-			return ret;
+		/*
+		 * Do not consider detached threads.
+		 */
+		if (!task_detached(p)) {
+			int ret = wait_consider_task(tsk, 0, p, notask_error,
+						     type, pid, options,
+						     infop, stat_addr, ru);
+			if (ret)
+				return ret;
+		}
 	}
 
 	return 0;
@@ -1601,21 +1618,16 @@ static int ptrace_do_wait(struct task_struct *tsk, int *notask_error,
 	struct task_struct *p;
 
 	/*
-	 * If we never saw an eligile child, check for children stolen by
-	 * ptrace.  We don't leave -ECHILD in *@notask_error if there are any,
-	 * because we will eventually be allowed to wait for them again.
+	 * Traditionally we see ptrace'd stopped tasks regardless of options.
 	 */
-	if (!*notask_error)
-		return 0;
+	options |= WUNTRACED;
 
-	list_for_each_entry(p, &tsk->ptrace_children, ptrace_list) {
-		int ret = eligible_child(type, pid, options, p);
-		if (unlikely(ret < 0))
+	list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
+		int ret = wait_consider_task(tsk, 1, p, notask_error,
+					     type, pid, options,
+					     infop, stat_addr, ru);
+		if (ret)
 			return ret;
-		if (ret) {
-			*notask_error = 0;
-			return 0;
-		}
 	}
 
 	return 0;
diff --git a/kernel/fork.c b/kernel/fork.c
index 4bd2f516401f..adefc1131f27 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1125,8 +1125,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	 */
 	p->group_leader = p;
 	INIT_LIST_HEAD(&p->thread_group);
-	INIT_LIST_HEAD(&p->ptrace_children);
-	INIT_LIST_HEAD(&p->ptrace_list);
+	INIT_LIST_HEAD(&p->ptrace_entry);
+	INIT_LIST_HEAD(&p->ptraced);
 
 	/* Now that the task is set up, run cgroup callbacks if
 	 * necessary. We need to run them before the task is visible
@@ -1198,7 +1198,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	}
 
 	if (likely(p->pid)) {
-		add_parent(p);
+		list_add_tail(&p->sibling, &p->real_parent->children);
 		if (unlikely(p->ptrace & PT_PTRACED))
 			__ptrace_link(p, current->parent);
 
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index e337390fce01..8392a9da6450 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -33,13 +33,9 @@
  */
 void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
 {
-	BUG_ON(!list_empty(&child->ptrace_list));
-	if (child->parent == new_parent)
-		return;
-	list_add(&child->ptrace_list, &child->parent->ptrace_children);
-	remove_parent(child);
+	BUG_ON(!list_empty(&child->ptrace_entry));
+	list_add(&child->ptrace_entry, &new_parent->ptraced);
 	child->parent = new_parent;
-	add_parent(child);
 }
  
 /*
@@ -73,12 +69,8 @@ void __ptrace_unlink(struct task_struct *child)
 	BUG_ON(!child->ptrace);
 
 	child->ptrace = 0;
-	if (ptrace_reparented(child)) {
-		list_del_init(&child->ptrace_list);
-		remove_parent(child);
-		child->parent = child->real_parent;
-		add_parent(child);
-	}
+	child->parent = child->real_parent;
+	list_del_init(&child->ptrace_entry);
 
 	if (task_is_traced(child))
 		ptrace_untrace(child);
@@ -492,15 +484,34 @@ int ptrace_traceme(void)
 	/*
 	 * Are we already being traced?
 	 */
+repeat:
 	task_lock(current);
 	if (!(current->ptrace & PT_PTRACED)) {
+		/*
+		 * See ptrace_attach() comments about the locking here.
+		 */
+		unsigned long flags;
+		if (!write_trylock_irqsave(&tasklist_lock, flags)) {
+			task_unlock(current);
+			do {
+				cpu_relax();
+			} while (!write_can_lock(&tasklist_lock));
+			goto repeat;
+		}
+
 		ret = security_ptrace(current->parent, current,
 				      PTRACE_MODE_ATTACH);
+
 		/*
 		 * Set the ptrace bit in the process ptrace flags.
+		 * Then link us on our parent's ptraced list.
 		 */
-		if (!ret)
+		if (!ret) {
 			current->ptrace |= PT_PTRACED;
+			__ptrace_link(current, current->real_parent);
+		}
+
+		write_unlock_irqrestore(&tasklist_lock, flags);
 	}
 	task_unlock(current);
 	return ret;
-- 
cgit v1.2.3


From 14dd0b81414a58caf0296dbeace016bb0a5d11ab Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Sun, 30 Mar 2008 18:41:25 -0700
Subject: do_wait: return security_task_wait() error code in place of -ECHILD

This reverts the effect of commit f2cc3eb133baa2e9dc8efd40f417106b2ee520f3
"do_wait: fix security checks".  That change reverted the effect of commit
73243284463a761e04d69d22c7516b2be7de096c.  The rationale for the original
commit still stands.  The inconsistent treatment of children hidden by
ptrace was an unintended omission in the original change and in no way
invalidates its purpose.

This makes do_wait return the error returned by security_task_wait()
(usually -EACCES) in place of -ECHILD when there are some children the
caller would be able to wait for if not for the permission failure.  A
permission error will give the user a clue to look for security policy
problems, rather than for mysterious wait bugs.

Signed-off-by: Roland McGrath <roland@redhat.com>
---
 kernel/exit.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 1e909826a804..a2af6cac823c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1199,14 +1199,10 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
 		return 0;
 
 	err = security_task_wait(p);
-	if (likely(!err))
-		return 1;
+	if (err)
+		return err;
 
-	if (type != PIDTYPE_PID)
-		return 0;
-	/* This child was explicitly requested, abort */
-	read_unlock(&tasklist_lock);
-	return err;
+	return 1;
 }
 
 static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
@@ -1536,7 +1532,8 @@ static int wait_task_continued(struct task_struct *p, int options,
  * -ECHILD should be in *@notask_error before the first call.
  * Returns nonzero for a final return, when we have unlocked tasklist_lock.
  * Returns zero if the search for a child should continue;
- * then *@notask_error is 0 if @p is an eligible child, or still -ECHILD.
+ * then *@notask_error is 0 if @p is an eligible child,
+ * or another error from security_task_wait(), or still -ECHILD.
  */
 static int wait_consider_task(struct task_struct *parent, int ptrace,
 			      struct task_struct *p, int *notask_error,
@@ -1545,9 +1542,21 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
 			      int __user *stat_addr, struct rusage __user *ru)
 {
 	int ret = eligible_child(type, pid, options, p);
-	if (ret <= 0)
+	if (!ret)
 		return ret;
 
+	if (unlikely(ret < 0)) {
+		/*
+		 * If we have not yet seen any eligible child,
+		 * then let this error code replace -ECHILD.
+		 * A permission error will give the user a clue
+		 * to look for security policy problems, rather
+		 * than for mysterious wait bugs.
+		 */
+		if (*notask_error)
+			*notask_error = ret;
+	}
+
 	if (likely(!ptrace) && unlikely(p->ptrace)) {
 		/*
 		 * This child is hidden by ptrace.
@@ -1585,7 +1594,8 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
  * -ECHILD should be in *@notask_error before the first call.
  * Returns nonzero for a final return, when we have unlocked tasklist_lock.
  * Returns zero if the search for a child should continue; then
- * *@notask_error is 0 if there were any eligible children, or still -ECHILD.
+ * *@notask_error is 0 if there were any eligible children,
+ * or another error from security_task_wait(), or still -ECHILD.
  */
 static int do_wait_thread(struct task_struct *tsk, int *notask_error,
 			  enum pid_type type, struct pid *pid, int options,
-- 
cgit v1.2.3


From 666f164f4fbfa78bd00fb4b74788b42a39842c64 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Tue, 8 Apr 2008 23:12:30 -0700
Subject: fix dangling zombie when new parent ignores children

This fixes an arcane bug that we think was a regression introduced
by commit b2b2cbc4b2a2f389442549399a993a8306420baf.  When a parent
ignores SIGCHLD (or uses SA_NOCLDWAIT), its children would self-reap
but they don't if it's using ptrace on them.  When the parent thread
later exits and ceases to ptrace a child but leaves other live
threads in the parent's thread group, any zombie children are left
dangling.  The fix makes them self-reap then, as they would have
done earlier if ptrace had not been in use.

Signed-off-by: Roland McGrath <roland@redhat.com>
---
 kernel/exit.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index a2af6cac823c..93d2711b9381 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -702,6 +702,23 @@ static void exit_mm(struct task_struct * tsk)
 	mmput(mm);
 }
 
+/*
+ * Return nonzero if @parent's children should reap themselves.
+ *
+ * Called with write_lock_irq(&tasklist_lock) held.
+ */
+static int ignoring_children(struct task_struct *parent)
+{
+	int ret;
+	struct sighand_struct *psig = parent->sighand;
+	unsigned long flags;
+	spin_lock_irqsave(&psig->siglock, flags);
+	ret = (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
+	       (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT));
+	spin_unlock_irqrestore(&psig->siglock, flags);
+	return ret;
+}
+
 /*
  * Detach all tasks we were using ptrace on.
  * Any that need to be release_task'd are put on the @dead list.
@@ -711,6 +728,7 @@ static void exit_mm(struct task_struct * tsk)
 static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
 {
 	struct task_struct *p, *n;
+	int ign = -1;
 
 	list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
 		__ptrace_unlink(p);
@@ -726,10 +744,18 @@ static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
 		 * release_task() here because we already hold tasklist_lock.
 		 *
 		 * If it's our own child, there is no notification to do.
+		 * But if our normal children self-reap, then this child
+		 * was prevented by ptrace and we must reap it now.
 		 */
 		if (!task_detached(p) && thread_group_empty(p)) {
 			if (!same_thread_group(p->real_parent, parent))
 				do_notify_parent(p, p->exit_signal);
+			else {
+				if (ign < 0)
+					ign = ignoring_children(parent);
+				if (ign)
+					p->exit_signal = -1;
+			}
 		}
 
 		if (task_detached(p)) {
-- 
cgit v1.2.3


From c349e0a01c3e0f70913db6a5bb61ab204e0602de Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 15 Apr 2008 22:39:31 +0200
Subject: ftrace: do not trace scheduler functions

do not trace scheduler functions - it's still a bit fragile
and can lock up with:

  http://redhat.com/~mingo/misc/config-Thu_Jul_17_13_34_52_CEST_2008

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Makefile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 0a7ed838984b..985ddb7da4d0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,8 +11,6 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o
 
-CFLAGS_REMOVE_sched.o = -mno-spe
-
 ifdef CONFIG_FTRACE
 # Do not trace debug files and internal ftrace files
 CFLAGS_REMOVE_lockdep.o = -pg
@@ -21,6 +19,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
 CFLAGS_REMOVE_rtmutex-debug.o = -pg
 CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_sched_clock.o = -pg
+CFLAGS_REMOVE_sched.o = -mno-spe -pg
 endif
 
 obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
-- 
cgit v1.2.3


From 93a0886e2368eafb9df5e2021fb185195cee88b2 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Tue, 15 Jul 2008 13:43:42 -0700
Subject: x86, xen, power: fix up config dependencies on PM

Xen save/restore needs bits of code enabled by PM_SLEEP, and PM_SLEEP
depends on PM.  So make XEN_SAVE_RESTORE depend on PM and PM_SLEEP
depend on XEN_SAVE_RESTORE.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/power/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index b45da40e8d25..59dfdf1e1d20 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -82,7 +82,7 @@ config PM_SLEEP_SMP
 
 config PM_SLEEP
 	bool
-	depends on SUSPEND || HIBERNATION
+	depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
 	default y
 
 config SUSPEND
-- 
cgit v1.2.3


From d88c16919793a9f5dc93e7956da5bb089c7600b4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 18 Jul 2008 08:59:24 +0200
Subject: Revert parts of "ftrace: do not trace scheduler functions"

the removal of -mno-spe in the !ftrace case was not intended.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Makefile | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 985ddb7da4d0..15ab63ffe64d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,6 +11,8 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o
 
+CFLAGS_REMOVE_sched.o = -mno-spe
+
 ifdef CONFIG_FTRACE
 # Do not trace debug files and internal ftrace files
 CFLAGS_REMOVE_lockdep.o = -pg
-- 
cgit v1.2.3


From 13b40c1e40f3261e83ee514a08b77dbecb93021b Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Tue, 1 Jul 2008 10:32:50 -0700
Subject: sched: reduce stack size in isolated_cpu_setup()

  * Remove 16k stack requirements in isolated_cpu_setup when NR_CPUS=4096.

Signed-off-by: Mike Travis <travis@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 99e6d850ecab..1ee18dbb4516 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6768,7 +6768,8 @@ static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
-	int ints[NR_CPUS], i;
+	static int __initdata ints[NR_CPUS];
+	int i;
 
 	str = get_options(str, ARRAY_SIZE(ints), ints);
 	cpus_clear(cpu_isolated_map);
-- 
cgit v1.2.3


From 7ebefa8ceefed44cc321be70afc54a585a68ac0b Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Tue, 1 Jul 2008 23:32:15 +0200
Subject: sched: rework of "prioritize non-migratable tasks over migratable
 ones"

(1) handle in a generic way all cases when a newly woken-up task is
not migratable (not just a corner case when "rt_se->nr_cpus_allowed ==
1")

(2) if current is to be preempted, then make sure "p" will be picked
up by pick_next_task_rt().
i.e. move task's group at the head of its list as well.

currently, it's not a case for the group-scheduling case as described
here: http://www.ussg.iu.edu/hypermail/linux/kernel/0807.0/0134.html

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 68 ++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 40 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 47ceac9e8552..d3d1cccb3d7b 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -599,11 +599,7 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
 	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
 		return;
 
-	if (rt_se->nr_cpus_allowed == 1)
-		list_add(&rt_se->run_list, queue);
-	else
-		list_add_tail(&rt_se->run_list, queue);
-
+	list_add_tail(&rt_se->run_list, queue);
 	__set_bit(rt_se_prio(rt_se), array->bitmap);
 
 	inc_rt_tasks(rt_se, rt_rq);
@@ -688,32 +684,34 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
  * Put task to the end of the run list without the overhead of dequeue
  * followed by enqueue.
  */
-static
-void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+static void
+requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
 {
-	struct rt_prio_array *array = &rt_rq->active;
-
 	if (on_rt_rq(rt_se)) {
-		list_del_init(&rt_se->run_list);
-		list_add_tail(&rt_se->run_list,
-			      array->queue + rt_se_prio(rt_se));
+		struct rt_prio_array *array = &rt_rq->active;
+		struct list_head *queue = array->queue + rt_se_prio(rt_se);
+
+		if (head)
+			list_move(&rt_se->run_list, queue);
+		else
+			list_move_tail(&rt_se->run_list, queue);
 	}
 }
 
-static void requeue_task_rt(struct rq *rq, struct task_struct *p)
+static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
 {
 	struct sched_rt_entity *rt_se = &p->rt;
 	struct rt_rq *rt_rq;
 
 	for_each_sched_rt_entity(rt_se) {
 		rt_rq = rt_rq_of_se(rt_se);
-		requeue_rt_entity(rt_rq, rt_se);
+		requeue_rt_entity(rt_rq, rt_se, head);
 	}
 }
 
 static void yield_task_rt(struct rq *rq)
 {
-	requeue_task_rt(rq, rq->curr);
+	requeue_task_rt(rq, rq->curr, 0);
 }
 
 #ifdef CONFIG_SMP
@@ -753,6 +751,30 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
 	 */
 	return task_cpu(p);
 }
+
+static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
+{
+	cpumask_t mask;
+
+	if (rq->curr->rt.nr_cpus_allowed == 1)
+		return;
+
+	if (p->rt.nr_cpus_allowed != 1
+	    && cpupri_find(&rq->rd->cpupri, p, &mask))
+		return;
+
+	if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
+		return;
+
+	/*
+	 * There appears to be other cpus that can accept
+	 * current and none to run 'p', so lets reschedule
+	 * to try and push current away:
+	 */
+	requeue_task_rt(rq, p, 1);
+	resched_task(rq->curr);
+}
+
 #endif /* CONFIG_SMP */
 
 /*
@@ -778,18 +800,8 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
 	 * to move current somewhere else, making room for our non-migratable
 	 * task.
 	 */
-	if((p->prio == rq->curr->prio)
-	   && p->rt.nr_cpus_allowed == 1
-	   && rq->curr->rt.nr_cpus_allowed != 1) {
-		cpumask_t mask;
-
-		if (cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
-			/*
-			 * There appears to be other cpus that can accept
-			 * current, so lets reschedule to try and push it away
-			 */
-			resched_task(rq->curr);
-	}
+	if (p->prio == rq->curr->prio && !need_resched())
+		check_preempt_equal_prio(rq, p);
 #endif
 }
 
@@ -1415,7 +1427,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 	 * on the queue:
 	 */
 	if (p->rt.run_list.prev != p->rt.run_list.next) {
-		requeue_task_rt(rq, p);
+		requeue_task_rt(rq, p, 0);
 		set_tsk_need_resched(p);
 	}
 }
-- 
cgit v1.2.3


From e761b7725234276a802322549cee5255305a0930 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qualcomm.com>
Date: Tue, 15 Jul 2008 04:43:49 -0700
Subject: cpu hotplug, sched: Introduce cpu_active_map and redo sched domain
 managment (take 2)

This is based on Linus' idea of creating cpu_active_map that prevents
scheduler load balancer from migrating tasks to the cpu that is going
down.

It allows us to simplify domain management code and avoid unecessary
domain rebuilds during cpu hotplug event handling.

Please ignore the cpusets part for now. It needs some more work in order
to avoid crazy lock nesting. Although I did simplfy and unify domain
reinitialization logic. We now simply call partition_sched_domains() in
all the cases. This means that we're using exact same code paths as in
cpusets case and hence the test below cover cpusets too.
Cpuset changes to make rebuild_sched_domains() callable from various
contexts are in the separate patch (right next after this one).

This not only boots but also easily handles
	while true; do make clean; make -j 8; done
and
	while true; do on-off-cpu 1; done
at the same time.
(on-off-cpu 1 simple does echo 0/1 > /sys/.../cpu1/online thing).

Suprisingly the box (dual-core Core2) is quite usable. In fact I'm typing
this on right now in gnome-terminal and things are moving just fine.

Also this is running with most of the debug features enabled (lockdep,
mutex, etc) no BUG_ONs or lockdep complaints so far.

I believe I addressed all of the Dmitry's comments for original Linus'
version. I changed both fair and rt balancer to mask out non-active cpus.
And replaced cpu_is_offline() with !cpu_active() in the main scheduler
code where it made sense (to me).

Signed-off-by: Max Krasnyanskiy <maxk@qualcomm.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Gregory Haskins <ghaskins@novell.com>
Cc: dmitry.adamushko@gmail.com
Cc: pj@sgi.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/cpu.c        |  30 ++++++++++++---
 kernel/cpuset.c     |   2 +-
 kernel/sched.c      | 108 ++++++++++++++++++++++------------------------------
 kernel/sched_fair.c |   3 ++
 kernel/sched_rt.c   |   7 ++++
 5 files changed, 80 insertions(+), 70 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index cfb1d43ab801..a1ac7ea245d7 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -64,6 +64,8 @@ void __init cpu_hotplug_init(void)
 	cpu_hotplug.refcount = 0;
 }
 
+cpumask_t cpu_active_map;
+
 #ifdef CONFIG_HOTPLUG_CPU
 
 void get_online_cpus(void)
@@ -291,11 +293,20 @@ int __ref cpu_down(unsigned int cpu)
 	int err = 0;
 
 	cpu_maps_update_begin();
-	if (cpu_hotplug_disabled)
+
+	if (cpu_hotplug_disabled) {
 		err = -EBUSY;
-	else
-		err = _cpu_down(cpu, 0);
+		goto out;
+	}
+
+	cpu_clear(cpu, cpu_active_map);
+
+	err = _cpu_down(cpu, 0);
+
+	if (cpu_online(cpu))
+		cpu_set(cpu, cpu_active_map);
 
+out:
 	cpu_maps_update_done();
 	return err;
 }
@@ -355,11 +366,18 @@ int __cpuinit cpu_up(unsigned int cpu)
 	}
 
 	cpu_maps_update_begin();
-	if (cpu_hotplug_disabled)
+
+	if (cpu_hotplug_disabled) {
 		err = -EBUSY;
-	else
-		err = _cpu_up(cpu, 0);
+		goto out;
+	}
 
+	err = _cpu_up(cpu, 0);
+
+	if (cpu_online(cpu))
+		cpu_set(cpu, cpu_active_map);
+
+out:
 	cpu_maps_update_done();
 	return err;
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 459d601947a8..3c3ef02f65f1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -564,7 +564,7 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
  *	partition_sched_domains().
  */
 
-static void rebuild_sched_domains(void)
+void rebuild_sched_domains(void)
 {
 	struct kfifo *q;	/* queue of cpusets to be scanned */
 	struct cpuset *cp;	/* scans q */
diff --git a/kernel/sched.c b/kernel/sched.c
index 1ee18dbb4516..c237624a8a04 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2881,7 +2881,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
 
 	rq = task_rq_lock(p, &flags);
 	if (!cpu_isset(dest_cpu, p->cpus_allowed)
-	    || unlikely(cpu_is_offline(dest_cpu)))
+	    || unlikely(!cpu_active(dest_cpu)))
 		goto out;
 
 	/* force the process onto the specified CPU */
@@ -3849,7 +3849,7 @@ int select_nohz_load_balancer(int stop_tick)
 		/*
 		 * If we are going offline and still the leader, give up!
 		 */
-		if (cpu_is_offline(cpu) &&
+		if (!cpu_active(cpu) &&
 		    atomic_read(&nohz.load_balancer) == cpu) {
 			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
 				BUG();
@@ -5876,7 +5876,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 	struct rq *rq_dest, *rq_src;
 	int ret = 0, on_rq;
 
-	if (unlikely(cpu_is_offline(dest_cpu)))
+	if (unlikely(!cpu_active(dest_cpu)))
 		return ret;
 
 	rq_src = cpu_rq(src_cpu);
@@ -7553,18 +7553,6 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
 {
 }
 
-/*
- * Free current domain masks.
- * Called after all cpus are attached to NULL domain.
- */
-static void free_sched_domains(void)
-{
-	ndoms_cur = 0;
-	if (doms_cur != &fallback_doms)
-		kfree(doms_cur);
-	doms_cur = &fallback_doms;
-}
-
 /*
  * Set up scheduler domains and groups. Callers must hold the hotplug lock.
  * For now this just excludes isolated cpus, but could be used to
@@ -7643,7 +7631,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
  * ownership of it and will kfree it when done with it. If the caller
  * failed the kmalloc call, then it can pass in doms_new == NULL,
  * and partition_sched_domains() will fallback to the single partition
- * 'fallback_doms'.
+ * 'fallback_doms', it also forces the domains to be rebuilt.
  *
  * Call with hotplug lock held
  */
@@ -7657,12 +7645,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
 	/* always unregister in case we don't destroy any domains */
 	unregister_sched_domain_sysctl();
 
-	if (doms_new == NULL) {
-		ndoms_new = 1;
-		doms_new = &fallback_doms;
-		cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
-		dattr_new = NULL;
-	}
+	if (doms_new == NULL)
+		ndoms_new = 0;
 
 	/* Destroy deleted domains */
 	for (i = 0; i < ndoms_cur; i++) {
@@ -7677,6 +7661,14 @@ match1:
 		;
 	}
 
+	if (doms_new == NULL) {
+		ndoms_cur = 0;
+		ndoms_new = 1;
+		doms_new = &fallback_doms;
+		cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
+		dattr_new = NULL;
+	}
+
 	/* Build new domains */
 	for (i = 0; i < ndoms_new; i++) {
 		for (j = 0; j < ndoms_cur; j++) {
@@ -7707,17 +7699,10 @@ match2:
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 int arch_reinit_sched_domains(void)
 {
-	int err;
-
 	get_online_cpus();
-	mutex_lock(&sched_domains_mutex);
-	detach_destroy_domains(&cpu_online_map);
-	free_sched_domains();
-	err = arch_init_sched_domains(&cpu_online_map);
-	mutex_unlock(&sched_domains_mutex);
+	rebuild_sched_domains();
 	put_online_cpus();
-
-	return err;
+	return 0;
 }
 
 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
@@ -7783,14 +7768,30 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
 }
 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 
+#ifndef CONFIG_CPUSETS
 /*
- * Force a reinitialization of the sched domains hierarchy. The domains
- * and groups cannot be updated in place without racing with the balancing
- * code, so we temporarily attach all running cpus to the NULL domain
- * which will prevent rebalancing while the sched domains are recalculated.
+ * Add online and remove offline CPUs from the scheduler domains.
+ * When cpusets are enabled they take over this function.
  */
 static int update_sched_domains(struct notifier_block *nfb,
 				unsigned long action, void *hcpu)
+{
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		partition_sched_domains(0, NULL, NULL);
+		return NOTIFY_OK;
+
+	default:
+		return NOTIFY_DONE;
+	}
+}
+#endif
+
+static int update_runtime(struct notifier_block *nfb,
+				unsigned long action, void *hcpu)
 {
 	int cpu = (int)(long)hcpu;
 
@@ -7798,44 +7799,18 @@ static int update_sched_domains(struct notifier_block *nfb,
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 		disable_runtime(cpu_rq(cpu));
-		/* fall-through */
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-		detach_destroy_domains(&cpu_online_map);
-		free_sched_domains();
 		return NOTIFY_OK;
 
-
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
 		enable_runtime(cpu_rq(cpu));
-		/* fall-through */
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-		/*
-		 * Fall through and re-initialise the domains.
-		 */
-		break;
+		return NOTIFY_OK;
+
 	default:
 		return NOTIFY_DONE;
 	}
-
-#ifndef CONFIG_CPUSETS
-	/*
-	 * Create default domain partitioning if cpusets are disabled.
-	 * Otherwise we let cpusets rebuild the domains based on the
-	 * current setup.
-	 */
-
-	/* The hotplug lock is already held by cpu_up/cpu_down */
-	arch_init_sched_domains(&cpu_online_map);
-#endif
-
-	return NOTIFY_OK;
 }
 
 void __init sched_init_smp(void)
@@ -7855,8 +7830,15 @@ void __init sched_init_smp(void)
 		cpu_set(smp_processor_id(), non_isolated_cpus);
 	mutex_unlock(&sched_domains_mutex);
 	put_online_cpus();
+
+#ifndef CONFIG_CPUSETS
 	/* XXX: Theoretical race here - CPU may be hotplugged now */
 	hotcpu_notifier(update_sched_domains, 0);
+#endif
+
+	/* RT runtime code needs to handle some hotplug events */
+	hotcpu_notifier(update_runtime, 0);
+
 	init_hrtick();
 
 	/* Move init over to a non-isolated CPU */
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f2aa987027d6..d924c679dfac 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1004,6 +1004,8 @@ static void yield_task_fair(struct rq *rq)
  * not idle and an idle cpu is available.  The span of cpus to
  * search starts with cpus closest then further out as needed,
  * so we always favor a closer, idle cpu.
+ * Domains may include CPUs that are not usable for migration,
+ * hence we need to mask them out (cpu_active_map)
  *
  * Returns the CPU we should wake onto.
  */
@@ -1031,6 +1033,7 @@ static int wake_idle(int cpu, struct task_struct *p)
 		    || ((sd->flags & SD_WAKE_IDLE_FAR)
 			&& !task_hot(p, task_rq(p)->clock, sd))) {
 			cpus_and(tmp, sd->span, p->cpus_allowed);
+			cpus_and(tmp, tmp, cpu_active_map);
 			for_each_cpu_mask(i, tmp) {
 				if (idle_cpu(i)) {
 					if (i != task_cpu(p)) {
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d3d1cccb3d7b..50735bb96149 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -933,6 +933,13 @@ static int find_lowest_rq(struct task_struct *task)
 	if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
 		return -1; /* No targets found */
 
+	/*
+	 * Only consider CPUs that are usable for migration.
+	 * I guess we might want to change cpupri_find() to ignore those
+	 * in the first place.
+	 */
+	cpus_and(*lowest_mask, *lowest_mask, cpu_active_map);
+
 	/*
 	 * At this point we have built a mask of cpus representing the
 	 * lowest priority tasks in the system.  Now we want to elect
-- 
cgit v1.2.3


From 39b0fad7121eace85770e7a4c6dc35dfd2879768 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qualcomm.com>
Date: Tue, 15 Jul 2008 20:56:26 -0700
Subject: cpu hotplug: Make cpu_active_map synchronization dependency clear

This goes on top of the cpu_active_map (take 2) patch.

Currently we depend on the stop_machine to provide nescessesary
synchronization for the cpu_active_map updates.
As Dmitry Adamushko pointed this is fragile and is not much clearer
than the previous scheme. In other words we do not want to depend on
the internal stop machine operation here.
So make the synchronization rules clear by doing synchronize_sched()
after clearing out cpu active bit.

Tested on quad-Core2 with:

   while true; do
      for i in 1 2 3; do
        echo 0 > /sys/devices/system/cpu/cpu$i/online
      done
      for i in 1 2 3; do
        echo 1 > /sys/devices/system/cpu/cpu$i/online
      done
   done
and
   stress -c 200

No lockdep, preempt or other complaints.

Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/cpu.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index a1ac7ea245d7..033603c1d7c3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -301,6 +301,16 @@ int __ref cpu_down(unsigned int cpu)
 
 	cpu_clear(cpu, cpu_active_map);
 
+	/*
+	 * Make sure the all cpus did the reschedule and are not
+	 * using stale version of the cpu_active_map.
+	 * This is not strictly necessary becuase stop_machine()
+	 * that we run down the line already provides the required
+	 * synchronization. But it's really a side effect and we do not
+	 * want to depend on the innards of the stop_machine here.
+	 */
+	synchronize_sched();
+
 	err = _cpu_down(cpu, 0);
 
 	if (cpu_online(cpu))
-- 
cgit v1.2.3


From e59494f441c834ca7aaa0e6fa6678ddbd3d72743 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 16 Jul 2008 00:13:45 -0400
Subject: ftrace: fix 4d3702b6 (post-v2.6.26): WARNING: at
 kernel/lockdep.c:2731 check_flags (ftrace)

On Wed, 16 Jul 2008, Vegard Nossum wrote:

> When booting 4d3702b6, I got this huge thing:
>
> Testing tracer wakeup: <4>------------[ cut here ]------------
> WARNING: at kernel/lockdep.c:2731 check_flags+0x123/0x160()
> Modules linked in:
> Pid: 1, comm: swapper Not tainted 2.6.26-crashing-02127-g4d3702b6 #30
>  [<c015c349>] warn_on_slowpath+0x59/0xb0
>  [<c01276c6>] ? ftrace_call+0x5/0x8
>  [<c012d800>] ? native_read_tsc+0x0/0x20
>  [<c0158de2>] ? sub_preempt_count+0x12/0xf0
>  [<c01814eb>] ? trace_hardirqs_off+0xb/0x10
>  [<c0182fbc>] ? __lock_acquire+0x2cc/0x1120
>  [<c01814eb>] ? trace_hardirqs_off+0xb/0x10
>  [<c01276af>] ? mcount_call+0x5/0xa
>  [<c017ff53>] check_flags+0x123/0x160
>  [<c0183e61>] lock_acquire+0x51/0xd0
>  [<c01276c6>] ? ftrace_call+0x5/0x8
>  [<c0613d4f>] _spin_lock_irqsave+0x5f/0xa0
>  [<c01a8d45>] ? ftrace_record_ip+0xf5/0x220
>  [<c02d5413>] ? debug_locks_off+0x3/0x50
>  [<c01a8d45>] ftrace_record_ip+0xf5/0x220
>  [<c01276af>] mcount_call+0x5/0xa
>  [<c02d5418>] ? debug_locks_off+0x8/0x50
>  [<c017ff27>] check_flags+0xf7/0x160
>  [<c0183e61>] lock_acquire+0x51/0xd0
>  [<c01276c6>] ? ftrace_call+0x5/0x8
>  [<c0613d4f>] _spin_lock_irqsave+0x5f/0xa0
>  [<c01affcd>] ? wakeup_tracer_call+0x6d/0xf0
>  [<c01625e2>] ? _local_bh_enable+0x62/0xb0
>  [<c0158ddd>] ? sub_preempt_count+0xd/0xf0
>  [<c01affcd>] wakeup_tracer_call+0x6d/0xf0
>  [<c0162724>] ? __do_softirq+0xf4/0x110
>  [<c01afff1>] ? wakeup_tracer_call+0x91/0xf0
>  [<c01276c6>] ftrace_call+0x5/0x8
>  [<c0162724>] ? __do_softirq+0xf4/0x110
>  [<c0158de2>] ? sub_preempt_count+0x12/0xf0
>  [<c01625e2>] _local_bh_enable+0x62/0xb0
>  [<c0162724>] __do_softirq+0xf4/0x110
>  [<c01627ed>] do_softirq+0xad/0xb0
>  [<c0162a15>] irq_exit+0xa5/0xb0
>  [<c013a506>] smp_apic_timer_interrupt+0x66/0xa0
>  [<c02d3fac>] ? trace_hardirqs_off_thunk+0xc/0x10
>  [<c0127449>] apic_timer_interrupt+0x2d/0x34
>  [<c018007b>] ? find_usage_backwards+0xb/0xf0
>  [<c0613a09>] ? _spin_unlock_irqrestore+0x69/0x80
>  [<c014ef32>] tg_shares_up+0x132/0x1d0
>  [<c014d2a2>] walk_tg_tree+0x62/0xa0
>  [<c014ee00>] ? tg_shares_up+0x0/0x1d0
>  [<c014a860>] ? tg_nop+0x0/0x10
>  [<c015499d>] update_shares+0x5d/0x80
>  [<c0154a2f>] try_to_wake_up+0x6f/0x280
>  [<c01a8b90>] ? __ftrace_modify_code+0x0/0xc0
>  [<c01a8b90>] ? __ftrace_modify_code+0x0/0xc0
>  [<c0154c94>] wake_up_process+0x14/0x20
>  [<c01725f6>] kthread_create+0x66/0xb0
>  [<c0195400>] ? do_stop+0x0/0x200
>  [<c0195320>] ? __stop_machine_run+0x30/0xb0
>  [<c0195340>] __stop_machine_run+0x50/0xb0
>  [<c0195400>] ? do_stop+0x0/0x200
>  [<c01a8b90>] ? __ftrace_modify_code+0x0/0xc0
>  [<c061242d>] ? mutex_unlock+0xd/0x10
>  [<c01953cc>] stop_machine_run+0x2c/0x60
>  [<c01a94d3>] unregister_ftrace_function+0x103/0x180
>  [<c01b0517>] stop_wakeup_tracer+0x17/0x60
>  [<c01b056f>] wakeup_tracer_ctrl_update+0xf/0x30
>  [<c01ab8d5>] trace_selftest_startup_wakeup+0xb5/0x130
>  [<c01ab950>] ? trace_wakeup_test_thread+0x0/0x70
>  [<c01aadf5>] register_tracer+0x135/0x1b0
>  [<c0877d02>] init_wakeup_tracer+0xd/0xf
>  [<c085d437>] kernel_init+0x1a9/0x2ce
>  [<c061397b>] ? _spin_unlock_irq+0x3b/0x60
>  [<c02d3f9c>] ? trace_hardirqs_on_thunk+0xc/0x10
>  [<c0877cf5>] ? init_wakeup_tracer+0x0/0xf
>  [<c0182646>] ? trace_hardirqs_on_caller+0x126/0x180
>  [<c02d3f9c>] ? trace_hardirqs_on_thunk+0xc/0x10
>  [<c01269c8>] ? restore_nocheck_notrace+0x0/0xe
>  [<c085d28e>] ? kernel_init+0x0/0x2ce
>  [<c085d28e>] ? kernel_init+0x0/0x2ce
>  [<c01275fb>] kernel_thread_helper+0x7/0x10
>  =======================
> ---[ end trace a7919e7f17c0a725 ]---
> irq event stamp: 579530
> hardirqs last  enabled at (579528): [<c01826ab>] trace_hardirqs_on+0xb/0x10
> hardirqs last disabled at (579529): [<c01814eb>] trace_hardirqs_off+0xb/0x10
> softirqs last  enabled at (579530): [<c0162724>] __do_softirq+0xf4/0x110
> softirqs last disabled at (579517): [<c01627ed>] do_softirq+0xad/0xb0
> irq event stamp: 579530
> hardirqs last  enabled at (579528): [<c01826ab>] trace_hardirqs_on+0xb/0x10
> hardirqs last disabled at (579529): [<c01814eb>] trace_hardirqs_off+0xb/0x10
> softirqs last  enabled at (579530): [<c0162724>] __do_softirq+0xf4/0x110
> softirqs last disabled at (579517): [<c01627ed>] do_softirq+0xad/0xb0
> PASSED
>
> Incidentally, the kernel also hung while I was typing in this report.

Things get weird between lockdep and ftrace because ftrace can be called
within lockdep internal code (via the mcount pointer) and lockdep can be
called with ftrace (via spin_locks).

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Tested-by: Vegard Nossum <vegard.nossum@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_sched_wakeup.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 3c8d61df4474..e303ccb62cdf 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -26,7 +26,8 @@ static struct task_struct	*wakeup_task;
 static int			wakeup_cpu;
 static unsigned			wakeup_prio = -1;
 
-static DEFINE_SPINLOCK(wakeup_lock);
+static raw_spinlock_t wakeup_lock =
+	(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 
 static void __wakeup_reset(struct trace_array *tr);
 
@@ -56,7 +57,8 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
 	if (unlikely(disabled != 1))
 		goto out;
 
-	spin_lock_irqsave(&wakeup_lock, flags);
+	local_irq_save(flags);
+	__raw_spin_lock(&wakeup_lock);
 
 	if (unlikely(!wakeup_task))
 		goto unlock;
@@ -71,7 +73,8 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
 	trace_function(tr, data, ip, parent_ip, flags);
 
  unlock:
-	spin_unlock_irqrestore(&wakeup_lock, flags);
+	__raw_spin_unlock(&wakeup_lock);
+	local_irq_restore(flags);
 
  out:
 	atomic_dec(&data->disabled);
@@ -145,7 +148,8 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
 	if (likely(disabled != 1))
 		goto out;
 
-	spin_lock_irqsave(&wakeup_lock, flags);
+	local_irq_save(flags);
+	__raw_spin_lock(&wakeup_lock);
 
 	/* We could race with grabbing wakeup_lock */
 	if (unlikely(!tracer_enabled || next != wakeup_task))
@@ -174,7 +178,8 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
 
 out_unlock:
 	__wakeup_reset(tr);
-	spin_unlock_irqrestore(&wakeup_lock, flags);
+	__raw_spin_unlock(&wakeup_lock);
+	local_irq_restore(flags);
 out:
 	atomic_dec(&tr->data[cpu]->disabled);
 }
@@ -209,8 +214,6 @@ static void __wakeup_reset(struct trace_array *tr)
 	struct trace_array_cpu *data;
 	int cpu;
 
-	assert_spin_locked(&wakeup_lock);
-
 	for_each_possible_cpu(cpu) {
 		data = tr->data[cpu];
 		tracing_reset(data);
@@ -229,9 +232,11 @@ static void wakeup_reset(struct trace_array *tr)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&wakeup_lock, flags);
+	local_irq_save(flags);
+	__raw_spin_lock(&wakeup_lock);
 	__wakeup_reset(tr);
-	spin_unlock_irqrestore(&wakeup_lock, flags);
+	__raw_spin_unlock(&wakeup_lock);
+	local_irq_restore(flags);
 }
 
 static void
@@ -252,7 +257,7 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p,
 		goto out;
 
 	/* interrupts should be off from try_to_wake_up */
-	spin_lock(&wakeup_lock);
+	__raw_spin_lock(&wakeup_lock);
 
 	/* check for races. */
 	if (!tracer_enabled || p->prio >= wakeup_prio)
@@ -274,7 +279,7 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p,
 		       CALLER_ADDR1, CALLER_ADDR2, flags);
 
 out_locked:
-	spin_unlock(&wakeup_lock);
+	__raw_spin_unlock(&wakeup_lock);
 out:
 	atomic_dec(&tr->data[cpu]->disabled);
 }
-- 
cgit v1.2.3


From 577b4a58d2e74a4d48050eeea3e3f952ce04eb86 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 11 Jul 2008 13:34:54 +0100
Subject: sched: fix warning in inc_rt_tasks() to not declare variable 'rq' if
 it's not needed

Fix inc_rt_tasks() to not declare variable 'rq' if it's not needed.  It is
declared if CONFIG_SMP or CONFIG_RT_GROUP_SCHED, but only used if CONFIG_SMP.

This is a consequence of patch 1f11eb6a8bc92536d9e93ead48fa3ffbd1478571 plus
patch 1100ac91b6af02d8639d518fad5b434b1bf44ed6.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 47ceac9e8552..147004c651c0 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -505,7 +505,9 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	rt_rq->rt_nr_running++;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 	if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
+#ifdef CONFIG_SMP
 		struct rq *rq = rq_of_rt_rq(rt_rq);
+#endif
 
 		rt_rq->highest_prio = rt_se_prio(rt_se);
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3


From 3cac97cbb14aed00d83eb33d4613b0fe3aaea863 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Sun, 6 Jul 2008 17:23:55 +0800
Subject: rcu classic: simplify the next pending batch

use a batch number(rcp->pending) instead of a flag(rcp->next_pending)

rcu_start_batch() need to change this flag, so mb()s is needed
for memory-access safe.

but(after this patch applied) rcu_start_batch() do not change
this batch number(rcp->pending), rcp->pending is managed by
__rcu_process_callbacks only, and troublesome mb()s are eliminated.

And codes look simpler and clearer.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: Gautham Shenoy <ego@in.ibm.com>
Cc: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 16eeeaa9d618..03726eb95193 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -60,12 +60,14 @@ EXPORT_SYMBOL_GPL(rcu_lock_map);
 static struct rcu_ctrlblk rcu_ctrlblk = {
 	.cur = -300,
 	.completed = -300,
+	.pending = -300,
 	.lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
 	.cpumask = CPU_MASK_NONE,
 };
 static struct rcu_ctrlblk rcu_bh_ctrlblk = {
 	.cur = -300,
 	.completed = -300,
+	.pending = -300,
 	.lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
 	.cpumask = CPU_MASK_NONE,
 };
@@ -276,14 +278,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
  */
 static void rcu_start_batch(struct rcu_ctrlblk *rcp)
 {
-	if (rcp->next_pending &&
+	if (rcp->cur != rcp->pending &&
 			rcp->completed == rcp->cur) {
-		rcp->next_pending = 0;
-		/*
-		 * next_pending == 0 must be visible in
-		 * __rcu_process_callbacks() before it can see new value of cur.
-		 */
-		smp_wmb();
 		rcp->cur++;
 
 		/*
@@ -441,16 +437,14 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 
 		/* determine batch number */
 		rdp->batch = rcp->cur + 1;
-		/* see the comment and corresponding wmb() in
-		 * the rcu_start_batch()
-		 */
-		smp_rmb();
 
-		if (!rcp->next_pending) {
+		if (rcu_batch_after(rdp->batch, rcp->pending)) {
 			/* and start it/schedule start if it's a new batch */
 			spin_lock(&rcp->lock);
-			rcp->next_pending = 1;
-			rcu_start_batch(rcp);
+			if (rcu_batch_after(rdp->batch, rcp->pending)) {
+				rcp->pending = rdp->batch;
+				rcu_start_batch(rcp);
+			}
 			spin_unlock(&rcp->lock);
 		}
 	}
-- 
cgit v1.2.3


From 5127bed588a2f8f3a1f732de2a8a190b7df5dce3 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Sun, 6 Jul 2008 17:23:59 +0800
Subject: rcu classic: new algorithm for callbacks-processing(v2)

This is v2, it's a little deference from v1 that I
had send to lkml.
use ACCESS_ONCE
use rcu_batch_after/rcu_batch_before for batch # comparison.

rcutorture test result:
(hotplugs: do cpu-online/offline once per second)

No CONFIG_NO_HZ:           OK, 12hours
No CONFIG_NO_HZ, hotplugs: OK, 12hours
CONFIG_NO_HZ=y:            OK, 24hours
CONFIG_NO_HZ=y, hotplugs:  Failed.
(Failed also without my patch applied, exactly the same bug occurred,
http://lkml.org/lkml/2008/7/3/24)

v1's email thread:
http://lkml.org/lkml/2008/6/2/539

v1's description:

The code/algorithm of the implement of current callbacks-processing
is very efficient and technical. But when I studied it and I found
a disadvantage:

In multi-CPU systems, when a new RCU callback is being
queued(call_rcu[_bh]), this callback will be invoked after the grace
period for the batch with batch number = rcp->cur+2 has completed
very very likely in current implement. Actually, this callback can be
invoked after the grace period for the batch with
batch number = rcp->cur+1 has completed. The delay of invocation means
that latency of synchronize_rcu() is extended. But more important thing
is that the callbacks usually free memory, and these works are delayed
too! it's necessary for reclaimer to free memory as soon as
possible when left memory is few.

A very simple way can solve this problem:
a field(struct rcu_head::batch) is added to record the batch number for
the RCU callback. And when a new RCU callback is being queued, we
determine the batch number for this callback(head->batch = rcp->cur+1)
and we move this callback to rdp->donelist if we find
that head->batch <= rcp->completed when we process callbacks.
This simple way reduces the wait time for invocation a lot. (about
2.5Grace Period -> 1.5Grace Period in average in multi-CPU systems)

This is my algorithm. But I do not add any field for struct rcu_head
in my implement. We just need to memorize the last 2 batches and
their batch number, because these 2 batches include all entries that
for whom the grace period hasn't completed. So we use a special
linked-list rather than add a field.
Please see the comment of struct rcu_data.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: Gautham Shenoy <ego@in.ibm.com>
Cc: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c | 157 ++++++++++++++++++++++++++++++++--------------------
 1 file changed, 97 insertions(+), 60 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 03726eb95193..d3553ee55f64 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -120,6 +120,43 @@ static inline void force_quiescent_state(struct rcu_data *rdp,
 }
 #endif
 
+static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
+		struct rcu_data *rdp)
+{
+	long batch;
+	smp_mb(); /* reads the most recently updated value of rcu->cur. */
+
+	/*
+	 * Determine the batch number of this callback.
+	 *
+	 * Using ACCESS_ONCE to avoid the following error when gcc eliminates
+	 * local variable "batch" and emits codes like this:
+	 *	1) rdp->batch = rcp->cur + 1 # gets old value
+	 *	......
+	 *	2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value
+	 * then [*nxttail[0], *nxttail[1]) may contain callbacks
+	 * that batch# = rdp->batch, see the comment of struct rcu_data.
+	 */
+	batch = ACCESS_ONCE(rcp->cur) + 1;
+
+	if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) {
+		/* process callbacks */
+		rdp->nxttail[0] = rdp->nxttail[1];
+		rdp->nxttail[1] = rdp->nxttail[2];
+		if (rcu_batch_after(batch - 1, rdp->batch))
+			rdp->nxttail[0] = rdp->nxttail[2];
+	}
+
+	rdp->batch = batch;
+	*rdp->nxttail[2] = head;
+	rdp->nxttail[2] = &head->next;
+
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = INT_MAX;
+		force_quiescent_state(rdp, &rcu_ctrlblk);
+	}
+}
+
 /**
  * call_rcu - Queue an RCU callback for invocation after a grace period.
  * @head: structure to be used for queueing the RCU updates.
@@ -135,18 +172,11 @@ void call_rcu(struct rcu_head *head,
 				void (*func)(struct rcu_head *rcu))
 {
 	unsigned long flags;
-	struct rcu_data *rdp;
 
 	head->func = func;
 	head->next = NULL;
 	local_irq_save(flags);
-	rdp = &__get_cpu_var(rcu_data);
-	*rdp->nxttail = head;
-	rdp->nxttail = &head->next;
-	if (unlikely(++rdp->qlen > qhimark)) {
-		rdp->blimit = INT_MAX;
-		force_quiescent_state(rdp, &rcu_ctrlblk);
-	}
+	__call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data));
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(call_rcu);
@@ -171,20 +201,11 @@ void call_rcu_bh(struct rcu_head *head,
 				void (*func)(struct rcu_head *rcu))
 {
 	unsigned long flags;
-	struct rcu_data *rdp;
 
 	head->func = func;
 	head->next = NULL;
 	local_irq_save(flags);
-	rdp = &__get_cpu_var(rcu_bh_data);
-	*rdp->nxttail = head;
-	rdp->nxttail = &head->next;
-
-	if (unlikely(++rdp->qlen > qhimark)) {
-		rdp->blimit = INT_MAX;
-		force_quiescent_state(rdp, &rcu_bh_ctrlblk);
-	}
-
+	__call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
@@ -213,12 +234,6 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
 static inline void raise_rcu_softirq(void)
 {
 	raise_softirq(RCU_SOFTIRQ);
-	/*
-	 * The smp_mb() here is required to ensure that this cpu's
-	 * __rcu_process_callbacks() reads the most recently updated
-	 * value of rcu->cur.
-	 */
-	smp_mb();
 }
 
 /*
@@ -360,13 +375,15 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
  * which is dead and hence not processing interrupts.
  */
 static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
-				struct rcu_head **tail)
+				struct rcu_head **tail, long batch)
 {
-	local_irq_disable();
-	*this_rdp->nxttail = list;
-	if (list)
-		this_rdp->nxttail = tail;
-	local_irq_enable();
+	if (list) {
+		local_irq_disable();
+		this_rdp->batch = batch;
+		*this_rdp->nxttail[2] = list;
+		this_rdp->nxttail[2] = tail;
+		local_irq_enable();
+	}
 }
 
 static void __rcu_offline_cpu(struct rcu_data *this_rdp,
@@ -380,9 +397,9 @@ static void __rcu_offline_cpu(struct rcu_data *this_rdp,
 	if (rcp->cur != rcp->completed)
 		cpu_quiet(rdp->cpu, rcp);
 	spin_unlock_bh(&rcp->lock);
-	rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
-	rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
-	rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
+	/* spin_lock implies smp_mb() */
+	rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1);
+	rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1);
 
 	local_irq_disable();
 	this_rdp->qlen += rdp->qlen;
@@ -416,27 +433,37 @@ static void rcu_offline_cpu(int cpu)
 static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 					struct rcu_data *rdp)
 {
-	if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
-		*rdp->donetail = rdp->curlist;
-		rdp->donetail = rdp->curtail;
-		rdp->curlist = NULL;
-		rdp->curtail = &rdp->curlist;
-	}
-
-	if (rdp->nxtlist && !rdp->curlist) {
+	if (rdp->nxtlist) {
 		local_irq_disable();
-		rdp->curlist = rdp->nxtlist;
-		rdp->curtail = rdp->nxttail;
-		rdp->nxtlist = NULL;
-		rdp->nxttail = &rdp->nxtlist;
-		local_irq_enable();
 
 		/*
-		 * start the next batch of callbacks
+		 * move the other grace-period-completed entries to
+		 * [rdp->nxtlist, *rdp->nxttail[0]) temporarily
+		 */
+		if (!rcu_batch_before(rcp->completed, rdp->batch))
+			rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2];
+		else if (!rcu_batch_before(rcp->completed, rdp->batch - 1))
+			rdp->nxttail[0] = rdp->nxttail[1];
+
+		/*
+		 * the grace period for entries in
+		 * [rdp->nxtlist, *rdp->nxttail[0]) has completed and
+		 * move these entries to donelist
 		 */
+		if (rdp->nxttail[0] != &rdp->nxtlist) {
+			*rdp->donetail = rdp->nxtlist;
+			rdp->donetail = rdp->nxttail[0];
+			rdp->nxtlist = *rdp->nxttail[0];
+			*rdp->donetail = NULL;
+
+			if (rdp->nxttail[1] == rdp->nxttail[0])
+				rdp->nxttail[1] = &rdp->nxtlist;
+			if (rdp->nxttail[2] == rdp->nxttail[0])
+				rdp->nxttail[2] = &rdp->nxtlist;
+			rdp->nxttail[0] = &rdp->nxtlist;
+		}
 
-		/* determine batch number */
-		rdp->batch = rcp->cur + 1;
+		local_irq_enable();
 
 		if (rcu_batch_after(rdp->batch, rcp->pending)) {
 			/* and start it/schedule start if it's a new batch */
@@ -462,15 +489,26 @@ static void rcu_process_callbacks(struct softirq_action *unused)
 
 static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
 {
-	/* This cpu has pending rcu entries and the grace period
-	 * for them has completed.
-	 */
-	if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
-		return 1;
+	if (rdp->nxtlist) {
+		/*
+		 * This cpu has pending rcu entries and the grace period
+		 * for them has completed.
+		 */
+		if (!rcu_batch_before(rcp->completed, rdp->batch))
+			return 1;
+		if (!rcu_batch_before(rcp->completed, rdp->batch - 1) &&
+				rdp->nxttail[0] != rdp->nxttail[1])
+			return 1;
+		if (rdp->nxttail[0] != &rdp->nxtlist)
+			return 1;
 
-	/* This cpu has no pending entries, but there are new entries */
-	if (!rdp->curlist && rdp->nxtlist)
-		return 1;
+		/*
+		 * This cpu has pending rcu entries and the new batch
+		 * for then hasn't been started nor scheduled start
+		 */
+		if (rcu_batch_after(rdp->batch, rcp->pending))
+			return 1;
+	}
 
 	/* This cpu has finished callbacks to invoke */
 	if (rdp->donelist)
@@ -506,7 +544,7 @@ int rcu_needs_cpu(int cpu)
 	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
 	struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
 
-	return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
+	return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
 }
 
 void rcu_check_callbacks(int cpu, int user)
@@ -553,8 +591,7 @@ static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
 						struct rcu_data *rdp)
 {
 	memset(rdp, 0, sizeof(*rdp));
-	rdp->curtail = &rdp->curlist;
-	rdp->nxttail = &rdp->nxtlist;
+	rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist;
 	rdp->donetail = &rdp->donelist;
 	rdp->quiescbatch = rcp->completed;
 	rdp->qs_pending = 0;
-- 
cgit v1.2.3


From b8f8c3cf0a4ac0632ec3f0e15e9dc0c29de917af Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 18 Jul 2008 17:27:28 +0200
Subject: nohz: prevent tick stop outside of the idle loop

Jack Ren and Eric Miao tracked down the following long standing
problem in the NOHZ code:

	scheduler switch to idle task
	enable interrupts

Window starts here

	----> interrupt happens (does not set NEED_RESCHED)
	      	irq_exit() stops the tick

	----> interrupt happens (does set NEED_RESCHED)

	return from schedule()

	cpu_idle(): preempt_disable();

Window ends here

The interrupts can happen at any point inside the race window. The
first interrupt stops the tick, the second one causes the scheduler to
rerun and switch away from idle again and we end up with the tick
disabled.

The fact that it needs two interrupts where the first one does not set
NEED_RESCHED and the second one does made the bug obscure and extremly
hard to reproduce and analyse. Kudos to Jack and Eric.

Solution: Limit the NOHZ functionality to the idle loop to make sure
that we can not run into such a situation ever again.

cpu_idle()
{
	preempt_disable();

	while(1) {
		 tick_nohz_stop_sched_tick(1); <- tell NOHZ code that we
		 			          are in the idle loop

		 while (!need_resched())
		       halt();

		 tick_nohz_restart_sched_tick(); <- disables NOHZ mode
		 preempt_enable_no_resched();
		 schedule();
		 preempt_disable();
	}
}

In hindsight we should have done this forever, but ...

/me grabs a large brown paperbag.

Debugged-by: Jack Ren <jack.ren@marvell.com>,
Debugged-by: eric miao <eric.y.miao@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/softirq.c         |  2 +-
 kernel/time/tick-sched.c | 12 ++++++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 36e061740047..05f248039d77 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -312,7 +312,7 @@ void irq_exit(void)
 #ifdef CONFIG_NO_HZ
 	/* Make sure that timer wheel updates are propagated */
 	if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
-		tick_nohz_stop_sched_tick();
+		tick_nohz_stop_sched_tick(0);
 	rcu_irq_exit();
 #endif
 	preempt_enable_no_resched();
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 86baa4f0dfe4..ee962d11107b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -195,7 +195,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
  * Called either from the idle loop or from irq_exit() when an idle period was
  * just interrupted by an interrupt which did not cause a reschedule.
  */
-void tick_nohz_stop_sched_tick(void)
+void tick_nohz_stop_sched_tick(int inidle)
 {
 	unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
 	struct tick_sched *ts;
@@ -224,6 +224,11 @@ void tick_nohz_stop_sched_tick(void)
 	if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
 		goto end;
 
+	if (!inidle && !ts->inidle)
+		goto end;
+
+	ts->inidle = 1;
+
 	if (need_resched())
 		goto end;
 
@@ -372,11 +377,14 @@ void tick_nohz_restart_sched_tick(void)
 	local_irq_disable();
 	tick_nohz_stop_idle(cpu);
 
-	if (!ts->tick_stopped) {
+	if (!ts->inidle || !ts->tick_stopped) {
+		ts->inidle = 0;
 		local_irq_enable();
 		return;
 	}
 
+	ts->inidle = 0;
+
 	rcu_exit_nohz();
 
 	/* Update jiffies first */
-- 
cgit v1.2.3


From 4dca10a96041f78bed11ce9e4a5cfde813ec4ccb Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Mon, 7 Jul 2008 18:37:04 -0700
Subject: softlockup: fix invalid proc_handler for softlockup_panic

The type of softlockup_panic is int, but the proc_handler is
proc_doulongvec_minmax(). This handler is for unsigned long.

This handler should be proc_dointvec_minmax().

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sysctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3e56d2f91414..ab59ac008caf 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -741,7 +741,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &softlockup_panic,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_doulongvec_minmax,
+		.proc_handler	= &proc_dointvec_minmax,
 		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 		.extra2		= &one,
-- 
cgit v1.2.3


From 8df185a95c9b84fc0c3c02224e64fdc5b83bae34 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Tue, 8 Jul 2008 15:55:48 -0700
Subject: kthread: reduce stack pressure in create_kthread and kthreadd

  * Replace:

  	set_cpus_allowed(..., CPU_MASK_ALL)

    with:

  	set_cpus_allowed_ptr(..., CPU_MASK_ALL_PTR)

    to remove excessive stack requirements when NR_CPUS=4096.

Signed-off-by: Mike Travis <travis@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/kthread.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index ac3fb7326641..6111c27491b1 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -106,7 +106,7 @@ static void create_kthread(struct kthread_create_info *create)
 		 */
 		sched_setscheduler(create->result, SCHED_NORMAL, &param);
 		set_user_nice(create->result, KTHREAD_NICE_LEVEL);
-		set_cpus_allowed(create->result, CPU_MASK_ALL);
+		set_cpus_allowed_ptr(create->result, CPU_MASK_ALL_PTR);
 	}
 	complete(&create->done);
 }
@@ -233,7 +233,7 @@ int kthreadd(void *unused)
 	set_task_comm(tsk, "kthreadd");
 	ignore_signals(tsk);
 	set_user_nice(tsk, KTHREAD_NICE_LEVEL);
-	set_cpus_allowed(tsk, CPU_MASK_ALL);
+	set_cpus_allowed_ptr(tsk, CPU_MASK_ALL_PTR);
 
 	current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
 
-- 
cgit v1.2.3


From 1e01cb0c6ff7e9ddb6547551794c6aa82785a7cb Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 15 Jul 2008 09:53:37 -0400
Subject: ftrace: only trace preempt off with preempt tracer

When PREEMPT_TRACER and IRQSOFF_TRACER are both configured and irqsoff
tracer is running, the preempt_off sections might also be traced.

Thanks to Andrew Morton for pointing out my mistake of spin_lock disabling
interrupts while he was reviewing ftrace.txt. Seems that my example I used
actually hit this bug.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_irqsoff.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 421d6fe3650e..b1e4a89b08eb 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -337,12 +337,14 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
 #ifdef CONFIG_PREEMPT_TRACER
 void trace_preempt_on(unsigned long a0, unsigned long a1)
 {
-	stop_critical_timing(a0, a1);
+	if (preempt_trace())
+		stop_critical_timing(a0, a1);
 }
 
 void trace_preempt_off(unsigned long a0, unsigned long a1)
 {
-	start_critical_timing(a0, a1);
+	if (preempt_trace())
+		start_critical_timing(a0, a1);
 }
 #endif /* CONFIG_PREEMPT_TRACER */
 
-- 
cgit v1.2.3


From 538c29d43ebdac2edcef96ac07982d2296a63077 Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dbaryshkov@gmail.com>
Date: Fri, 18 Jul 2008 13:29:57 +0400
Subject: Generic dma-coherent: fix DMA_MEMORY_EXCLUSIVE

Don't rewrite successfull allocation return values
in case the memory was marked with DMA_MEMORY_EXCLUSIVE.

Signed-off-by: Dmitry Baryshkov <dbaryshkov@gmail.com>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/dma-coherent.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
index 89a554cfd936..56dff5cb0f29 100644
--- a/kernel/dma-coherent.c
+++ b/kernel/dma-coherent.c
@@ -105,8 +105,7 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size,
 			*dma_handle = mem->device_base + (page << PAGE_SHIFT);
 			*ret = mem->virt_base + (page << PAGE_SHIFT);
 			memset(*ret, 0, size);
-		}
-		if (mem->flags & DMA_MEMORY_EXCLUSIVE)
+		} else if (mem->flags & DMA_MEMORY_EXCLUSIVE)
 			*ret = NULL;
 	}
 	return (mem != NULL);
-- 
cgit v1.2.3


From 65c011845316d3c1381f478ca0d8265c43b3b039 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Tue, 15 Jul 2008 14:14:30 -0700
Subject: cpumask: Replace cpumask_of_cpu with cpumask_of_cpu_ptr

  * This patch replaces the dangerous lvalue version of cpumask_of_cpu
    with new cpumask_of_cpu_ptr macros.  These are patterned after the
    node_to_cpumask_ptr macros.

    In general terms, if there is a cpumask_of_cpu_map[] then a pointer to
    the cpumask_of_cpu_map[cpu] entry is used.  The cpumask_of_cpu_map
    is provided when there is a large NR_CPUS count, reducing
    greatly the amount of code generated and stack space used for
    cpumask_of_cpu().  The pointer to the cpumask_t value is needed for
    calling set_cpus_allowed_ptr() to reduce the amount of stack space
    needed to pass the cpumask_t value.

    If there isn't a cpumask_of_cpu_map[], then a temporary variable is
    declared and filled in with value from cpumask_of_cpu(cpu) as well as
    a pointer variable pointing to this temporary variable.  Afterwards,
    the pointer is used to reference the cpumask value.  The compiler
    will optimize out the extra dereference through the pointer as well
    as the stack space used for the pointer, resulting in identical code.

    A good example of the orthogonal usages is in net/sunrpc/svc.c:

	case SVC_POOL_PERCPU:
	{
		unsigned int cpu = m->pool_to[pidx];
		cpumask_of_cpu_ptr(cpumask, cpu);

		*oldmask = current->cpus_allowed;
		set_cpus_allowed_ptr(current, cpumask);
		return 1;
	}
	case SVC_POOL_PERNODE:
	{
		unsigned int node = m->pool_to[pidx];
		node_to_cpumask_ptr(nodecpumask, node);

		*oldmask = current->cpus_allowed;
		set_cpus_allowed_ptr(current, nodecpumask);
		return 1;
	}

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/stop_machine.c        | 3 ++-
 kernel/trace/trace_sysprof.c | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index ba9b2054ecbd..738b411ff2d3 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -33,8 +33,9 @@ static int stopmachine(void *cpu)
 {
 	int irqs_disabled = 0;
 	int prepared = 0;
+	cpumask_of_cpu_ptr(cpumask, (int)(long)cpu);
 
-	set_cpus_allowed_ptr(current, &cpumask_of_cpu((int)(long)cpu));
+	set_cpus_allowed_ptr(current, cpumask);
 
 	/* Ack: we are alive */
 	smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 2301e1e7c606..63528086337c 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -213,7 +213,9 @@ static void start_stack_timers(void)
 	int cpu;
 
 	for_each_online_cpu(cpu) {
-		set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
+		cpumask_of_cpu_ptr(new_mask, cpu);
+
+		set_cpus_allowed_ptr(current, new_mask);
 		start_stack_timer(cpu);
 	}
 	set_cpus_allowed_ptr(current, &saved_mask);
-- 
cgit v1.2.3


From c18a41fbbc500ac0307ffd2b0ae73c2af9d0b0ab Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Tue, 15 Jul 2008 14:14:34 -0700
Subject: cpumask: Optimize cpumask_of_cpu in kernel/time/tick-common.c

  * Optimize various places where a pointer to the cpumask_of_cpu value
    will result in reducing stack pressure.

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/tick-common.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 4f3886562b8c..bf43284d6855 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -135,7 +135,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
  */
 static void tick_setup_device(struct tick_device *td,
 			      struct clock_event_device *newdev, int cpu,
-			      cpumask_t cpumask)
+			      const cpumask_t *cpumask)
 {
 	ktime_t next_event;
 	void (*handler)(struct clock_event_device *) = NULL;
@@ -169,8 +169,8 @@ static void tick_setup_device(struct tick_device *td,
 	 * When the device is not per cpu, pin the interrupt to the
 	 * current cpu:
 	 */
-	if (!cpus_equal(newdev->cpumask, cpumask))
-		irq_set_affinity(newdev->irq, cpumask);
+	if (!cpus_equal(newdev->cpumask, *cpumask))
+		irq_set_affinity(newdev->irq, *cpumask);
 
 	/*
 	 * When global broadcasting is active, check if the current
@@ -196,20 +196,20 @@ static int tick_check_new_device(struct clock_event_device *newdev)
 	struct tick_device *td;
 	int cpu, ret = NOTIFY_OK;
 	unsigned long flags;
-	cpumask_t cpumask;
+	cpumask_of_cpu_ptr_declare(cpumask);
 
 	spin_lock_irqsave(&tick_device_lock, flags);
 
 	cpu = smp_processor_id();
+	cpumask_of_cpu_ptr_next(cpumask, cpu);
 	if (!cpu_isset(cpu, newdev->cpumask))
 		goto out_bc;
 
 	td = &per_cpu(tick_cpu_device, cpu);
 	curdev = td->evtdev;
-	cpumask = cpumask_of_cpu(cpu);
 
 	/* cpu local device ? */
-	if (!cpus_equal(newdev->cpumask, cpumask)) {
+	if (!cpus_equal(newdev->cpumask, *cpumask)) {
 
 		/*
 		 * If the cpu affinity of the device interrupt can not
@@ -222,7 +222,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
 		 * If we have a cpu local device already, do not replace it
 		 * by a non cpu local device
 		 */
-		if (curdev && cpus_equal(curdev->cpumask, cpumask))
+		if (curdev && cpus_equal(curdev->cpumask, *cpumask))
 			goto out_bc;
 	}
 
-- 
cgit v1.2.3


From 31656519e132f6612584815f128c83976a9aaaef Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 18 Jul 2008 18:01:23 +0200
Subject: sched, x86: clean up hrtick implementation

random uvesafb failures were reported against Gentoo:

  http://bugs.gentoo.org/show_bug.cgi?id=222799

and Mihai Moldovan bisected it back to:

> 8f4d37ec073c17e2d4aa8851df5837d798606d6f is first bad commit
> commit 8f4d37ec073c17e2d4aa8851df5837d798606d6f
> Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
> Date:   Fri Jan 25 21:08:29 2008 +0100
>
>    sched: high-res preemption tick

Linus suspected it to be hrtick + vm86 interaction and observed:

> Btw, Peter, Ingo: I think that commit is doing bad things. They aren't
> _incorrect_ per se, but they are definitely bad.
>
> Why?
>
> Using random _TIF_WORK_MASK flags is really impolite for doing
> "scheduling" work. There's a reason that arch/x86/kernel/entry_32.S
> special-cases the _TIF_NEED_RESCHED flag: we don't want to exit out of
> vm86 mode unnecessarily.
>
> See the "work_notifysig_v86" label, and how it does that
> "save_v86_state()" thing etc etc.

Right, I never liked having to fiddle with those TIF flags. Initially I
needed it because the hrtimer base lock could not nest in the rq lock.
That however is fixed these days.

Currently the only reason left to fiddle with the TIF flags is remote
wakeups. We cannot program a remote cpu's hrtimer. I've been thinking
about using the new and improved IPI function call stuff to implement
hrtimer_start_on().

However that does require that smp_call_function_single(.wait=0) works
from interrupt context - /me looks at the latest series from Jens - Yes
that does seem to be supported, good.

Here's a stab at cleaning this stuff up ...

Mihai reported test success as well.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Mihai Moldovan <ionic@ionic.de>
Cc: Michal Januszewski <spock@gentoo.org>
Cc: Antonino Daplas <adaplas@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Kconfig.hz   |   2 +-
 kernel/sched.c      | 202 ++++++++++++++++------------------------------------
 kernel/sched_fair.c |   5 +-
 3 files changed, 63 insertions(+), 146 deletions(-)

(limited to 'kernel')

diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 526128a2e622..2a202a846757 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -55,4 +55,4 @@ config HZ
 	default 1000 if HZ_1000
 
 config SCHED_HRTICK
-	def_bool HIGH_RES_TIMERS && X86
+	def_bool HIGH_RES_TIMERS
diff --git a/kernel/sched.c b/kernel/sched.c
index 1ee18dbb4516..c13c75e9f9f7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -571,8 +571,10 @@ struct rq {
 #endif
 
 #ifdef CONFIG_SCHED_HRTICK
-	unsigned long hrtick_flags;
-	ktime_t hrtick_expire;
+#ifdef CONFIG_SMP
+	int hrtick_csd_pending;
+	struct call_single_data hrtick_csd;
+#endif
 	struct hrtimer hrtick_timer;
 #endif
 
@@ -983,13 +985,6 @@ static struct rq *this_rq_lock(void)
 	return rq;
 }
 
-static void __resched_task(struct task_struct *p, int tif_bit);
-
-static inline void resched_task(struct task_struct *p)
-{
-	__resched_task(p, TIF_NEED_RESCHED);
-}
-
 #ifdef CONFIG_SCHED_HRTICK
 /*
  * Use HR-timers to deliver accurate preemption points.
@@ -1001,25 +996,6 @@ static inline void resched_task(struct task_struct *p)
  * When we get rescheduled we reprogram the hrtick_timer outside of the
  * rq->lock.
  */
-static inline void resched_hrt(struct task_struct *p)
-{
-	__resched_task(p, TIF_HRTICK_RESCHED);
-}
-
-static inline void resched_rq(struct rq *rq)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&rq->lock, flags);
-	resched_task(rq->curr);
-	spin_unlock_irqrestore(&rq->lock, flags);
-}
-
-enum {
-	HRTICK_SET,		/* re-programm hrtick_timer */
-	HRTICK_RESET,		/* not a new slice */
-	HRTICK_BLOCK,		/* stop hrtick operations */
-};
 
 /*
  * Use hrtick when:
@@ -1030,72 +1006,17 @@ static inline int hrtick_enabled(struct rq *rq)
 {
 	if (!sched_feat(HRTICK))
 		return 0;
-	if (unlikely(test_bit(HRTICK_BLOCK, &rq->hrtick_flags)))
+	if (!cpu_online(cpu_of(rq)))
 		return 0;
 	return hrtimer_is_hres_active(&rq->hrtick_timer);
 }
 
-/*
- * Called to set the hrtick timer state.
- *
- * called with rq->lock held and irqs disabled
- */
-static void hrtick_start(struct rq *rq, u64 delay, int reset)
-{
-	assert_spin_locked(&rq->lock);
-
-	/*
-	 * preempt at: now + delay
-	 */
-	rq->hrtick_expire =
-		ktime_add_ns(rq->hrtick_timer.base->get_time(), delay);
-	/*
-	 * indicate we need to program the timer
-	 */
-	__set_bit(HRTICK_SET, &rq->hrtick_flags);
-	if (reset)
-		__set_bit(HRTICK_RESET, &rq->hrtick_flags);
-
-	/*
-	 * New slices are called from the schedule path and don't need a
-	 * forced reschedule.
-	 */
-	if (reset)
-		resched_hrt(rq->curr);
-}
-
 static void hrtick_clear(struct rq *rq)
 {
 	if (hrtimer_active(&rq->hrtick_timer))
 		hrtimer_cancel(&rq->hrtick_timer);
 }
 
-/*
- * Update the timer from the possible pending state.
- */
-static void hrtick_set(struct rq *rq)
-{
-	ktime_t time;
-	int set, reset;
-	unsigned long flags;
-
-	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
-
-	spin_lock_irqsave(&rq->lock, flags);
-	set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags);
-	reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags);
-	time = rq->hrtick_expire;
-	clear_thread_flag(TIF_HRTICK_RESCHED);
-	spin_unlock_irqrestore(&rq->lock, flags);
-
-	if (set) {
-		hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS);
-		if (reset && !hrtimer_active(&rq->hrtick_timer))
-			resched_rq(rq);
-	} else
-		hrtick_clear(rq);
-}
-
 /*
  * High-resolution timer tick.
  * Runs from hardirq context with interrupts disabled.
@@ -1115,27 +1036,37 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
 }
 
 #ifdef CONFIG_SMP
-static void hotplug_hrtick_disable(int cpu)
+/*
+ * called from hardirq (IPI) context
+ */
+static void __hrtick_start(void *arg)
 {
-	struct rq *rq = cpu_rq(cpu);
-	unsigned long flags;
-
-	spin_lock_irqsave(&rq->lock, flags);
-	rq->hrtick_flags = 0;
-	__set_bit(HRTICK_BLOCK, &rq->hrtick_flags);
-	spin_unlock_irqrestore(&rq->lock, flags);
+	struct rq *rq = arg;
 
-	hrtick_clear(rq);
+	spin_lock(&rq->lock);
+	hrtimer_restart(&rq->hrtick_timer);
+	rq->hrtick_csd_pending = 0;
+	spin_unlock(&rq->lock);
 }
 
-static void hotplug_hrtick_enable(int cpu)
+/*
+ * Called to set the hrtick timer state.
+ *
+ * called with rq->lock held and irqs disabled
+ */
+static void hrtick_start(struct rq *rq, u64 delay)
 {
-	struct rq *rq = cpu_rq(cpu);
-	unsigned long flags;
+	struct hrtimer *timer = &rq->hrtick_timer;
+	ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
 
-	spin_lock_irqsave(&rq->lock, flags);
-	__clear_bit(HRTICK_BLOCK, &rq->hrtick_flags);
-	spin_unlock_irqrestore(&rq->lock, flags);
+	timer->expires = time;
+
+	if (rq == this_rq()) {
+		hrtimer_restart(timer);
+	} else if (!rq->hrtick_csd_pending) {
+		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
+		rq->hrtick_csd_pending = 1;
+	}
 }
 
 static int
@@ -1150,16 +1081,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	case CPU_DOWN_PREPARE_FROZEN:
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
-		hotplug_hrtick_disable(cpu);
-		return NOTIFY_OK;
-
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-	case CPU_DOWN_FAILED:
-	case CPU_DOWN_FAILED_FROZEN:
-	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-		hotplug_hrtick_enable(cpu);
+		hrtick_clear(cpu_rq(cpu));
 		return NOTIFY_OK;
 	}
 
@@ -1170,46 +1092,45 @@ static void init_hrtick(void)
 {
 	hotcpu_notifier(hotplug_hrtick, 0);
 }
-#endif /* CONFIG_SMP */
+#else
+/*
+ * Called to set the hrtick timer state.
+ *
+ * called with rq->lock held and irqs disabled
+ */
+static void hrtick_start(struct rq *rq, u64 delay)
+{
+	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
+}
 
-static void init_rq_hrtick(struct rq *rq)
+static void init_hrtick(void)
 {
-	rq->hrtick_flags = 0;
-	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	rq->hrtick_timer.function = hrtick;
-	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 }
+#endif /* CONFIG_SMP */
 
-void hrtick_resched(void)
+static void init_rq_hrtick(struct rq *rq)
 {
-	struct rq *rq;
-	unsigned long flags;
+#ifdef CONFIG_SMP
+	rq->hrtick_csd_pending = 0;
 
-	if (!test_thread_flag(TIF_HRTICK_RESCHED))
-		return;
+	rq->hrtick_csd.flags = 0;
+	rq->hrtick_csd.func = __hrtick_start;
+	rq->hrtick_csd.info = rq;
+#endif
 
-	local_irq_save(flags);
-	rq = cpu_rq(smp_processor_id());
-	hrtick_set(rq);
-	local_irq_restore(flags);
+	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	rq->hrtick_timer.function = hrtick;
+	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 }
 #else
 static inline void hrtick_clear(struct rq *rq)
 {
 }
 
-static inline void hrtick_set(struct rq *rq)
-{
-}
-
 static inline void init_rq_hrtick(struct rq *rq)
 {
 }
 
-void hrtick_resched(void)
-{
-}
-
 static inline void init_hrtick(void)
 {
 }
@@ -1228,16 +1149,16 @@ static inline void init_hrtick(void)
 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
 #endif
 
-static void __resched_task(struct task_struct *p, int tif_bit)
+static void resched_task(struct task_struct *p)
 {
 	int cpu;
 
 	assert_spin_locked(&task_rq(p)->lock);
 
-	if (unlikely(test_tsk_thread_flag(p, tif_bit)))
+	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
 		return;
 
-	set_tsk_thread_flag(p, tif_bit);
+	set_tsk_thread_flag(p, TIF_NEED_RESCHED);
 
 	cpu = task_cpu(p);
 	if (cpu == smp_processor_id())
@@ -1303,10 +1224,10 @@ void wake_up_idle_cpu(int cpu)
 #endif /* CONFIG_NO_HZ */
 
 #else /* !CONFIG_SMP */
-static void __resched_task(struct task_struct *p, int tif_bit)
+static void resched_task(struct task_struct *p)
 {
 	assert_spin_locked(&task_rq(p)->lock);
-	set_tsk_thread_flag(p, tif_bit);
+	set_tsk_need_resched(p);
 }
 #endif /* CONFIG_SMP */
 
@@ -4395,7 +4316,7 @@ asmlinkage void __sched schedule(void)
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
 	struct rq *rq;
-	int cpu, hrtick = sched_feat(HRTICK);
+	int cpu;
 
 need_resched:
 	preempt_disable();
@@ -4410,7 +4331,7 @@ need_resched_nonpreemptible:
 
 	schedule_debug(prev);
 
-	if (hrtick)
+	if (sched_feat(HRTICK))
 		hrtick_clear(rq);
 
 	/*
@@ -4457,9 +4378,6 @@ need_resched_nonpreemptible:
 	} else
 		spin_unlock_irq(&rq->lock);
 
-	if (hrtick)
-		hrtick_set(rq);
-
 	if (unlikely(reacquire_kernel_lock(current) < 0))
 		goto need_resched_nonpreemptible;
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f2aa987027d6..6893b3ed65fe 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -878,7 +878,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 #ifdef CONFIG_SCHED_HRTICK
 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
 {
-	int requeue = rq->curr == p;
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
@@ -899,10 +898,10 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
 		 * Don't schedule slices shorter than 10000ns, that just
 		 * doesn't make sense. Rely on vruntime for fairness.
 		 */
-		if (!requeue)
+		if (rq->curr != p)
 			delta = max(10000LL, delta);
 
-		hrtick_start(rq, delta, requeue);
+		hrtick_start(rq, delta);
 	}
 }
 #else /* !CONFIG_SCHED_HRTICK */
-- 
cgit v1.2.3


From ba42059fbd0aa1ac91b582412b5fedb1258f241f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 20 Jul 2008 11:02:06 +0200
Subject: sched: hrtick_enabled() should use cpu_active()

Peter pointed out that hrtick_enabled() should use cpu_active().

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 85cf246cfdf5..62b1b8ecb5c7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1006,7 +1006,7 @@ static inline int hrtick_enabled(struct rq *rq)
 {
 	if (!sched_feat(HRTICK))
 		return 0;
-	if (!cpu_online(cpu_of(rq)))
+	if (!cpu_active(cpu_of(rq)))
 		return 0;
 	return hrtimer_is_hres_active(&rq->hrtick_timer);
 }
-- 
cgit v1.2.3


From b6d4f7e3ef25beb8c658c97867d98883e69dc544 Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dbaryshkov@gmail.com>
Date: Sun, 20 Jul 2008 15:01:10 +0400
Subject: dma-coherent: add documentation to new interfaces

Signed-off-by: Dmitry Baryshkov <dbaryshkov@gmail.com>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/dma-coherent.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'kernel')

diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
index 56dff5cb0f29..7517115a8cce 100644
--- a/kernel/dma-coherent.c
+++ b/kernel/dma-coherent.c
@@ -92,6 +92,21 @@ void *dma_mark_declared_memory_occupied(struct device *dev,
 }
 EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
 
+/**
+ * Try to allocate memory from the per-device coherent area.
+ *
+ * @dev:	device from which we allocate memory
+ * @size:	size of requested memory area
+ * @dma_handle:	This will be filled with the correct dma handle
+ * @ret:	This pointer will be filled with the virtual address
+ * 		to allocated area.
+ *
+ * This function should be only called from per-arch %dma_alloc_coherent()
+ * to support allocation from per-device coherent memory pools.
+ *
+ * Returns 0 if dma_alloc_coherent should continue with allocating from
+ * generic memory areas, or !0 if dma_alloc_coherent should return %ret.
+ */
 int dma_alloc_from_coherent(struct device *dev, ssize_t size,
 				       dma_addr_t *dma_handle, void **ret)
 {
@@ -111,6 +126,19 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size,
 	return (mem != NULL);
 }
 
+/**
+ * Try to free the memory allocated from per-device coherent memory pool.
+ * @dev:	device from which the memory was allocated
+ * @order:	the order of pages allocated
+ * @vaddr:	virtual address of allocated pages
+ *
+ * This checks whether the memory was allocated from the per-device
+ * coherent memory pool and if so, releases that memory.
+ *
+ * Returns 1 if we correctly released the memory, or 0 if
+ * %dma_release_coherent() should proceed with releasing memory from
+ * generic pools.
+ */
 int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
 {
 	struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
-- 
cgit v1.2.3


From 4a0b2b4dbe1335b8b9886ba3dc85a145d5d938ed Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Tue, 1 Jul 2008 18:48:41 +0200
Subject: sysdev: Pass the attribute to the low level sysdev show/store
 function

This allow to dynamically generate attributes and share show/store
functions between attributes. Right now most attributes are generated
by special macros and lots of duplicated code. With the attribute
passed it's instead possible to attach some data to the attribute
and then use that in shared low level functions to do different things.

I need this for the dynamically generated bank attributes in the x86
machine check code, but it'll allow some further cleanups.

I converted all users in tree to the new show/store prototype. It's a single
huge patch to avoid unbisectable sections.

Runtime tested: x86-32, x86-64
Compiled only: ia64, powerpc
Not compile tested/only grep converted: sh, arm, avr32

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/rtmutex-tester.c   | 7 ++++---
 kernel/sched.c            | 8 ++++++--
 kernel/time/clocksource.c | 8 ++++++--
 3 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 092e4c620af9..a56f629b057a 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -297,8 +297,8 @@ static int test_func(void *data)
  *
  * opcode:data
  */
-static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf,
-				  size_t count)
+static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribute *attr,
+				  const char *buf, size_t count)
 {
 	struct sched_param schedpar;
 	struct test_thread_data *td;
@@ -360,7 +360,8 @@ static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf,
  * @dev:	thread to query
  * @buf:	char buffer to be filled with thread status info
  */
-static ssize_t sysfs_test_status(struct sys_device *dev, char *buf)
+static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute *attr,
+				 char *buf)
 {
 	struct test_thread_data *td;
 	struct task_struct *tsk;
diff --git a/kernel/sched.c b/kernel/sched.c
index 99e6d850ecab..b1104ea5d255 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7737,11 +7737,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 }
 
 #ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
+static ssize_t sched_mc_power_savings_show(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *page)
 {
 	return sprintf(page, "%u\n", sched_mc_power_savings);
 }
 static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
+					    struct sysdev_attribute *attr,
 					    const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 0);
@@ -7751,11 +7753,13 @@ static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
 #endif
 
 #ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
+static ssize_t sched_smt_power_savings_show(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *page)
 {
 	return sprintf(page, "%u\n", sched_smt_power_savings);
 }
 static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
+					     struct sysdev_attribute *attr,
 					     const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 1);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index dadde5361f32..b1c2da81b050 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -376,7 +376,8 @@ void clocksource_unregister(struct clocksource *cs)
  * Provides sysfs interface for listing current clocksource.
  */
 static ssize_t
-sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
+sysfs_show_current_clocksources(struct sys_device *dev,
+				struct sysdev_attribute *attr, char *buf)
 {
 	ssize_t count = 0;
 
@@ -397,6 +398,7 @@ sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
  * clocksource selction.
  */
 static ssize_t sysfs_override_clocksource(struct sys_device *dev,
+					  struct sysdev_attribute *attr,
 					  const char *buf, size_t count)
 {
 	struct clocksource *ovr = NULL;
@@ -449,7 +451,9 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
  * Provides sysfs interface for listing registered clocksources
  */
 static ssize_t
-sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
+sysfs_show_available_clocksources(struct sys_device *dev,
+				  struct sysdev_attribute *attr,
+				  char *buf)
 {
 	struct clocksource *src;
 	ssize_t count = 0;
-- 
cgit v1.2.3


From da39ba5e1d65e997a98f6eb93ba6e6eb505f6e3c Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 22 Jul 2008 19:24:25 -0500
Subject: module: don't use stop_machine for waiting rmmod

rmmod has a little-used "-w" option, meaning that instead of failing if the
module is in use, it should block until the module becomes unused.

In this case, we don't need to use stop_machine: Max Krasnyansky
indicated that would be useful for SystemTap which loads/unloads new
modules frequently.

Cc: Max Krasnyansky <maxk@qualcomm.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 5f80478b746d..705e1d5d516c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -639,8 +639,8 @@ static int __try_stop_module(void *_sref)
 {
 	struct stopref *sref = _sref;
 
-	/* If it's not unused, quit unless we are told to block. */
-	if ((sref->flags & O_NONBLOCK) && module_refcount(sref->mod) != 0) {
+	/* If it's not unused, quit unless we're forcing. */
+	if (module_refcount(sref->mod) != 0) {
 		if (!(*sref->forced = try_force_unload(sref->flags)))
 			return -EWOULDBLOCK;
 	}
@@ -652,9 +652,16 @@ static int __try_stop_module(void *_sref)
 
 static int try_stop_module(struct module *mod, int flags, int *forced)
 {
-	struct stopref sref = { mod, flags, forced };
+	if (flags & O_NONBLOCK) {
+		struct stopref sref = { mod, flags, forced };
 
-	return stop_machine_run(__try_stop_module, &sref, NR_CPUS);
+		return stop_machine_run(__try_stop_module, &sref, NR_CPUS);
+	} else {
+		/* We don't need to stop the machine for this. */
+		mod->state = MODULE_STATE_GOING;
+		synchronize_sched();
+		return 0;
+	}
 }
 
 unsigned int module_refcount(struct module *mod)
-- 
cgit v1.2.3


From dafd0940c96fec67974a88ed8e6b8ba3160394cd Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 22 Jul 2008 19:24:25 -0500
Subject: module: generic each_symbol iterator function

Introduce an each_symbol() iterator to avoid duplicating the knowledge
about the 5 different sections containing symbols.  Currently only
used by find_symbol(), but will be used by symbol_put_addr() too.

(Includes NULL ptr deref fix by Jiri Kosina <jkosina@suse.cz>)

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jiri Kosina <jkosina@suse.cz>
---
 kernel/module.c | 244 +++++++++++++++++++++++++++++++-------------------------
 1 file changed, 134 insertions(+), 110 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 705e1d5d516c..c51c089c666e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -152,156 +152,180 @@ extern const unsigned long __start___kcrctab_unused_gpl[];
 #define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
 #endif
 
-/* lookup symbol in given range of kernel_symbols */
-static const struct kernel_symbol *lookup_symbol(const char *name,
-	const struct kernel_symbol *start,
-	const struct kernel_symbol *stop)
-{
-	const struct kernel_symbol *ks = start;
-	for (; ks < stop; ks++)
-		if (strcmp(ks->name, name) == 0)
-			return ks;
-	return NULL;
-}
-
-static bool always_ok(bool gplok, bool warn, const char *name)
-{
-	return true;
-}
-
-static bool printk_unused_warning(bool gplok, bool warn, const char *name)
-{
-	if (warn) {
-		printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
-		       "however this module is using it.\n", name);
-		printk(KERN_WARNING
-		       "This symbol will go away in the future.\n");
-		printk(KERN_WARNING
-		       "Please evalute if this is the right api to use and if "
-		       "it really is, submit a report the linux kernel "
-		       "mailinglist together with submitting your code for "
-		       "inclusion.\n");
-	}
-	return true;
-}
-
-static bool gpl_only_unused_warning(bool gplok, bool warn, const char *name)
-{
-	if (!gplok)
-		return false;
-	return printk_unused_warning(gplok, warn, name);
-}
-
-static bool gpl_only(bool gplok, bool warn, const char *name)
-{
-	return gplok;
-}
-
-static bool warn_if_not_gpl(bool gplok, bool warn, const char *name)
-{
-	if (!gplok && warn) {
-		printk(KERN_WARNING "Symbol %s is being used "
-		       "by a non-GPL module, which will not "
-		       "be allowed in the future\n", name);
-		printk(KERN_WARNING "Please see the file "
-		       "Documentation/feature-removal-schedule.txt "
-		       "in the kernel source tree for more details.\n");
-	}
-	return true;
-}
-
 struct symsearch {
 	const struct kernel_symbol *start, *stop;
 	const unsigned long *crcs;
-	bool (*check)(bool gplok, bool warn, const char *name);
+	enum {
+		NOT_GPL_ONLY,
+		GPL_ONLY,
+		WILL_BE_GPL_ONLY,
+	} licence;
+	bool unused;
 };
 
-/* Look through this array of symbol tables for a symbol match which
- * passes the check function. */
-static const struct kernel_symbol *search_symarrays(const struct symsearch *arr,
-						    unsigned int num,
-						    const char *name,
-						    bool gplok,
-						    bool warn,
-						    const unsigned long **crc)
+static bool each_symbol_in_section(const struct symsearch *arr,
+				   unsigned int arrsize,
+				   struct module *owner,
+				   bool (*fn)(const struct symsearch *syms,
+					      struct module *owner,
+					      unsigned int symnum, void *data),
+				   void *data)
 {
-	unsigned int i;
-	const struct kernel_symbol *ks;
-
-	for (i = 0; i < num; i++) {
-		ks = lookup_symbol(name, arr[i].start, arr[i].stop);
-		if (!ks || !arr[i].check(gplok, warn, name))
-			continue;
+	unsigned int i, j;
 
-		if (crc)
-			*crc = symversion(arr[i].crcs, ks - arr[i].start);
-		return ks;
+	for (j = 0; j < arrsize; j++) {
+		for (i = 0; i < arr[j].stop - arr[j].start; i++)
+			if (fn(&arr[j], owner, i, data))
+				return true;
 	}
-	return NULL;
+
+	return false;
 }
 
-/* Find a symbol, return value, (optional) crc and (optional) module
- * which owns it */
-static unsigned long find_symbol(const char *name,
-				 struct module **owner,
-				 const unsigned long **crc,
-				 bool gplok,
-				 bool warn)
+/* Returns true as soon as fn returns true, otherwise false. */
+static bool each_symbol(bool (*fn)(const struct symsearch *arr,
+				   struct module *owner,
+				   unsigned int symnum, void *data),
+			void *data)
 {
 	struct module *mod;
-	const struct kernel_symbol *ks;
 	const struct symsearch arr[] = {
 		{ __start___ksymtab, __stop___ksymtab, __start___kcrctab,
-		  always_ok },
+		  NOT_GPL_ONLY, false },
 		{ __start___ksymtab_gpl, __stop___ksymtab_gpl,
-		  __start___kcrctab_gpl, gpl_only },
+		  __start___kcrctab_gpl,
+		  GPL_ONLY, false },
 		{ __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future,
-		  __start___kcrctab_gpl_future, warn_if_not_gpl },
+		  __start___kcrctab_gpl_future,
+		  WILL_BE_GPL_ONLY, false },
 		{ __start___ksymtab_unused, __stop___ksymtab_unused,
-		  __start___kcrctab_unused, printk_unused_warning },
+		  __start___kcrctab_unused,
+		  NOT_GPL_ONLY, true },
 		{ __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl,
-		  __start___kcrctab_unused_gpl, gpl_only_unused_warning },
+		  __start___kcrctab_unused_gpl,
+		  GPL_ONLY, true },
 	};
 
-	/* Core kernel first. */
-	ks = search_symarrays(arr, ARRAY_SIZE(arr), name, gplok, warn, crc);
-	if (ks) {
-		if (owner)
-			*owner = NULL;
-		return ks->value;
-	}
+	if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data))
+		return true;
 
-	/* Now try modules. */
 	list_for_each_entry(mod, &modules, list) {
 		struct symsearch arr[] = {
 			{ mod->syms, mod->syms + mod->num_syms, mod->crcs,
-			  always_ok },
+			  NOT_GPL_ONLY, false },
 			{ mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
-			  mod->gpl_crcs, gpl_only },
+			  mod->gpl_crcs,
+			  GPL_ONLY, false },
 			{ mod->gpl_future_syms,
 			  mod->gpl_future_syms + mod->num_gpl_future_syms,
-			  mod->gpl_future_crcs, warn_if_not_gpl },
+			  mod->gpl_future_crcs,
+			  WILL_BE_GPL_ONLY, false },
 			{ mod->unused_syms,
 			  mod->unused_syms + mod->num_unused_syms,
-			  mod->unused_crcs, printk_unused_warning },
+			  mod->unused_crcs,
+			  NOT_GPL_ONLY, true },
 			{ mod->unused_gpl_syms,
 			  mod->unused_gpl_syms + mod->num_unused_gpl_syms,
-			  mod->unused_gpl_crcs, gpl_only_unused_warning },
+			  mod->unused_gpl_crcs,
+			  GPL_ONLY, true },
 		};
 
-		ks = search_symarrays(arr, ARRAY_SIZE(arr),
-				      name, gplok, warn, crc);
-		if (ks) {
-			if (owner)
-				*owner = mod;
-			return ks->value;
+		if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data))
+			return true;
+	}
+	return false;
+}
+
+struct find_symbol_arg {
+	/* Input */
+	const char *name;
+	bool gplok;
+	bool warn;
+
+	/* Output */
+	struct module *owner;
+	const unsigned long *crc;
+	unsigned long value;
+};
+
+static bool find_symbol_in_section(const struct symsearch *syms,
+				   struct module *owner,
+				   unsigned int symnum, void *data)
+{
+	struct find_symbol_arg *fsa = data;
+
+	if (strcmp(syms->start[symnum].name, fsa->name) != 0)
+		return false;
+
+	if (!fsa->gplok) {
+		if (syms->licence == GPL_ONLY)
+			return false;
+		if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) {
+			printk(KERN_WARNING "Symbol %s is being used "
+			       "by a non-GPL module, which will not "
+			       "be allowed in the future\n", fsa->name);
+			printk(KERN_WARNING "Please see the file "
+			       "Documentation/feature-removal-schedule.txt "
+			       "in the kernel source tree for more details.\n");
 		}
 	}
 
+	if (syms->unused && fsa->warn) {
+		printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
+		       "however this module is using it.\n", fsa->name);
+		printk(KERN_WARNING
+		       "This symbol will go away in the future.\n");
+		printk(KERN_WARNING
+		       "Please evalute if this is the right api to use and if "
+		       "it really is, submit a report the linux kernel "
+		       "mailinglist together with submitting your code for "
+		       "inclusion.\n");
+	}
+
+	fsa->owner = owner;
+	fsa->crc = symversion(syms->crcs, symnum);
+	fsa->value = syms->start[symnum].value;
+	return true;
+}
+
+/* Find a symbol, return value, (optional) crc and (optional) module
+ * which owns it */
+static unsigned long find_symbol(const char *name,
+				 struct module **owner,
+				 const unsigned long **crc,
+				 bool gplok,
+				 bool warn)
+{
+	struct find_symbol_arg fsa;
+
+	fsa.name = name;
+	fsa.gplok = gplok;
+	fsa.warn = warn;
+
+	if (each_symbol(find_symbol_in_section, &fsa)) {
+		if (owner)
+			*owner = fsa.owner;
+		if (crc)
+			*crc = fsa.crc;
+		return fsa.value;
+	}
+
 	DEBUGP("Failed to find symbol %s\n", name);
 	return -ENOENT;
 }
 
+/* lookup symbol in given range of kernel_symbols */
+static const struct kernel_symbol *lookup_symbol(const char *name,
+	const struct kernel_symbol *start,
+	const struct kernel_symbol *stop)
+{
+	const struct kernel_symbol *ks = start;
+	for (; ks < stop; ks++)
+		if (strcmp(ks->name, name) == 0)
+			return ks;
+	return NULL;
+}
+
 /* Search for module by name: must hold module_mutex. */
 static struct module *find_module(const char *name)
 {
-- 
cgit v1.2.3


From f7f5b67557eac1131ba6532522e3c50eced34238 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Tue, 22 Jul 2008 19:24:26 -0500
Subject: Shrink struct module: CONFIG_UNUSED_SYMBOLS ifdefs

module.c and module.h conatains code for finding
exported symbols which are declared with EXPORT_UNUSED_SYMBOL,
and this code is compiled in even if CONFIG_UNUSED_SYMBOLS is not set
and thus there can be no EXPORT_UNUSED_SYMBOLs in modules anyway
(because EXPORT_UNUSED_SYMBOL(x) are compiled out to nothing then).

This patch adds required #ifdefs.

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 49 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 34 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index c51c089c666e..ea9580521eb1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -134,17 +134,19 @@ extern const struct kernel_symbol __start___ksymtab_gpl[];
 extern const struct kernel_symbol __stop___ksymtab_gpl[];
 extern const struct kernel_symbol __start___ksymtab_gpl_future[];
 extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
-extern const struct kernel_symbol __start___ksymtab_unused[];
-extern const struct kernel_symbol __stop___ksymtab_unused[];
-extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
-extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
 extern const struct kernel_symbol __start___ksymtab_gpl_future[];
 extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
 extern const unsigned long __start___kcrctab[];
 extern const unsigned long __start___kcrctab_gpl[];
 extern const unsigned long __start___kcrctab_gpl_future[];
+#ifdef CONFIG_UNUSED_SYMBOLS
+extern const struct kernel_symbol __start___ksymtab_unused[];
+extern const struct kernel_symbol __stop___ksymtab_unused[];
+extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
+extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
 extern const unsigned long __start___kcrctab_unused[];
 extern const unsigned long __start___kcrctab_unused_gpl[];
+#endif
 
 #ifndef CONFIG_MODVERSIONS
 #define symversion(base, idx) NULL
@@ -198,12 +200,14 @@ static bool each_symbol(bool (*fn)(const struct symsearch *arr,
 		{ __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future,
 		  __start___kcrctab_gpl_future,
 		  WILL_BE_GPL_ONLY, false },
+#ifdef CONFIG_UNUSED_SYMBOLS
 		{ __start___ksymtab_unused, __stop___ksymtab_unused,
 		  __start___kcrctab_unused,
 		  NOT_GPL_ONLY, true },
 		{ __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl,
 		  __start___kcrctab_unused_gpl,
 		  GPL_ONLY, true },
+#endif
 	};
 
 	if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data))
@@ -220,6 +224,7 @@ static bool each_symbol(bool (*fn)(const struct symsearch *arr,
 			  mod->gpl_future_syms + mod->num_gpl_future_syms,
 			  mod->gpl_future_crcs,
 			  WILL_BE_GPL_ONLY, false },
+#ifdef CONFIG_UNUSED_SYMBOLS
 			{ mod->unused_syms,
 			  mod->unused_syms + mod->num_unused_syms,
 			  mod->unused_crcs,
@@ -228,6 +233,7 @@ static bool each_symbol(bool (*fn)(const struct symsearch *arr,
 			  mod->unused_gpl_syms + mod->num_unused_gpl_syms,
 			  mod->unused_gpl_crcs,
 			  GPL_ONLY, true },
+#endif
 		};
 
 		if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data))
@@ -270,6 +276,7 @@ static bool find_symbol_in_section(const struct symsearch *syms,
 		}
 	}
 
+#ifdef CONFIG_UNUSED_SYMBOLS
 	if (syms->unused && fsa->warn) {
 		printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
 		       "however this module is using it.\n", fsa->name);
@@ -281,6 +288,7 @@ static bool find_symbol_in_section(const struct symsearch *syms,
 		       "mailinglist together with submitting your code for "
 		       "inclusion.\n");
 	}
+#endif
 
 	fsa->owner = owner;
 	fsa->crc = symversion(syms->crcs, symnum);
@@ -1476,8 +1484,10 @@ static int verify_export_symbols(struct module *mod)
 		{ mod->syms, mod->num_syms },
 		{ mod->gpl_syms, mod->num_gpl_syms },
 		{ mod->gpl_future_syms, mod->num_gpl_future_syms },
+#ifdef CONFIG_UNUSED_SYMBOLS
 		{ mod->unused_syms, mod->num_unused_syms },
 		{ mod->unused_gpl_syms, mod->num_unused_gpl_syms },
+#endif
 	};
 
 	for (i = 0; i < ARRAY_SIZE(arr); i++) {
@@ -1795,10 +1805,12 @@ static struct module *load_module(void __user *umod,
 	unsigned int gplfutureindex;
 	unsigned int gplfuturecrcindex;
 	unsigned int unwindex = 0;
+#ifdef CONFIG_UNUSED_SYMBOLS
 	unsigned int unusedindex;
 	unsigned int unusedcrcindex;
 	unsigned int unusedgplindex;
 	unsigned int unusedgplcrcindex;
+#endif
 	unsigned int markersindex;
 	unsigned int markersstringsindex;
 	struct module *mod;
@@ -1881,13 +1893,15 @@ static struct module *load_module(void __user *umod,
 	exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab");
 	gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl");
 	gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future");
-	unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused");
-	unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl");
 	crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab");
 	gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl");
 	gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future");
+#ifdef CONFIG_UNUSED_SYMBOLS
+	unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused");
+	unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl");
 	unusedcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused");
 	unusedgplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused_gpl");
+#endif
 	setupindex = find_sec(hdr, sechdrs, secstrings, "__param");
 	exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table");
 	obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");
@@ -2049,14 +2063,15 @@ static struct module *load_module(void __user *umod,
 		mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr;
 	mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size /
 					sizeof(*mod->gpl_future_syms);
-	mod->num_unused_syms = sechdrs[unusedindex].sh_size /
-					sizeof(*mod->unused_syms);
-	mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size /
-					sizeof(*mod->unused_gpl_syms);
 	mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr;
 	if (gplfuturecrcindex)
 		mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr;
 
+#ifdef CONFIG_UNUSED_SYMBOLS
+	mod->num_unused_syms = sechdrs[unusedindex].sh_size /
+					sizeof(*mod->unused_syms);
+	mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size /
+					sizeof(*mod->unused_gpl_syms);
 	mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr;
 	if (unusedcrcindex)
 		mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr;
@@ -2064,13 +2079,17 @@ static struct module *load_module(void __user *umod,
 	if (unusedgplcrcindex)
 		mod->unused_gpl_crcs
 			= (void *)sechdrs[unusedgplcrcindex].sh_addr;
+#endif
 
 #ifdef CONFIG_MODVERSIONS
-	if ((mod->num_syms && !crcindex) ||
-	    (mod->num_gpl_syms && !gplcrcindex) ||
-	    (mod->num_gpl_future_syms && !gplfuturecrcindex) ||
-	    (mod->num_unused_syms && !unusedcrcindex) ||
-	    (mod->num_unused_gpl_syms && !unusedgplcrcindex)) {
+	if ((mod->num_syms && !crcindex)
+	    || (mod->num_gpl_syms && !gplcrcindex)
+	    || (mod->num_gpl_future_syms && !gplfuturecrcindex)
+#ifdef CONFIG_UNUSED_SYMBOLS
+	    || (mod->num_unused_syms && !unusedcrcindex)
+	    || (mod->num_unused_gpl_syms && !unusedgplcrcindex)
+#endif
+		) {
 		printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name);
 		err = try_to_force_load(mod, "nocrc");
 		if (err)
-- 
cgit v1.2.3


From 2f0f2a334bc38b61a9afca951185cd3844ee709d Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Tue, 22 Jul 2008 19:24:27 -0500
Subject: module: turn longs into ints for module sizes

This shrinks module.o and each *.ko file.

And finally, structure members which hold length of module
code (four such members there) and count of symbols
are converted from longs to ints.

We cannot possibly have a module where 32 bits won't
be enough to hold such counts.

For one, module loading checks module size for sanity
before loading, so such insanely big module will fail
that test first.

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index ea9580521eb1..5c7eb0695b3c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1567,7 +1567,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
 }
 
 /* Update size with this section: return offset. */
-static long get_offset(unsigned long *size, Elf_Shdr *sechdr)
+static long get_offset(unsigned int *size, Elf_Shdr *sechdr)
 {
 	long ret;
 
@@ -2562,7 +2562,7 @@ static int m_show(struct seq_file *m, void *p)
 	struct module *mod = list_entry(p, struct module, list);
 	char buf[8];
 
-	seq_printf(m, "%s %lu",
+	seq_printf(m, "%s %u",
 		   mod->name, mod->init_size + mod->core_size);
 	print_unload_info(m, mod);
 
-- 
cgit v1.2.3


From 3a642e99babe0617febb6f402e1e063479f489db Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 22 Jul 2008 19:24:28 -0500
Subject: modules: Take a shortcut for checking if an address is in a module

This patch keeps track of the boundaries of module allocation, in
order to speed up module_text_address().

Inspired by Arjan's version, which required arch-specific defines:

	Various pieces of the kernel (lockdep, latencytop, etc) tend
	to store backtraces, sometimes at a relatively high
	frequency. In itself this isn't a big performance deal (after
	all you're using diagnostics features), but there have been
	some complaints from people who have over 100 modules loaded
	that this is a tad too slow.

	This is due to the new backtracer code which looks at every
	slot on the stack to see if it's a kernel/module text address,
	so that's 1024 slots.  1024 times 100 modules... that's a lot
	of list walking.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 5c7eb0695b3c..d8b5605132a0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -70,6 +70,9 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq);
 
 static BLOCKING_NOTIFIER_HEAD(module_notify_list);
 
+/* Bounds of module allocation, for speeding __module_text_address */
+static unsigned long module_addr_min = -1UL, module_addr_max = 0;
+
 int register_module_notifier(struct notifier_block * nb)
 {
 	return blocking_notifier_chain_register(&module_notify_list, nb);
@@ -1779,6 +1782,20 @@ static inline void add_kallsyms(struct module *mod,
 }
 #endif /* CONFIG_KALLSYMS */
 
+static void *module_alloc_update_bounds(unsigned long size)
+{
+	void *ret = module_alloc(size);
+
+	if (ret) {
+		/* Update module bounds. */
+		if ((unsigned long)ret < module_addr_min)
+			module_addr_min = (unsigned long)ret;
+		if ((unsigned long)ret + size > module_addr_max)
+			module_addr_max = (unsigned long)ret + size;
+	}
+	return ret;
+}
+
 /* Allocate and load the module: note that size of section 0 is always
    zero, and we rely on this for optional sections. */
 static struct module *load_module(void __user *umod,
@@ -1980,7 +1997,7 @@ static struct module *load_module(void __user *umod,
 	layout_sections(mod, hdr, sechdrs, secstrings);
 
 	/* Do the allocs. */
-	ptr = module_alloc(mod->core_size);
+	ptr = module_alloc_update_bounds(mod->core_size);
 	if (!ptr) {
 		err = -ENOMEM;
 		goto free_percpu;
@@ -1988,7 +2005,7 @@ static struct module *load_module(void __user *umod,
 	memset(ptr, 0, mod->core_size);
 	mod->module_core = ptr;
 
-	ptr = module_alloc(mod->init_size);
+	ptr = module_alloc_update_bounds(mod->init_size);
 	if (!ptr && mod->init_size) {
 		err = -ENOMEM;
 		goto free_core;
@@ -2645,6 +2662,9 @@ struct module *__module_text_address(unsigned long addr)
 {
 	struct module *mod;
 
+	if (addr < module_addr_min || addr > module_addr_max)
+		return NULL;
+
 	list_for_each_entry(mod, &modules, list)
 		if (within(addr, mod->module_init, mod->init_text_size)
 		    || within(addr, mod->module_core, mod->core_text_size))
-- 
cgit v1.2.3


From a1ef5adb4cad43460ebba23c5a78cf4a55bb6a5b Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Tue, 8 Jul 2008 19:00:17 +0200
Subject: remove CONFIG_KMOD from core kernel code

Always compile request_module when the kernel allows modules.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/exec_domain.c | 2 +-
 kernel/kmod.c        | 2 +-
 kernel/sysctl.c      | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index a9e6bad9f706..c1ef192aa655 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -65,7 +65,7 @@ lookup_exec_domain(u_long personality)
 				goto out;
 	}
 
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
 	read_unlock(&exec_domains_lock);
 	request_module("personality-%ld", pers);
 	read_lock(&exec_domains_lock);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 8df97d3dfda8..90d7af1c1655 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -42,7 +42,7 @@ extern int max_threads;
 
 static struct workqueue_struct *khelper_wq;
 
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
 
 /*
 	modprobe_path is set via /proc/sys.
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6b16e16428d8..b859e6b5a767 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -110,7 +110,7 @@ static int min_percpu_pagelist_fract = 8;
 
 static int ngroups_max = NGROUPS_MAX;
 
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
 extern char modprobe_path[];
 #endif
 #ifdef CONFIG_CHR_DEV_SG
@@ -475,7 +475,7 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &ftrace_enable_sysctl,
 	},
 #endif
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
 	{
 		.ctl_name	= KERN_MODPROBE,
 		.procname	= "modprobe",
-- 
cgit v1.2.3


From 91cd4d6ef0abb1f65e81f8fe37e7d3c10344e38c Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Mon, 21 Jul 2008 14:21:35 -0700
Subject: cpusets: fix wrong domain attr updates

Fix wrong domain attr updates, or we will always update the first sched
domain attr.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Cc: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: <stable@kernel.org>	[2.6.26.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 459d601947a8..d2cc67dac8b1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -679,7 +679,9 @@ restart:
 				if (apn == b->pn) {
 					cpus_or(*dp, *dp, b->cpus_allowed);
 					b->pn = -1;
-					update_domain_attr(dattr, b);
+					if (dattr)
+						update_domain_attr(dattr
+								   + nslot, b);
 				}
 			}
 			nslot++;
-- 
cgit v1.2.3


From 5f17156fc55abac476d180e480bedb0f07f01b14 Mon Sep 17 00:00:00 2001
From: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Date: Mon, 21 Jul 2008 14:21:37 -0700
Subject: Fix build on COMPAT platforms when CONFIG_EPOLL is disabled

Add missing cond_syscall() entry for compat_sys_epoll_pwait.

Signed-off-by: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: <stable@kernel.org>		[2.6.25.x, 2.6.26.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys_ni.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 5b9b467de070..0fea0ee12da9 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -59,6 +59,7 @@ cond_syscall(sys_epoll_create);
 cond_syscall(sys_epoll_ctl);
 cond_syscall(sys_epoll_wait);
 cond_syscall(sys_epoll_pwait);
+cond_syscall(compat_sys_epoll_pwait);
 cond_syscall(sys_semget);
 cond_syscall(sys_semop);
 cond_syscall(sys_semtimedop);
-- 
cgit v1.2.3


From 422037bafde8083acc3c539ceba3dfc60a04110c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 23 Jul 2008 11:16:38 +0200
Subject: sched: fix hrtick & generic-ipi dependency

Andrew Morton reported this s390 allmodconfig build failure:

 kernel/built-in.o: In function `hrtick_start_fair':
 sched.c:(.text+0x69c6): undefined reference to `__smp_call_function_single'

the reason is that s390 is not a generic-ipi SMP platform yet, while
the hrtick code relies on it. Fix the dependency.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Kconfig.hz | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 2a202a846757..382dd5a8b2d7 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -55,4 +55,4 @@ config HZ
 	default 1000 if HZ_1000
 
 config SCHED_HRTICK
-	def_bool HIGH_RES_TIMERS
+	def_bool HIGH_RES_TIMERS && USE_GENERIC_SMP_HELPERS
-- 
cgit v1.2.3


From 2db873211ba47ef704c301f9ecf4a33413a0b649 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <Uwe.Kleine-Koenig@digi.com>
Date: Wed, 23 Jul 2008 14:42:25 +0200
Subject: set_irq_wake: fix return code and wake status tracking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since 15a647eba94c3da27ccc666bea72e7cca06b2d19 set_irq_wake returned -ENXIO
if another device had it already enabled.  Zero is the right value to
return in this case.  Moreover the change to desc->status was not reverted
if desc->chip->set_wake returned an error.

Signed-off-by: Uwe Kleine-König <Uwe.Kleine-Koenig@digi.com>
Acked-by: David Brownell <dbrownell@users.sourceforge.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/irq/manage.c | 39 +++++++++++++++++++++++++++------------
 1 file changed, 27 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 77a51be36010..3cfc0fefb5ee 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -217,6 +217,17 @@ void enable_irq(unsigned int irq)
 }
 EXPORT_SYMBOL(enable_irq);
 
+int set_irq_wake_real(unsigned int irq, unsigned int on)
+{
+	struct irq_desc *desc = irq_desc + irq;
+	int ret = -ENXIO;
+
+	if (desc->chip->set_wake)
+		ret = desc->chip->set_wake(irq, on);
+
+	return ret;
+}
+
 /**
  *	set_irq_wake - control irq power management wakeup
  *	@irq:	interrupt to control
@@ -233,30 +244,34 @@ int set_irq_wake(unsigned int irq, unsigned int on)
 {
 	struct irq_desc *desc = irq_desc + irq;
 	unsigned long flags;
-	int ret = -ENXIO;
-	int (*set_wake)(unsigned, unsigned) = desc->chip->set_wake;
+	int ret = 0;
 
 	/* wakeup-capable irqs can be shared between drivers that
 	 * don't need to have the same sleep mode behaviors.
 	 */
 	spin_lock_irqsave(&desc->lock, flags);
 	if (on) {
-		if (desc->wake_depth++ == 0)
-			desc->status |= IRQ_WAKEUP;
-		else
-			set_wake = NULL;
+		if (desc->wake_depth++ == 0) {
+			ret = set_irq_wake_real(irq, on);
+			if (ret)
+				desc->wake_depth = 0;
+			else
+				desc->status |= IRQ_WAKEUP;
+		}
 	} else {
 		if (desc->wake_depth == 0) {
 			printk(KERN_WARNING "Unbalanced IRQ %d "
 					"wake disable\n", irq);
 			WARN_ON(1);
-		} else if (--desc->wake_depth == 0)
-			desc->status &= ~IRQ_WAKEUP;
-		else
-			set_wake = NULL;
+		} else if (--desc->wake_depth == 0) {
+			ret = set_irq_wake_real(irq, on);
+			if (ret)
+				desc->wake_depth = 1;
+			else
+				desc->status &= ~IRQ_WAKEUP;
+		}
 	}
-	if (set_wake)
-		ret = desc->chip->set_wake(irq, on);
+
 	spin_unlock_irqrestore(&desc->lock, flags);
 	return ret;
 }
-- 
cgit v1.2.3


From 54da1174922cddd4be83d5a364b2e0fdd693f513 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 23 Jul 2008 20:52:05 +0400
Subject: posix-timers: do_schedule_next_timer: fix the setting of ->si_overrun

do_schedule_next_timer() sets info->si_overrun = timr->it_overrun_last,
this discards the already accumulated overruns.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Mark McLoughlin <markmc@redhat.com>
Cc: Oliver Pinter <oliver.pntr@gmail.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: stable@kernel.org
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/posix-timers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index dbd8398ddb0b..814a2bb8d2ea 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -289,7 +289,7 @@ void do_schedule_next_timer(struct siginfo *info)
 		else
 			schedule_next_timer(timr);
 
-		info->si_overrun = timr->it_overrun_last;
+		info->si_overrun += timr->it_overrun_last;
 	}
 
 	if (timr)
-- 
cgit v1.2.3


From ba661292a2bc6ddd305a212b0526e5dc22195fe7 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 23 Jul 2008 20:52:05 +0400
Subject: posix-timers: fix posix_timer_event() vs dequeue_signal() race

The bug was reported and analysed by Mark McLoughlin <markmc@redhat.com>,
the patch is based on his and Roland's suggestions.

posix_timer_event() always rewrites the pre-allocated siginfo before sending
the signal. Most of the written info is the same all the time, but memset(0)
is very wrong. If ->sigq is queued we can race with collect_signal() which
can fail to find this siginfo looking at .si_signo, or copy_siginfo() can
copy the wrong .si_code/si_tid/etc.

In short, sys_timer_settime() can in fact stop the active timer, or the user
can receive the siginfo with the wrong .si_xxx values.

Move "memset(->info, 0)" from posix_timer_event() to alloc_posix_timer(),
change send_sigqueue() to set .si_overrun = 0 when ->sigq is not queued.
It would be nice to move the whole sigq->info initialization from send to
create path, but this is not easy to do without uglifying timer_create()
further.

As Roland rightly pointed out, we need more cleanups/fixes here, see the
"FIXME" comment in the patch. Hopefully this patch makes sense anyway, and
it can mask the most bad implications.

Reported-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Mark McLoughlin <markmc@redhat.com>
Cc: Oliver Pinter <oliver.pntr@gmail.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: stable@kernel.org
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

 kernel/posix-timers.c |   17 +++++++++++++----
 kernel/signal.c       |    1 +
 2 files changed, 14 insertions(+), 4 deletions(-)
---
 kernel/posix-timers.c | 17 +++++++++++++----
 kernel/signal.c       |  1 +
 2 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 814a2bb8d2ea..0ffaeb075e35 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -296,14 +296,22 @@ void do_schedule_next_timer(struct siginfo *info)
 		unlock_timer(timr, flags);
 }
 
-int posix_timer_event(struct k_itimer *timr,int si_private)
+int posix_timer_event(struct k_itimer *timr, int si_private)
 {
-	memset(&timr->sigq->info, 0, sizeof(siginfo_t));
+	/*
+	 * FIXME: if ->sigq is queued we can race with
+	 * dequeue_signal()->do_schedule_next_timer().
+	 *
+	 * If dequeue_signal() sees the "right" value of
+	 * si_sys_private it calls do_schedule_next_timer().
+	 * We re-queue ->sigq and drop ->it_lock().
+	 * do_schedule_next_timer() locks the timer
+	 * and re-schedules it while ->sigq is pending.
+	 * Not really bad, but not that we want.
+	 */
 	timr->sigq->info.si_sys_private = si_private;
-	/* Send signal to the process that owns this timer.*/
 
 	timr->sigq->info.si_signo = timr->it_sigev_signo;
-	timr->sigq->info.si_errno = 0;
 	timr->sigq->info.si_code = SI_TIMER;
 	timr->sigq->info.si_tid = timr->it_id;
 	timr->sigq->info.si_value = timr->it_sigev_value;
@@ -435,6 +443,7 @@ static struct k_itimer * alloc_posix_timer(void)
 		kmem_cache_free(posix_timers_cache, tmr);
 		tmr = NULL;
 	}
+	memset(&tmr->sigq->info, 0, sizeof(siginfo_t));
 	return tmr;
 }
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 72bb4f51f963..13fab9838354 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1280,6 +1280,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
 		q->info.si_overrun++;
 		goto out;
 	}
+	q->info.si_overrun = 0;
 
 	signalfd_notify(t, sig);
 	pending = group ? &t->signal->shared_pending : &t->pending;
-- 
cgit v1.2.3


From 86a1c34a929f30fde8ad01ea8245df61ddcf58b7 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Mon, 23 Jun 2008 15:37:04 -0700
Subject: x86_64 syscall audit fast-path

This adds a fast path for 64-bit syscall entry and exit when
TIF_SYSCALL_AUDIT is set, but no other kind of syscall tracing.
This path does not need to save and restore all registers as
the general case of tracing does.  Avoiding the iret return path
when syscall audit is enabled helps performance a lot.

Signed-off-by: Roland McGrath <roland@redhat.com>
---
 kernel/auditsc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index c10e7aae04d7..4699950e65bd 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1476,7 +1476,8 @@ void audit_syscall_entry(int arch, int major,
 	struct audit_context *context = tsk->audit_context;
 	enum audit_state     state;
 
-	BUG_ON(!context);
+	if (unlikely(!context))
+		return;
 
 	/*
 	 * This happens only on certain architectures that make system
-- 
cgit v1.2.3


From 1986b0cb1671ea39178b4e2b00461109728fc935 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 24 Jul 2008 08:10:02 +0200
Subject: ftrace: remove latency-tracer leftover

remove the :vim=ft=help tag from trace files.

I used them years ago to syntax-highlight traces and forgot about this hack.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 868e121c8e38..fc20e09a6cb1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1203,9 +1203,6 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
 
 	iter->pos = *pos;
 
-	if (last_ent && !ent)
-		seq_puts(m, "\n\nvim:ft=help\n");
-
 	return ent;
 }
 
-- 
cgit v1.2.3


From 58838cf3ca3337d76141c33d6c68376490263468 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 24 Jul 2008 12:43:13 +0200
Subject: sched: clean up compiler warning

Reported-by: Daniel Walker <dwalker@mvista.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 147004c651c0..93ac8ee08271 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -253,7 +253,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
 
 		diff = iter->rt_runtime - iter->rt_time;
 		if (diff > 0) {
-			do_div(diff, weight);
+			diff = div_u64((u64)diff, weight);
 			if (rt_rq->rt_runtime + diff > rt_period)
 				diff = rt_period - rt_rq->rt_runtime;
 			iter->rt_runtime -= diff;
-- 
cgit v1.2.3


From c748e1340e0de3fa7fed86f8bdf499be9242afff Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Wed, 23 Jul 2008 21:27:03 -0700
Subject: mm/vmstat.c: proper externs

This patch adds proper extern declarations for five variables in
include/linux/vmstat.h

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2a7b9d88706b..1f7b3b76a166 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -43,6 +43,7 @@
 #include <linux/limits.h>
 #include <linux/dcache.h>
 #include <linux/syscalls.h>
+#include <linux/vmstat.h>
 #include <linux/nfs_fs.h>
 #include <linux/acpi.h>
 #include <linux/reboot.h>
@@ -80,7 +81,6 @@ extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
 extern int compat_log;
 extern int maps_protect;
-extern int sysctl_stat_interval;
 extern int latencytop_enabled;
 extern int sysctl_nr_open_min, sysctl_nr_open_max;
 #ifdef CONFIG_RCU_TORTURE_TEST
-- 
cgit v1.2.3


From a1e78772d72b2616ed20e54896e68e0e7044854e Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Wed, 23 Jul 2008 21:27:23 -0700
Subject: hugetlb: reserve huge pages for reliable MAP_PRIVATE hugetlbfs
 mappings until fork()

This patch reserves huge pages at mmap() time for MAP_PRIVATE mappings in
a similar manner to the reservations taken for MAP_SHARED mappings.  The
reserve count is accounted both globally and on a per-VMA basis for
private mappings.  This guarantees that a process that successfully calls
mmap() will successfully fault all pages in the future unless fork() is
called.

The characteristics of private mappings of hugetlbfs files behaviour after
this patch are;

1. The process calling mmap() is guaranteed to succeed all future faults until
   it forks().
2. On fork(), the parent may die due to SIGKILL on writes to the private
   mapping if enough pages are not available for the COW. For reasonably
   reliable behaviour in the face of a small huge page pool, children of
   hugepage-aware processes should not reference the mappings; such as
   might occur when fork()ing to exec().
3. On fork(), the child VMAs inherit no reserves. Reads on pages already
   faulted by the parent will succeed. Successful writes will depend on enough
   huge pages being free in the pool.
4. Quotas of the hugetlbfs mount are checked at reserve time for the mapper
   and at fault time otherwise.

Before this patch, all reads or writes in the child potentially needs page
allocations that can later lead to the death of the parent.  This applies
to reads and writes of uninstantiated pages as well as COW.  After the
patch it is only a write to an instantiated page that causes problems.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Adam Litke <agl@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index adefc1131f27..552c8d8e77ad 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -33,6 +33,7 @@
 #include <linux/cpu.h>
 #include <linux/cgroup.h>
 #include <linux/security.h>
+#include <linux/hugetlb.h>
 #include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
@@ -306,6 +307,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 			spin_unlock(&file->f_mapping->i_mmap_lock);
 		}
 
+		/*
+		 * Clear hugetlb-related page reserves for children. This only
+		 * affects MAP_PRIVATE mappings. Faults generated by the child
+		 * are not guaranteed to succeed, even if read-only
+		 */
+		if (is_vm_hugetlb_page(tmp))
+			reset_vma_resv_huge_pages(tmp);
+
 		/*
 		 * Link in the new vma and copy the page table entries.
 		 */
-- 
cgit v1.2.3


From e5ff215941d59f8ae6bf58f6428dc5c26745a612 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 23 Jul 2008 21:27:42 -0700
Subject: hugetlb: multiple hstates for multiple page sizes

Add basic support for more than one hstate in hugetlbfs.  This is the key
to supporting multiple hugetlbfs page sizes at once.

- Rather than a single hstate, we now have an array, with an iterator
- default_hstate continues to be the struct hstate which we use by default
- Add functions for architectures to register new hstates

[akpm@linux-foundation.org: coding-style fixes]
Acked-by: Adam Litke <agl@us.ibm.com>
Acked-by: Nishanth Aravamudan <nacc@us.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1f7b3b76a166..1a8299d1fe59 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -959,7 +959,7 @@ static struct ctl_table vm_table[] = {
 #ifdef CONFIG_HUGETLB_PAGE
 	 {
 		.procname	= "nr_hugepages",
-		.data		= &max_huge_pages,
+		.data		= NULL,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
 		.proc_handler	= &hugetlb_sysctl_handler,
@@ -985,10 +985,12 @@ static struct ctl_table vm_table[] = {
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "nr_overcommit_hugepages",
-		.data		= &sysctl_overcommit_huge_pages,
-		.maxlen		= sizeof(sysctl_overcommit_huge_pages),
+		.data		= NULL,
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
 		.proc_handler	= &hugetlb_overcommit_handler,
+		.extra1		= (void *)&hugetlb_zero,
+		.extra2		= (void *)&hugetlb_infinity,
 	},
 #endif
 	{
-- 
cgit v1.2.3


From ab763c7112ce0e2559c73f921617c81dc7287ca6 Mon Sep 17 00:00:00 2001
From: "Andrew G. Morgan" <morgan@kernel.org>
Date: Wed, 23 Jul 2008 21:28:25 -0700
Subject: security: filesystem capabilities refactor kernel code

To date, we've tried hard to confine filesystem support for capabilities
to the security modules.  This has left a lot of the code in
kernel/capability.c in a state where it looks like it supports something
that filesystem support for capabilities actually suppresses when the LSM
security/commmoncap.c code runs.  What is left is a lot of code that uses
sub-optimal locking in the main kernel

With this change we refactor the main kernel code and make it explicit
which locks are needed and that the only remaining kernel races in this
area are associated with non-filesystem capability code.

Signed-off-by: Andrew G. Morgan <morgan@kernel.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/capability.c | 338 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 221 insertions(+), 117 deletions(-)

(limited to 'kernel')

diff --git a/kernel/capability.c b/kernel/capability.c
index 901e0fdc3fff..0101e847603e 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -115,11 +115,208 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
 	return 0;
 }
 
+#ifndef CONFIG_SECURITY_FILE_CAPABILITIES
+
+/*
+ * Without filesystem capability support, we nominally support one process
+ * setting the capabilities of another
+ */
+static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
+				     kernel_cap_t *pIp, kernel_cap_t *pPp)
+{
+	struct task_struct *target;
+	int ret;
+
+	spin_lock(&task_capability_lock);
+	read_lock(&tasklist_lock);
+
+	if (pid && pid != task_pid_vnr(current)) {
+		target = find_task_by_vpid(pid);
+		if (!target) {
+			ret = -ESRCH;
+			goto out;
+		}
+	} else
+		target = current;
+
+	ret = security_capget(target, pEp, pIp, pPp);
+
+out:
+	read_unlock(&tasklist_lock);
+	spin_unlock(&task_capability_lock);
+
+	return ret;
+}
+
+/*
+ * cap_set_pg - set capabilities for all processes in a given process
+ * group.  We call this holding task_capability_lock and tasklist_lock.
+ */
+static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective,
+			     kernel_cap_t *inheritable,
+			     kernel_cap_t *permitted)
+{
+	struct task_struct *g, *target;
+	int ret = -EPERM;
+	int found = 0;
+	struct pid *pgrp;
+
+	spin_lock(&task_capability_lock);
+	read_lock(&tasklist_lock);
+
+	pgrp = find_vpid(pgrp_nr);
+	do_each_pid_task(pgrp, PIDTYPE_PGID, g) {
+		target = g;
+		while_each_thread(g, target) {
+			if (!security_capset_check(target, effective,
+						   inheritable, permitted)) {
+				security_capset_set(target, effective,
+						    inheritable, permitted);
+				ret = 0;
+			}
+			found = 1;
+		}
+	} while_each_pid_task(pgrp, PIDTYPE_PGID, g);
+
+	read_unlock(&tasklist_lock);
+	spin_unlock(&task_capability_lock);
+
+	if (!found)
+		ret = 0;
+	return ret;
+}
+
 /*
- * For sys_getproccap() and sys_setproccap(), any of the three
- * capability set pointers may be NULL -- indicating that that set is
- * uninteresting and/or not to be changed.
+ * cap_set_all - set capabilities for all processes other than init
+ * and self.  We call this holding task_capability_lock and tasklist_lock.
  */
+static inline int cap_set_all(kernel_cap_t *effective,
+			      kernel_cap_t *inheritable,
+			      kernel_cap_t *permitted)
+{
+	struct task_struct *g, *target;
+	int ret = -EPERM;
+	int found = 0;
+
+	spin_lock(&task_capability_lock);
+	read_lock(&tasklist_lock);
+
+	do_each_thread(g, target) {
+		if (target == current
+		    || is_container_init(target->group_leader))
+			continue;
+		found = 1;
+		if (security_capset_check(target, effective, inheritable,
+					  permitted))
+			continue;
+		ret = 0;
+		security_capset_set(target, effective, inheritable, permitted);
+	} while_each_thread(g, target);
+
+	read_unlock(&tasklist_lock);
+	spin_unlock(&task_capability_lock);
+
+	if (!found)
+		ret = 0;
+
+	return ret;
+}
+
+/*
+ * Given the target pid does not refer to the current process we
+ * need more elaborate support... (This support is not present when
+ * filesystem capabilities are configured.)
+ */
+static inline int do_sys_capset_other_tasks(pid_t pid, kernel_cap_t *effective,
+					    kernel_cap_t *inheritable,
+					    kernel_cap_t *permitted)
+{
+	struct task_struct *target;
+	int ret;
+
+	if (!capable(CAP_SETPCAP))
+		return -EPERM;
+
+	if (pid == -1)	          /* all procs other than current and init */
+		return cap_set_all(effective, inheritable, permitted);
+
+	else if (pid < 0)                    /* all procs in process group */
+		return cap_set_pg(-pid, effective, inheritable, permitted);
+
+	/* target != current */
+	spin_lock(&task_capability_lock);
+	read_lock(&tasklist_lock);
+
+	target = find_task_by_vpid(pid);
+	if (!target)
+		ret = -ESRCH;
+	else {
+		ret = security_capset_check(target, effective, inheritable,
+					    permitted);
+
+		/* having verified that the proposed changes are legal,
+		   we now put them into effect. */
+		if (!ret)
+			security_capset_set(target, effective, inheritable,
+					    permitted);
+	}
+
+	read_unlock(&tasklist_lock);
+	spin_unlock(&task_capability_lock);
+
+	return ret;
+}
+
+#else /* ie., def CONFIG_SECURITY_FILE_CAPABILITIES */
+
+/*
+ * If we have configured with filesystem capability support, then the
+ * only thing that can change the capabilities of the current process
+ * is the current process. As such, we can't be in this code at the
+ * same time as we are in the process of setting capabilities in this
+ * process. The net result is that we can limit our use of locks to
+ * when we are reading the caps of another process.
+ */
+static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
+				     kernel_cap_t *pIp, kernel_cap_t *pPp)
+{
+	int ret;
+
+	if (pid && (pid != task_pid_vnr(current))) {
+		struct task_struct *target;
+
+		spin_lock(&task_capability_lock);
+		read_lock(&tasklist_lock);
+
+		target = find_task_by_vpid(pid);
+		if (!target)
+			ret = -ESRCH;
+		else
+			ret = security_capget(target, pEp, pIp, pPp);
+
+		read_unlock(&tasklist_lock);
+		spin_unlock(&task_capability_lock);
+	} else
+		ret = security_capget(current, pEp, pIp, pPp);
+
+	return ret;
+}
+
+/*
+ * With filesystem capability support configured, the kernel does not
+ * permit the changing of capabilities in one process by another
+ * process. (CAP_SETPCAP has much less broad semantics when configured
+ * this way.)
+ */
+static inline int do_sys_capset_other_tasks(pid_t pid,
+					    kernel_cap_t *effective,
+					    kernel_cap_t *inheritable,
+					    kernel_cap_t *permitted)
+{
+	return -EPERM;
+}
+
+#endif /* ie., ndef CONFIG_SECURITY_FILE_CAPABILITIES */
 
 /*
  * Atomically modify the effective capabilities returning the original
@@ -155,7 +352,6 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
 {
 	int ret = 0;
 	pid_t pid;
-	struct task_struct *target;
 	unsigned tocopy;
 	kernel_cap_t pE, pI, pP;
 
@@ -169,23 +365,7 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
 	if (pid < 0)
 		return -EINVAL;
 
-	spin_lock(&task_capability_lock);
-	read_lock(&tasklist_lock);
-
-	if (pid && pid != task_pid_vnr(current)) {
-		target = find_task_by_vpid(pid);
-		if (!target) {
-			ret = -ESRCH;
-			goto out;
-		}
-	} else
-		target = current;
-
-	ret = security_capget(target, &pE, &pI, &pP);
-
-out:
-	read_unlock(&tasklist_lock);
-	spin_unlock(&task_capability_lock);
+	ret = cap_get_target_pid(pid, &pE, &pI, &pP);
 
 	if (!ret) {
 		struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
@@ -216,7 +396,6 @@ out:
 		 * before modification is attempted and the application
 		 * fails.
 		 */
-
 		if (copy_to_user(dataptr, kdata, tocopy
 				 * sizeof(struct __user_cap_data_struct))) {
 			return -EFAULT;
@@ -226,70 +405,8 @@ out:
 	return ret;
 }
 
-/*
- * cap_set_pg - set capabilities for all processes in a given process
- * group.  We call this holding task_capability_lock and tasklist_lock.
- */
-static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective,
-			      kernel_cap_t *inheritable,
-			      kernel_cap_t *permitted)
-{
-	struct task_struct *g, *target;
-	int ret = -EPERM;
-	int found = 0;
-	struct pid *pgrp;
-
-	pgrp = find_vpid(pgrp_nr);
-	do_each_pid_task(pgrp, PIDTYPE_PGID, g) {
-		target = g;
-		while_each_thread(g, target) {
-			if (!security_capset_check(target, effective,
-							inheritable,
-							permitted)) {
-				security_capset_set(target, effective,
-							inheritable,
-							permitted);
-				ret = 0;
-			}
-			found = 1;
-		}
-	} while_each_pid_task(pgrp, PIDTYPE_PGID, g);
-
-	if (!found)
-		ret = 0;
-	return ret;
-}
-
-/*
- * cap_set_all - set capabilities for all processes other than init
- * and self.  We call this holding task_capability_lock and tasklist_lock.
- */
-static inline int cap_set_all(kernel_cap_t *effective,
-			       kernel_cap_t *inheritable,
-			       kernel_cap_t *permitted)
-{
-     struct task_struct *g, *target;
-     int ret = -EPERM;
-     int found = 0;
-
-     do_each_thread(g, target) {
-             if (target == current || is_container_init(target->group_leader))
-                     continue;
-             found = 1;
-	     if (security_capset_check(target, effective, inheritable,
-						permitted))
-		     continue;
-	     ret = 0;
-	     security_capset_set(target, effective, inheritable, permitted);
-     } while_each_thread(g, target);
-
-     if (!found)
-	     ret = 0;
-     return ret;
-}
-
 /**
- * sys_capset - set capabilities for a process or a group of processes
+ * sys_capset - set capabilities for a process or (*) a group of processes
  * @header: pointer to struct that contains capability version and
  *	target pid data
  * @data: pointer to struct that contains the effective, permitted,
@@ -313,7 +430,6 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 	struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
 	unsigned i, tocopy;
 	kernel_cap_t inheritable, permitted, effective;
-	struct task_struct *target;
 	int ret;
 	pid_t pid;
 
@@ -324,9 +440,6 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 	if (get_user(pid, &header->pid))
 		return -EFAULT;
 
-	if (pid && pid != task_pid_vnr(current) && !capable(CAP_SETPCAP))
-		return -EPERM;
-
 	if (copy_from_user(&kdata, data, tocopy
 			   * sizeof(struct __user_cap_data_struct))) {
 		return -EFAULT;
@@ -344,40 +457,31 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 		i++;
 	}
 
-	spin_lock(&task_capability_lock);
-	read_lock(&tasklist_lock);
-
-	if (pid > 0 && pid != task_pid_vnr(current)) {
-		target = find_task_by_vpid(pid);
-		if (!target) {
-			ret = -ESRCH;
-			goto out;
-		}
-	} else
-		target = current;
-
-	ret = 0;
-
-	/* having verified that the proposed changes are legal,
-	   we now put them into effect. */
-	if (pid < 0) {
-		if (pid == -1)	/* all procs other than current and init */
-			ret = cap_set_all(&effective, &inheritable, &permitted);
+	if (pid && (pid != task_pid_vnr(current)))
+		ret = do_sys_capset_other_tasks(pid, &effective, &inheritable,
+						&permitted);
+	else {
+		/*
+		 * This lock is required even when filesystem
+		 * capability support is configured - it protects the
+		 * sys_capget() call from returning incorrect data in
+		 * the case that the targeted process is not the
+		 * current one.
+		 */
+		spin_lock(&task_capability_lock);
 
-		else		/* all procs in process group */
-			ret = cap_set_pg(-pid, &effective, &inheritable,
-					 &permitted);
-	} else {
-		ret = security_capset_check(target, &effective, &inheritable,
+		ret = security_capset_check(current, &effective, &inheritable,
 					    &permitted);
+		/*
+		 * Having verified that the proposed changes are
+		 * legal, we now put them into effect.
+		 */
 		if (!ret)
-			security_capset_set(target, &effective, &inheritable,
+			security_capset_set(current, &effective, &inheritable,
 					    &permitted);
+		spin_unlock(&task_capability_lock);
 	}
 
-out:
-	read_unlock(&tasklist_lock);
-	spin_unlock(&task_capability_lock);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 0d63081d418c73cc187c893069e0f24c4c6eecd3 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@suse.cz>
Date: Wed, 23 Jul 2008 21:28:32 -0700
Subject: swsusp: provide users with a hint about the no_console_suspend option

Tell the user about the no_console_suspend option, so that we don't have to
tell each bug reporter personally.

[akpm@linux-foundation.org: clarify the text a little]
Signed-off-by: Pavel Machek <pavel@suse.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 07ad9e7f7a66..3f7a2a94583b 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -933,7 +933,7 @@ void suspend_console(void)
 {
 	if (!console_suspend_enabled)
 		return;
-	printk("Suspending console(s)\n");
+	printk("Suspending console(s) (use no_console_suspend to debug)\n");
 	acquire_console_sem();
 	console_suspended = 1;
 }
-- 
cgit v1.2.3


From 77437fd4e61f87cc94d9314baa5cbf50e3ccdf54 Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Wed, 23 Jul 2008 21:28:33 -0700
Subject: pm: boot time suspend selftest

Boot-time test for system suspend states (STR or standby).  The generic
RTC framework triggers wakeup alarms, which are used to exit those states.

  - Measures some aspects of suspend time ... this uses "jiffies" until
    someone converts it to use a timebase that works properly even while
    timer IRQs are disabled.

  - Triggered by a command line parameter.  By default nothing even
    vaguely troublesome will happen, but "test_suspend=mem" will give
    you a brief STR test during system boot.  (Or you may need to use
    "test_suspend=standby" instead, if your hardware needs that.)

This isn't without problems.  It fires early enough during boot that for
example both PCMCIA and MMC stacks have misbehaved.  The workaround in
those cases was to boot without such media cards inserted.

[matthltc@us.ibm.com: fix compile failure in boot time suspend selftest]
Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Pavel Machek <pavel@suse.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Matt Helsley <matthltc@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/Kconfig |  11 +++
 kernel/power/main.c  | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 204 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 59dfdf1e1d20..dcd165f92a88 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -94,6 +94,17 @@ config SUSPEND
 	  powered and thus its contents are preserved, such as the
 	  suspend-to-RAM state (e.g. the ACPI S3 state).
 
+config PM_TEST_SUSPEND
+	bool "Test suspend/resume and wakealarm during bootup"
+	depends on SUSPEND && PM_DEBUG && RTC_LIB=y
+	---help---
+	This option will let you suspend your machine during bootup, and
+	make it wake up a few seconds later using an RTC wakeup alarm.
+	Enable this with a kernel parameter like "test_suspend=mem".
+
+	You probably want to have your system's RTC driver statically
+	linked, ensuring that it's available when this test runs.
+
 config SUSPEND_FREEZER
 	bool "Enable freezer for suspend to RAM/standby" \
 		if ARCH_WANTS_FREEZER_CONTROL || BROKEN
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 3398f4651aa1..95bff23ecdaa 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -132,6 +132,61 @@ static inline int suspend_test(int level) { return 0; }
 
 #ifdef CONFIG_SUSPEND
 
+#ifdef CONFIG_PM_TEST_SUSPEND
+
+/*
+ * We test the system suspend code by setting an RTC wakealarm a short
+ * time in the future, then suspending.  Suspending the devices won't
+ * normally take long ... some systems only need a few milliseconds.
+ *
+ * The time it takes is system-specific though, so when we test this
+ * during system bootup we allow a LOT of time.
+ */
+#define TEST_SUSPEND_SECONDS	5
+
+static unsigned long suspend_test_start_time;
+
+static void suspend_test_start(void)
+{
+	/* FIXME Use better timebase than "jiffies", ideally a clocksource.
+	 * What we want is a hardware counter that will work correctly even
+	 * during the irqs-are-off stages of the suspend/resume cycle...
+	 */
+	suspend_test_start_time = jiffies;
+}
+
+static void suspend_test_finish(const char *label)
+{
+	long nj = jiffies - suspend_test_start_time;
+	unsigned msec;
+
+	msec = jiffies_to_msecs(abs(nj));
+	pr_info("PM: %s took %d.%03d seconds\n", label,
+			msec / 1000, msec % 1000);
+
+	/* Warning on suspend means the RTC alarm period needs to be
+	 * larger -- the system was sooo slooowwww to suspend that the
+	 * alarm (should have) fired before the system went to sleep!
+	 *
+	 * Warning on either suspend or resume also means the system
+	 * has some performance issues.  The stack dump of a WARN_ON
+	 * is more likely to get the right attention than a printk...
+	 */
+	WARN_ON(msec > (TEST_SUSPEND_SECONDS * 1000));
+}
+
+#else
+
+static void suspend_test_start(void)
+{
+}
+
+static void suspend_test_finish(const char *label)
+{
+}
+
+#endif
+
 /* This is just an arbitrary number */
 #define FREE_PAGE_NUMBER (100)
 
@@ -266,12 +321,13 @@ int suspend_devices_and_enter(suspend_state_t state)
 			goto Close;
 	}
 	suspend_console();
+	suspend_test_start();
 	error = device_suspend(PMSG_SUSPEND);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to suspend\n");
 		goto Recover_platform;
 	}
-
+	suspend_test_finish("suspend devices");
 	if (suspend_test(TEST_DEVICES))
 		goto Recover_platform;
 
@@ -293,7 +349,9 @@ int suspend_devices_and_enter(suspend_state_t state)
 	if (suspend_ops->finish)
 		suspend_ops->finish();
  Resume_devices:
+	suspend_test_start();
 	device_resume(PMSG_RESUME);
+	suspend_test_finish("resume devices");
 	resume_console();
  Close:
 	if (suspend_ops->end)
@@ -521,3 +579,137 @@ static int __init pm_init(void)
 }
 
 core_initcall(pm_init);
+
+
+#ifdef CONFIG_PM_TEST_SUSPEND
+
+#include <linux/rtc.h>
+
+/*
+ * To test system suspend, we need a hands-off mechanism to resume the
+ * system.  RTCs wake alarms are a common self-contained mechanism.
+ */
+
+static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
+{
+	static char err_readtime[] __initdata =
+		KERN_ERR "PM: can't read %s time, err %d\n";
+	static char err_wakealarm [] __initdata =
+		KERN_ERR "PM: can't set %s wakealarm, err %d\n";
+	static char err_suspend[] __initdata =
+		KERN_ERR "PM: suspend test failed, error %d\n";
+	static char info_test[] __initdata =
+		KERN_INFO "PM: test RTC wakeup from '%s' suspend\n";
+
+	unsigned long		now;
+	struct rtc_wkalrm	alm;
+	int			status;
+
+	/* this may fail if the RTC hasn't been initialized */
+	status = rtc_read_time(rtc, &alm.time);
+	if (status < 0) {
+		printk(err_readtime, rtc->dev.bus_id, status);
+		return;
+	}
+	rtc_tm_to_time(&alm.time, &now);
+
+	memset(&alm, 0, sizeof alm);
+	rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
+	alm.enabled = true;
+
+	status = rtc_set_alarm(rtc, &alm);
+	if (status < 0) {
+		printk(err_wakealarm, rtc->dev.bus_id, status);
+		return;
+	}
+
+	if (state == PM_SUSPEND_MEM) {
+		printk(info_test, pm_states[state]);
+		status = pm_suspend(state);
+		if (status == -ENODEV)
+			state = PM_SUSPEND_STANDBY;
+	}
+	if (state == PM_SUSPEND_STANDBY) {
+		printk(info_test, pm_states[state]);
+		status = pm_suspend(state);
+	}
+	if (status < 0)
+		printk(err_suspend, status);
+}
+
+static int __init has_wakealarm(struct device *dev, void *name_ptr)
+{
+	struct rtc_device *candidate = to_rtc_device(dev);
+
+	if (!candidate->ops->set_alarm)
+		return 0;
+	if (!device_may_wakeup(candidate->dev.parent))
+		return 0;
+
+	*(char **)name_ptr = dev->bus_id;
+	return 1;
+}
+
+/*
+ * Kernel options like "test_suspend=mem" force suspend/resume sanity tests
+ * at startup time.  They're normally disabled, for faster boot and because
+ * we can't know which states really work on this particular system.
+ */
+static suspend_state_t test_state __initdata = PM_SUSPEND_ON;
+
+static char warn_bad_state[] __initdata =
+	KERN_WARNING "PM: can't test '%s' suspend state\n";
+
+static int __init setup_test_suspend(char *value)
+{
+	unsigned i;
+
+	/* "=mem" ==> "mem" */
+	value++;
+	for (i = 0; i < PM_SUSPEND_MAX; i++) {
+		if (!pm_states[i])
+			continue;
+		if (strcmp(pm_states[i], value) != 0)
+			continue;
+		test_state = (__force suspend_state_t) i;
+		return 0;
+	}
+	printk(warn_bad_state, value);
+	return 0;
+}
+__setup("test_suspend", setup_test_suspend);
+
+static int __init test_suspend(void)
+{
+	static char		warn_no_rtc[] __initdata =
+		KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
+
+	char			*pony = NULL;
+	struct rtc_device	*rtc = NULL;
+
+	/* PM is initialized by now; is that state testable? */
+	if (test_state == PM_SUSPEND_ON)
+		goto done;
+	if (!valid_state(test_state)) {
+		printk(warn_bad_state, pm_states[test_state]);
+		goto done;
+	}
+
+	/* RTCs have initialized by now too ... can we use one? */
+	class_find_device(rtc_class, NULL, &pony, has_wakealarm);
+	if (pony)
+		rtc = rtc_class_open(pony);
+	if (!rtc) {
+		printk(warn_no_rtc);
+		goto done;
+	}
+
+	/* go for it */
+	test_wakealarm(rtc, test_state);
+	rtc_class_close(rtc);
+done:
+	return 0;
+}
+late_initcall(test_suspend);
+
+#endif /* CONFIG_PM_TEST_SUSPEND */
-- 
cgit v1.2.3


From 0d83304c7e7bd3b05be90281b3a47841bc8f057a Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Wed, 23 Jul 2008 21:28:38 -0700
Subject: pm: hibernation: simplify memory bitmap

This patch simplifies the memory bitmap manipulations.

- remove the member size in struct bm_block

It is not necessary for struct bm_block to have the number of bit chunks that
can be calculated by using end_pfn and start_pfn.

- use find_next_bit() for memory_bm_next_pfn

No need to invent the bitmap library only for the memory bitmap.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/snapshot.c | 88 ++++++++++++-------------------------------------
 1 file changed, 21 insertions(+), 67 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 5f91a07c4eac..5d2ab836e998 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -205,8 +205,7 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
  *	objects.  The main list's elements are of type struct zone_bitmap
  *	and each of them corresonds to one zone.  For each zone bitmap
  *	object there is a list of objects of type struct bm_block that
- *	represent each blocks of bit chunks in which information is
- *	stored.
+ *	represent each blocks of bitmap in which information is stored.
  *
  *	struct memory_bitmap contains a pointer to the main list of zone
  *	bitmap objects, a struct bm_position used for browsing the bitmap,
@@ -224,26 +223,27 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
  *	pfns that correspond to the start and end of the represented zone.
  *
  *	struct bm_block contains a pointer to the memory page in which
- *	information is stored (in the form of a block of bit chunks
- *	of type unsigned long each).  It also contains the pfns that
- *	correspond to the start and end of the represented memory area and
- *	the number of bit chunks in the block.
+ *	information is stored (in the form of a block of bitmap)
+ *	It also contains the pfns that correspond to the start and end of
+ *	the represented memory area.
  */
 
 #define BM_END_OF_MAP	(~0UL)
 
-#define BM_CHUNKS_PER_BLOCK	(PAGE_SIZE / sizeof(long))
-#define BM_BITS_PER_CHUNK	(sizeof(long) << 3)
 #define BM_BITS_PER_BLOCK	(PAGE_SIZE << 3)
 
 struct bm_block {
 	struct bm_block *next;		/* next element of the list */
 	unsigned long start_pfn;	/* pfn represented by the first bit */
 	unsigned long end_pfn;	/* pfn represented by the last bit plus 1 */
-	unsigned int size;	/* number of bit chunks */
-	unsigned long *data;	/* chunks of bits representing pages */
+	unsigned long *data;	/* bitmap representing pages */
 };
 
+static inline unsigned long bm_block_bits(struct bm_block *bb)
+{
+	return bb->end_pfn - bb->start_pfn;
+}
+
 struct zone_bitmap {
 	struct zone_bitmap *next;	/* next element of the list */
 	unsigned long start_pfn;	/* minimal pfn in this zone */
@@ -257,7 +257,6 @@ struct zone_bitmap {
 struct bm_position {
 	struct zone_bitmap *zone_bm;
 	struct bm_block *block;
-	int chunk;
 	int bit;
 };
 
@@ -272,12 +271,6 @@ struct memory_bitmap {
 
 /* Functions that operate on memory bitmaps */
 
-static inline void memory_bm_reset_chunk(struct memory_bitmap *bm)
-{
-	bm->cur.chunk = 0;
-	bm->cur.bit = -1;
-}
-
 static void memory_bm_position_reset(struct memory_bitmap *bm)
 {
 	struct zone_bitmap *zone_bm;
@@ -285,7 +278,7 @@ static void memory_bm_position_reset(struct memory_bitmap *bm)
 	zone_bm = bm->zone_bm_list;
 	bm->cur.zone_bm = zone_bm;
 	bm->cur.block = zone_bm->bm_blocks;
-	memory_bm_reset_chunk(bm);
+	bm->cur.bit = 0;
 }
 
 static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
@@ -394,12 +387,10 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
 			bb->start_pfn = pfn;
 			if (nr >= BM_BITS_PER_BLOCK) {
 				pfn += BM_BITS_PER_BLOCK;
-				bb->size = BM_CHUNKS_PER_BLOCK;
 				nr -= BM_BITS_PER_BLOCK;
 			} else {
 				/* This is executed only once in the loop */
 				pfn += nr;
-				bb->size = DIV_ROUND_UP(nr, BM_BITS_PER_CHUNK);
 			}
 			bb->end_pfn = pfn;
 			bb = bb->next;
@@ -478,8 +469,8 @@ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
 	}
 	zone_bm->cur_block = bb;
 	pfn -= bb->start_pfn;
-	*bit_nr = pfn % BM_BITS_PER_CHUNK;
-	*addr = bb->data + pfn / BM_BITS_PER_CHUNK;
+	*bit_nr = pfn;
+	*addr = bb->data;
 	return 0;
 }
 
@@ -528,36 +519,6 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
 	return test_bit(bit, addr);
 }
 
-/* Two auxiliary functions for memory_bm_next_pfn */
-
-/* Find the first set bit in the given chunk, if there is one */
-
-static inline int next_bit_in_chunk(int bit, unsigned long *chunk_p)
-{
-	bit++;
-	while (bit < BM_BITS_PER_CHUNK) {
-		if (test_bit(bit, chunk_p))
-			return bit;
-
-		bit++;
-	}
-	return -1;
-}
-
-/* Find a chunk containing some bits set in given block of bits */
-
-static inline int next_chunk_in_block(int n, struct bm_block *bb)
-{
-	n++;
-	while (n < bb->size) {
-		if (bb->data[n])
-			return n;
-
-		n++;
-	}
-	return -1;
-}
-
 /**
  *	memory_bm_next_pfn - find the pfn that corresponds to the next set bit
  *	in the bitmap @bm.  If the pfn cannot be found, BM_END_OF_MAP is
@@ -571,40 +532,33 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
 {
 	struct zone_bitmap *zone_bm;
 	struct bm_block *bb;
-	int chunk;
 	int bit;
 
 	do {
 		bb = bm->cur.block;
 		do {
-			chunk = bm->cur.chunk;
 			bit = bm->cur.bit;
-			do {
-				bit = next_bit_in_chunk(bit, bb->data + chunk);
-				if (bit >= 0)
-					goto Return_pfn;
-
-				chunk = next_chunk_in_block(chunk, bb);
-				bit = -1;
-			} while (chunk >= 0);
+			bit = find_next_bit(bb->data, bm_block_bits(bb), bit);
+			if (bit < bm_block_bits(bb))
+				goto Return_pfn;
+
 			bb = bb->next;
 			bm->cur.block = bb;
-			memory_bm_reset_chunk(bm);
+			bm->cur.bit = 0;
 		} while (bb);
 		zone_bm = bm->cur.zone_bm->next;
 		if (zone_bm) {
 			bm->cur.zone_bm = zone_bm;
 			bm->cur.block = zone_bm->bm_blocks;
-			memory_bm_reset_chunk(bm);
+			bm->cur.bit = 0;
 		}
 	} while (zone_bm);
 	memory_bm_position_reset(bm);
 	return BM_END_OF_MAP;
 
  Return_pfn:
-	bm->cur.chunk = chunk;
-	bm->cur.bit = bit;
-	return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit;
+	bm->cur.bit = bit + 1;
+	return bb->start_pfn + bit;
 }
 
 /**
-- 
cgit v1.2.3


From c1a220e7acf8ad2c03504891f4a70cd9c32c904b Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 23 Jul 2008 21:28:39 -0700
Subject: pm: introduce new interfaces schedule_work_on() and queue_work_on()

This interface allows adding a job on a specific cpu.

Although a work struct on a cpu will be scheduled to other cpu if the cpu
dies, there is a recursion if a work task tries to offline the cpu it's
running on.  we need to schedule the task to a specific cpu in this case.
http://bugzilla.kernel.org/show_bug.cgi?id=10897

[oleg@tv-sign.ru: cleanups]
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Rus <harbour@sfinx.od.ua>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 39 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a6d36346d10a..6fd158b21026 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -140,7 +140,6 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
 	wake_up(&cwq->more_work);
 }
 
-/* Preempt must be disabled. */
 static void __queue_work(struct cpu_workqueue_struct *cwq,
 			 struct work_struct *work)
 {
@@ -175,6 +174,31 @@ int queue_work(struct workqueue_struct *wq, struct work_struct *work)
 }
 EXPORT_SYMBOL_GPL(queue_work);
 
+/**
+ * queue_work_on - queue work on specific cpu
+ * @cpu: CPU number to execute work on
+ * @wq: workqueue to use
+ * @work: work to queue
+ *
+ * Returns 0 if @work was already on a queue, non-zero otherwise.
+ *
+ * We queue the work to a specific CPU, the caller must ensure it
+ * can't go away.
+ */
+int
+queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
+{
+	int ret = 0;
+
+	if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
+		BUG_ON(!list_empty(&work->entry));
+		__queue_work(wq_per_cpu(wq, cpu), work);
+		ret = 1;
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(queue_work_on);
+
 static void delayed_work_timer_fn(unsigned long __data)
 {
 	struct delayed_work *dwork = (struct delayed_work *)__data;
@@ -553,6 +577,19 @@ int schedule_work(struct work_struct *work)
 }
 EXPORT_SYMBOL(schedule_work);
 
+/*
+ * schedule_work_on - put work task on a specific cpu
+ * @cpu: cpu to put the work task on
+ * @work: job to be done
+ *
+ * This puts a job on a specific cpu
+ */
+int schedule_work_on(int cpu, struct work_struct *work)
+{
+	return queue_work_on(cpu, keventd_wq, work);
+}
+EXPORT_SYMBOL(schedule_work_on);
+
 /**
  * schedule_delayed_work - put work task in global workqueue after delay
  * @dwork: job to be done
-- 
cgit v1.2.3


From 2f15fc4bdf91eb399da3f47a09c55831d9f22826 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 23 Jul 2008 21:28:40 -0700
Subject: pm: schedule sysrq poweroff on boot cpu

schedule sysrq poweroff on boot cpu.

sysrq poweroff needs to disable nonboot cpus, and we need to run this on boot
cpu to avoid any recursion.  http://bugzilla.kernel.org/show_bug.cgi?id=10897

[kosaki.motohiro@jp.fujitsu.com: build fix]
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Rus <harbour@sfinx.od.ua>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/poweroff.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 678ec736076b..72016f051477 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -10,6 +10,7 @@
 #include <linux/pm.h>
 #include <linux/workqueue.h>
 #include <linux/reboot.h>
+#include <linux/cpumask.h>
 
 /*
  * When the user hits Sys-Rq o to power down the machine this is the
@@ -25,7 +26,8 @@ static DECLARE_WORK(poweroff_work, do_poweroff);
 
 static void handle_poweroff(int key, struct tty_struct *tty)
 {
-	schedule_work(&poweroff_work);
+	/* run sysrq poweroff on boot cpu */
+	schedule_work_on(first_cpu(cpu_online_map), &poweroff_work);
 }
 
 static struct sysrq_key_op	sysrq_poweroff_op = {
-- 
cgit v1.2.3


From f0af566da6e9a4a2f5a83c5a70f3d0a772050e21 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 23 Jul 2008 21:28:44 -0700
Subject: pm: fix try_to_freeze_tasks()'s use of do_div()

Fix try_to_freeze_tasks()'s use of do_div() on an s64 by making
elapsed_csecs64 a u64 instead and dividing that.

Possibly this should be guarded lest the interval calculation turn up
negative, but the possible negativity of the result of the division is
cast away anyway.

This was introduced by patch 438e2ce68dfd4af4cfcec2f873564fb921db4bb5.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/process.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/process.c b/kernel/power/process.c
index 5fb87652f214..278946aecaf0 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -149,7 +149,7 @@ static int try_to_freeze_tasks(bool sig_only)
 	unsigned long end_time;
 	unsigned int todo;
 	struct timeval start, end;
-	s64 elapsed_csecs64;
+	u64 elapsed_csecs64;
 	unsigned int elapsed_csecs;
 
 	do_gettimeofday(&start);
-- 
cgit v1.2.3


From 82736f4d1d2b7063b829cc93171a6e5aea8a9c49 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <Uwe.Kleine-Koenig@digi.com>
Date: Wed, 23 Jul 2008 21:28:54 -0700
Subject: generic irqs: handle failure of irqchip->set_type in setup_irq
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

set_type returns an int indicating success or failure, but up to now
setup_irq ignores that.

In my case this resulted in a machine hang:

gpio-keys requested IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING, but
arm/ns9xxx can only trigger on one direction so set_type didn't touch
the configuration which happens do default to a level sensitiveness and
returned -EINVAL.  setup_irq ignored that and unmasked the irq.  This
resulted in an endless triggering of the gpio-key interrupt service
routine which effectively killed the machine.

With this patch applied setup_irq propagates the error to the caller.

Note that before in the case

	chip && !chip->set_type && !chip->name

a NULL pointer was feed to printk.  This is fixed, too.

Signed-off-by: Uwe Kleine-König <Uwe.Kleine-Koenig@digi.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/irq/manage.c | 64 +++++++++++++++++++++++++++++++++++------------------
 1 file changed, 42 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 3cfc0fefb5ee..5bc6e5ecc493 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -308,6 +308,30 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc)
 		desc->handle_irq = NULL;
 }
 
+static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq,
+		unsigned long flags)
+{
+	int ret;
+
+	if (!chip || !chip->set_type) {
+		/*
+		 * IRQF_TRIGGER_* but the PIC does not support multiple
+		 * flow-types?
+		 */
+		pr_warning("No set_type function for IRQ %d (%s)\n", irq,
+				chip ? (chip->name ? : "unknown") : "unknown");
+		return 0;
+	}
+
+	ret = chip->set_type(irq, flags & IRQF_TRIGGER_MASK);
+
+	if (ret)
+		pr_err("setting flow type for irq %u failed (%pF)\n",
+				irq, chip->set_type);
+
+	return ret;
+}
+
 /*
  * Internal function to register an irqaction - typically used to
  * allocate special interrupts that are part of the architecture.
@@ -319,6 +343,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 	const char *old_name = NULL;
 	unsigned long flags;
 	int shared = 0;
+	int ret;
 
 	if (irq >= NR_IRQS)
 		return -EINVAL;
@@ -376,35 +401,23 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 		shared = 1;
 	}
 
-	*p = new;
-
-	/* Exclude IRQ from balancing */
-	if (new->flags & IRQF_NOBALANCING)
-		desc->status |= IRQ_NO_BALANCING;
-
 	if (!shared) {
 		irq_chip_set_defaults(desc->chip);
 
-#if defined(CONFIG_IRQ_PER_CPU)
-		if (new->flags & IRQF_PERCPU)
-			desc->status |= IRQ_PER_CPU;
-#endif
-
 		/* Setup the type (level, edge polarity) if configured: */
 		if (new->flags & IRQF_TRIGGER_MASK) {
-			if (desc->chip->set_type)
-				desc->chip->set_type(irq,
-						new->flags & IRQF_TRIGGER_MASK);
-			else
-				/*
-				 * IRQF_TRIGGER_* but the PIC does not support
-				 * multiple flow-types?
-				 */
-				printk(KERN_WARNING "No IRQF_TRIGGER set_type "
-				       "function for IRQ %d (%s)\n", irq,
-				       desc->chip->name);
+			ret = __irq_set_trigger(desc->chip, irq, new->flags);
+
+			if (ret) {
+				spin_unlock_irqrestore(&desc->lock, flags);
+				return ret;
+			}
 		} else
 			compat_irq_chip_set_default_handler(desc);
+#if defined(CONFIG_IRQ_PER_CPU)
+		if (new->flags & IRQF_PERCPU)
+			desc->status |= IRQ_PER_CPU;
+#endif
 
 		desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING |
 				  IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
@@ -423,6 +436,13 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 		/* Set default affinity mask once everything is setup */
 		irq_select_affinity(irq);
 	}
+
+	*p = new;
+
+	/* Exclude IRQ from balancing */
+	if (new->flags & IRQF_NOBALANCING)
+		desc->status |= IRQ_NO_BALANCING;
+
 	/* Reset broken irq detection when installing new handler */
 	desc->irq_count = 0;
 	desc->irqs_unhandled = 0;
-- 
cgit v1.2.3


From aaca0bdca573f3f51ea03139f9c7289541e7bca3 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 23 Jul 2008 21:29:20 -0700
Subject: flag parameters: paccept

This patch is by far the most complex in the series.  It adds a new syscall
paccept.  This syscall differs from accept in that it adds (at the userlevel)
two additional parameters:

- a signal mask
- a flags value

The flags parameter can be used to set flag like SOCK_CLOEXEC.  This is
imlpemented here as well.  Some people argued that this is a property which
should be inherited from the file desriptor for the server but this is against
POSIX.  Additionally, we really want the signal mask parameter as well
(similar to pselect, ppoll, etc).  So an interface change in inevitable.

The flag value is the same as for socket and socketpair.  I think diverging
here will only create confusion.  Similar to the filesystem interfaces where
the use of the O_* constants differs, it is acceptable here.

The signal mask is handled as for pselect etc.  The mask is temporarily
installed for the thread and removed before the call returns.  I modeled the
code after pselect.  If there is a problem it's likely also in pselect.

For architectures which use socketcall I maintained this interface instead of
adding a system call.  The symmetry shouldn't be broken.

The following test must be adjusted for architectures other than x86 and
x86-64 and in case the syscall numbers changed.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#include <errno.h>
#include <fcntl.h>
#include <pthread.h>
#include <signal.h>
#include <stdio.h>
#include <unistd.h>
#include <netinet/in.h>
#include <sys/socket.h>
#include <sys/syscall.h>

#ifndef __NR_paccept
# ifdef __x86_64__
#  define __NR_paccept 288
# elif defined __i386__
#  define SYS_PACCEPT 18
#  define USE_SOCKETCALL 1
# else
#  error "need __NR_paccept"
# endif
#endif

#ifdef USE_SOCKETCALL
# define paccept(fd, addr, addrlen, mask, flags) \
  ({ long args[6] = { \
       (long) fd, (long) addr, (long) addrlen, (long) mask, 8, (long) flags }; \
     syscall (__NR_socketcall, SYS_PACCEPT, args); })
#else
# define paccept(fd, addr, addrlen, mask, flags) \
  syscall (__NR_paccept, fd, addr, addrlen, mask, 8, flags)
#endif

#define PORT 57392

#define SOCK_CLOEXEC O_CLOEXEC

static pthread_barrier_t b;

static void *
tf (void *arg)
{
  pthread_barrier_wait (&b);
  int s = socket (AF_INET, SOCK_STREAM, 0);
  struct sockaddr_in sin;
  sin.sin_family = AF_INET;
  sin.sin_addr.s_addr = htonl (INADDR_LOOPBACK);
  sin.sin_port = htons (PORT);
  connect (s, (const struct sockaddr *) &sin, sizeof (sin));
  close (s);

  pthread_barrier_wait (&b);
  s = socket (AF_INET, SOCK_STREAM, 0);
  sin.sin_port = htons (PORT);
  connect (s, (const struct sockaddr *) &sin, sizeof (sin));
  close (s);
  pthread_barrier_wait (&b);

  pthread_barrier_wait (&b);
  sleep (2);
  pthread_kill ((pthread_t) arg, SIGUSR1);

  return NULL;
}

static void
handler (int s)
{
}

int
main (void)
{
  pthread_barrier_init (&b, NULL, 2);

  struct sockaddr_in sin;
  pthread_t th;
  if (pthread_create (&th, NULL, tf, (void *) pthread_self ()) != 0)
    {
      puts ("pthread_create failed");
      return 1;
    }

  int s = socket (AF_INET, SOCK_STREAM, 0);
  int reuse = 1;
  setsockopt (s, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof (reuse));
  sin.sin_family = AF_INET;
  sin.sin_addr.s_addr = htonl (INADDR_LOOPBACK);
  sin.sin_port = htons (PORT);
  bind (s, (struct sockaddr *) &sin, sizeof (sin));
  listen (s, SOMAXCONN);

  pthread_barrier_wait (&b);

  int s2 = paccept (s, NULL, 0, NULL, 0);
  if (s2 < 0)
    {
      puts ("paccept(0) failed");
      return 1;
    }

  int coe = fcntl (s2, F_GETFD);
  if (coe & FD_CLOEXEC)
    {
      puts ("paccept(0) set close-on-exec-flag");
      return 1;
    }
  close (s2);

  pthread_barrier_wait (&b);

  s2 = paccept (s, NULL, 0, NULL, SOCK_CLOEXEC);
  if (s2 < 0)
    {
      puts ("paccept(SOCK_CLOEXEC) failed");
      return 1;
    }

  coe = fcntl (s2, F_GETFD);
  if ((coe & FD_CLOEXEC) == 0)
    {
      puts ("paccept(SOCK_CLOEXEC) does not set close-on-exec flag");
      return 1;
    }
  close (s2);

  pthread_barrier_wait (&b);

  struct sigaction sa;
  sa.sa_handler = handler;
  sa.sa_flags = 0;
  sigemptyset (&sa.sa_mask);
  sigaction (SIGUSR1, &sa, NULL);

  sigset_t ss;
  pthread_sigmask (SIG_SETMASK, NULL, &ss);
  sigaddset (&ss, SIGUSR1);
  pthread_sigmask (SIG_SETMASK, &ss, NULL);

  sigdelset (&ss, SIGUSR1);
  alarm (4);
  pthread_barrier_wait (&b);

  errno = 0 ;
  s2 = paccept (s, NULL, 0, &ss, 0);
  if (s2 != -1 || errno != EINTR)
    {
      puts ("paccept did not fail with EINTR");
      return 1;
    }

  close (s);

  puts ("OK");

  return 0;
}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[akpm@linux-foundation.org: make it compile]
[akpm@linux-foundation.org: add sys_ni stub]
Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: <linux-arch@vger.kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Roland McGrath <roland@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys_ni.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 0fea0ee12da9..2f0b8a2e600f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -31,6 +31,7 @@ cond_syscall(sys_socketpair);
 cond_syscall(sys_bind);
 cond_syscall(sys_listen);
 cond_syscall(sys_accept);
+cond_syscall(sys_paccept);
 cond_syscall(sys_connect);
 cond_syscall(sys_getsockname);
 cond_syscall(sys_getpeername);
-- 
cgit v1.2.3


From 9deb27baedb79759c3ab9435a7d8b841842d56e9 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 23 Jul 2008 21:29:24 -0700
Subject: flag parameters: signalfd

This patch adds the new signalfd4 syscall.  It extends the old signalfd
syscall by one parameter which is meant to hold a flag value.  In this
patch the only flag support is SFD_CLOEXEC which causes the close-on-exec
flag for the returned file descriptor to be set.

A new name SFD_CLOEXEC is introduced which in this implementation must
have the same value as O_CLOEXEC.

The following test must be adjusted for architectures other than x86 and
x86-64 and in case the syscall numbers changed.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#include <fcntl.h>
#include <signal.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>

#ifndef __NR_signalfd4
# ifdef __x86_64__
#  define __NR_signalfd4 289
# elif defined __i386__
#  define __NR_signalfd4 327
# else
#  error "need __NR_signalfd4"
# endif
#endif

#define SFD_CLOEXEC O_CLOEXEC

int
main (void)
{
  sigset_t ss;
  sigemptyset (&ss);
  sigaddset (&ss, SIGUSR1);
  int fd = syscall (__NR_signalfd4, -1, &ss, 8, 0);
  if (fd == -1)
    {
      puts ("signalfd4(0) failed");
      return 1;
    }
  int coe = fcntl (fd, F_GETFD);
  if (coe == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if (coe & FD_CLOEXEC)
    {
      puts ("signalfd4(0) set close-on-exec flag");
      return 1;
    }
  close (fd);

  fd = syscall (__NR_signalfd4, -1, &ss, 8, SFD_CLOEXEC);
  if (fd == -1)
    {
      puts ("signalfd4(SFD_CLOEXEC) failed");
      return 1;
    }
  coe = fcntl (fd, F_GETFD);
  if (coe == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if ((coe & FD_CLOEXEC) == 0)
    {
      puts ("signalfd4(SFD_CLOEXEC) does not set close-on-exec flag");
      return 1;
    }
  close (fd);

  puts ("OK");

  return 0;
}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[akpm@linux-foundation.org: add sys_ni stub]
Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys_ni.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 2f0b8a2e600f..8627c89ae9e8 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -156,6 +156,7 @@ cond_syscall(sys_ioprio_get);
 
 /* New file descriptors */
 cond_syscall(sys_signalfd);
+cond_syscall(sys_signalfd4);
 cond_syscall(compat_sys_signalfd);
 cond_syscall(sys_timerfd_create);
 cond_syscall(sys_timerfd_settime);
-- 
cgit v1.2.3


From b087498eb5605673b0f260a7620d91818cd72304 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 23 Jul 2008 21:29:25 -0700
Subject: flag parameters: eventfd

This patch adds the new eventfd2 syscall.  It extends the old eventfd
syscall by one parameter which is meant to hold a flag value.  In this
patch the only flag support is EFD_CLOEXEC which causes the close-on-exec
flag for the returned file descriptor to be set.

A new name EFD_CLOEXEC is introduced which in this implementation must
have the same value as O_CLOEXEC.

The following test must be adjusted for architectures other than x86 and
x86-64 and in case the syscall numbers changed.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>

#ifndef __NR_eventfd2
# ifdef __x86_64__
#  define __NR_eventfd2 290
# elif defined __i386__
#  define __NR_eventfd2 328
# else
#  error "need __NR_eventfd2"
# endif
#endif

#define EFD_CLOEXEC O_CLOEXEC

int
main (void)
{
  int fd = syscall (__NR_eventfd2, 1, 0);
  if (fd == -1)
    {
      puts ("eventfd2(0) failed");
      return 1;
    }
  int coe = fcntl (fd, F_GETFD);
  if (coe == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if (coe & FD_CLOEXEC)
    {
      puts ("eventfd2(0) sets close-on-exec flag");
      return 1;
    }
  close (fd);

  fd = syscall (__NR_eventfd2, 1, EFD_CLOEXEC);
  if (fd == -1)
    {
      puts ("eventfd2(EFD_CLOEXEC) failed");
      return 1;
    }
  coe = fcntl (fd, F_GETFD);
  if (coe == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if ((coe & FD_CLOEXEC) == 0)
    {
      puts ("eventfd2(EFD_CLOEXEC) does not set close-on-exec flag");
      return 1;
    }
  close (fd);

  puts ("OK");

  return 0;
}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[akpm@linux-foundation.org: add sys_ni stub]
Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys_ni.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 8627c89ae9e8..2a361ccdc7ca 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -164,3 +164,4 @@ cond_syscall(sys_timerfd_gettime);
 cond_syscall(compat_sys_timerfd_settime);
 cond_syscall(compat_sys_timerfd_gettime);
 cond_syscall(sys_eventfd);
+cond_syscall(sys_eventfd2);
-- 
cgit v1.2.3


From 4006553b06306b34054529477b06b68a1c66249b Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 23 Jul 2008 21:29:32 -0700
Subject: flag parameters: inotify_init

This patch introduces the new syscall inotify_init1 (note: the 1 stands for
the one parameter the syscall takes, as opposed to no parameter before).  The
values accepted for this parameter are function-specific and defined in the
inotify.h header.  Here the values must match the O_* flags, though.  In this
patch CLOEXEC support is introduced.

The following test must be adjusted for architectures other than x86 and
x86-64 and in case the syscall numbers changed.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>

#ifndef __NR_inotify_init1
# ifdef __x86_64__
#  define __NR_inotify_init1 294
# elif defined __i386__
#  define __NR_inotify_init1 332
# else
#  error "need __NR_inotify_init1"
# endif
#endif

#define IN_CLOEXEC O_CLOEXEC

int
main (void)
{
  int fd;
  fd = syscall (__NR_inotify_init1, 0);
  if (fd == -1)
    {
      puts ("inotify_init1(0) failed");
      return 1;
    }
  int coe = fcntl (fd, F_GETFD);
  if (coe == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if (coe & FD_CLOEXEC)
    {
      puts ("inotify_init1(0) set close-on-exit");
      return 1;
    }
  close (fd);

  fd = syscall (__NR_inotify_init1, IN_CLOEXEC);
  if (fd == -1)
    {
      puts ("inotify_init1(IN_CLOEXEC) failed");
      return 1;
    }
  coe = fcntl (fd, F_GETFD);
  if (coe == -1)
    {
      puts ("fcntl failed");
      return 1;
    }
  if ((coe & FD_CLOEXEC) == 0)
    {
      puts ("inotify_init1(O_CLOEXEC) does not set close-on-exit");
      return 1;
    }
  close (fd);

  puts ("OK");

  return 0;
}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[akpm@linux-foundation.org: add sys_ni stub]
Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys_ni.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 2a361ccdc7ca..bd66ac5406f3 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -96,6 +96,7 @@ cond_syscall(sys_keyctl);
 cond_syscall(compat_sys_keyctl);
 cond_syscall(compat_sys_socketcall);
 cond_syscall(sys_inotify_init);
+cond_syscall(sys_inotify_init1);
 cond_syscall(sys_inotify_add_watch);
 cond_syscall(sys_inotify_rm_watch);
 cond_syscall(sys_migrate_pages);
-- 
cgit v1.2.3


From be61a86d7237dd80510615f38ae21d6e1e98660c Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 23 Jul 2008 21:29:40 -0700
Subject: flag parameters: NONBLOCK in pipe

This patch adds O_NONBLOCK support to pipe2.  It is minimally more involved
than the patches for eventfd et.al but still trivial.  The interfaces of the
create_write_pipe and create_read_pipe helper functions were changed and the
one other caller as well.

The following test must be adjusted for architectures other than x86 and
x86-64 and in case the syscall numbers changed.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>

#ifndef __NR_pipe2
# ifdef __x86_64__
#  define __NR_pipe2 293
# elif defined __i386__
#  define __NR_pipe2 331
# else
#  error "need __NR_pipe2"
# endif
#endif

int
main (void)
{
  int fds[2];
  if (syscall (__NR_pipe2, fds, 0) == -1)
    {
      puts ("pipe2(0) failed");
      return 1;
    }
  for (int i = 0; i < 2; ++i)
    {
      int fl = fcntl (fds[i], F_GETFL);
      if (fl == -1)
        {
          puts ("fcntl failed");
          return 1;
        }
      if (fl & O_NONBLOCK)
        {
          printf ("pipe2(0) set non-blocking mode for fds[%d]\n", i);
          return 1;
        }
      close (fds[i]);
    }

  if (syscall (__NR_pipe2, fds, O_NONBLOCK) == -1)
    {
      puts ("pipe2(O_NONBLOCK) failed");
      return 1;
    }
  for (int i = 0; i < 2; ++i)
    {
      int fl = fcntl (fds[i], F_GETFL);
      if (fl == -1)
        {
          puts ("fcntl failed");
          return 1;
        }
      if ((fl & O_NONBLOCK) == 0)
        {
          printf ("pipe2(O_NONBLOCK) does not set non-blocking mode for fds[%d]\n", i);
          return 1;
        }
      close (fds[i]);
    }

  puts ("OK");

  return 0;
}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kmod.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 90d7af1c1655..2989f67c4446 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -417,12 +417,12 @@ int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
 {
 	struct file *f;
 
-	f = create_write_pipe();
+	f = create_write_pipe(0);
 	if (IS_ERR(f))
 		return PTR_ERR(f);
 	*filp = f;
 
-	f = create_read_pipe(f);
+	f = create_read_pipe(f, 0);
 	if (IS_ERR(f)) {
 		free_write_pipe(*filp);
 		return PTR_ERR(f);
-- 
cgit v1.2.3


From 5df439ef06d4173357711a04740aa8bfcf50d621 Mon Sep 17 00:00:00 2001
From: Wang Chen <wangchen@cn.fujitsu.com>
Date: Fri, 25 Jul 2008 01:45:23 -0700
Subject: flag parameters: fix compile error of sys_epoll_create1

GEN     .version
  CHK     include/linux/compile.h
  UPD     include/linux/compile.h
  CC      init/version.o
  LD      init/built-in.o
  LD      vmlinux
arch/x86/kernel/built-in.o: In function `sys_call_table':
(.rodata+0x8a4): undefined reference to `sys_epoll_create1'
make: *** [vmlinux] Error 1

Signed-off-by: Wang Chen <wangchen@cn.fujitsu.com>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys_ni.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index bd66ac5406f3..55eca1594da9 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -57,6 +57,7 @@ cond_syscall(compat_sys_set_robust_list);
 cond_syscall(sys_get_robust_list);
 cond_syscall(compat_sys_get_robust_list);
 cond_syscall(sys_epoll_create);
+cond_syscall(sys_epoll_create1);
 cond_syscall(sys_epoll_ctl);
 cond_syscall(sys_epoll_wait);
 cond_syscall(sys_epoll_pwait);
-- 
cgit v1.2.3


From 2fc9c4e18f94431e7eb77d97edb2a995b46fba55 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Fri, 25 Jul 2008 01:45:34 -0700
Subject: kallsyms: fix potential overflow in binary search

This will probably never trigger... but it won't hurt to be careful.

http://googleresearch.blogspot.com/2006/06/extra-extra-read-all-about-it-nearly.html

Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Joshua Bloch <jjb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kallsyms.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 6fc0040f3e3a..38fc10ac7541 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -176,7 +176,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
 	high = kallsyms_num_syms;
 
 	while (high - low > 1) {
-		mid = (low + high) / 2;
+		mid = low + (high - low) / 2;
 		if (kallsyms_addresses[mid] <= addr)
 			low = mid;
 		else
-- 
cgit v1.2.3


From b03f6489f9f27dc519a4c60ebf39cc7b8a58eae7 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Fri, 25 Jul 2008 01:45:35 -0700
Subject: build kernel/profile.o only when requested

Build kernel/profile.o only if CONFIG_PROFILING is enabled.

This makes CONFIG_PROFILING=n kernels smaller.

As a bonus, some profile_tick() calls and one branch from schedule() are
now eliminated with CONFIG_PROFILING=n (but I doubt these are
measurable effects).

This patch changes the effects of CONFIG_PROFILING=n, but I don't think
having more than two choices would be the better choice.

This patch also adds the name of the first parameter to the prototypes
of profile_{hits,tick}() since I anyway had to add them for the dummy
functions.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/Makefile  | 3 ++-
 kernel/profile.c | 4 ----
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 15ab63ffe64d..54f69837d35a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -2,7 +2,7 @@
 # Makefile for the linux kernel.
 #
 
-obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
+obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
 	    cpu.o exit.o itimer.o time.o softirq.o resource.o \
 	    sysctl.o capability.o ptrace.o timer.o user.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
@@ -24,6 +24,7 @@ CFLAGS_REMOVE_sched_clock.o = -pg
 CFLAGS_REMOVE_sched.o = -mno-spe -pg
 endif
 
+obj-$(CONFIG_PROFILING) += profile.o
 obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
diff --git a/kernel/profile.c b/kernel/profile.c
index 58926411eb2a..cd26bed4cc26 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -112,8 +112,6 @@ void __init profile_init(void)
 
 /* Profile event notifications */
 
-#ifdef CONFIG_PROFILING
-
 static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
 static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
 static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
@@ -203,8 +201,6 @@ void unregister_timer_hook(int (*hook)(struct pt_regs *))
 }
 EXPORT_SYMBOL_GPL(unregister_timer_hook);
 
-#endif /* CONFIG_PROFILING */
-
 
 #ifdef CONFIG_SMP
 /*
-- 
cgit v1.2.3


From ac331d158e198d2a91a5b0a3ec4ca9991fdb57af Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Fri, 25 Jul 2008 01:45:38 -0700
Subject: call_usermodehelper(): increase reliability

Presently call_usermodehelper_setup() uses GFP_ATOMIC.  but it can return
NULL _very_ easily.

GFP_ATOMIC is needed only when we can't sleep.  and, GFP_KERNEL is robust
and better.

thus, I add gfp_mask argument to call_usermodehelper_setup().

So, its callers pass the gfp_t as below:

call_usermodehelper() and call_usermodehelper_keys():
	depend on 'wait' argument.
call_usermodehelper_pipe():
	always GFP_KERNEL because always run under process context.
orderly_poweroff():
	pass to GFP_ATOMIC because may run under interrupt context.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: "Paul Menage" <menage@google.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kmod.c | 9 +++++----
 kernel/sys.c  | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 2989f67c4446..2456d1a0befb 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -352,16 +352,17 @@ static inline void register_pm_notifier_callback(void) {}
  * @path: path to usermode executable
  * @argv: arg vector for process
  * @envp: environment for process
+ * @gfp_mask: gfp mask for memory allocation
  *
  * Returns either %NULL on allocation failure, or a subprocess_info
  * structure.  This should be passed to call_usermodehelper_exec to
  * exec the process and free the structure.
  */
-struct subprocess_info *call_usermodehelper_setup(char *path,
-						  char **argv, char **envp)
+struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
+						  char **envp, gfp_t gfp_mask)
 {
 	struct subprocess_info *sub_info;
-	sub_info = kzalloc(sizeof(struct subprocess_info),  GFP_ATOMIC);
+	sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
 	if (!sub_info)
 		goto out;
 
@@ -494,7 +495,7 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
 	struct subprocess_info *sub_info;
 	int ret;
 
-	sub_info = call_usermodehelper_setup(path, argv, envp);
+	sub_info = call_usermodehelper_setup(path, argv, envp, GFP_KERNEL);
 	if (sub_info == NULL)
 		return -ENOMEM;
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 14e97282eb6c..6c2188046048 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1795,7 +1795,7 @@ int orderly_poweroff(bool force)
 		goto out;
 	}
 
-	info = call_usermodehelper_setup(argv[0], argv, envp);
+	info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC);
 	if (info == NULL) {
 		argv_free(argv);
 		goto out;
-- 
cgit v1.2.3


From b69c49b78457f681ecfb3147bd968434ee6559c1 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Fri, 25 Jul 2008 01:45:40 -0700
Subject: clean up duplicated alloc/free_thread_info

We duplicate alloc/free_thread_info defines on many platforms (the
majority uses __get_free_pages/free_pages).  This patch defines common
defines and removes these duplicated defines.
__HAVE_ARCH_THREAD_INFO_ALLOCATOR is introduced for platforms that do
something different.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 552c8d8e77ad..5a5d6fef341d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -93,6 +93,23 @@ int nr_processes(void)
 static struct kmem_cache *task_struct_cachep;
 #endif
 
+#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
+static inline struct thread_info *alloc_thread_info(struct task_struct *tsk)
+{
+#ifdef CONFIG_DEBUG_STACK_USAGE
+	gfp_t mask = GFP_KERNEL | __GFP_ZERO;
+#else
+	gfp_t mask = GFP_KERNEL;
+#endif
+	return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER);
+}
+
+static inline void free_thread_info(struct thread_info *ti)
+{
+	free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+}
+#endif
+
 /* SLAB cache for signal_struct structures (tsk->signal) */
 static struct kmem_cache *signal_cachep;
 
-- 
cgit v1.2.3


From a8f18b909c0a3f22630846207035c8b84bb252b8 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 25 Jul 2008 01:45:53 -0700
Subject: Add a WARN() macro; this is WARN_ON() + printk arguments

Add a WARN() macro that acts like WARN_ON(), with the added feature that it
takes a printk like argument that is printed as part of the warning message.

[akpm@linux-foundation.org: fix printk arguments]
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Greg KH <greg@kroah.com>
Cc: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/panic.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 425567f45b9f..12c5a0a6c89b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -318,6 +318,28 @@ void warn_on_slowpath(const char *file, int line)
 	add_taint(TAINT_WARN);
 }
 EXPORT_SYMBOL(warn_on_slowpath);
+
+
+void warn_slowpath(const char *file, int line, const char *fmt, ...)
+{
+	va_list args;
+	char function[KSYM_SYMBOL_LEN];
+	unsigned long caller = (unsigned long)__builtin_return_address(0);
+	sprint_symbol(function, caller);
+
+	printk(KERN_WARNING "------------[ cut here ]------------\n");
+	printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
+		line, function);
+	va_start(args, fmt);
+	vprintk(fmt, args);
+	va_end(args);
+
+	print_modules();
+	dump_stack();
+	print_oops_end_marker();
+	add_taint(TAINT_WARN);
+}
+EXPORT_SYMBOL(warn_slowpath);
 #endif
 
 #ifdef CONFIG_CC_STACKPROTECTOR
-- 
cgit v1.2.3


From 7a2c477069fbd32f91598f05334003979b987a39 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 25 Jul 2008 01:45:54 -0700
Subject: kernel/irq/manage.c: replace a printk + WARN_ON() to a WARN()

Replace a printk+WARN_ON() by a WARN(); this increases the chance of the
string making it into the bugreport (ie: it goes inside the
---[ cut here ]--- section)

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/irq/manage.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5bc6e5ecc493..f8914b92b664 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -260,9 +260,7 @@ int set_irq_wake(unsigned int irq, unsigned int on)
 		}
 	} else {
 		if (desc->wake_depth == 0) {
-			printk(KERN_WARNING "Unbalanced IRQ %d "
-					"wake disable\n", irq);
-			WARN_ON(1);
+			WARN(1, "Unbalanced IRQ %d wake disable\n", irq);
 		} else if (--desc->wake_depth == 0) {
 			ret = set_irq_wake_real(irq, on);
 			if (ret)
-- 
cgit v1.2.3


From 717115e1a5856b57af0f71e1df7149108294fc10 Mon Sep 17 00:00:00 2001
From: Dave Young <hidave.darkstar@gmail.com>
Date: Fri, 25 Jul 2008 01:45:58 -0700
Subject: printk ratelimiting rewrite

All ratelimit user use same jiffies and burst params, so some messages
(callbacks) will be lost.

For example:
a call printk_ratelimit(5 * HZ, 1)
b call printk_ratelimit(5 * HZ, 1) before the 5*HZ timeout of a, then b will
will be supressed.

- rewrite __ratelimit, and use a ratelimit_state as parameter.  Thanks for
  hints from andrew.

- Add WARN_ON_RATELIMIT, update rcupreempt.h

- remove __printk_ratelimit

- use __ratelimit in net_ratelimit

Signed-off-by: Dave Young <hidave.darkstar@gmail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Dave Young <hidave.darkstar@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 17 +++--------------
 kernel/sysctl.c |  4 ++--
 2 files changed, 5 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 3f7a2a94583b..a7f7559c5f6c 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1308,6 +1308,8 @@ void tty_write_message(struct tty_struct *tty, char *msg)
 }
 
 #if defined CONFIG_PRINTK
+
+DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
 /*
  * printk rate limiting, lifted from the networking subsystem.
  *
@@ -1315,22 +1317,9 @@ void tty_write_message(struct tty_struct *tty, char *msg)
  * every printk_ratelimit_jiffies to make a denial-of-service
  * attack impossible.
  */
-int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
-{
-	return __ratelimit(ratelimit_jiffies, ratelimit_burst);
-}
-EXPORT_SYMBOL(__printk_ratelimit);
-
-/* minimum time in jiffies between messages */
-int printk_ratelimit_jiffies = 5 * HZ;
-
-/* number of messages we send before ratelimiting */
-int printk_ratelimit_burst = 10;
-
 int printk_ratelimit(void)
 {
-	return __printk_ratelimit(printk_ratelimit_jiffies,
-				printk_ratelimit_burst);
+	return __ratelimit(&printk_ratelimit_state);
 }
 EXPORT_SYMBOL(printk_ratelimit);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1a8299d1fe59..35a50db9b6ce 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -624,7 +624,7 @@ static struct ctl_table kern_table[] = {
 	{
 		.ctl_name	= KERN_PRINTK_RATELIMIT,
 		.procname	= "printk_ratelimit",
-		.data		= &printk_ratelimit_jiffies,
+		.data		= &printk_ratelimit_state.interval,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_jiffies,
@@ -633,7 +633,7 @@ static struct ctl_table kern_table[] = {
 	{
 		.ctl_name	= KERN_PRINTK_RATELIMIT_BURST,
 		.procname	= "printk_ratelimit_burst",
-		.data		= &printk_ratelimit_burst,
+		.data		= &printk_ratelimit_state.burst,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
-- 
cgit v1.2.3


From ef53d9c5e4da147ecaa43c44c5e5945eb83970a2 Mon Sep 17 00:00:00 2001
From: Srinivasa D S <srinivasa@in.ibm.com>
Date: Fri, 25 Jul 2008 01:46:04 -0700
Subject: kprobes: improve kretprobe scalability with hashed locking

Currently list of kretprobe instances are stored in kretprobe object (as
used_instances,free_instances) and in kretprobe hash table.  We have one
global kretprobe lock to serialise the access to these lists.  This causes
only one kretprobe handler to execute at a time.  Hence affects system
performance, particularly on SMP systems and when return probe is set on
lot of functions (like on all systemcalls).

Solution proposed here gives fine-grain locks that performs better on SMP
system compared to present kretprobe implementation.

Solution:

 1) Instead of having one global lock to protect kretprobe instances
    present in kretprobe object and kretprobe hash table.  We will have
    two locks, one lock for protecting kretprobe hash table and another
    lock for kretporbe object.

 2) We hold lock present in kretprobe object while we modify kretprobe
    instance in kretprobe object and we hold per-hash-list lock while
    modifying kretprobe instances present in that hash list.  To prevent
    deadlock, we never grab a per-hash-list lock while holding a kretprobe
    lock.

 3) We can remove used_instances from struct kretprobe, as we can
    track used instances of kretprobe instances using kretprobe hash
    table.

Time duration for kernel compilation ("make -j 8") on a 8-way ppc64 system
with return probes set on all systemcalls looks like this.

cacheline              non-cacheline             Un-patched kernel
aligned patch 	       aligned patch
===============================================================================
real    9m46.784s       9m54.412s                  10m2.450s
user    40m5.715s       40m7.142s                  40m4.273s
sys     2m57.754s       2m58.583s                  3m17.430s
===========================================================

Time duration for kernel compilation ("make -j 8) on the same system, when
kernel is not probed.
=========================
real    9m26.389s
user    40m8.775s
sys     2m7.283s
=========================

Signed-off-by: Srinivasa DS <srinivasa@in.ibm.com>
Signed-off-by: Jim Keniston <jkenisto@us.ibm.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 127 ++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 89 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1485ca8d0e00..cb0b3bde3617 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -62,6 +62,7 @@
 	addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name)))
 #endif
 
+static int kprobes_initialized;
 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 
@@ -69,8 +70,15 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 static bool kprobe_enabled;
 
 DEFINE_MUTEX(kprobe_mutex);		/* Protects kprobe_table */
-DEFINE_SPINLOCK(kretprobe_lock);	/* Protects kretprobe_inst_table */
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
+static struct {
+	spinlock_t lock ____cacheline_aligned;
+} kretprobe_table_locks[KPROBE_TABLE_SIZE];
+
+static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
+{
+	return &(kretprobe_table_locks[hash].lock);
+}
 
 /*
  * Normally, functions that we'd want to prohibit kprobes in, are marked
@@ -368,26 +376,53 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
 	return;
 }
 
-/* Called with kretprobe_lock held */
 void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
 				struct hlist_head *head)
 {
+	struct kretprobe *rp = ri->rp;
+
 	/* remove rp inst off the rprobe_inst_table */
 	hlist_del(&ri->hlist);
-	if (ri->rp) {
-		/* remove rp inst off the used list */
-		hlist_del(&ri->uflist);
-		/* put rp inst back onto the free list */
-		INIT_HLIST_NODE(&ri->uflist);
-		hlist_add_head(&ri->uflist, &ri->rp->free_instances);
+	INIT_HLIST_NODE(&ri->hlist);
+	if (likely(rp)) {
+		spin_lock(&rp->lock);
+		hlist_add_head(&ri->hlist, &rp->free_instances);
+		spin_unlock(&rp->lock);
 	} else
 		/* Unregistering */
 		hlist_add_head(&ri->hlist, head);
 }
 
-struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
+void kretprobe_hash_lock(struct task_struct *tsk,
+			 struct hlist_head **head, unsigned long *flags)
+{
+	unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
+	spinlock_t *hlist_lock;
+
+	*head = &kretprobe_inst_table[hash];
+	hlist_lock = kretprobe_table_lock_ptr(hash);
+	spin_lock_irqsave(hlist_lock, *flags);
+}
+
+void kretprobe_table_lock(unsigned long hash, unsigned long *flags)
 {
-	return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
+	spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
+	spin_lock_irqsave(hlist_lock, *flags);
+}
+
+void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags)
+{
+	unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
+	spinlock_t *hlist_lock;
+
+	hlist_lock = kretprobe_table_lock_ptr(hash);
+	spin_unlock_irqrestore(hlist_lock, *flags);
+}
+
+void kretprobe_table_unlock(unsigned long hash, unsigned long *flags)
+{
+	spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
+	spin_unlock_irqrestore(hlist_lock, *flags);
 }
 
 /*
@@ -401,17 +436,21 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
 	struct kretprobe_instance *ri;
 	struct hlist_head *head, empty_rp;
 	struct hlist_node *node, *tmp;
-	unsigned long flags = 0;
+	unsigned long hash, flags = 0;
 
-	INIT_HLIST_HEAD(&empty_rp);
-	spin_lock_irqsave(&kretprobe_lock, flags);
-	head = kretprobe_inst_table_head(tk);
+	if (unlikely(!kprobes_initialized))
+		/* Early boot.  kretprobe_table_locks not yet initialized. */
+		return;
+
+	hash = hash_ptr(tk, KPROBE_HASH_BITS);
+	head = &kretprobe_inst_table[hash];
+	kretprobe_table_lock(hash, &flags);
 	hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
 		if (ri->task == tk)
 			recycle_rp_inst(ri, &empty_rp);
 	}
-	spin_unlock_irqrestore(&kretprobe_lock, flags);
-
+	kretprobe_table_unlock(hash, &flags);
+	INIT_HLIST_HEAD(&empty_rp);
 	hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
 		hlist_del(&ri->hlist);
 		kfree(ri);
@@ -423,24 +462,29 @@ static inline void free_rp_inst(struct kretprobe *rp)
 	struct kretprobe_instance *ri;
 	struct hlist_node *pos, *next;
 
-	hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, uflist) {
-		hlist_del(&ri->uflist);
+	hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, hlist) {
+		hlist_del(&ri->hlist);
 		kfree(ri);
 	}
 }
 
 static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
 {
-	unsigned long flags;
+	unsigned long flags, hash;
 	struct kretprobe_instance *ri;
 	struct hlist_node *pos, *next;
+	struct hlist_head *head;
+
 	/* No race here */
-	spin_lock_irqsave(&kretprobe_lock, flags);
-	hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) {
-		ri->rp = NULL;
-		hlist_del(&ri->uflist);
+	for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) {
+		kretprobe_table_lock(hash, &flags);
+		head = &kretprobe_inst_table[hash];
+		hlist_for_each_entry_safe(ri, pos, next, head, hlist) {
+			if (ri->rp == rp)
+				ri->rp = NULL;
+		}
+		kretprobe_table_unlock(hash, &flags);
 	}
-	spin_unlock_irqrestore(&kretprobe_lock, flags);
 	free_rp_inst(rp);
 }
 
@@ -831,32 +875,37 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
 					   struct pt_regs *regs)
 {
 	struct kretprobe *rp = container_of(p, struct kretprobe, kp);
-	unsigned long flags = 0;
+	unsigned long hash, flags = 0;
+	struct kretprobe_instance *ri;
 
 	/*TODO: consider to only swap the RA after the last pre_handler fired */
-	spin_lock_irqsave(&kretprobe_lock, flags);
+	hash = hash_ptr(current, KPROBE_HASH_BITS);
+	spin_lock_irqsave(&rp->lock, flags);
 	if (!hlist_empty(&rp->free_instances)) {
-		struct kretprobe_instance *ri;
-
 		ri = hlist_entry(rp->free_instances.first,
-				 struct kretprobe_instance, uflist);
+				struct kretprobe_instance, hlist);
+		hlist_del(&ri->hlist);
+		spin_unlock_irqrestore(&rp->lock, flags);
+
 		ri->rp = rp;
 		ri->task = current;
 
 		if (rp->entry_handler && rp->entry_handler(ri, regs)) {
-			spin_unlock_irqrestore(&kretprobe_lock, flags);
+			spin_unlock_irqrestore(&rp->lock, flags);
 			return 0;
 		}
 
 		arch_prepare_kretprobe(ri, regs);
 
 		/* XXX(hch): why is there no hlist_move_head? */
-		hlist_del(&ri->uflist);
-		hlist_add_head(&ri->uflist, &ri->rp->used_instances);
-		hlist_add_head(&ri->hlist, kretprobe_inst_table_head(ri->task));
-	} else
+		INIT_HLIST_NODE(&ri->hlist);
+		kretprobe_table_lock(hash, &flags);
+		hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]);
+		kretprobe_table_unlock(hash, &flags);
+	} else {
 		rp->nmissed++;
-	spin_unlock_irqrestore(&kretprobe_lock, flags);
+		spin_unlock_irqrestore(&rp->lock, flags);
+	}
 	return 0;
 }
 
@@ -892,7 +941,7 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp,
 		rp->maxactive = NR_CPUS;
 #endif
 	}
-	INIT_HLIST_HEAD(&rp->used_instances);
+	spin_lock_init(&rp->lock);
 	INIT_HLIST_HEAD(&rp->free_instances);
 	for (i = 0; i < rp->maxactive; i++) {
 		inst = kmalloc(sizeof(struct kretprobe_instance) +
@@ -901,8 +950,8 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp,
 			free_rp_inst(rp);
 			return -ENOMEM;
 		}
-		INIT_HLIST_NODE(&inst->uflist);
-		hlist_add_head(&inst->uflist, &rp->free_instances);
+		INIT_HLIST_NODE(&inst->hlist);
+		hlist_add_head(&inst->hlist, &rp->free_instances);
 	}
 
 	rp->nmissed = 0;
@@ -1009,6 +1058,7 @@ static int __init init_kprobes(void)
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
 		INIT_HLIST_HEAD(&kprobe_table[i]);
 		INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
+		spin_lock_init(&(kretprobe_table_locks[i].lock));
 	}
 
 	/*
@@ -1050,6 +1100,7 @@ static int __init init_kprobes(void)
 	err = arch_init_kprobes();
 	if (!err)
 		err = register_die_notifier(&kprobe_exceptions_nb);
+	kprobes_initialized = (err == 0);
 
 	if (!err)
 		init_test_probes();
-- 
cgit v1.2.3


From 8b6dd986823a8d92ed9f54baa5cef8604d9d9d44 Mon Sep 17 00:00:00 2001
From: Abhishek Sagar <sagar.abhishek@gmail.com>
Date: Fri, 25 Jul 2008 01:46:05 -0700
Subject: kprobes: remove redundant config check

I noticed that there's a CONFIG_KPROBES check inside kernel/kprobes.c,
which is redundant.

Signed-off-by: Abhishek Sagar <sagar.abhishek@gmail.com>
Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index cb0b3bde3617..75bc2cd9ebc6 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1337,13 +1337,8 @@ EXPORT_SYMBOL_GPL(register_jprobe);
 EXPORT_SYMBOL_GPL(unregister_jprobe);
 EXPORT_SYMBOL_GPL(register_jprobes);
 EXPORT_SYMBOL_GPL(unregister_jprobes);
-#ifdef CONFIG_KPROBES
 EXPORT_SYMBOL_GPL(jprobe_return);
-#endif
-
-#ifdef CONFIG_KPROBES
 EXPORT_SYMBOL_GPL(register_kretprobe);
 EXPORT_SYMBOL_GPL(unregister_kretprobe);
 EXPORT_SYMBOL_GPL(register_kretprobes);
 EXPORT_SYMBOL_GPL(unregister_kretprobes);
-#endif
-- 
cgit v1.2.3


From 7e9abd89cbdf9b73d327d8173343abce9022609b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 25 Jul 2008 01:46:54 -0700
Subject: cgroup: use read lock to guard find_existing_css_set()

The function does not modify anything (except the temporary css template), so
it's sufficient to hold read lock.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 15ac0e1e4f4d..f50edadfdd86 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -415,11 +415,11 @@ static struct css_set *find_css_set(
 
 	/* First see if we already have a cgroup group that matches
 	 * the desired set */
-	write_lock(&css_set_lock);
+	read_lock(&css_set_lock);
 	res = find_existing_css_set(oldcg, cgrp, template);
 	if (res)
 		get_css_set(res);
-	write_unlock(&css_set_lock);
+	read_unlock(&css_set_lock);
 
 	if (res)
 		return res;
-- 
cgit v1.2.3


From 71cbb949d17d4d776abd547135feb7f3282405c8 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Fri, 25 Jul 2008 01:46:55 -0700
Subject: cgroup: list_for_each cleanup

--------------------------
while() {
	list_entry();
	...
}
--------------------------

is equivalent to following code.

--------------------------
list_for_each_entry(){
	...
}
--------------------------

later can review easily more.

this patch is just clean up.
it doesn't have any behavor change.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 44 +++++++++++++++++++++-----------------------
 1 file changed, 21 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f50edadfdd86..6836a9063634 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -241,17 +241,20 @@ static int use_task_css_set_links;
  */
 static void unlink_css_set(struct css_set *cg)
 {
+	struct cg_cgroup_link *link;
+	struct cg_cgroup_link *saved_link;
+
 	write_lock(&css_set_lock);
 	hlist_del(&cg->hlist);
 	css_set_count--;
-	while (!list_empty(&cg->cg_links)) {
-		struct cg_cgroup_link *link;
-		link = list_entry(cg->cg_links.next,
-				  struct cg_cgroup_link, cg_link_list);
+
+	list_for_each_entry_safe(link, saved_link, &cg->cg_links,
+				 cg_link_list) {
 		list_del(&link->cg_link_list);
 		list_del(&link->cgrp_link_list);
 		kfree(link);
 	}
+
 	write_unlock(&css_set_lock);
 }
 
@@ -363,15 +366,14 @@ static struct css_set *find_existing_css_set(
 static int allocate_cg_links(int count, struct list_head *tmp)
 {
 	struct cg_cgroup_link *link;
+	struct cg_cgroup_link *saved_link;
 	int i;
 	INIT_LIST_HEAD(tmp);
 	for (i = 0; i < count; i++) {
 		link = kmalloc(sizeof(*link), GFP_KERNEL);
 		if (!link) {
-			while (!list_empty(tmp)) {
-				link = list_entry(tmp->next,
-						  struct cg_cgroup_link,
-						  cgrp_link_list);
+			list_for_each_entry_safe(link, saved_link, tmp,
+						 cgrp_link_list) {
 				list_del(&link->cgrp_link_list);
 				kfree(link);
 			}
@@ -384,11 +386,10 @@ static int allocate_cg_links(int count, struct list_head *tmp)
 
 static void free_cg_links(struct list_head *tmp)
 {
-	while (!list_empty(tmp)) {
-		struct cg_cgroup_link *link;
-		link = list_entry(tmp->next,
-				  struct cg_cgroup_link,
-				  cgrp_link_list);
+	struct cg_cgroup_link *link;
+	struct cg_cgroup_link *saved_link;
+
+	list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
 		list_del(&link->cgrp_link_list);
 		kfree(link);
 	}
@@ -1093,6 +1094,8 @@ static void cgroup_kill_sb(struct super_block *sb) {
 	struct cgroupfs_root *root = sb->s_fs_info;
 	struct cgroup *cgrp = &root->top_cgroup;
 	int ret;
+	struct cg_cgroup_link *link;
+	struct cg_cgroup_link *saved_link;
 
 	BUG_ON(!root);
 
@@ -1112,10 +1115,9 @@ static void cgroup_kill_sb(struct super_block *sb) {
 	 * root cgroup
 	 */
 	write_lock(&css_set_lock);
-	while (!list_empty(&cgrp->css_sets)) {
-		struct cg_cgroup_link *link;
-		link = list_entry(cgrp->css_sets.next,
-				  struct cg_cgroup_link, cgrp_link_list);
+
+	list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
+				 cgrp_link_list) {
 		list_del(&link->cg_link_list);
 		list_del(&link->cgrp_link_list);
 		kfree(link);
@@ -1756,15 +1758,11 @@ int cgroup_add_files(struct cgroup *cgrp,
 int cgroup_task_count(const struct cgroup *cgrp)
 {
 	int count = 0;
-	struct list_head *l;
+	struct cg_cgroup_link *link;
 
 	read_lock(&css_set_lock);
-	l = cgrp->css_sets.next;
-	while (l != &cgrp->css_sets) {
-		struct cg_cgroup_link *link =
-			list_entry(l, struct cg_cgroup_link, cgrp_link_list);
+	list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
 		count += atomic_read(&link->cg->ref.refcount);
-		l = l->next;
 	}
 	read_unlock(&css_set_lock);
 	return count;
-- 
cgit v1.2.3


From 8947f9d5b361ce927be6d5c11fed57905b7a4100 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 25 Jul 2008 01:46:56 -0700
Subject: cgroups: annotate two variables with __read_mostly

- need_forkexit_callback will be read only after system boot.
- use_task_css_set_links will be read only after it's set.

And these 2 variables are checked when a new process is forked.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 6836a9063634..70d083c6fb6b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -118,7 +118,7 @@ static int root_count;
  * extra work in the fork/exit path if none of the subsystems need to
  * be called.
  */
-static int need_forkexit_callback;
+static int need_forkexit_callback __read_mostly;
 static int need_mm_owner_callback __read_mostly;
 
 /* convenient tests for these bits */
@@ -220,7 +220,7 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
  * task until after the first call to cgroup_iter_start(). This
  * reduces the fork()/exit() overhead for people who have cgroups
  * compiled into their kernel but not actually in use */
-static int use_task_css_set_links;
+static int use_task_css_set_links __read_mostly;
 
 /* When we create or destroy a css_set, the operation simply
  * takes/releases a reference count on all the cgroups referenced
-- 
cgit v1.2.3


From db3b14978abc02041046ed8353f0899cb58ffffc Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Fri, 25 Jul 2008 01:46:58 -0700
Subject: cgroup files: add write_string cgroup control file method

This patch adds a write_string() method for cgroups control files. The
semantics are that a buffer is copied from userspace to kernelspace
and the handler function invoked on that buffer.  The buffer is
guaranteed to be nul-terminated, and no longer than max_write_len
(defaulting to 64 bytes if unspecified). Later patches will convert
existing raw file write handlers in control group subsystems to use
this method.

Signed-off-by: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Acked-by: Balbir Singh <balbir@in.ibm.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 70d083c6fb6b..3a99cc2df860 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1363,6 +1363,39 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
 	return retval;
 }
 
+static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
+				   struct file *file,
+				   const char __user *userbuf,
+				   size_t nbytes, loff_t *unused_ppos)
+{
+	char local_buffer[64];
+	int retval = 0;
+	size_t max_bytes = cft->max_write_len;
+	char *buffer = local_buffer;
+
+	if (!max_bytes)
+		max_bytes = sizeof(local_buffer) - 1;
+	if (nbytes >= max_bytes)
+		return -E2BIG;
+	/* Allocate a dynamic buffer if we need one */
+	if (nbytes >= sizeof(local_buffer)) {
+		buffer = kmalloc(nbytes + 1, GFP_KERNEL);
+		if (buffer == NULL)
+			return -ENOMEM;
+	}
+	if (nbytes && copy_from_user(buffer, userbuf, nbytes))
+		return -EFAULT;
+
+	buffer[nbytes] = 0;     /* nul-terminate */
+	strstrip(buffer);
+	retval = cft->write_string(cgrp, cft, buffer);
+	if (!retval)
+		retval = nbytes;
+	if (buffer != local_buffer)
+		kfree(buffer);
+	return retval;
+}
+
 static ssize_t cgroup_common_file_write(struct cgroup *cgrp,
 					   struct cftype *cft,
 					   struct file *file,
@@ -1440,6 +1473,8 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
 		return cft->write(cgrp, cft, file, buf, nbytes, ppos);
 	if (cft->write_u64 || cft->write_s64)
 		return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
+	if (cft->write_string)
+		return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos);
 	if (cft->trigger) {
 		int ret = cft->trigger(cgrp, (unsigned int)cft->private);
 		return ret ? ret : nbytes;
-- 
cgit v1.2.3


From e788e066c651b1bbf4a927dc95395c1aa13be436 Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Fri, 25 Jul 2008 01:46:59 -0700
Subject: cgroup files: move the release_agent file to use typed handlers

Adds cgroup_release_agent_write() and cgroup_release_agent_show()
methods to handle writing/reading the path to a cgroup hierarchy's
release agent. As a result, cgroup_common_file_read() is now unnecessary.

As part of the change, a previously-tolerated race in
cgroup_release_agent() is avoided by copying the current
release_agent_path prior to calling call_usermode_helper().

Signed-off-by: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 125 ++++++++++++++++++++++++++------------------------------
 1 file changed, 57 insertions(+), 68 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3a99cc2df860..0120b5d67a73 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -89,11 +89,7 @@ struct cgroupfs_root {
 	/* Hierarchy-specific flags */
 	unsigned long flags;
 
-	/* The path to use for release notifications. No locking
-	 * between setting and use - so if userspace updates this
-	 * while child cgroups exist, you could miss a
-	 * notification. We ensure that it's always a valid
-	 * NUL-terminated string */
+	/* The path to use for release notifications. */
 	char release_agent_path[PATH_MAX];
 };
 
@@ -1329,6 +1325,45 @@ enum cgroup_filetype {
 	FILE_RELEASE_AGENT,
 };
 
+/**
+ * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
+ * @cgrp: the cgroup to be checked for liveness
+ *
+ * Returns true (with lock held) on success, or false (with no lock
+ * held) on failure.
+ */
+int cgroup_lock_live_group(struct cgroup *cgrp)
+{
+	mutex_lock(&cgroup_mutex);
+	if (cgroup_is_removed(cgrp)) {
+		mutex_unlock(&cgroup_mutex);
+		return false;
+	}
+	return true;
+}
+
+static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
+				      const char *buffer)
+{
+	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+	if (!cgroup_lock_live_group(cgrp))
+		return -ENODEV;
+	strcpy(cgrp->root->release_agent_path, buffer);
+	mutex_unlock(&cgroup_mutex);
+	return 0;
+}
+
+static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
+				     struct seq_file *seq)
+{
+	if (!cgroup_lock_live_group(cgrp))
+		return -ENODEV;
+	seq_puts(seq, cgrp->root->release_agent_path);
+	seq_putc(seq, '\n');
+	mutex_unlock(&cgroup_mutex);
+	return 0;
+}
+
 static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
 				struct file *file,
 				const char __user *userbuf,
@@ -1443,10 +1478,6 @@ static ssize_t cgroup_common_file_write(struct cgroup *cgrp,
 		else
 			clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 		break;
-	case FILE_RELEASE_AGENT:
-		BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
-		strcpy(cgrp->root->release_agent_path, buffer);
-		break;
 	default:
 		retval = -EINVAL;
 		goto out2;
@@ -1506,49 +1537,6 @@ static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
 	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
 }
 
-static ssize_t cgroup_common_file_read(struct cgroup *cgrp,
-					  struct cftype *cft,
-					  struct file *file,
-					  char __user *buf,
-					  size_t nbytes, loff_t *ppos)
-{
-	enum cgroup_filetype type = cft->private;
-	char *page;
-	ssize_t retval = 0;
-	char *s;
-
-	if (!(page = (char *)__get_free_page(GFP_KERNEL)))
-		return -ENOMEM;
-
-	s = page;
-
-	switch (type) {
-	case FILE_RELEASE_AGENT:
-	{
-		struct cgroupfs_root *root;
-		size_t n;
-		mutex_lock(&cgroup_mutex);
-		root = cgrp->root;
-		n = strnlen(root->release_agent_path,
-			    sizeof(root->release_agent_path));
-		n = min(n, (size_t) PAGE_SIZE);
-		strncpy(s, root->release_agent_path, n);
-		mutex_unlock(&cgroup_mutex);
-		s += n;
-		break;
-	}
-	default:
-		retval = -EINVAL;
-		goto out;
-	}
-	*s++ = '\n';
-
-	retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
-out:
-	free_page((unsigned long)page);
-	return retval;
-}
-
 static ssize_t cgroup_file_read(struct file *file, char __user *buf,
 				   size_t nbytes, loff_t *ppos)
 {
@@ -1606,6 +1594,7 @@ int cgroup_seqfile_release(struct inode *inode, struct file *file)
 
 static struct file_operations cgroup_seqfile_operations = {
 	.read = seq_read,
+	.write = cgroup_file_write,
 	.llseek = seq_lseek,
 	.release = cgroup_seqfile_release,
 };
@@ -2283,8 +2272,9 @@ static struct cftype files[] = {
 
 static struct cftype cft_release_agent = {
 	.name = "release_agent",
-	.read = cgroup_common_file_read,
-	.write = cgroup_common_file_write,
+	.read_seq_string = cgroup_release_agent_show,
+	.write_string = cgroup_release_agent_write,
+	.max_write_len = PATH_MAX,
 	.private = FILE_RELEASE_AGENT,
 };
 
@@ -3111,27 +3101,24 @@ static void cgroup_release_agent(struct work_struct *work)
 	while (!list_empty(&release_list)) {
 		char *argv[3], *envp[3];
 		int i;
-		char *pathbuf;
+		char *pathbuf = NULL, *agentbuf = NULL;
 		struct cgroup *cgrp = list_entry(release_list.next,
 						    struct cgroup,
 						    release_list);
 		list_del_init(&cgrp->release_list);
 		spin_unlock(&release_list_lock);
 		pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
-		if (!pathbuf) {
-			spin_lock(&release_list_lock);
-			continue;
-		}
-
-		if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) {
-			kfree(pathbuf);
-			spin_lock(&release_list_lock);
-			continue;
-		}
+		if (!pathbuf)
+			goto continue_free;
+		if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
+			goto continue_free;
+		agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
+		if (!agentbuf)
+			goto continue_free;
 
 		i = 0;
-		argv[i++] = cgrp->root->release_agent_path;
-		argv[i++] = (char *)pathbuf;
+		argv[i++] = agentbuf;
+		argv[i++] = pathbuf;
 		argv[i] = NULL;
 
 		i = 0;
@@ -3145,8 +3132,10 @@ static void cgroup_release_agent(struct work_struct *work)
 		 * be a slow process */
 		mutex_unlock(&cgroup_mutex);
 		call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
-		kfree(pathbuf);
 		mutex_lock(&cgroup_mutex);
+ continue_free:
+		kfree(pathbuf);
+		kfree(agentbuf);
 		spin_lock(&release_list_lock);
 	}
 	spin_unlock(&release_list_lock);
-- 
cgit v1.2.3


From 84eea842886ac35020be6043e04748ed22014359 Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Fri, 25 Jul 2008 01:47:00 -0700
Subject: cgroups: misc cleanups to write_string patchset

This patch contains cleanups suggested by reviewers for the recent
write_string() patchset:

- pair cgroup_lock_live_group() with cgroup_unlock() in cgroup.c for
  clarity, rather than directly unlocking cgroup_mutex.

- make the return type of cgroup_lock_live_group() a bool

- use a #define'd constant for the local buffer size in read/write functions

Signed-off-by: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0120b5d67a73..a14122ecaa5e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1329,10 +1329,10 @@ enum cgroup_filetype {
  * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
  * @cgrp: the cgroup to be checked for liveness
  *
- * Returns true (with lock held) on success, or false (with no lock
- * held) on failure.
+ * On success, returns true; the lock should be later released with
+ * cgroup_unlock(). On failure returns false with no lock held.
  */
-int cgroup_lock_live_group(struct cgroup *cgrp)
+bool cgroup_lock_live_group(struct cgroup *cgrp)
 {
 	mutex_lock(&cgroup_mutex);
 	if (cgroup_is_removed(cgrp)) {
@@ -1349,7 +1349,7 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
 	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
 	strcpy(cgrp->root->release_agent_path, buffer);
-	mutex_unlock(&cgroup_mutex);
+	cgroup_unlock();
 	return 0;
 }
 
@@ -1360,16 +1360,19 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
 		return -ENODEV;
 	seq_puts(seq, cgrp->root->release_agent_path);
 	seq_putc(seq, '\n');
-	mutex_unlock(&cgroup_mutex);
+	cgroup_unlock();
 	return 0;
 }
 
+/* A buffer size big enough for numbers or short strings */
+#define CGROUP_LOCAL_BUFFER_SIZE 64
+
 static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
 				struct file *file,
 				const char __user *userbuf,
 				size_t nbytes, loff_t *unused_ppos)
 {
-	char buffer[64];
+	char buffer[CGROUP_LOCAL_BUFFER_SIZE];
 	int retval = 0;
 	char *end;
 
@@ -1403,7 +1406,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
 				   const char __user *userbuf,
 				   size_t nbytes, loff_t *unused_ppos)
 {
-	char local_buffer[64];
+	char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
 	int retval = 0;
 	size_t max_bytes = cft->max_write_len;
 	char *buffer = local_buffer;
@@ -1518,7 +1521,7 @@ static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
 			       char __user *buf, size_t nbytes,
 			       loff_t *ppos)
 {
-	char tmp[64];
+	char tmp[CGROUP_LOCAL_BUFFER_SIZE];
 	u64 val = cft->read_u64(cgrp, cft);
 	int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
 
@@ -1530,7 +1533,7 @@ static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
 			       char __user *buf, size_t nbytes,
 			       loff_t *ppos)
 {
-	char tmp[64];
+	char tmp[CGROUP_LOCAL_BUFFER_SIZE];
 	s64 val = cft->read_s64(cgrp, cft);
 	int len = sprintf(tmp, "%lld\n", (long long) val);
 
-- 
cgit v1.2.3


From 6379c106152388f7ea45d6dda63edda0e9181fc8 Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Fri, 25 Jul 2008 01:47:01 -0700
Subject: cgroup files: move notify_on_release file to separate write handler

This patch moves the write handler for the cgroups notify_on_release
file into a separate handler. This handler requires no cgroups locking
since it relies on atomic bitops for synchronization.

Signed-off-by: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a14122ecaa5e..d597d3015786 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1474,13 +1474,6 @@ static ssize_t cgroup_common_file_write(struct cgroup *cgrp,
 	case FILE_TASKLIST:
 		retval = attach_task_by_pid(cgrp, buffer);
 		break;
-	case FILE_NOTIFY_ON_RELEASE:
-		clear_bit(CGRP_RELEASABLE, &cgrp->flags);
-		if (simple_strtoul(buffer, NULL, 10) != 0)
-			set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
-		else
-			clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
-		break;
 	default:
 		retval = -EINVAL;
 		goto out2;
@@ -2252,6 +2245,18 @@ static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
 	return notify_on_release(cgrp);
 }
 
+static int cgroup_write_notify_on_release(struct cgroup *cgrp,
+					  struct cftype *cft,
+					  u64 val)
+{
+	clear_bit(CGRP_RELEASABLE, &cgrp->flags);
+	if (val)
+		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+	else
+		clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+	return 0;
+}
+
 /*
  * for the common functions, 'private' gives the type of file
  */
@@ -2268,7 +2273,7 @@ static struct cftype files[] = {
 	{
 		.name = "notify_on_release",
 		.read_u64 = cgroup_read_notify_on_release,
-		.write = cgroup_common_file_write,
+		.write_u64 = cgroup_write_notify_on_release,
 		.private = FILE_NOTIFY_ON_RELEASE,
 	},
 };
-- 
cgit v1.2.3


From af351026aafc8da16518a02b41c66d3e0c1cdef4 Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Fri, 25 Jul 2008 01:47:01 -0700
Subject: cgroup files: turn attach_task_by_pid directly into a cgroup write
 handler

This patch changes attach_task_by_pid() to take a u64 rather than a
string; as a result it can be called directly as a control groups
write_u64 handler, and cgroup_common_file_write() can be removed.

Signed-off-by: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 80 ++++++++++-----------------------------------------------
 1 file changed, 14 insertions(+), 66 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d597d3015786..86b71e714e13 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -504,10 +504,6 @@ static struct css_set *find_css_set(
  * knows that the cgroup won't be removed, as cgroup_rmdir()
  * needs that mutex.
  *
- * The cgroup_common_file_write handler for operations that modify
- * the cgroup hierarchy holds cgroup_mutex across the entire operation,
- * single threading all such cgroup modifications across the system.
- *
  * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
  * (usually) take cgroup_mutex.  These are the two most performance
  * critical pieces of code here.  The exception occurs on cgroup_exit(),
@@ -1279,18 +1275,14 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 }
 
 /*
- * Attach task with pid 'pid' to cgroup 'cgrp'. Call with
- * cgroup_mutex, may take task_lock of task
+ * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
+ * held. May take task_lock of task
  */
-static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
+static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
 {
-	pid_t pid;
 	struct task_struct *tsk;
 	int ret;
 
-	if (sscanf(pidbuf, "%d", &pid) != 1)
-		return -EIO;
-
 	if (pid) {
 		rcu_read_lock();
 		tsk = find_task_by_vpid(pid);
@@ -1316,6 +1308,16 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
 	return ret;
 }
 
+static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
+{
+	int ret;
+	if (!cgroup_lock_live_group(cgrp))
+		return -ENODEV;
+	ret = attach_task_by_pid(cgrp, pid);
+	cgroup_unlock();
+	return ret;
+}
+
 /* The various types of files and directories in a cgroup file system */
 enum cgroup_filetype {
 	FILE_ROOT,
@@ -1434,60 +1436,6 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
 	return retval;
 }
 
-static ssize_t cgroup_common_file_write(struct cgroup *cgrp,
-					   struct cftype *cft,
-					   struct file *file,
-					   const char __user *userbuf,
-					   size_t nbytes, loff_t *unused_ppos)
-{
-	enum cgroup_filetype type = cft->private;
-	char *buffer;
-	int retval = 0;
-
-	if (nbytes >= PATH_MAX)
-		return -E2BIG;
-
-	/* +1 for nul-terminator */
-	buffer = kmalloc(nbytes + 1, GFP_KERNEL);
-	if (buffer == NULL)
-		return -ENOMEM;
-
-	if (copy_from_user(buffer, userbuf, nbytes)) {
-		retval = -EFAULT;
-		goto out1;
-	}
-	buffer[nbytes] = 0;	/* nul-terminate */
-	strstrip(buffer);	/* strip -just- trailing whitespace */
-
-	mutex_lock(&cgroup_mutex);
-
-	/*
-	 * This was already checked for in cgroup_file_write(), but
-	 * check again now we're holding cgroup_mutex.
-	 */
-	if (cgroup_is_removed(cgrp)) {
-		retval = -ENODEV;
-		goto out2;
-	}
-
-	switch (type) {
-	case FILE_TASKLIST:
-		retval = attach_task_by_pid(cgrp, buffer);
-		break;
-	default:
-		retval = -EINVAL;
-		goto out2;
-	}
-
-	if (retval == 0)
-		retval = nbytes;
-out2:
-	mutex_unlock(&cgroup_mutex);
-out1:
-	kfree(buffer);
-	return retval;
-}
-
 static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
 						size_t nbytes, loff_t *ppos)
 {
@@ -2265,7 +2213,7 @@ static struct cftype files[] = {
 		.name = "tasks",
 		.open = cgroup_tasks_open,
 		.read = cgroup_tasks_read,
-		.write = cgroup_common_file_write,
+		.write_u64 = cgroup_tasks_write,
 		.release = cgroup_tasks_release,
 		.private = FILE_TASKLIST,
 	},
-- 
cgit v1.2.3


From e37123953292146445c8629b3950d0513fd10ae2 Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Fri, 25 Jul 2008 01:47:02 -0700
Subject: cgroup files: remove cpuset_common_file_write()

This patch tweaks the signatures of the update_cpumask() and
update_nodemask() functions so that they can be called directly as
handlers for the new cgroups write_string() method.

This allows cpuset_common_file_write() to be removed.

Signed-off-by: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 109 ++++++++++++++++++--------------------------------------
 1 file changed, 35 insertions(+), 74 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d5738910c34c..276ce7e4f1ab 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -227,10 +227,6 @@ static struct cpuset top_cpuset = {
  * The task_struct fields mems_allowed and mems_generation may only
  * be accessed in the context of that task, so require no locks.
  *
- * The cpuset_common_file_write handler for operations that modify
- * the cpuset hierarchy holds cgroup_mutex across the entire operation,
- * single threading all such cpuset modifications across the system.
- *
  * The cpuset_common_file_read() handlers only hold callback_mutex across
  * small pieces of code, such as when reading out possibly multi-word
  * cpumasks and nodemasks.
@@ -772,7 +768,7 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
  * @cs: the cpuset to consider
  * @buf: buffer of cpu numbers written to this cpuset
  */
-static int update_cpumask(struct cpuset *cs, char *buf)
+static int update_cpumask(struct cpuset *cs, const char *buf)
 {
 	struct cpuset trialcs;
 	struct cgroup_scanner scan;
@@ -792,7 +788,6 @@ static int update_cpumask(struct cpuset *cs, char *buf)
 	 * that parsing.  The validate_change() call ensures that cpusets
 	 * with tasks have cpus.
 	 */
-	buf = strstrip(buf);
 	if (!*buf) {
 		cpus_clear(trialcs.cpus_allowed);
 	} else {
@@ -902,7 +897,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 
 static void *cpuset_being_rebound;
 
-static int update_nodemask(struct cpuset *cs, char *buf)
+static int update_nodemask(struct cpuset *cs, const char *buf)
 {
 	struct cpuset trialcs;
 	nodemask_t oldmem;
@@ -929,7 +924,6 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 	 * that parsing.  The validate_change() call ensures that cpusets
 	 * with tasks have memory.
 	 */
-	buf = strstrip(buf);
 	if (!*buf) {
 		nodes_clear(trialcs.mems_allowed);
 	} else {
@@ -1256,72 +1250,14 @@ typedef enum {
 	FILE_SPREAD_SLAB,
 } cpuset_filetype_t;
 
-static ssize_t cpuset_common_file_write(struct cgroup *cont,
-					struct cftype *cft,
-					struct file *file,
-					const char __user *userbuf,
-					size_t nbytes, loff_t *unused_ppos)
-{
-	struct cpuset *cs = cgroup_cs(cont);
-	cpuset_filetype_t type = cft->private;
-	char *buffer;
-	int retval = 0;
-
-	/* Crude upper limit on largest legitimate cpulist user might write. */
-	if (nbytes > 100U + 6 * max(NR_CPUS, MAX_NUMNODES))
-		return -E2BIG;
-
-	/* +1 for nul-terminator */
-	buffer = kmalloc(nbytes + 1, GFP_KERNEL);
-	if (!buffer)
-		return -ENOMEM;
-
-	if (copy_from_user(buffer, userbuf, nbytes)) {
-		retval = -EFAULT;
-		goto out1;
-	}
-	buffer[nbytes] = 0;	/* nul-terminate */
-
-	cgroup_lock();
-
-	if (cgroup_is_removed(cont)) {
-		retval = -ENODEV;
-		goto out2;
-	}
-
-	switch (type) {
-	case FILE_CPULIST:
-		retval = update_cpumask(cs, buffer);
-		break;
-	case FILE_MEMLIST:
-		retval = update_nodemask(cs, buffer);
-		break;
-	default:
-		retval = -EINVAL;
-		goto out2;
-	}
-
-	if (retval == 0)
-		retval = nbytes;
-out2:
-	cgroup_unlock();
-out1:
-	kfree(buffer);
-	return retval;
-}
-
 static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
 {
 	int retval = 0;
 	struct cpuset *cs = cgroup_cs(cgrp);
 	cpuset_filetype_t type = cft->private;
 
-	cgroup_lock();
-
-	if (cgroup_is_removed(cgrp)) {
-		cgroup_unlock();
+	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
-	}
 
 	switch (type) {
 	case FILE_CPU_EXCLUSIVE:
@@ -1367,12 +1303,9 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
 	struct cpuset *cs = cgroup_cs(cgrp);
 	cpuset_filetype_t type = cft->private;
 
-	cgroup_lock();
-
-	if (cgroup_is_removed(cgrp)) {
-		cgroup_unlock();
+	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
-	}
+
 	switch (type) {
 	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
 		retval = update_relax_domain_level(cs, val);
@@ -1385,6 +1318,32 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
 	return retval;
 }
 
+/*
+ * Common handling for a write to a "cpus" or "mems" file.
+ */
+static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
+				const char *buf)
+{
+	int retval = 0;
+
+	if (!cgroup_lock_live_group(cgrp))
+		return -ENODEV;
+
+	switch (cft->private) {
+	case FILE_CPULIST:
+		retval = update_cpumask(cgroup_cs(cgrp), buf);
+		break;
+	case FILE_MEMLIST:
+		retval = update_nodemask(cgroup_cs(cgrp), buf);
+		break;
+	default:
+		retval = -EINVAL;
+		break;
+	}
+	cgroup_unlock();
+	return retval;
+}
+
 /*
  * These ascii lists should be read in a single call, by using a user
  * buffer large enough to hold the entire map.  If read in smaller
@@ -1504,14 +1463,16 @@ static struct cftype files[] = {
 	{
 		.name = "cpus",
 		.read = cpuset_common_file_read,
-		.write = cpuset_common_file_write,
+		.write_string = cpuset_write_resmask,
+		.max_write_len = (100U + 6 * NR_CPUS),
 		.private = FILE_CPULIST,
 	},
 
 	{
 		.name = "mems",
 		.read = cpuset_common_file_read,
-		.write = cpuset_common_file_write,
+		.write_string = cpuset_write_resmask,
+		.max_write_len = (100U + 6 * MAX_NUMNODES),
 		.private = FILE_MEMLIST,
 	},
 
-- 
cgit v1.2.3


From 856c13aa1ff6136c1968414fdea5938ea9d5ebf2 Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Fri, 25 Jul 2008 01:47:04 -0700
Subject: cgroup files: convert res_counter_write() to be a cgroups
 write_string() handler

Currently res_counter_write() is a raw file handler even though it's
ultimately taking a number, since in some cases it wants to
pre-process the string when converting it to a number.

This patch converts res_counter_write() from a raw file handler to a
write_string() handler; this allows some of the boilerplate
copying/locking/checking to be removed, and simplies the cleanup path,
since these functions are now performed by the cgroups framework.

[lizf@cn.fujitsu.com: build fix]
Signed-off-by: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/res_counter.c | 48 +++++++++++++++++++++---------------------------
 1 file changed, 21 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index d3c61b4ebef2..f275c8eca772 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/res_counter.h>
 #include <linux/uaccess.h>
+#include <linux/mm.h>
 
 void res_counter_init(struct res_counter *counter)
 {
@@ -102,44 +103,37 @@ u64 res_counter_read_u64(struct res_counter *counter, int member)
 	return *res_counter_member(counter, member);
 }
 
-ssize_t res_counter_write(struct res_counter *counter, int member,
-		const char __user *userbuf, size_t nbytes, loff_t *pos,
-		int (*write_strategy)(char *st_buf, unsigned long long *val))
+int res_counter_memparse_write_strategy(const char *buf,
+					unsigned long long *res)
 {
-	int ret;
-	char *buf, *end;
-	unsigned long flags;
-	unsigned long long tmp, *val;
-
-	buf = kmalloc(nbytes + 1, GFP_KERNEL);
-	ret = -ENOMEM;
-	if (buf == NULL)
-		goto out;
+	char *end;
+	/* FIXME - make memparse() take const char* args */
+	*res = memparse((char *)buf, &end);
+	if (*end != '\0')
+		return -EINVAL;
 
-	buf[nbytes] = '\0';
-	ret = -EFAULT;
-	if (copy_from_user(buf, userbuf, nbytes))
-		goto out_free;
+	*res = PAGE_ALIGN(*res);
+	return 0;
+}
 
-	ret = -EINVAL;
+int res_counter_write(struct res_counter *counter, int member,
+		      const char *buf, write_strategy_fn write_strategy)
+{
+	char *end;
+	unsigned long flags;
+	unsigned long long tmp, *val;
 
-	strstrip(buf);
 	if (write_strategy) {
-		if (write_strategy(buf, &tmp)) {
-			goto out_free;
-		}
+		if (write_strategy(buf, &tmp))
+			return -EINVAL;
 	} else {
 		tmp = simple_strtoull(buf, &end, 10);
 		if (*end != '\0')
-			goto out_free;
+			return -EINVAL;
 	}
 	spin_lock_irqsave(&counter->lock, flags);
 	val = res_counter_member(counter, member);
 	*val = tmp;
 	spin_unlock_irqrestore(&counter->lock, flags);
-	ret = nbytes;
-out_free:
-	kfree(buf);
-out:
-	return ret;
+	return 0;
 }
-- 
cgit v1.2.3


From e885dcde75685e09f23cffae1f6d5169c105b8a0 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Fri, 25 Jul 2008 01:47:06 -0700
Subject: cgroup_clone: use pid of newly created task for new cgroup

cgroup_clone creates a new cgroup with the pid of the task.  This works
correctly for unshare, but for clone cgroup_clone is called from
copy_namespaces inside copy_process, which happens before the new pid is
created.  As a result, the new cgroup was created with current's pid.
This patch:

	1. Moves the call inside copy_process to after the new pid
	   is created
	2. Passes the struct pid into ns_cgroup_clone (as it is not
	   yet attached to the task)
	3. Passes a name from ns_cgroup_clone() into cgroup_clone()
	   so as to keep cgroup_clone() itself simpler
	4. Uses pid_vnr() to get the process id value, so that the
	   pid used to name the new cgroup is always the pid as it
	   would be known to the task which did the cloning or
	   unsharing.  I think that is the most intuitive thing to
	   do.  This way, task t1 does clone(CLONE_NEWPID) to get
	   t2, which does clone(CLONE_NEWPID) to get t3, then the
	   cgroup for t3 will be named for the pid by which t2 knows
	   t3.

(Thanks to Dan Smith for finding the main bug)

Changelog:
	June 11: Incorporate Paul Menage's feedback:  don't pass
	         NULL to ns_cgroup_clone from unshare, and reduce
		 patch size by using 'nodename' in cgroup_clone.
	June 10: Original version

[akpm@linux-foundation.org: build fix]
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Serge Hallyn <serge@us.ibm.com>
Acked-by: Paul Menage <menage@google.com>
Tested-by: Dan Smith <danms@us.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c    | 7 +++----
 kernel/fork.c      | 6 ++++++
 kernel/ns_cgroup.c | 8 ++++++--
 kernel/nsproxy.c   | 8 +-------
 4 files changed, 16 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 86b71e714e13..66ec9fd21e0c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2848,16 +2848,17 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
  * cgroup_clone - clone the cgroup the given subsystem is attached to
  * @tsk: the task to be moved
  * @subsys: the given subsystem
+ * @nodename: the name for the new cgroup
  *
  * Duplicate the current cgroup in the hierarchy that the given
  * subsystem is attached to, and move this task into the new
  * child.
  */
-int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
+int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
+							char *nodename)
 {
 	struct dentry *dentry;
 	int ret = 0;
-	char nodename[MAX_CGROUP_TYPE_NAMELEN];
 	struct cgroup *parent, *child;
 	struct inode *inode;
 	struct css_set *cg;
@@ -2882,8 +2883,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
 	cg = tsk->cgroups;
 	parent = task_cgroup(tsk, subsys->subsys_id);
 
-	snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "%d", tsk->pid);
-
 	/* Pin the hierarchy */
 	atomic_inc(&parent->root->sb->s_active);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 5a5d6fef341d..228f80c9155a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1107,6 +1107,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (clone_flags & CLONE_THREAD)
 		p->tgid = current->tgid;
 
+	if (current->nsproxy != p->nsproxy) {
+		retval = ns_cgroup_clone(p, pid);
+		if (retval)
+			goto bad_fork_free_pid;
+	}
+
 	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
 	/*
 	 * Clear TID on mm_release()?
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 48d7ed6fc3a4..43c2111cd54d 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -7,6 +7,7 @@
 #include <linux/module.h>
 #include <linux/cgroup.h>
 #include <linux/fs.h>
+#include <linux/proc_fs.h>
 #include <linux/slab.h>
 #include <linux/nsproxy.h>
 
@@ -24,9 +25,12 @@ static inline struct ns_cgroup *cgroup_to_ns(
 			    struct ns_cgroup, css);
 }
 
-int ns_cgroup_clone(struct task_struct *task)
+int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
 {
-	return cgroup_clone(task, &ns_subsys);
+	char name[PROC_NUMBUF];
+
+	snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid));
+	return cgroup_clone(task, &ns_subsys, name);
 }
 
 /*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index adc785146a1c..21575fc46d05 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -157,12 +157,6 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 		goto out;
 	}
 
-	err = ns_cgroup_clone(tsk);
-	if (err) {
-		put_nsproxy(new_ns);
-		goto out;
-	}
-
 	tsk->nsproxy = new_ns;
 
 out:
@@ -209,7 +203,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
 		goto out;
 	}
 
-	err = ns_cgroup_clone(current);
+	err = ns_cgroup_clone(current, task_pid(current));
 	if (err)
 		put_nsproxy(*new_nsp);
 
-- 
cgit v1.2.3


From 0b2f630a28d53b5a2082a5275bc3334b10373508 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Fri, 25 Jul 2008 01:47:21 -0700
Subject: cpusets: restructure the function update_cpumask() and
 update_nodemask()

Extract two functions from update_cpumask() and update_nodemask().They
will be used later for updating tasks' cpus_allowed and mems_allowed after
CPU/NODE offline/online.

[lizf@cn.fujitsu.com: build fix]
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: David Rientjes <rientjes@google.com>
Cc:  Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 181 ++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 109 insertions(+), 72 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 276ce7e4f1ab..7326d51eefe1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -763,6 +763,37 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
 	set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed));
 }
 
+/**
+ * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
+ * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
+ *
+ * Called with cgroup_mutex held
+ *
+ * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
+ * calling callback functions for each.
+ *
+ * Return 0 if successful, -errno if not.
+ */
+static int update_tasks_cpumask(struct cpuset *cs)
+{
+	struct cgroup_scanner scan;
+	struct ptr_heap heap;
+	int retval;
+
+	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
+	if (retval)
+		return retval;
+
+	scan.cg = cs->css.cgroup;
+	scan.test_task = cpuset_test_cpumask;
+	scan.process_task = cpuset_change_cpumask;
+	scan.heap = &heap;
+	retval = cgroup_scan_tasks(&scan);
+
+	heap_free(&heap);
+	return retval;
+}
+
 /**
  * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
  * @cs: the cpuset to consider
@@ -771,8 +802,6 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
 static int update_cpumask(struct cpuset *cs, const char *buf)
 {
 	struct cpuset trialcs;
-	struct cgroup_scanner scan;
-	struct ptr_heap heap;
 	int retval;
 	int is_load_balanced;
 
@@ -806,10 +835,6 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
 	if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
 		return 0;
 
-	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
-	if (retval)
-		return retval;
-
 	is_load_balanced = is_sched_load_balance(&trialcs);
 
 	mutex_lock(&callback_mutex);
@@ -820,12 +845,9 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
 	 * Scan tasks in the cpuset, and update the cpumasks of any
 	 * that need an update.
 	 */
-	scan.cg = cs->css.cgroup;
-	scan.test_task = cpuset_test_cpumask;
-	scan.process_task = cpuset_change_cpumask;
-	scan.heap = &heap;
-	cgroup_scan_tasks(&scan);
-	heap_free(&heap);
+	retval = update_tasks_cpumask(cs);
+	if (retval < 0)
+		return retval;
 
 	if (is_load_balanced)
 		rebuild_sched_domains();
@@ -881,73 +903,25 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 	mutex_unlock(&callback_mutex);
 }
 
-/*
- * Handle user request to change the 'mems' memory placement
- * of a cpuset.  Needs to validate the request, update the
- * cpusets mems_allowed and mems_generation, and for each
- * task in the cpuset, rebind any vma mempolicies and if
- * the cpuset is marked 'memory_migrate', migrate the tasks
- * pages to the new memory.
- *
- * Call with cgroup_mutex held.  May take callback_mutex during call.
- * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
- * lock each such tasks mm->mmap_sem, scan its vma's and rebind
- * their mempolicies to the cpusets new mems_allowed.
- */
-
 static void *cpuset_being_rebound;
 
-static int update_nodemask(struct cpuset *cs, const char *buf)
+/**
+ * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
+ * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
+ * @oldmem: old mems_allowed of cpuset cs
+ *
+ * Called with cgroup_mutex held
+ * Return 0 if successful, -errno if not.
+ */
+static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
 {
-	struct cpuset trialcs;
-	nodemask_t oldmem;
 	struct task_struct *p;
 	struct mm_struct **mmarray;
 	int i, n, ntasks;
 	int migrate;
 	int fudge;
-	int retval;
 	struct cgroup_iter it;
-
-	/*
-	 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
-	 * it's read-only
-	 */
-	if (cs == &top_cpuset)
-		return -EACCES;
-
-	trialcs = *cs;
-
-	/*
-	 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
-	 * Since nodelist_parse() fails on an empty mask, we special case
-	 * that parsing.  The validate_change() call ensures that cpusets
-	 * with tasks have memory.
-	 */
-	if (!*buf) {
-		nodes_clear(trialcs.mems_allowed);
-	} else {
-		retval = nodelist_parse(buf, trialcs.mems_allowed);
-		if (retval < 0)
-			goto done;
-
-		if (!nodes_subset(trialcs.mems_allowed,
-				node_states[N_HIGH_MEMORY]))
-			return -EINVAL;
-	}
-	oldmem = cs->mems_allowed;
-	if (nodes_equal(oldmem, trialcs.mems_allowed)) {
-		retval = 0;		/* Too easy - nothing to do */
-		goto done;
-	}
-	retval = validate_change(cs, &trialcs);
-	if (retval < 0)
-		goto done;
-
-	mutex_lock(&callback_mutex);
-	cs->mems_allowed = trialcs.mems_allowed;
-	cs->mems_generation = cpuset_mems_generation++;
-	mutex_unlock(&callback_mutex);
+	int retval;
 
 	cpuset_being_rebound = cs;		/* causes mpol_dup() rebind */
 
@@ -1014,7 +988,7 @@ static int update_nodemask(struct cpuset *cs, const char *buf)
 
 		mpol_rebind_mm(mm, &cs->mems_allowed);
 		if (migrate)
-			cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed);
+			cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
 		mmput(mm);
 	}
 
@@ -1026,6 +1000,70 @@ done:
 	return retval;
 }
 
+/*
+ * Handle user request to change the 'mems' memory placement
+ * of a cpuset.  Needs to validate the request, update the
+ * cpusets mems_allowed and mems_generation, and for each
+ * task in the cpuset, rebind any vma mempolicies and if
+ * the cpuset is marked 'memory_migrate', migrate the tasks
+ * pages to the new memory.
+ *
+ * Call with cgroup_mutex held.  May take callback_mutex during call.
+ * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
+ * lock each such tasks mm->mmap_sem, scan its vma's and rebind
+ * their mempolicies to the cpusets new mems_allowed.
+ */
+static int update_nodemask(struct cpuset *cs, const char *buf)
+{
+	struct cpuset trialcs;
+	nodemask_t oldmem;
+	int retval;
+
+	/*
+	 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
+	 * it's read-only
+	 */
+	if (cs == &top_cpuset)
+		return -EACCES;
+
+	trialcs = *cs;
+
+	/*
+	 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
+	 * Since nodelist_parse() fails on an empty mask, we special case
+	 * that parsing.  The validate_change() call ensures that cpusets
+	 * with tasks have memory.
+	 */
+	if (!*buf) {
+		nodes_clear(trialcs.mems_allowed);
+	} else {
+		retval = nodelist_parse(buf, trialcs.mems_allowed);
+		if (retval < 0)
+			goto done;
+
+		if (!nodes_subset(trialcs.mems_allowed,
+				node_states[N_HIGH_MEMORY]))
+			return -EINVAL;
+	}
+	oldmem = cs->mems_allowed;
+	if (nodes_equal(oldmem, trialcs.mems_allowed)) {
+		retval = 0;		/* Too easy - nothing to do */
+		goto done;
+	}
+	retval = validate_change(cs, &trialcs);
+	if (retval < 0)
+		goto done;
+
+	mutex_lock(&callback_mutex);
+	cs->mems_allowed = trialcs.mems_allowed;
+	cs->mems_generation = cpuset_mems_generation++;
+	mutex_unlock(&callback_mutex);
+
+	retval = update_tasks_nodemask(cs, &oldmem);
+done:
+	return retval;
+}
+
 int current_cpuset_is_being_rebound(void)
 {
 	return task_cs(current) == cpuset_being_rebound;
@@ -1935,7 +1973,6 @@ void __init cpuset_init_smp(void)
 }
 
 /**
-
  * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
  * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
  * @pmask: pointer to cpumask_t variable to receive cpus_allowed set.
-- 
cgit v1.2.3


From f9b4fb8dabf38fb456c97f01aace07cb6e7c1723 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Fri, 25 Jul 2008 01:47:22 -0700
Subject: cpusets: update task's cpus_allowed and mems_allowed after CPU/NODE
 offline/online

The bug is that a task may run on the cpu/node which is not in its
cpuset.cpus/ cpuset.mems.

It can be reproduced by the following commands:
-----------------------------------
# mkdir /dev/cpuset
# mount -t cpuset xxx /dev/cpuset
# mkdir /dev/cpuset/0
# echo 0-1 > /dev/cpuset/0/cpus
# echo 0 > /dev/cpuset/0/mems
# echo $$ > /dev/cpuset/0/tasks
# echo 0 > /sys/devices/system/cpu/cpu1/online
# echo 1 > /sys/devices/system/cpu/cpu1/online
-----------------------------------

There is only CPU0 in cpuset.cpus, but the task in this cpuset runs on
both CPU0 and CPU1.

It is because the task's cpu_allowed didn't get updated after we did CPU
offline/online manipulation.  Similar for mem_allowed.

This patch fixes this bug expect for root cpuset.  Because there is a
problem about root cpuset, in that whether it is necessary to update all
the tasks in root cpuset or not after cpu/node offline/online.

If updating, some kernel threads which is bound into a specified cpu will
be unbound.

If not updating, there is a bug in root cpuset.  This bug is also caused
by offline/online manipulation.  For example, there is a dual-cpu machine.
 we create a sub cpuset in root cpuset and assign 1 to its cpus.  And then
we attach some tasks into this sub cpuset.  After this, we offline CPU1.
Now, the tasks in this new cpuset are moved into root cpuset automatically
because there is no cpu in sub cpuset.  Then we online CPU1, we find all
the tasks which doesn't belong to root cpuset originally just run on CPU0.

Maybe we need to add a flag in the task_struct to mark which task can't be
unbound?

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Paul Menage <menage@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7326d51eefe1..6eae6639e851 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1851,6 +1851,7 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
 	struct cpuset *child;	/* scans child cpusets of cp */
 	struct list_head queue;
 	struct cgroup *cont;
+	nodemask_t oldmems;
 
 	INIT_LIST_HEAD(&queue);
 
@@ -1870,6 +1871,8 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
 		    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
 			continue;
 
+		oldmems = cp->mems_allowed;
+
 		/* Remove offline cpus and mems from this cpuset. */
 		mutex_lock(&callback_mutex);
 		cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
@@ -1881,6 +1884,10 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
 		if (cpus_empty(cp->cpus_allowed) ||
 		     nodes_empty(cp->mems_allowed))
 			remove_tasks_in_empty_cpuset(cp);
+		else {
+			update_tasks_cpumask(cp);
+			update_tasks_nodemask(cp, &oldmems);
+		}
 	}
 }
 
-- 
cgit v1.2.3


From c372e817afc629fea9ff6321313325ed0b4a855b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 25 Jul 2008 01:47:23 -0700
Subject: cpuset: avoid unnecessary sched domains rebuilding

When changing 'sched_relax_domain_level', don't rebuild sched domains if
'cpus' is empty or 'sched_load_balance' is not set.

Also make the comments of rebuild_sched_domains() more readable.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6eae6639e851..60d2c4702c6c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -496,11 +496,16 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
 /*
  * rebuild_sched_domains()
  *
- * If the flag 'sched_load_balance' of any cpuset with non-empty
- * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
- * which has that flag enabled, or if any cpuset with a non-empty
- * 'cpus' is removed, then call this routine to rebuild the
- * scheduler's dynamic sched domains.
+ * This routine will be called to rebuild the scheduler's dynamic
+ * sched domains:
+ * - if the flag 'sched_load_balance' of any cpuset with non-empty
+ *   'cpus' changes,
+ * - or if the 'cpus' allowed changes in any cpuset which has that
+ *   flag enabled,
+ * - or if the 'sched_relax_domain_level' of any cpuset which has
+ *   that flag enabled and with non-empty 'cpus' changes,
+ * - or if any cpuset with non-empty 'cpus' is removed,
+ * - or if a cpu gets offlined.
  *
  * This routine builds a partial partition of the systems CPUs
  * (the set of non-overlappping cpumask_t's in the array 'part'
@@ -1076,7 +1081,8 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
 
 	if (val != cs->relax_domain_level) {
 		cs->relax_domain_level = val;
-		rebuild_sched_domains();
+		if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))
+			rebuild_sched_domains();
 	}
 
 	return 0;
-- 
cgit v1.2.3


From 489a5393a20dcbf91104052120eb2eff8791b61b Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 25 Jul 2008 01:47:23 -0700
Subject: cpuset: don't pass empty cpumasks to partition_sched_domains()

I create lots of empty cpusets(empty cpumasks) and turn off the
"sched_load_balance" in top cpuset.

I found that all these empty cpumasks are passed to
partition_sched_domains() in rebuild_sched_domains(), it's very
time-consuming for partition_sched_domains() and it's not need.

It also reduce memory consumed and some works in rebuild_sched_domains()
too.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 60d2c4702c6c..531b235e546f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -610,8 +610,13 @@ void rebuild_sched_domains(void)
 	while (__kfifo_get(q, (void *)&cp, sizeof(cp))) {
 		struct cgroup *cont;
 		struct cpuset *child;   /* scans child cpusets of cp */
+
+		if (cpus_empty(cp->cpus_allowed))
+			continue;
+
 		if (is_sched_load_balance(cp))
 			csa[csn++] = cp;
+
 		list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
 			child = cgroup_cs(cont);
 			__kfifo_put(q, (void *)&child, sizeof(cp));
-- 
cgit v1.2.3


From 02412483777651a26b19a75e49c2a451a174ca9c Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 25 Jul 2008 01:47:24 -0700
Subject: cpuset: code-cleanup for started_after

cgroup(cgroup_scan_tasks) will initialize heap->gt for us.  This patch
removes started_after() and its helper-function.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 37 ++++++-------------------------------
 1 file changed, 6 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 531b235e546f..ebbc9b082e48 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -709,36 +709,6 @@ done:
 	/* Don't kfree(dattr) -- partition_sched_domains() does that. */
 }
 
-static inline int started_after_time(struct task_struct *t1,
-				     struct timespec *time,
-				     struct task_struct *t2)
-{
-	int start_diff = timespec_compare(&t1->start_time, time);
-	if (start_diff > 0) {
-		return 1;
-	} else if (start_diff < 0) {
-		return 0;
-	} else {
-		/*
-		 * Arbitrarily, if two processes started at the same
-		 * time, we'll say that the lower pointer value
-		 * started first. Note that t2 may have exited by now
-		 * so this may not be a valid pointer any longer, but
-		 * that's fine - it still serves to distinguish
-		 * between two tasks started (effectively)
-		 * simultaneously.
-		 */
-		return t1 > t2;
-	}
-}
-
-static inline int started_after(void *p1, void *p2)
-{
-	struct task_struct *t1 = p1;
-	struct task_struct *t2 = p2;
-	return started_after_time(t1, &t2->start_time, t2);
-}
-
 /**
  * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's
  * @tsk: task to test
@@ -790,7 +760,12 @@ static int update_tasks_cpumask(struct cpuset *cs)
 	struct ptr_heap heap;
 	int retval;
 
-	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
+	/*
+	 * cgroup_scan_tasks() will initialize heap->gt for us.
+	 * heap_init() is still needed here for we should not change
+	 * cs->cpus_allowed when heap_init() fails.
+	 */
+	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
 	if (retval)
 		return retval;
 
-- 
cgit v1.2.3


From da5ef6bb96158b0fc0d808704237a453af449124 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 25 Jul 2008 01:47:25 -0700
Subject: cpuset: two minor code-cleanups

In cpuset_update_task_memory_state() local variable struct task_struct
*tsk = current;

And local variable tsk is used 14 times and statement task_cs(tsk) is used
twice in this function.  So using task_cs(tsk) instead of task_cs(current)
is better for readability.

And "(struct cgroup_scanner *)&scan" is not good for readability also.
(and "container_of" is used in cpuset_do_move_task(), not
"(cpuset_hotplug_scanner *)scan")

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ebbc9b082e48..91cf85b36dd5 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -365,7 +365,7 @@ void cpuset_update_task_memory_state(void)
 		my_cpusets_mem_gen = top_cpuset.mems_generation;
 	} else {
 		rcu_read_lock();
-		my_cpusets_mem_gen = task_cs(current)->mems_generation;
+		my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
 		rcu_read_unlock();
 	}
 
@@ -1777,7 +1777,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
 	scan.scan.heap = NULL;
 	scan.to = to->css.cgroup;
 
-	if (cgroup_scan_tasks((struct cgroup_scanner *)&scan))
+	if (cgroup_scan_tasks(&scan.scan))
 		printk(KERN_ERR "move_member_tasks_to_cpuset: "
 				"cgroup_scan_tasks failed\n");
 }
-- 
cgit v1.2.3


From 4b7a1304267bff68260ae861784b27130e805be3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:26 -0700
Subject: posix timers: timer_delete: remove the bogus "->it_process != NULL"
 check

sys_timer_delete() and itimer_delete() check "timer->it_process != NULL",
this looks completely bogus.  ->it_process == NULL means that this timer
is already under destruction or it is not fully initialized, this must not
happen.

	sys_timer_delete: the timer is locked, and lock_timer() can't succeed
	if ->it_process == NULL.

	itimer_delete: it is called by exit_itimers() when there are no other
	threads which can play with signal_struct->posix_timers.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Roland McGrath <roland@redhat.com>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/posix-timers.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index dbd8398ddb0b..17f53266fb67 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -856,11 +856,10 @@ retry_delete:
 	 * This keeps any tasks waiting on the spin lock from thinking
 	 * they got something (see the lock code above).
 	 */
-	if (timer->it_process) {
-		if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
-			put_task_struct(timer->it_process);
-		timer->it_process = NULL;
-	}
+	if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+		put_task_struct(timer->it_process);
+	timer->it_process = NULL;
+
 	unlock_timer(timer, flags);
 	release_posix_timer(timer, IT_ID_SET);
 	return 0;
@@ -885,11 +884,10 @@ retry_delete:
 	 * This keeps any tasks waiting on the spin lock from thinking
 	 * they got something (see the lock code above).
 	 */
-	if (timer->it_process) {
-		if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
-			put_task_struct(timer->it_process);
-		timer->it_process = NULL;
-	}
+	if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+		put_task_struct(timer->it_process);
+	timer->it_process = NULL;
+
 	unlock_timer(timer, flags);
 	release_posix_timer(timer, IT_ID_SET);
 }
-- 
cgit v1.2.3


From 96347e7759e2e433c427defa0fa1adfc8cce6226 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:27 -0700
Subject: posix timers: release_posix_timer: kill the bogus
 put_task_struct(->it_process);

release_posix_timer() can't be called with ->it_process != NULL.  Once
sys_timer_create() sets ->it_process it must not call
release_posix_timer(), otherwise we can race with another thread doing
sys_timer_delete(), this timer is visible to idr_find() and unlocked.

The same is true for two other callers (actually, for any possible
caller), sys_timer_delete() and itimer_delete().  They must clear
->it_process before unlock_timer() + release_posix_timer().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Roland McGrath <roland@redhat.com>
Cc: john stultz <johnstul@us.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/posix-timers.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 17f53266fb67..9a21681aa80f 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -449,9 +449,6 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
 		spin_unlock_irqrestore(&idr_lock, flags);
 	}
 	sigqueue_free(tmr->sigq);
-	if (unlikely(tmr->it_process) &&
-	    tmr->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
-		put_task_struct(tmr->it_process);
 	kmem_cache_free(posix_timers_cache, tmr);
 }
 
-- 
cgit v1.2.3


From 6715ca451cfff1c9ce4b33ad9918a1dacf43997c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:27 -0700
Subject: signals: collect_signal: remove the unneeded sigismember() check

collect_signal() checks sigismember(&list->signal, sig), this is not
needed.  This "sig" was just found by next_signal(), so it must be valid.

We have a (completely broken) call to ->notifier in between, but it must
not play with sigpending->signal bits or unlock ->siglock.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 6c0958e52ea7..c5b9aabb1550 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -343,9 +343,6 @@ static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
 	struct sigqueue *q, *first = NULL;
 	int still_pending = 0;
 
-	if (unlikely(!sigismember(&list->signal, sig)))
-		return 0;
-
 	/*
 	 * Collect the siginfo appropriate to this signal.  Check if
 	 * there is another siginfo for the same signal.
-- 
cgit v1.2.3


From d4434207616980885205c605697868c0f07e4378 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:28 -0700
Subject: signals: collect_signal: simplify the "still_pending" logic

Factor out sigdelset() calls and remove the "still_pending" variable.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index c5b9aabb1550..50ad439377b2 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -341,7 +341,6 @@ unblock_all_signals(void)
 static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
 {
 	struct sigqueue *q, *first = NULL;
-	int still_pending = 0;
 
 	/*
 	 * Collect the siginfo appropriate to this signal.  Check if
@@ -349,26 +348,24 @@ static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
 	*/
 	list_for_each_entry(q, &list->list, list) {
 		if (q->info.si_signo == sig) {
-			if (first) {
-				still_pending = 1;
-				break;
-			}
+			if (first)
+				goto still_pending;
 			first = q;
 		}
 	}
+
+	sigdelset(&list->signal, sig);
+
 	if (first) {
+still_pending:
 		list_del_init(&first->list);
 		copy_siginfo(info, &first->info);
 		__sigqueue_free(first);
-		if (!still_pending)
-			sigdelset(&list->signal, sig);
 	} else {
-
 		/* Ok, it wasn't in the queue.  This must be
 		   a fast-pathed signal or we must have been
 		   out of queue space.  So zero out the info.
 		 */
-		sigdelset(&list->signal, sig);
 		info->si_signo = sig;
 		info->si_errno = 0;
 		info->si_code = 0;
-- 
cgit v1.2.3


From 100360f03077663b7bef3af44805b6cf700c3bee Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:29 -0700
Subject: signals: change collect_signal() to return void

With the recent changes collect_signal() always returns true.  Change it
to return void and update the single caller.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 50ad439377b2..fea236fe0b50 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -338,7 +338,7 @@ unblock_all_signals(void)
 	spin_unlock_irqrestore(&current->sighand->siglock, flags);
 }
 
-static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
+static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
 {
 	struct sigqueue *q, *first = NULL;
 
@@ -372,7 +372,6 @@ still_pending:
 		info->si_pid = 0;
 		info->si_uid = 0;
 	}
-	return 1;
 }
 
 static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
@@ -390,8 +389,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
 			}
 		}
 
-		if (!collect_signal(sig, pending, info))
-			sig = 0;
+		collect_signal(sig, pending, info);
 	}
 
 	return sig;
-- 
cgit v1.2.3


From 3854a771821c970065e3203a0b40ddc4101538cc Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:29 -0700
Subject: __exit_signal: don't take rcu lock

There is no reason for rcu_read_lock() in __exit_signal().  tsk->sighand
can only be changed if tsk does exec, obviously this is not possible.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 93d2711b9381..a7799d8a6404 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -85,7 +85,6 @@ static void __exit_signal(struct task_struct *tsk)
 	BUG_ON(!sig);
 	BUG_ON(!atomic_read(&sig->count));
 
-	rcu_read_lock();
 	sighand = rcu_dereference(tsk->sighand);
 	spin_lock(&sighand->siglock);
 
@@ -136,7 +135,6 @@ static void __exit_signal(struct task_struct *tsk)
 	tsk->signal = NULL;
 	tsk->sighand = NULL;
 	spin_unlock(&sighand->siglock);
-	rcu_read_unlock();
 
 	__cleanup_sighand(sighand);
 	clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
-- 
cgit v1.2.3


From 92413d771e7123304fb4b9efd2a00cccc946e383 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:30 -0700
Subject: signals: dequeue_signal: don't check SIGNAL_GROUP_EXIT when setting
 SIGNAL_STOP_DEQUEUED

dequeue_signal() checks SIGNAL_GROUP_EXIT before setting
SIGNAL_STOP_DEQUEUED.  This was added by
788e05a67c343fa22f2ae1d3ca264e7f15c25eaf a long ago to avoid the
coredump/SIGSTOP race.

Since then the related code was changed, and now this subtle check is both
incomplete and unneeded at the same time.  It is incomplete because
nowadays exec() doesn't set SIGNAL_GROUP_EXIT, so in fact we should check
signal_group_exit() to avoid a similar race.  Fortunately, we doesn't need
the check at all.  The only function which relies on SIGNAL_STOP_DEQUEUED
is do_signal_stop(), and it ignores this flag if signal_group_exit() == T,
this covers the SIGNAL_GROUP_EXIT case.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index fea236fe0b50..15f901a26ec7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -454,8 +454,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 		 * is to alert stop-signal processing code when another
 		 * processor has come along and cleared the flag.
 		 */
-		if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
-			tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
+		tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
 	}
 	if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
 		/*
-- 
cgit v1.2.3


From 2b201a9eddf509e8e935b45e573648e36f4b623f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:31 -0700
Subject: signals: do_signal_stop: kill the SIGNAL_UNKILLABLE check

fae5fa44f1fd079ffbed8e0add929dd7bbd1347f changed do_signal_stop() to check
SIGNAL_UNKILLABLE, this wasn't needed.  If signal_group_exit() == F, the
signal sent to SIGNAL_UNKILLABLE task must be already filtered out by the
caller, get_signal_to_deliver().  And if signal_group_exit() == T we are
not going to stop.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 15f901a26ec7..0514da573f22 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1649,8 +1649,7 @@ static int do_signal_stop(int signr)
 	} else {
 		struct task_struct *t;
 
-		if (unlikely((sig->flags & (SIGNAL_STOP_DEQUEUED | SIGNAL_UNKILLABLE))
-					 != SIGNAL_STOP_DEQUEUED) ||
+		if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
 		    unlikely(signal_group_exit(sig)))
 			return 0;
 		/*
-- 
cgit v1.2.3


From d8878ba3f05ae5bbfad5a6e72e5121c0ea35f989 Mon Sep 17 00:00:00 2001
From: Michael Kerrisk <mtk.manpages@googlemail.com>
Date: Fri, 25 Jul 2008 01:47:32 -0700
Subject: signals: make siginfo_t si_utime + si_sstime report times in USER_HZ,
 not HZ

In the switch to configurable HZ in 2.6, the treatment of the si_utime and
si_stime fields that are exposed to userland via the siginfo structure
looks to have been botched.  As things stand, these fields report times in
units of HZ, so that userland gets information that varies depending on
the HZ that the kernel was configured with.  This patch changes the
reported values to use USER_HZ units.

Signed-off-by: Michael Kerrisk <mtk.manpages@gmail.com>
Acked-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 0514da573f22..ba60eeeb63aa 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1370,10 +1370,9 @@ void do_notify_parent(struct task_struct *tsk, int sig)
 
 	info.si_uid = tsk->uid;
 
-	/* FIXME: find out whether or not this is supposed to be c*time. */
-	info.si_utime = cputime_to_jiffies(cputime_add(tsk->utime,
+	info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime,
 						       tsk->signal->utime));
-	info.si_stime = cputime_to_jiffies(cputime_add(tsk->stime,
+	info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
 						       tsk->signal->stime));
 
 	info.si_status = tsk->exit_code & 0x7f;
@@ -1441,9 +1440,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
 
 	info.si_uid = tsk->uid;
 
-	/* FIXME: find out whether or not this is supposed to be c*time. */
-	info.si_utime = cputime_to_jiffies(tsk->utime);
-	info.si_stime = cputime_to_jiffies(tsk->stime);
+	info.si_utime = cputime_to_clock_t(tsk->utime);
+	info.si_stime = cputime_to_clock_t(tsk->stime);
 
  	info.si_code = why;
  	switch (why) {
-- 
cgit v1.2.3


From bc64efd220dcd4449aef8dd2564d73127b583b09 Mon Sep 17 00:00:00 2001
From: Gustavo Fernando Padovan <gustavo@las.ic.unicamp.br>
Date: Fri, 25 Jul 2008 01:47:33 -0700
Subject: kernel/signal.c: change vars pid and tgid types to pid_t

Change the type of pid and tgid variables from int to the POSIX type
pid_t.

Signed-off-by: Gustavo F. Padovan <gustavo@las.ic.unicamp.br>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index ba60eeeb63aa..fdab7b363fa7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1116,7 +1116,7 @@ EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
  * is probably wrong.  Should make it like BSD or SYSV.
  */
 
-static int kill_something_info(int sig, struct siginfo *info, int pid)
+static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
 {
 	int ret;
 
@@ -2184,7 +2184,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
 }
 
 asmlinkage long
-sys_kill(int pid, int sig)
+sys_kill(pid_t pid, int sig)
 {
 	struct siginfo info;
 
@@ -2197,7 +2197,7 @@ sys_kill(int pid, int sig)
 	return kill_something_info(sig, &info, pid);
 }
 
-static int do_tkill(int tgid, int pid, int sig)
+static int do_tkill(pid_t tgid, pid_t pid, int sig)
 {
 	int error;
 	struct siginfo info;
@@ -2243,7 +2243,7 @@ static int do_tkill(int tgid, int pid, int sig)
  *  exists but it's not belonging to the target process anymore. This
  *  method solves the problem of threads exiting and PIDs getting reused.
  */
-asmlinkage long sys_tgkill(int tgid, int pid, int sig)
+asmlinkage long sys_tgkill(pid_t tgid, pid_t pid, int sig)
 {
 	/* This is only valid for single tasks */
 	if (pid <= 0 || tgid <= 0)
@@ -2256,7 +2256,7 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig)
  *  Send a signal to only one task, even if it's a CLONE_THREAD task.
  */
 asmlinkage long
-sys_tkill(int pid, int sig)
+sys_tkill(pid_t pid, int sig)
 {
 	/* This is only valid for single tasks */
 	if (pid <= 0)
@@ -2266,7 +2266,7 @@ sys_tkill(int pid, int sig)
 }
 
 asmlinkage long
-sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo)
+sys_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t __user *uinfo)
 {
 	siginfo_t info;
 
-- 
cgit v1.2.3


From 3d749b9e676b26584a47e75c235aa6f69d0697ae Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:37 -0700
Subject: ptrace: simplify ptrace_stop()->sigkill_pending() path

1. SIGKILL can't be blocked, remove this check from sigkill_pending().

2. When ptrace_stop() sees sigkill_pending() == T, it can just return.
   Kill "int killed" and simplify the code. This also is more correct,
   the tracer shouldn't see us in TASK_TRACED if we are not going to
   stop.

I strongly believe this code needs further changes.  We should do the "was
this task killed" check unconditionally, currently it depends on
arch_ptrace_stop_needed().  On the other hand, sigkill_pending() isn't
very clever.  If the task was killed tkill(SIGKILL), the signal can be
already dequeued if the caller is do_exit().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index fdab7b363fa7..39c1706edf03 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1496,9 +1496,8 @@ static inline int may_ptrace_stop(void)
  */
 static int sigkill_pending(struct task_struct *tsk)
 {
-	return ((sigismember(&tsk->pending.signal, SIGKILL) ||
-		 sigismember(&tsk->signal->shared_pending.signal, SIGKILL)) &&
-		!unlikely(sigismember(&tsk->blocked, SIGKILL)));
+	return	sigismember(&tsk->pending.signal, SIGKILL) ||
+		sigismember(&tsk->signal->shared_pending.signal, SIGKILL);
 }
 
 /*
@@ -1514,8 +1513,6 @@ static int sigkill_pending(struct task_struct *tsk)
  */
 static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
 {
-	int killed = 0;
-
 	if (arch_ptrace_stop_needed(exit_code, info)) {
 		/*
 		 * The arch code has something special to do before a
@@ -1531,7 +1528,8 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
 		spin_unlock_irq(&current->sighand->siglock);
 		arch_ptrace_stop(exit_code, info);
 		spin_lock_irq(&current->sighand->siglock);
-		killed = sigkill_pending(current);
+		if (sigkill_pending(current))
+			return;
 	}
 
 	/*
@@ -1548,7 +1546,7 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
 	__set_current_state(TASK_TRACED);
 	spin_unlock_irq(&current->sighand->siglock);
 	read_lock(&tasklist_lock);
-	if (!unlikely(killed) && may_ptrace_stop()) {
+	if (may_ptrace_stop()) {
 		do_notify_parent_cldstop(current, CLD_TRAPPED);
 		read_unlock(&tasklist_lock);
 		schedule();
-- 
cgit v1.2.3


From 7b34e4283c685f5cc6ba6d30e939906eee0d4bcf Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:37 -0700
Subject: introduce PF_KTHREAD flag

Introduce the new PF_KTHREAD flag to mark the kernel threads.  It is set
by INIT_TASK() and copied to the forked childs (we could set it in
kthreadd() along with PF_NOFREEZE instead).

daemonize() was changed as well.  In that case testing of PF_KTHREAD is
racy, but daemonize() is hopeless anyway.

This flag is cleared in do_execve(), before search_binary_handler().
Probably not the best place, we can do this in exec_mmap() or in
start_thread(), or clear it along with PF_FORKNOEXEC.  But I think this
doesn't matter in practice, and if do_execve() fails kthread should die
soon.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index a7799d8a6404..28a44a2612dc 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -430,7 +430,7 @@ void daemonize(const char *name, ...)
 	 * We don't want to have TIF_FREEZE set if the system-wide hibernation
 	 * or suspend transition begins right now.
 	 */
-	current->flags |= PF_NOFREEZE;
+	current->flags |= (PF_NOFREEZE | PF_KTHREAD);
 
 	if (current->nsproxy != &init_nsproxy) {
 		get_nsproxy(&init_nsproxy);
-- 
cgit v1.2.3


From 246bb0b1deb29726990620d8b5e55ca29f331362 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:38 -0700
Subject: kill PF_BORROWED_MM in favour of PF_KTHREAD

Kill PF_BORROWED_MM.  Change use_mm/unuse_mm to not play with ->flags, and
do s/PF_BORROWED_MM/PF_KTHREAD/ for a couple of other users.

No functional changes yet.  But this allows us to do further
fixes/cleanups.

oom_kill/ptrace/etc often check "p->mm != NULL" to filter out the
kthreads, this is wrong because of use_mm().  The problem with
PF_BORROWED_MM is that we need task_lock() to avoid races.  With this
patch we can check PF_KTHREAD directly, or use a simple lockless helper:

	/* The result must not be dereferenced !!! */
	struct mm_struct *__get_task_mm(struct task_struct *tsk)
	{
		if (tsk->flags & PF_KTHREAD)
			return NULL;
		return tsk->mm;
	}

Note also ecard_task().  It runs with ->mm != NULL, but it's the kernel
thread without PF_BORROWED_MM.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 228f80c9155a..eeaec6893b0d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -474,7 +474,7 @@ EXPORT_SYMBOL_GPL(mmput);
 /**
  * get_task_mm - acquire a reference to the task's mm
  *
- * Returns %NULL if the task has no mm.  Checks PF_BORROWED_MM (meaning
+ * Returns %NULL if the task has no mm.  Checks PF_KTHREAD (meaning
  * this kernel workthread has transiently adopted a user mm with use_mm,
  * to do its AIO) is not set and if so returns a reference to it, after
  * bumping up the use count.  User must release the mm via mmput()
@@ -487,7 +487,7 @@ struct mm_struct *get_task_mm(struct task_struct *task)
 	task_lock(task);
 	mm = task->mm;
 	if (mm) {
-		if (task->flags & PF_BORROWED_MM)
+		if (task->flags & PF_KTHREAD)
 			mm = NULL;
 		else
 			atomic_inc(&mm->mm_users);
-- 
cgit v1.2.3


From 32ecb1f26dd50eeaac4e3f4dea4541c97848e459 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:41 -0700
Subject: coredump: turn mm->core_startup_done into the pointer to struct
 core_state

mm->core_startup_done points to "struct completion startup_done" allocated
on the coredump_wait()'s stack.  Introduce the new structure, core_state,
which holds this "struct completion".  This way we can add more info
visible to the threads participating in coredump without enlarging
mm_struct.

No changes in affected .o files.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 28a44a2612dc..f7fa21dbced4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -680,7 +680,7 @@ static void exit_mm(struct task_struct * tsk)
 		up_read(&mm->mmap_sem);
 		down_write(&mm->mmap_sem);
 		if (!--mm->core_waiters)
-			complete(mm->core_startup_done);
+			complete(&mm->core_state->startup);
 		up_write(&mm->mmap_sem);
 
 		wait_for_completion(&mm->core_done);
-- 
cgit v1.2.3


From 999d9fc1670bc082928b93b11d1f2e0e417d973c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:41 -0700
Subject: coredump: move mm->core_waiters into struct core_state

Move mm->core_waiters into "struct core_state" allocated on stack.  This
shrinks mm_struct a little bit and allows further changes.

This patch mostly does s/core_waiters/core_state.  The only essential
change is that coredump_wait() must clear mm->core_state before return.

The coredump_wait()'s path is uglified and .text grows by 30 bytes, this
is fixed by the next patch.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c   | 8 ++++----
 kernel/fork.c   | 2 +-
 kernel/signal.c | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index f7fa21dbced4..988e232254e9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -670,16 +670,16 @@ static void exit_mm(struct task_struct * tsk)
 		return;
 	/*
 	 * Serialize with any possible pending coredump.
-	 * We must hold mmap_sem around checking core_waiters
+	 * We must hold mmap_sem around checking core_state
 	 * and clearing tsk->mm.  The core-inducing thread
-	 * will increment core_waiters for each thread in the
+	 * will increment ->nr_threads for each thread in the
 	 * group with ->mm != NULL.
 	 */
 	down_read(&mm->mmap_sem);
-	if (mm->core_waiters) {
+	if (mm->core_state) {
 		up_read(&mm->mmap_sem);
 		down_write(&mm->mmap_sem);
-		if (!--mm->core_waiters)
+		if (!--mm->core_state->nr_threads)
 			complete(&mm->core_state->startup);
 		up_write(&mm->mmap_sem);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index eeaec6893b0d..813d5c89b9d5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -400,7 +400,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 	INIT_LIST_HEAD(&mm->mmlist);
 	mm->flags = (current->mm) ? current->mm->flags
 				  : MMF_DUMP_FILTER_DEFAULT;
-	mm->core_waiters = 0;
+	mm->core_state = NULL;
 	mm->nr_ptes = 0;
 	set_mm_counter(mm, file_rss, 0);
 	set_mm_counter(mm, anon_rss, 0);
diff --git a/kernel/signal.c b/kernel/signal.c
index 39c1706edf03..5c7b7eaa0dc6 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1480,10 +1480,10 @@ static inline int may_ptrace_stop(void)
 	 * is a deadlock situation, and pointless because our tracer
 	 * is dead so don't allow us to stop.
 	 * If SIGKILL was already sent before the caller unlocked
-	 * ->siglock we must see ->core_waiters != 0. Otherwise it
+	 * ->siglock we must see ->core_state != NULL. Otherwise it
 	 * is safe to enter schedule().
 	 */
-	if (unlikely(current->mm->core_waiters) &&
+	if (unlikely(current->mm->core_state) &&
 	    unlikely(current->mm == current->parent->mm))
 		return 0;
 
-- 
cgit v1.2.3


From c5f1cc8c1828486a61ab3e575da6e2c62b34d399 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:42 -0700
Subject: coredump: turn core_state->nr_threads into atomic_t

Turn core_state->nr_threads into atomic_t and kill now unneeded
down_write(&mm->mmap_sem) in exit_mm().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 988e232254e9..63d82957baae 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -678,10 +678,9 @@ static void exit_mm(struct task_struct * tsk)
 	down_read(&mm->mmap_sem);
 	if (mm->core_state) {
 		up_read(&mm->mmap_sem);
-		down_write(&mm->mmap_sem);
-		if (!--mm->core_state->nr_threads)
+
+		if (atomic_dec_and_test(&mm->core_state->nr_threads))
 			complete(&mm->core_state->startup);
-		up_write(&mm->mmap_sem);
 
 		wait_for_completion(&mm->core_done);
 		down_read(&mm->mmap_sem);
-- 
cgit v1.2.3


From b564daf806d492dd4f7afe9b6c83b8d35d137669 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:44 -0700
Subject: coredump: construct the list of coredumping threads at startup time

binfmt->core_dump() has to iterate over the all threads in system in order
to find the coredumping threads and construct the list using the
GFP_ATOMIC allocations.

With this patch each thread allocates the list node on exit_mm()'s stack and
adds itself to the list.

This allows us to do further changes:

	- simplify ->core_dump()

	- change exit_mm() to clear ->mm first, then wait for ->core_done.
	  this makes the coredumping process visible to oom_kill

	- kill mm->core_done

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 63d82957baae..b66f0d55c791 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -664,6 +664,7 @@ assign_new_owner:
 static void exit_mm(struct task_struct * tsk)
 {
 	struct mm_struct *mm = tsk->mm;
+	struct core_state *core_state;
 
 	mm_release(tsk, mm);
 	if (!mm)
@@ -676,11 +677,19 @@ static void exit_mm(struct task_struct * tsk)
 	 * group with ->mm != NULL.
 	 */
 	down_read(&mm->mmap_sem);
-	if (mm->core_state) {
+	core_state = mm->core_state;
+	if (core_state) {
+		struct core_thread self;
 		up_read(&mm->mmap_sem);
 
-		if (atomic_dec_and_test(&mm->core_state->nr_threads))
-			complete(&mm->core_state->startup);
+		self.task = tsk;
+		self.next = xchg(&core_state->dumper.next, &self);
+		/*
+		 * Implies mb(), the result of xchg() must be visible
+		 * to core_state->dumper.
+		 */
+		if (atomic_dec_and_test(&core_state->nr_threads))
+			complete(&core_state->startup);
 
 		wait_for_completion(&mm->core_done);
 		down_read(&mm->mmap_sem);
-- 
cgit v1.2.3


From a94e2d408eaedbd85aae259621d46fafc10479a2 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:46 -0700
Subject: coredump: kill mm->core_done

Now that we have core_state->dumper list we can use it to wake up the
sub-threads waiting for the coredump completion.

This uglifies the code and .text grows by 47 bytes, but otoh mm_struct
lessens by sizeof(struct completion).  Also, with this change we can
decouple exit_mm() from the coredumping code.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index b66f0d55c791..8a4d4d12e294 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -691,7 +691,13 @@ static void exit_mm(struct task_struct * tsk)
 		if (atomic_dec_and_test(&core_state->nr_threads))
 			complete(&core_state->startup);
 
-		wait_for_completion(&mm->core_done);
+		for (;;) {
+			set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+			if (!self.task) /* see coredump_finish() */
+				break;
+			schedule();
+		}
+		__set_task_state(tsk, TASK_RUNNING);
 		down_read(&mm->mmap_sem);
 	}
 	atomic_inc(&mm->mm_count);
-- 
cgit v1.2.3


From 1a4d9b0aa0d3c50314e57525a5e5ec2cfc48b4c8 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:47 -0700
Subject: workqueues: insert_work: use "list_head *" instead of "int tail"

insert_work() inserts the new work_struct before or after cwq->worklist,
depending on the "int tail" parameter. Change it to accept "list_head *"
instead, this shrinks .text a bit and allows us to insert the barrier
after specific work_struct.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Jarek Poplawski <jarkao2@gmail.com>
Cc: Max Krasnyansky <maxk@qualcomm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6fd158b21026..d9a2d65cc63e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -125,7 +125,7 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
 }
 
 static void insert_work(struct cpu_workqueue_struct *cwq,
-				struct work_struct *work, int tail)
+			struct work_struct *work, struct list_head *head)
 {
 	set_wq_data(work, cwq);
 	/*
@@ -133,10 +133,7 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
 	 * result of list_add() below, see try_to_grab_pending().
 	 */
 	smp_wmb();
-	if (tail)
-		list_add_tail(&work->entry, &cwq->worklist);
-	else
-		list_add(&work->entry, &cwq->worklist);
+	list_add_tail(&work->entry, head);
 	wake_up(&cwq->more_work);
 }
 
@@ -146,7 +143,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
 	unsigned long flags;
 
 	spin_lock_irqsave(&cwq->lock, flags);
-	insert_work(cwq, work, 1);
+	insert_work(cwq, work, &cwq->worklist);
 	spin_unlock_irqrestore(&cwq->lock, flags);
 }
 
@@ -361,14 +358,14 @@ static void wq_barrier_func(struct work_struct *work)
 }
 
 static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
-					struct wq_barrier *barr, int tail)
+			struct wq_barrier *barr, struct list_head *head)
 {
 	INIT_WORK(&barr->work, wq_barrier_func);
 	__set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
 
 	init_completion(&barr->done);
 
-	insert_work(cwq, &barr->work, tail);
+	insert_work(cwq, &barr->work, head);
 }
 
 static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
@@ -388,7 +385,7 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
 		active = 0;
 		spin_lock_irq(&cwq->lock);
 		if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
-			insert_wq_barrier(cwq, &barr, 1);
+			insert_wq_barrier(cwq, &barr, &cwq->worklist);
 			active = 1;
 		}
 		spin_unlock_irq(&cwq->lock);
@@ -473,7 +470,7 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
 
 	spin_lock_irq(&cwq->lock);
 	if (unlikely(cwq->current_work == work)) {
-		insert_wq_barrier(cwq, &barr, 0);
+		insert_wq_barrier(cwq, &barr, cwq->worklist.next);
 		running = 1;
 	}
 	spin_unlock_irq(&cwq->lock);
-- 
cgit v1.2.3


From db700897224b5ebdf852f2d38920ce428940d059 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:49 -0700
Subject: workqueues: implement flush_work()

Most of users of flush_workqueue() can be changed to use cancel_work_sync(),
but sometimes we really need to wait for the completion and cancelling is not
an option. schedule_on_each_cpu() is good example.

Add the new helper, flush_work(work), which waits for the completion of the
specific work_struct. More precisely, it "flushes" the result of of the last
queue_work() which is visible to the caller.

For example, this code

	queue_work(wq, work);
	/* WINDOW */
	queue_work(wq, work);

	flush_work(work);

doesn't necessary work "as expected". What can happen in the WINDOW above is

	- wq starts the execution of work->func()

	- the caller migrates to another CPU

now, after the 2nd queue_work() this work is active on the previous CPU, and
at the same time it is queued on another. In this case flush_work(work) may
return before the first work->func() completes.

It is trivial to add another helper

	int flush_work_sync(struct work_struct *work)
	{
		return flush_work(work) || wait_on_work(work);
	}

which works "more correctly", but it has to iterate over all CPUs and thus
it much slower than flush_work().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Jarek Poplawski <jarkao2@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d9a2d65cc63e..ee41cf857d55 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -423,6 +423,52 @@ void flush_workqueue(struct workqueue_struct *wq)
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
 
+/**
+ * flush_work - block until a work_struct's callback has terminated
+ * @work: the work which is to be flushed
+ *
+ * It is expected that, prior to calling flush_work(), the caller has
+ * arranged for the work to not be requeued, otherwise it doesn't make
+ * sense to use this function.
+ */
+int flush_work(struct work_struct *work)
+{
+	struct cpu_workqueue_struct *cwq;
+	struct list_head *prev;
+	struct wq_barrier barr;
+
+	might_sleep();
+	cwq = get_wq_data(work);
+	if (!cwq)
+		return 0;
+
+	prev = NULL;
+	spin_lock_irq(&cwq->lock);
+	if (!list_empty(&work->entry)) {
+		/*
+		 * See the comment near try_to_grab_pending()->smp_rmb().
+		 * If it was re-queued under us we are not going to wait.
+		 */
+		smp_rmb();
+		if (unlikely(cwq != get_wq_data(work)))
+			goto out;
+		prev = &work->entry;
+	} else {
+		if (cwq->current_work != work)
+			goto out;
+		prev = &cwq->worklist;
+	}
+	insert_wq_barrier(cwq, &barr, prev->next);
+out:
+	spin_unlock_irq(&cwq->lock);
+	if (!prev)
+		return 0;
+
+	wait_for_completion(&barr.done);
+	return 1;
+}
+EXPORT_SYMBOL_GPL(flush_work);
+
 /*
  * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
  * so this work can't be re-armed in any way.
-- 
cgit v1.2.3


From 8616a89ab761239c963eea3a63be383f127cc7e8 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:49 -0700
Subject: workqueues: schedule_on_each_cpu: use flush_work()

Change schedule_on_each_cpu() to use flush_work() instead of
flush_workqueue(), this way we don't wait for other work_struct's which
can be queued meanwhile.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Jarek Poplawski <jarkao2@gmail.com>
Cc: Max Krasnyansky <maxk@qualcomm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ee41cf857d55..5fbffd302eb5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -690,7 +690,8 @@ int schedule_on_each_cpu(work_func_t func)
 		set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
 		__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
 	}
-	flush_workqueue(keventd_wq);
+	for_each_online_cpu(cpu)
+		flush_work(per_cpu_ptr(works, cpu));
 	put_online_cpus();
 	free_percpu(works);
 	return 0;
-- 
cgit v1.2.3


From 3da1c84c00c7e5fa8348336bd8c342f9128b0f14 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:50 -0700
Subject: workqueues: make get_online_cpus() useable for work->func()

workqueue_cpu_callback(CPU_DEAD) flushes cwq->thread under
cpu_maps_update_begin().  This means that the multithreaded workqueues
can't use get_online_cpus() due to the possible deadlock, very bad and
very old problem.

Introduce the new state, CPU_POST_DEAD, which is called after
cpu_hotplug_done() but before cpu_maps_update_done().

Change workqueue_cpu_callback() to use CPU_POST_DEAD instead of CPU_DEAD.
This means that create/destroy functions can't rely on get_online_cpus()
any longer and should take cpu_add_remove_lock instead.

[akpm@linux-foundation.org: fix CONFIG_SMP=n]
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Gautham R Shenoy <ego@in.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Max Krasnyansky <maxk@qualcomm.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Paul Menage <menage@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpu.c       |  5 +++++
 kernel/workqueue.c | 18 +++++++++---------
 2 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2cc409ce0a8f..10ba5f1004a5 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -285,6 +285,11 @@ out_allowed:
 	set_cpus_allowed_ptr(current, &old_allowed);
 out_release:
 	cpu_hotplug_done();
+	if (!err) {
+		if (raw_notifier_call_chain(&cpu_chain, CPU_POST_DEAD | mod,
+					    hcpu) == NOTIFY_BAD)
+			BUG();
+	}
 	return err;
 }
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5fbffd302eb5..828e58230cbc 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -828,7 +828,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 		err = create_workqueue_thread(cwq, singlethread_cpu);
 		start_workqueue_thread(cwq, -1);
 	} else {
-		get_online_cpus();
+		cpu_maps_update_begin();
 		spin_lock(&workqueue_lock);
 		list_add(&wq->list, &workqueues);
 		spin_unlock(&workqueue_lock);
@@ -840,7 +840,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 			err = create_workqueue_thread(cwq, cpu);
 			start_workqueue_thread(cwq, cpu);
 		}
-		put_online_cpus();
+		cpu_maps_update_done();
 	}
 
 	if (err) {
@@ -854,8 +854,8 @@ EXPORT_SYMBOL_GPL(__create_workqueue_key);
 static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
 {
 	/*
-	 * Our caller is either destroy_workqueue() or CPU_DEAD,
-	 * get_online_cpus() protects cwq->thread.
+	 * Our caller is either destroy_workqueue() or CPU_POST_DEAD,
+	 * cpu_add_remove_lock protects cwq->thread.
 	 */
 	if (cwq->thread == NULL)
 		return;
@@ -865,7 +865,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
 
 	flush_cpu_workqueue(cwq);
 	/*
-	 * If the caller is CPU_DEAD and cwq->worklist was not empty,
+	 * If the caller is CPU_POST_DEAD and cwq->worklist was not empty,
 	 * a concurrent flush_workqueue() can insert a barrier after us.
 	 * However, in that case run_workqueue() won't return and check
 	 * kthread_should_stop() until it flushes all work_struct's.
@@ -889,14 +889,14 @@ void destroy_workqueue(struct workqueue_struct *wq)
 	const cpumask_t *cpu_map = wq_cpu_map(wq);
 	int cpu;
 
-	get_online_cpus();
+	cpu_maps_update_begin();
 	spin_lock(&workqueue_lock);
 	list_del(&wq->list);
 	spin_unlock(&workqueue_lock);
 
 	for_each_cpu_mask_nr(cpu, *cpu_map)
 		cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
-	put_online_cpus();
+ 	cpu_maps_update_done();
 
 	free_percpu(wq->cpu_wq);
 	kfree(wq);
@@ -935,7 +935,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 
 		case CPU_UP_CANCELED:
 			start_workqueue_thread(cwq, -1);
-		case CPU_DEAD:
+		case CPU_POST_DEAD:
 			cleanup_workqueue_thread(cwq);
 			break;
 		}
@@ -943,7 +943,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 
 	switch (action) {
 	case CPU_UP_CANCELED:
-	case CPU_DEAD:
+	case CPU_POST_DEAD:
 		cpu_clear(cpu, cpu_populated_map);
 	}
 
-- 
cgit v1.2.3


From a67da70dc0955580665f5444f318b92e69a3c272 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:52 -0700
Subject: workqueues: lockdep annotations for flush_work()

Add lockdep annotations to flush_work() and update the comment.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Jarek Poplawski <jarkao2@o2.pl>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 828e58230cbc..4fcb75b98443 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -427,6 +427,8 @@ EXPORT_SYMBOL_GPL(flush_workqueue);
  * flush_work - block until a work_struct's callback has terminated
  * @work: the work which is to be flushed
  *
+ * Returns false if @work has already terminated.
+ *
  * It is expected that, prior to calling flush_work(), the caller has
  * arranged for the work to not be requeued, otherwise it doesn't make
  * sense to use this function.
@@ -442,6 +444,9 @@ int flush_work(struct work_struct *work)
 	if (!cwq)
 		return 0;
 
+	lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+	lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
+
 	prev = NULL;
 	spin_lock_irq(&cwq->lock);
 	if (!list_empty(&work->entry)) {
-- 
cgit v1.2.3


From ef1ca236b8d645349ed6569598ae3f6c1b9511c0 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:53 -0700
Subject: workqueues: queue_work() can use queue_work_on()

queue_work() can use queue_work_on() to avoid the code duplication.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4fcb75b98443..fe08a8512ddd 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -159,14 +159,11 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
  */
 int queue_work(struct workqueue_struct *wq, struct work_struct *work)
 {
-	int ret = 0;
+	int ret;
+
+	ret = queue_work_on(get_cpu(), wq, work);
+	put_cpu();
 
-	if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
-		BUG_ON(!list_empty(&work->entry));
-		__queue_work(wq_per_cpu(wq, get_cpu()), work);
-		put_cpu();
-		ret = 1;
-	}
 	return ret;
 }
 EXPORT_SYMBOL_GPL(queue_work);
-- 
cgit v1.2.3


From 8de6d308bab4f67fcf953562f9f08f9527cad72d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:53 -0700
Subject: workqueues: schedule_on_each_cpu() can use schedule_work_on()

schedule_on_each_cpu() can use schedule_work_on() to avoid the code
duplication.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index fe08a8512ddd..7cf430372f89 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -689,8 +689,7 @@ int schedule_on_each_cpu(work_func_t func)
 		struct work_struct *work = per_cpu_ptr(works, cpu);
 
 		INIT_WORK(work, func);
-		set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
-		__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
+		schedule_work_on(cpu, work);
 	}
 	for_each_online_cpu(cpu)
 		flush_work(per_cpu_ptr(works, cpu));
-- 
cgit v1.2.3


From 8448502cfc915f70e3f8923849ade27d472044cb Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 25 Jul 2008 01:47:54 -0700
Subject: workqueues: do CPU_UP_CANCELED if CPU_UP_PREPARE fails

The bug was pointed out by Akinobu Mita <akinobu.mita@gmail.com>, and this
patch is based on his original patch.

workqueue_cpu_callback(CPU_UP_PREPARE) expects that if it returns
NOTIFY_BAD, _cpu_up() will send CPU_UP_CANCELED then.

However, this is not true since

	"cpu hotplug: cpu: deliver CPU_UP_CANCELED only to NOTIFY_OKed callbacks with CPU_UP_PREPARE"
	commit: a0d8cdb652d35af9319a9e0fb7134de2a276c636

The callback which has returned NOTIFY_BAD will not receive
CPU_UP_CANCELED.  Change the code to fulfil the CPU_UP_CANCELED logic if
CPU_UP_PREPARE fails.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Reported-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 7cf430372f89..ec7e4f62aaff 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -911,6 +911,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 	unsigned int cpu = (unsigned long)hcpu;
 	struct cpu_workqueue_struct *cwq;
 	struct workqueue_struct *wq;
+	int ret = NOTIFY_OK;
 
 	action &= ~CPU_TASKS_FROZEN;
 
@@ -918,7 +919,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 	case CPU_UP_PREPARE:
 		cpu_set(cpu, cpu_populated_map);
 	}
-
+undo:
 	list_for_each_entry(wq, &workqueues, list) {
 		cwq = per_cpu_ptr(wq->cpu_wq, cpu);
 
@@ -928,7 +929,9 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 				break;
 			printk(KERN_ERR "workqueue [%s] for %i failed\n",
 				wq->name, cpu);
-			return NOTIFY_BAD;
+			action = CPU_UP_CANCELED;
+			ret = NOTIFY_BAD;
+			goto undo;
 
 		case CPU_ONLINE:
 			start_workqueue_thread(cwq, cpu);
@@ -948,7 +951,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		cpu_clear(cpu, cpu_populated_map);
 	}
 
-	return NOTIFY_OK;
+	return ret;
 }
 
 void __init init_workqueues(void)
-- 
cgit v1.2.3


From 339caf2a224fc9af0f01686bf287dda32c6efca6 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Fri, 25 Jul 2008 01:48:31 -0700
Subject: proc: misplaced export of find_get_pid

Move EXPORT_SYMBOL right after the func

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/pid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index 30bd5d4b2ac7..753fd90d9ec1 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -435,6 +435,7 @@ struct pid *find_get_pid(pid_t nr)
 
 	return pid;
 }
+EXPORT_SYMBOL_GPL(find_get_pid);
 
 pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
 {
@@ -497,7 +498,6 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
 
 	return pid;
 }
-EXPORT_SYMBOL_GPL(find_get_pid);
 
 /*
  * The pid hash table is scaled according to the amount of memory in the
-- 
cgit v1.2.3


From 99541c23cd32bacf1a591ca537a7c0cb9053ad7e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@parallels.com>
Date: Fri, 25 Jul 2008 01:48:31 -0700
Subject: sysctl: check for bogus modes

Catch, e. g., 644/0644 typo.

Signed-off-by: Alexey Dobriyan <adobriyan@parallels.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl_check.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index c09350d564f2..c35da23ab8fb 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -1532,6 +1532,8 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
 			sysctl_check_leaf(namespaces, table, &fail);
 		}
 		sysctl_check_bin_path(table, &fail);
+		if (table->mode > 0777)
+			set_fail(&fail, table, "bogus .mode");
 		if (fail) {
 			set_fail(&fail, table, NULL);
 			error = -EINVAL;
-- 
cgit v1.2.3


From 19b0cfcca41dd772065671ad0584e1cea0f3fd13 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 25 Jul 2008 01:48:35 -0700
Subject: pidns: remove now unused kill_proc function

This function operated on a pid_t to kill a task, which is no longer valid
in a containerized system.

It has finally lost all its users and we can safely remove it from the
tree.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 5c7b7eaa0dc6..82c3545596c5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1228,17 +1228,6 @@ int kill_pid(struct pid *pid, int sig, int priv)
 }
 EXPORT_SYMBOL(kill_pid);
 
-int
-kill_proc(pid_t pid, int sig, int priv)
-{
-	int ret;
-
-	rcu_read_lock();
-	ret = kill_pid_info(sig, __si_special(priv), find_pid(pid));
-	rcu_read_unlock();
-	return ret;
-}
-
 /*
  * These functions support sending signals using preallocated sigqueue
  * structures.  This is needed "because realtime applications cannot
@@ -1906,7 +1895,6 @@ EXPORT_SYMBOL(recalc_sigpending);
 EXPORT_SYMBOL_GPL(dequeue_signal);
 EXPORT_SYMBOL(flush_signals);
 EXPORT_SYMBOL(force_sig);
-EXPORT_SYMBOL(kill_proc);
 EXPORT_SYMBOL(ptrace_notify);
 EXPORT_SYMBOL(send_sig);
 EXPORT_SYMBOL(send_sig_info);
-- 
cgit v1.2.3


From e49859e71e0318b564de1546bdc30fab738f9deb Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 25 Jul 2008 01:48:36 -0700
Subject: pidns: remove now unused find_pid function.

This one had the only users so far - the kill_proc, which is removed, so
drop this (invalid in namespaced world) call too.

And of course - erase all references on it from comments.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/pid.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/pid.c b/kernel/pid.c
index 753fd90d9ec1..064e76afa507 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -309,12 +309,6 @@ struct pid *find_vpid(int nr)
 }
 EXPORT_SYMBOL_GPL(find_vpid);
 
-struct pid *find_pid(int nr)
-{
-	return find_pid_ns(nr, &init_pid_ns);
-}
-EXPORT_SYMBOL_GPL(find_pid);
-
 /*
  * attach_pid() must be called with the tasklist_lock write-held.
  */
@@ -483,7 +477,7 @@ EXPORT_SYMBOL(task_session_nr_ns);
 /*
  * Used by proc to find the first pid that is greater then or equal to nr.
  *
- * If there is a pid at nr this function is exactly the same as find_pid.
+ * If there is a pid at nr this function is exactly the same as find_pid_ns.
  */
 struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
 {
-- 
cgit v1.2.3


From 28325df0d9339b7f3aba9c45174d4586223ef46b Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 25 Jul 2008 01:48:38 -0700
Subject: markers: use rcu_barrier_sched() and call_rcu_sched()

rcu_barrier_sched() and call_rcu_sched() were introduced in 2.6.26 for the
Markers.  Change the marker code to use them.

It can be seen as a fix since the marker code was using an ugly,
temporary, #ifdef hack to work around CONFIG_PREEMPT_RCU.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Acked-by: Paul McKenney <paulmck@us.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/marker.c | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/marker.c b/kernel/marker.c
index 1abfb923b761..971da5317903 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -441,7 +441,7 @@ static int remove_marker(const char *name)
 	hlist_del(&e->hlist);
 	/* Make sure the call_rcu has been executed */
 	if (e->rcu_pending)
-		rcu_barrier();
+		rcu_barrier_sched();
 	kfree(e);
 	return 0;
 }
@@ -476,7 +476,7 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
 	hlist_del(&(*entry)->hlist);
 	/* Make sure the call_rcu has been executed */
 	if ((*entry)->rcu_pending)
-		rcu_barrier();
+		rcu_barrier_sched();
 	kfree(*entry);
 	*entry = e;
 	trace_mark(core_marker_format, "name %s format %s",
@@ -655,7 +655,7 @@ int marker_probe_register(const char *name, const char *format,
 	 * make sure it's executed now.
 	 */
 	if (entry->rcu_pending)
-		rcu_barrier();
+		rcu_barrier_sched();
 	old = marker_entry_add_probe(entry, probe, probe_private);
 	if (IS_ERR(old)) {
 		ret = PTR_ERR(old);
@@ -670,10 +670,7 @@ int marker_probe_register(const char *name, const char *format,
 	entry->rcu_pending = 1;
 	/* write rcu_pending before calling the RCU callback */
 	smp_wmb();
-#ifdef CONFIG_PREEMPT_RCU
-	synchronize_sched();	/* Until we have the call_rcu_sched() */
-#endif
-	call_rcu(&entry->rcu, free_old_closure);
+	call_rcu_sched(&entry->rcu, free_old_closure);
 end:
 	mutex_unlock(&markers_mutex);
 	return ret;
@@ -704,7 +701,7 @@ int marker_probe_unregister(const char *name,
 	if (!entry)
 		goto end;
 	if (entry->rcu_pending)
-		rcu_barrier();
+		rcu_barrier_sched();
 	old = marker_entry_remove_probe(entry, probe, probe_private);
 	mutex_unlock(&markers_mutex);
 	marker_update_probes();		/* may update entry */
@@ -716,10 +713,7 @@ int marker_probe_unregister(const char *name,
 	entry->rcu_pending = 1;
 	/* write rcu_pending before calling the RCU callback */
 	smp_wmb();
-#ifdef CONFIG_PREEMPT_RCU
-	synchronize_sched();	/* Until we have the call_rcu_sched() */
-#endif
-	call_rcu(&entry->rcu, free_old_closure);
+	call_rcu_sched(&entry->rcu, free_old_closure);
 	remove_marker(name);	/* Ignore busy error message */
 	ret = 0;
 end:
@@ -786,7 +780,7 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,
 		goto end;
 	}
 	if (entry->rcu_pending)
-		rcu_barrier();
+		rcu_barrier_sched();
 	old = marker_entry_remove_probe(entry, NULL, probe_private);
 	mutex_unlock(&markers_mutex);
 	marker_update_probes();		/* may update entry */
@@ -797,10 +791,7 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,
 	entry->rcu_pending = 1;
 	/* write rcu_pending before calling the RCU callback */
 	smp_wmb();
-#ifdef CONFIG_PREEMPT_RCU
-	synchronize_sched();	/* Until we have the call_rcu_sched() */
-#endif
-	call_rcu(&entry->rcu, free_old_closure);
+	call_rcu_sched(&entry->rcu, free_old_closure);
 	remove_marker(entry->name);	/* Ignore busy error message */
 end:
 	mutex_unlock(&markers_mutex);
-- 
cgit v1.2.3


From a89cc1959d0ea5f36bf7421dc97b34f03809637d Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Fri, 25 Jul 2008 01:48:39 -0700
Subject: markers: fix sparse integer as NULL pointer warning

kernel/trace/trace_sysprof.c:164:20: warning: Using plain integer as NULL pointer

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/trace/trace_sysprof.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 63528086337c..ce2d723c10e1 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -161,7 +161,7 @@ static void timer_notify(struct pt_regs *regs, int cpu)
 		__trace_special(tr, data, 2, regs->ip, 0);
 
 		while (i < sample_max_depth) {
-			frame.next_fp = 0;
+			frame.next_fp = NULL;
 			frame.return_address = 0;
 			if (!copy_stack_frame(fp, &frame))
 				break;
-- 
cgit v1.2.3


From 7394f0f6c0baab650ea9194cb1be847df646fb57 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Fri, 25 Jul 2008 01:48:40 -0700
Subject: unexport uts_sem

With the removal of the Solaris binary emulation the export of
uts_sem became unused.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 6c2188046048..0c9d3fa1f5ff 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1343,8 +1343,6 @@ EXPORT_SYMBOL(in_egroup_p);
 
 DECLARE_RWSEM(uts_sem);
 
-EXPORT_SYMBOL(uts_sem);
-
 asmlinkage long sys_newuname(struct new_utsname __user * name)
 {
 	int errno = 0;
-- 
cgit v1.2.3


From 49b5cf34727a6c1be1568ab28e89a2d9a6bf51e0 Mon Sep 17 00:00:00 2001
From: Jonathan Lim <jlim@sgi.com>
Date: Fri, 25 Jul 2008 01:48:40 -0700
Subject: accounting: account for user time when updating memory integrals

Adapt acct_update_integrals() to include user time when calculating the time
difference.  The units of acct_rss_mem1 and acct_vm_mem1 are also changed from
pages-jiffies to pages-usecs to avoid calling jiffies_to_usecs() in
xacct_add_tsk() which might overflow.

Signed-off-by: Jonathan Lim <jlim@sgi.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c  |  2 ++
 kernel/tsacct.c | 21 ++++++++++++++-------
 2 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 6acf749d3336..0047bd9b96aa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4046,6 +4046,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
 		cpustat->nice = cputime64_add(cpustat->nice, tmp);
 	else
 		cpustat->user = cputime64_add(cpustat->user, tmp);
+	/* Account for user time used */
+	acct_update_integrals(p);
 }
 
 /*
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 4ab1b584961b..1da6990af8e0 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -84,9 +84,9 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 {
 	struct mm_struct *mm;
 
-	/* convert pages-jiffies to Mbyte-usec */
-	stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB;
-	stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB;
+	/* convert pages-usec to Mbyte-usec */
+	stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB;
+	stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB;
 	mm = get_task_mm(p);
 	if (mm) {
 		/* adjust to KB unit */
@@ -118,12 +118,19 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 void acct_update_integrals(struct task_struct *tsk)
 {
 	if (likely(tsk->mm)) {
-		long delta = cputime_to_jiffies(
-			cputime_sub(tsk->stime, tsk->acct_stimexpd));
+		cputime_t time, dtime;
+		struct timeval value;
+		u64 delta;
+
+		time = tsk->stime + tsk->utime;
+		dtime = cputime_sub(time, tsk->acct_timexpd);
+		jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
+		delta = value.tv_sec;
+		delta = delta * USEC_PER_SEC + value.tv_usec;
 
 		if (delta == 0)
 			return;
-		tsk->acct_stimexpd = tsk->stime;
+		tsk->acct_timexpd = time;
 		tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
 		tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
 	}
@@ -135,7 +142,7 @@ void acct_update_integrals(struct task_struct *tsk)
  */
 void acct_clear_integrals(struct task_struct *tsk)
 {
-	tsk->acct_stimexpd = 0;
+	tsk->acct_timexpd = 0;
 	tsk->acct_rss_mem1 = 0;
 	tsk->acct_vm_mem1 = 0;
 }
-- 
cgit v1.2.3


From 081e4c8a75692c21f3a119a81ca3270081879d0e Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 25 Jul 2008 01:48:42 -0700
Subject: bsdacct: rename acct_gbls to bsd_acct_struct

After I fixed access to task->tgid in kernel/acct.c, Oleg pointed out some
bad side effects with this accounting vs pid namespaces interaction.  I.e.
 when some task in pid namespace sets this accounting up, this blocks all
the others from doing the same.  Restricting this to init namespace only
could help, but didn't look a graceful solution.

So here is the approach to make this accounting work with pid namespaces
properly.

The idea is simple - when a task dies it accounts itself in each namespace
it is visible from and which set the accounting up.

For example here are the commands run and the output of lastcomm from init
and sub namespaces:

init_ns# accton pacct
 sub_ns# accton pacct (this is a different file - sub ns is run in
                       a chroot-ed environment)
init_ns# cat /dev/null
 sub_ns# ls /dev/null
init_ns# accton
 sub_ns# accton

 sub_ns#  lastcomm -f pacct
ls                      0        [136,0]    0.00 secs Thu May 15 10:30
accton                  0        [136,0]    0.00 secs Thu May 15 10:30

init_ns# lastcomm -f pacct
accton                  root     pts/0      0.00 secs Thu May 15 14:30 << got from sub
cat                     root     pts/1      0.00 secs Thu May 15 14:30
ls                      root     pts/0      0.00 secs Thu May 15 14:30 << got from sub
accton                  root     pts/1      0.00 secs Thu May 15 14:30

That was the summary, the details are in patches.

This patch:

It will be visible in pid_namespace.h file, so fix its name to look better
outside the acct.c file.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/acct.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index 91e1cfd734d2..ee3e605190f9 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -82,7 +82,7 @@ static void do_acct_process(struct pid_namespace *ns, struct file *);
  * can be placed in the same cache line as the lock.  This primes
  * the cache line to have the data after getting the lock.
  */
-struct acct_glbs {
+struct bsd_acct_struct {
 	spinlock_t		lock;
 	volatile int		active;
 	volatile int		needcheck;
@@ -91,7 +91,7 @@ struct acct_glbs {
 	struct timer_list	timer;
 };
 
-static struct acct_glbs acct_globals __cacheline_aligned =
+static struct bsd_acct_struct acct_globals __cacheline_aligned =
 	{__SPIN_LOCK_UNLOCKED(acct_globals.lock)};
 
 /*
-- 
cgit v1.2.3


From 84406c153a5bfa5d8b428a0933e9d39db6b59a75 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 25 Jul 2008 01:48:42 -0700
Subject: pidns: use kzalloc when allocating new pid_namespace struct

It makes many fields initialization implicit helping in auto-setting
#ifdef-ed fields (bsd-acct related pointer will be such).

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/pid_namespace.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 98702b4b8851..06331cc1c3f5 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -71,7 +71,7 @@ static struct pid_namespace *create_pid_namespace(unsigned int level)
 	struct pid_namespace *ns;
 	int i;
 
-	ns = kmem_cache_alloc(pid_ns_cachep, GFP_KERNEL);
+	ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
 	if (ns == NULL)
 		goto out;
 
@@ -84,17 +84,13 @@ static struct pid_namespace *create_pid_namespace(unsigned int level)
 		goto out_free_map;
 
 	kref_init(&ns->kref);
-	ns->last_pid = 0;
-	ns->child_reaper = NULL;
 	ns->level = level;
 
 	set_bit(0, ns->pidmap[0].page);
 	atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
 
-	for (i = 1; i < PIDMAP_ENTRIES; i++) {
-		ns->pidmap[i].page = NULL;
+	for (i = 1; i < PIDMAP_ENTRIES; i++)
 		atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
-	}
 
 	return ns;
 
-- 
cgit v1.2.3


From 1c552858ac2b1732a99d234d46b98098baef41ff Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 25 Jul 2008 01:48:44 -0700
Subject: bsdacct: "truthify" a comment near acct_process

The acct_process does not accept any arguments actually.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/acct.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index ee3e605190f9..d9ee1838b4d4 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -579,7 +579,6 @@ void acct_collect(long exitcode, int group_dead)
 
 /**
  * acct_process - now just a wrapper around do_acct_process
- * @exitcode: task exit code
  *
  * handles process accounting for an exiting task
  */
-- 
cgit v1.2.3


From e59a04a7aa5ce2483470aee4f2eb79ba6b9afe8b Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 25 Jul 2008 01:48:44 -0700
Subject: bsdacct: make check timer accept a bsd_acct_struct argument

We're going to have many bsd_acct_struct instances, not just one, so the
timer (currently working with a global one) has to know which one to work
with.

Use a handy setup_timer macro for it (thanks to Oleg for one).

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/acct.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index d9ee1838b4d4..05f8bc094a4b 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -97,9 +97,10 @@ static struct bsd_acct_struct acct_globals __cacheline_aligned =
 /*
  * Called whenever the timer says to check the free space.
  */
-static void acct_timeout(unsigned long unused)
+static void acct_timeout(unsigned long x)
 {
-	acct_globals.needcheck = 1;
+	struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x;
+	acct->needcheck = 1;
 }
 
 /*
@@ -193,8 +194,8 @@ static void acct_file_reopen(struct file *file)
 		acct_globals.needcheck = 0;
 		acct_globals.active = 1;
 		/* It's been deleted if it was used before so this is safe */
-		init_timer(&acct_globals.timer);
-		acct_globals.timer.function = acct_timeout;
+		setup_timer(&acct_globals.timer, acct_timeout,
+				(unsigned long)&acct_globals);
 		acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ;
 		add_timer(&acct_globals.timer);
 	}
-- 
cgit v1.2.3


From a75d97976517dcda69150fd81d6be86ae63324a1 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 25 Jul 2008 01:48:45 -0700
Subject: bsdacct: turn the acct_lock from on-the-struct to global

Don't use per-bsd-acct-struct lock, but work with a global one.

This lock is taken for short periods, so it doesn't seem it'll become a
bottleneck, but it will allow us to easily avoid many locking difficulties
in the future.

So this is a mostly s/acct_globals.lock/acct_lock/ over the file.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/acct.c | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index 05f8bc094a4b..fc71c1304977 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -83,7 +83,6 @@ static void do_acct_process(struct pid_namespace *ns, struct file *);
  * the cache line to have the data after getting the lock.
  */
 struct bsd_acct_struct {
-	spinlock_t		lock;
 	volatile int		active;
 	volatile int		needcheck;
 	struct file		*file;
@@ -91,8 +90,9 @@ struct bsd_acct_struct {
 	struct timer_list	timer;
 };
 
-static struct bsd_acct_struct acct_globals __cacheline_aligned =
-	{__SPIN_LOCK_UNLOCKED(acct_globals.lock)};
+static DEFINE_SPINLOCK(acct_lock);
+
+static struct bsd_acct_struct acct_globals __cacheline_aligned;
 
 /*
  * Called whenever the timer says to check the free space.
@@ -114,11 +114,11 @@ static int check_free_space(struct file *file)
 	sector_t resume;
 	sector_t suspend;
 
-	spin_lock(&acct_globals.lock);
+	spin_lock(&acct_lock);
 	res = acct_globals.active;
 	if (!file || !acct_globals.needcheck)
 		goto out;
-	spin_unlock(&acct_globals.lock);
+	spin_unlock(&acct_lock);
 
 	/* May block */
 	if (vfs_statfs(file->f_path.dentry, &sbuf))
@@ -140,7 +140,7 @@ static int check_free_space(struct file *file)
 	 * If some joker switched acct_globals.file under us we'ld better be
 	 * silent and _not_ touch anything.
 	 */
-	spin_lock(&acct_globals.lock);
+	spin_lock(&acct_lock);
 	if (file != acct_globals.file) {
 		if (act)
 			res = act>0;
@@ -165,7 +165,7 @@ static int check_free_space(struct file *file)
 	add_timer(&acct_globals.timer);
 	res = acct_globals.active;
 out:
-	spin_unlock(&acct_globals.lock);
+	spin_unlock(&acct_lock);
 	return res;
 }
 
@@ -173,7 +173,7 @@ out:
  * Close the old accounting file (if currently open) and then replace
  * it with file (if non-NULL).
  *
- * NOTE: acct_globals.lock MUST be held on entry and exit.
+ * NOTE: acct_lock MUST be held on entry and exit.
  */
 static void acct_file_reopen(struct file *file)
 {
@@ -201,11 +201,11 @@ static void acct_file_reopen(struct file *file)
 	}
 	if (old_acct) {
 		mnt_unpin(old_acct->f_path.mnt);
-		spin_unlock(&acct_globals.lock);
+		spin_unlock(&acct_lock);
 		do_acct_process(old_ns, old_acct);
 		filp_close(old_acct, NULL);
 		put_pid_ns(old_ns);
-		spin_lock(&acct_globals.lock);
+		spin_lock(&acct_lock);
 	}
 }
 
@@ -235,10 +235,10 @@ static int acct_on(char *name)
 		return error;
 	}
 
-	spin_lock(&acct_globals.lock);
+	spin_lock(&acct_lock);
 	mnt_pin(file->f_path.mnt);
 	acct_file_reopen(file);
-	spin_unlock(&acct_globals.lock);
+	spin_unlock(&acct_lock);
 
 	mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
 
@@ -272,9 +272,9 @@ asmlinkage long sys_acct(const char __user *name)
 	} else {
 		error = security_acct(NULL);
 		if (!error) {
-			spin_lock(&acct_globals.lock);
+			spin_lock(&acct_lock);
 			acct_file_reopen(NULL);
-			spin_unlock(&acct_globals.lock);
+			spin_unlock(&acct_lock);
 		}
 	}
 	return error;
@@ -289,10 +289,10 @@ asmlinkage long sys_acct(const char __user *name)
  */
 void acct_auto_close_mnt(struct vfsmount *m)
 {
-	spin_lock(&acct_globals.lock);
+	spin_lock(&acct_lock);
 	if (acct_globals.file && acct_globals.file->f_path.mnt == m)
 		acct_file_reopen(NULL);
-	spin_unlock(&acct_globals.lock);
+	spin_unlock(&acct_lock);
 }
 
 /**
@@ -304,12 +304,12 @@ void acct_auto_close_mnt(struct vfsmount *m)
  */
 void acct_auto_close(struct super_block *sb)
 {
-	spin_lock(&acct_globals.lock);
+	spin_lock(&acct_lock);
 	if (acct_globals.file &&
 	    acct_globals.file->f_path.mnt->mnt_sb == sb) {
 		acct_file_reopen(NULL);
 	}
-	spin_unlock(&acct_globals.lock);
+	spin_unlock(&acct_lock);
 }
 
 /*
@@ -594,15 +594,15 @@ void acct_process(void)
 	if (!acct_globals.file)
 		return;
 
-	spin_lock(&acct_globals.lock);
+	spin_lock(&acct_lock);
 	file = acct_globals.file;
 	if (unlikely(!file)) {
-		spin_unlock(&acct_globals.lock);
+		spin_unlock(&acct_lock);
 		return;
 	}
 	get_file(file);
 	ns = get_pid_ns(acct_globals.ns);
-	spin_unlock(&acct_globals.lock);
+	spin_unlock(&acct_lock);
 
 	do_acct_process(ns, file);
 	fput(file);
-- 
cgit v1.2.3


From 6248b1b342005a428b1247b4e89249da1528d88d Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 25 Jul 2008 01:48:46 -0700
Subject: bsdacct: make internal code work with passed bsd_acct_struct, not
 global

This adds the appropriate pointer to all the internal (i.e.  static)
functions that work with global acct instance.  API calls pass a global
instance to them (while we still have such).

Mostly this is a s/acct_globals./acct->/ over the file.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/acct.c | 77 ++++++++++++++++++++++++++++++-----------------------------
 1 file changed, 39 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index fc71c1304977..72d4760c8da8 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -75,7 +75,8 @@ int acct_parm[3] = {4, 2, 30};
 /*
  * External references and all of the globals.
  */
-static void do_acct_process(struct pid_namespace *ns, struct file *);
+static void do_acct_process(struct bsd_acct_struct *acct,
+		struct pid_namespace *ns, struct file *);
 
 /*
  * This structure is used so that all the data protected by lock
@@ -106,7 +107,7 @@ static void acct_timeout(unsigned long x)
 /*
  * Check the amount of free space and suspend/resume accordingly.
  */
-static int check_free_space(struct file *file)
+static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
 {
 	struct kstatfs sbuf;
 	int res;
@@ -115,8 +116,8 @@ static int check_free_space(struct file *file)
 	sector_t suspend;
 
 	spin_lock(&acct_lock);
-	res = acct_globals.active;
-	if (!file || !acct_globals.needcheck)
+	res = acct->active;
+	if (!file || !acct->needcheck)
 		goto out;
 	spin_unlock(&acct_lock);
 
@@ -137,33 +138,33 @@ static int check_free_space(struct file *file)
 		act = 0;
 
 	/*
-	 * If some joker switched acct_globals.file under us we'ld better be
+	 * If some joker switched acct->file under us we'ld better be
 	 * silent and _not_ touch anything.
 	 */
 	spin_lock(&acct_lock);
-	if (file != acct_globals.file) {
+	if (file != acct->file) {
 		if (act)
 			res = act>0;
 		goto out;
 	}
 
-	if (acct_globals.active) {
+	if (acct->active) {
 		if (act < 0) {
-			acct_globals.active = 0;
+			acct->active = 0;
 			printk(KERN_INFO "Process accounting paused\n");
 		}
 	} else {
 		if (act > 0) {
-			acct_globals.active = 1;
+			acct->active = 1;
 			printk(KERN_INFO "Process accounting resumed\n");
 		}
 	}
 
-	del_timer(&acct_globals.timer);
-	acct_globals.needcheck = 0;
-	acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ;
-	add_timer(&acct_globals.timer);
-	res = acct_globals.active;
+	del_timer(&acct->timer);
+	acct->needcheck = 0;
+	acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
+	add_timer(&acct->timer);
+	res = acct->active;
 out:
 	spin_unlock(&acct_lock);
 	return res;
@@ -175,34 +176,33 @@ out:
  *
  * NOTE: acct_lock MUST be held on entry and exit.
  */
-static void acct_file_reopen(struct file *file)
+static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file)
 {
 	struct file *old_acct = NULL;
 	struct pid_namespace *old_ns = NULL;
 
-	if (acct_globals.file) {
-		old_acct = acct_globals.file;
-		old_ns = acct_globals.ns;
-		del_timer(&acct_globals.timer);
-		acct_globals.active = 0;
-		acct_globals.needcheck = 0;
-		acct_globals.file = NULL;
+	if (acct->file) {
+		old_acct = acct->file;
+		old_ns = acct->ns;
+		del_timer(&acct->timer);
+		acct->active = 0;
+		acct->needcheck = 0;
+		acct->file = NULL;
 	}
 	if (file) {
-		acct_globals.file = file;
-		acct_globals.ns = get_pid_ns(task_active_pid_ns(current));
-		acct_globals.needcheck = 0;
-		acct_globals.active = 1;
+		acct->file = file;
+		acct->ns = get_pid_ns(task_active_pid_ns(current));
+		acct->needcheck = 0;
+		acct->active = 1;
 		/* It's been deleted if it was used before so this is safe */
-		setup_timer(&acct_globals.timer, acct_timeout,
-				(unsigned long)&acct_globals);
-		acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ;
-		add_timer(&acct_globals.timer);
+		setup_timer(&acct->timer, acct_timeout, (unsigned long)acct);
+		acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
+		add_timer(&acct->timer);
 	}
 	if (old_acct) {
 		mnt_unpin(old_acct->f_path.mnt);
 		spin_unlock(&acct_lock);
-		do_acct_process(old_ns, old_acct);
+		do_acct_process(acct, old_ns, old_acct);
 		filp_close(old_acct, NULL);
 		put_pid_ns(old_ns);
 		spin_lock(&acct_lock);
@@ -237,7 +237,7 @@ static int acct_on(char *name)
 
 	spin_lock(&acct_lock);
 	mnt_pin(file->f_path.mnt);
-	acct_file_reopen(file);
+	acct_file_reopen(&acct_globals, file);
 	spin_unlock(&acct_lock);
 
 	mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
@@ -273,7 +273,7 @@ asmlinkage long sys_acct(const char __user *name)
 		error = security_acct(NULL);
 		if (!error) {
 			spin_lock(&acct_lock);
-			acct_file_reopen(NULL);
+			acct_file_reopen(&acct_globals, NULL);
 			spin_unlock(&acct_lock);
 		}
 	}
@@ -291,7 +291,7 @@ void acct_auto_close_mnt(struct vfsmount *m)
 {
 	spin_lock(&acct_lock);
 	if (acct_globals.file && acct_globals.file->f_path.mnt == m)
-		acct_file_reopen(NULL);
+		acct_file_reopen(&acct_globals, NULL);
 	spin_unlock(&acct_lock);
 }
 
@@ -307,7 +307,7 @@ void acct_auto_close(struct super_block *sb)
 	spin_lock(&acct_lock);
 	if (acct_globals.file &&
 	    acct_globals.file->f_path.mnt->mnt_sb == sb) {
-		acct_file_reopen(NULL);
+		acct_file_reopen(&acct_globals, NULL);
 	}
 	spin_unlock(&acct_lock);
 }
@@ -426,7 +426,8 @@ static u32 encode_float(u64 value)
 /*
  *  do_acct_process does all actual work. Caller holds the reference to file.
  */
-static void do_acct_process(struct pid_namespace *ns, struct file *file)
+static void do_acct_process(struct bsd_acct_struct *acct,
+		struct pid_namespace *ns, struct file *file)
 {
 	struct pacct_struct *pacct = &current->signal->pacct;
 	acct_t ac;
@@ -441,7 +442,7 @@ static void do_acct_process(struct pid_namespace *ns, struct file *file)
 	 * First check to see if there is enough free_space to continue
 	 * the process accounting system.
 	 */
-	if (!check_free_space(file))
+	if (!check_free_space(acct, file))
 		return;
 
 	/*
@@ -604,7 +605,7 @@ void acct_process(void)
 	ns = get_pid_ns(acct_globals.ns);
 	spin_unlock(&acct_lock);
 
-	do_acct_process(ns, file);
+	do_acct_process(&acct_globals, ns, file);
 	fput(file);
 	put_pid_ns(ns);
 }
-- 
cgit v1.2.3


From 0b6b030fc30d169bb406b34b4fc60d99dde4a9c6 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 25 Jul 2008 01:48:47 -0700
Subject: bsdacct: switch from global bsd_acct_struct instance to per-pidns one

Allocate the structure on the first call to sys_acct().  After this each
namespace, that ordered the accounting, will live with this structure till
its own death.

Two notes
- routines, that close the accounting on fs umount time use
  the init_pid_ns's acct by now;
- accounting routine accounts to dying task's namespace
  (also by now).

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/acct.c          | 84 +++++++++++++++++++++++++++++++++++++++-----------
 kernel/pid_namespace.c |  2 ++
 2 files changed, 68 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index 72d4760c8da8..febbbc67157e 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -93,8 +93,6 @@ struct bsd_acct_struct {
 
 static DEFINE_SPINLOCK(acct_lock);
 
-static struct bsd_acct_struct acct_globals __cacheline_aligned;
-
 /*
  * Called whenever the timer says to check the free space.
  */
@@ -176,7 +174,8 @@ out:
  *
  * NOTE: acct_lock MUST be held on entry and exit.
  */
-static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file)
+static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
+		struct pid_namespace *ns)
 {
 	struct file *old_acct = NULL;
 	struct pid_namespace *old_ns = NULL;
@@ -188,10 +187,11 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file)
 		acct->active = 0;
 		acct->needcheck = 0;
 		acct->file = NULL;
+		acct->ns = NULL;
 	}
 	if (file) {
 		acct->file = file;
-		acct->ns = get_pid_ns(task_active_pid_ns(current));
+		acct->ns = ns;
 		acct->needcheck = 0;
 		acct->active = 1;
 		/* It's been deleted if it was used before so this is safe */
@@ -204,7 +204,6 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file)
 		spin_unlock(&acct_lock);
 		do_acct_process(acct, old_ns, old_acct);
 		filp_close(old_acct, NULL);
-		put_pid_ns(old_ns);
 		spin_lock(&acct_lock);
 	}
 }
@@ -213,6 +212,8 @@ static int acct_on(char *name)
 {
 	struct file *file;
 	int error;
+	struct pid_namespace *ns;
+	struct bsd_acct_struct *acct = NULL;
 
 	/* Difference from BSD - they don't do O_APPEND */
 	file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
@@ -229,18 +230,34 @@ static int acct_on(char *name)
 		return -EIO;
 	}
 
+	ns = task_active_pid_ns(current);
+	if (ns->bacct == NULL) {
+		acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
+		if (acct == NULL) {
+			filp_close(file, NULL);
+			return -ENOMEM;
+		}
+	}
+
 	error = security_acct(file);
 	if (error) {
+		kfree(acct);
 		filp_close(file, NULL);
 		return error;
 	}
 
 	spin_lock(&acct_lock);
+	if (ns->bacct == NULL) {
+		ns->bacct = acct;
+		acct = NULL;
+	}
+
 	mnt_pin(file->f_path.mnt);
-	acct_file_reopen(&acct_globals, file);
+	acct_file_reopen(ns->bacct, file, ns);
 	spin_unlock(&acct_lock);
 
 	mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
+	kfree(acct);
 
 	return 0;
 }
@@ -270,10 +287,16 @@ asmlinkage long sys_acct(const char __user *name)
 		error = acct_on(tmp);
 		putname(tmp);
 	} else {
+		struct bsd_acct_struct *acct;
+
+		acct = task_active_pid_ns(current)->bacct;
+		if (acct == NULL)
+			return 0;
+
 		error = security_acct(NULL);
 		if (!error) {
 			spin_lock(&acct_lock);
-			acct_file_reopen(&acct_globals, NULL);
+			acct_file_reopen(acct, NULL, NULL);
 			spin_unlock(&acct_lock);
 		}
 	}
@@ -289,9 +312,15 @@ asmlinkage long sys_acct(const char __user *name)
  */
 void acct_auto_close_mnt(struct vfsmount *m)
 {
+	struct bsd_acct_struct *acct;
+
+	acct = init_pid_ns.bacct;
+	if (acct == NULL)
+		return;
+
 	spin_lock(&acct_lock);
-	if (acct_globals.file && acct_globals.file->f_path.mnt == m)
-		acct_file_reopen(&acct_globals, NULL);
+	if (acct->file && acct->file->f_path.mnt == m)
+		acct_file_reopen(acct, NULL, NULL);
 	spin_unlock(&acct_lock);
 }
 
@@ -304,10 +333,29 @@ void acct_auto_close_mnt(struct vfsmount *m)
  */
 void acct_auto_close(struct super_block *sb)
 {
+	struct bsd_acct_struct *acct;
+
+	acct = init_pid_ns.bacct;
+	if (acct == NULL)
+		return;
+
 	spin_lock(&acct_lock);
-	if (acct_globals.file &&
-	    acct_globals.file->f_path.mnt->mnt_sb == sb) {
-		acct_file_reopen(&acct_globals, NULL);
+	if (acct->file && acct->file->f_path.mnt->mnt_sb == sb)
+		acct_file_reopen(acct, NULL, NULL);
+	spin_unlock(&acct_lock);
+}
+
+void acct_exit_ns(struct pid_namespace *ns)
+{
+	struct bsd_acct_struct *acct;
+
+	spin_lock(&acct_lock);
+	acct = ns->bacct;
+	if (acct != NULL) {
+		if (acct->file != NULL)
+			acct_file_reopen(acct, NULL, NULL);
+
+		kfree(acct);
 	}
 	spin_unlock(&acct_lock);
 }
@@ -587,25 +635,25 @@ void acct_collect(long exitcode, int group_dead)
 void acct_process(void)
 {
 	struct file *file = NULL;
-	struct pid_namespace *ns;
+	struct pid_namespace *ns = task_active_pid_ns(current);
+	struct bsd_acct_struct *acct;
 
+	acct = ns->bacct;
 	/*
 	 * accelerate the common fastpath:
 	 */
-	if (!acct_globals.file)
+	if (!acct || !acct->file)
 		return;
 
 	spin_lock(&acct_lock);
-	file = acct_globals.file;
+	file = acct->file;
 	if (unlikely(!file)) {
 		spin_unlock(&acct_lock);
 		return;
 	}
 	get_file(file);
-	ns = get_pid_ns(acct_globals.ns);
 	spin_unlock(&acct_lock);
 
-	do_acct_process(&acct_globals, ns, file);
+	do_acct_process(acct, ns, file);
 	fput(file);
-	put_pid_ns(ns);
 }
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 06331cc1c3f5..ea567b78d1aa 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -12,6 +12,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/syscalls.h>
 #include <linux/err.h>
+#include <linux/acct.h>
 
 #define BITS_PER_PAGE		(PAGE_SIZE*8)
 
@@ -181,6 +182,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 
 	/* Child reaper for the pid namespace is going away */
 	pid_ns->child_reaper = NULL;
+	acct_exit_ns(pid_ns);
 	return;
 }
 
-- 
cgit v1.2.3


From b5a7174875ea570cc675f2c503e800db8efdd6a7 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 25 Jul 2008 01:48:47 -0700
Subject: bsdacct: turn acct off for all pidns-s on umount time

All the bsd_acct_strcts with opened accounting are linked into a global
list.  So, the acct_auto_close(_mnt) walks one and drops the accounting
for each.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/acct.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index febbbc67157e..7fc9f9dd1e9e 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -89,9 +89,11 @@ struct bsd_acct_struct {
 	struct file		*file;
 	struct pid_namespace	*ns;
 	struct timer_list	timer;
+	struct list_head	list;
 };
 
 static DEFINE_SPINLOCK(acct_lock);
+static LIST_HEAD(acct_list);
 
 /*
  * Called whenever the timer says to check the free space.
@@ -188,12 +190,14 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
 		acct->needcheck = 0;
 		acct->file = NULL;
 		acct->ns = NULL;
+		list_del(&acct->list);
 	}
 	if (file) {
 		acct->file = file;
 		acct->ns = ns;
 		acct->needcheck = 0;
 		acct->active = 1;
+		list_add(&acct->list, &acct_list);
 		/* It's been deleted if it was used before so this is safe */
 		setup_timer(&acct->timer, acct_timeout, (unsigned long)acct);
 		acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
@@ -314,13 +318,13 @@ void acct_auto_close_mnt(struct vfsmount *m)
 {
 	struct bsd_acct_struct *acct;
 
-	acct = init_pid_ns.bacct;
-	if (acct == NULL)
-		return;
-
 	spin_lock(&acct_lock);
-	if (acct->file && acct->file->f_path.mnt == m)
-		acct_file_reopen(acct, NULL, NULL);
+restart:
+	list_for_each_entry(acct, &acct_list, list)
+		if (acct->file && acct->file->f_path.mnt == m) {
+			acct_file_reopen(acct, NULL, NULL);
+			goto restart;
+		}
 	spin_unlock(&acct_lock);
 }
 
@@ -335,13 +339,13 @@ void acct_auto_close(struct super_block *sb)
 {
 	struct bsd_acct_struct *acct;
 
-	acct = init_pid_ns.bacct;
-	if (acct == NULL)
-		return;
-
 	spin_lock(&acct_lock);
-	if (acct->file && acct->file->f_path.mnt->mnt_sb == sb)
-		acct_file_reopen(acct, NULL, NULL);
+restart:
+	list_for_each_entry(acct, &acct_list, list)
+		if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) {
+			acct_file_reopen(acct, NULL, NULL);
+			goto restart;
+		}
 	spin_unlock(&acct_lock);
 }
 
-- 
cgit v1.2.3


From 7d1e13505be8c2bd2207894f4e0f069e1f9b51c9 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 25 Jul 2008 01:48:48 -0700
Subject: bsdacct: account dying tasks in all relevant namespaces

This just makes the acct_proces walk the pid namespaces from current up to
the top and account a task in each with the accounting turned on.

ns->parent access if safe lockless, since current it still alive and holds
its namespace, which in turn holds its parent.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/acct.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index 7fc9f9dd1e9e..0feba97e114e 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -631,15 +631,9 @@ void acct_collect(long exitcode, int group_dead)
 	spin_unlock_irq(&current->sighand->siglock);
 }
 
-/**
- * acct_process - now just a wrapper around do_acct_process
- *
- * handles process accounting for an exiting task
- */
-void acct_process(void)
+static void acct_process_in_ns(struct pid_namespace *ns)
 {
 	struct file *file = NULL;
-	struct pid_namespace *ns = task_active_pid_ns(current);
 	struct bsd_acct_struct *acct;
 
 	acct = ns->bacct;
@@ -661,3 +655,16 @@ void acct_process(void)
 	do_acct_process(acct, ns, file);
 	fput(file);
 }
+
+/**
+ * acct_process - now just a wrapper around do_acct_process
+ *
+ * handles process accounting for an exiting task
+ */
+void acct_process(void)
+{
+	struct pid_namespace *ns;
+
+	for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent)
+		acct_process_in_ns(ns);
+}
-- 
cgit v1.2.3


From 0c18d7a5df82524e634637c3aec24d4cba096442 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 25 Jul 2008 01:48:49 -0700
Subject: bsdacct: fix and add comments around acct_process()

Fix the one describing what this function is and add one more - about
locking absence around pid namespaces loop.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/acct.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index 0feba97e114e..dd68b9059418 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -657,7 +657,8 @@ static void acct_process_in_ns(struct pid_namespace *ns)
 }
 
 /**
- * acct_process - now just a wrapper around do_acct_process
+ * acct_process - now just a wrapper around acct_process_in_ns,
+ * which in turn is a wrapper around do_acct_process.
  *
  * handles process accounting for an exiting task
  */
@@ -665,6 +666,11 @@ void acct_process(void)
 {
 	struct pid_namespace *ns;
 
+	/*
+	 * This loop is safe lockless, since current is still
+	 * alive and holds its namespace, which in turn holds
+	 * its parent.
+	 */
 	for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent)
 		acct_process_in_ns(ns);
 }
-- 
cgit v1.2.3


From 297c5d92634c809cef23d73e7b2556f2528ff7e2 Mon Sep 17 00:00:00 2001
From: Andrea Righi <righi.andrea@gmail.com>
Date: Fri, 25 Jul 2008 01:48:49 -0700
Subject: task IO accounting: provide distinct tgid/tid I/O statistics

Report per-thread I/O statistics in /proc/pid/task/tid/io and aggregate
parent I/O statistics in /proc/pid/io.  This approach follows the same
model used to account per-process and per-thread CPU times.

As a practial application, this allows for example to quickly find the top
I/O consumer when a process spawns many child threads that perform the
actual I/O work, because the aggregated I/O statistics can always be found
in /proc/pid/io.

[ Oleg Nesterov points out that we should check that the task is still
  alive before we iterate over the threads, but also says that we can do
  that fixup on top of this later.  - Linus ]

Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Andrea Righi <righi.andrea@gmail.com>
Cc: Matt Heaton <matt@hostmonster.com>
Cc: Shailabh Nagar <nagar@watson.ibm.com>
Acked-by-with-comments: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 27 +++++++++++++++++++++++++++
 kernel/fork.c |  6 ++++++
 2 files changed, 33 insertions(+)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 8a4d4d12e294..ad933bb29ec7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -120,6 +120,18 @@ static void __exit_signal(struct task_struct *tsk)
 		sig->nivcsw += tsk->nivcsw;
 		sig->inblock += task_io_get_inblock(tsk);
 		sig->oublock += task_io_get_oublock(tsk);
+#ifdef CONFIG_TASK_XACCT
+		sig->rchar += tsk->rchar;
+		sig->wchar += tsk->wchar;
+		sig->syscr += tsk->syscr;
+		sig->syscw += tsk->syscw;
+#endif /* CONFIG_TASK_XACCT */
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+		sig->ioac.read_bytes += tsk->ioac.read_bytes;
+		sig->ioac.write_bytes += tsk->ioac.write_bytes;
+		sig->ioac.cancelled_write_bytes +=
+					tsk->ioac.cancelled_write_bytes;
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
 		sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 		sig = NULL; /* Marker for below. */
 	}
@@ -1366,6 +1378,21 @@ static int wait_task_zombie(struct task_struct *p, int options,
 		psig->coublock +=
 			task_io_get_oublock(p) +
 			sig->oublock + sig->coublock;
+#ifdef CONFIG_TASK_XACCT
+		psig->rchar += p->rchar + sig->rchar;
+		psig->wchar += p->wchar + sig->wchar;
+		psig->syscr += p->syscr + sig->syscr;
+		psig->syscw += p->syscw + sig->syscw;
+#endif /* CONFIG_TASK_XACCT */
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+		psig->ioac.read_bytes +=
+			p->ioac.read_bytes + sig->ioac.read_bytes;
+		psig->ioac.write_bytes +=
+			p->ioac.write_bytes + sig->ioac.write_bytes;
+		psig->ioac.cancelled_write_bytes +=
+				p->ioac.cancelled_write_bytes +
+				sig->ioac.cancelled_write_bytes;
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
 		spin_unlock_irq(&p->parent->sighand->siglock);
 	}
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 813d5c89b9d5..b99d73e971a4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -812,6 +812,12 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
 	sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
+#ifdef CONFIG_TASK_XACCT
+	sig->rchar = sig->wchar = sig->syscr = sig->syscw = 0;
+#endif
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+	memset(&sig->ioac, 0, sizeof(sig->ioac));
+#endif
 	sig->sum_sched_runtime = 0;
 	INIT_LIST_HEAD(&sig->cpu_timers[0]);
 	INIT_LIST_HEAD(&sig->cpu_timers[1]);
-- 
cgit v1.2.3


From 3e85ba034deec351f02cb55ff225bbd616463841 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 25 Jul 2008 01:48:50 -0700
Subject: tsacct: fix bacct_add_tsk()'s use of do_div()

Fix bacct_add_tsk()'s use of do_div() on an s64 by making ac_etime a u64
instead and dividing that.

Possibly this should be guarded lest the interval calculation turn up
negative, but the possible negativity of the result of the division is
cast away, and it shouldn't end up negative anyway.

This was introduced by patch f3cef7a99469afc159fec3a61b42dc7ca5b6824f.

Signed-off-by: David Howells <dhowells@redhat.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/tsacct.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 1da6990af8e0..3da47ccdc5e5 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -28,14 +28,14 @@
 void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
 {
 	struct timespec uptime, ts;
-	s64 ac_etime;
+	u64 ac_etime;
 
 	BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
 
 	/* calculate task elapsed time in timespec */
 	do_posix_clock_monotonic_gettime(&uptime);
 	ts = timespec_sub(uptime, tsk->start_time);
-	/* rebase elapsed time to usec */
+	/* rebase elapsed time to usec (should never be negative) */
 	ac_etime = timespec_to_ns(&ts);
 	do_div(ac_etime, NSEC_PER_USEC);
 	stats->ac_etime = ac_etime;
-- 
cgit v1.2.3


From 873b47717732c2f33a4b14de02571a4295a02f0c Mon Sep 17 00:00:00 2001
From: Keika Kobayashi <kobayashi.kk@ncos.nec.co.jp>
Date: Fri, 25 Jul 2008 01:48:52 -0700
Subject: per-task-delay-accounting: add memory reclaim delay

Sometimes, application responses become bad under heavy memory load.
Applications take a bit time to reclaim memory.  The statistics, how long
memory reclaim takes, will be useful to measure memory usage.

This patch adds accounting memory reclaim to per-task-delay-accounting for
accounting the time of do_try_to_free_pages().

<i.e>

- When System is under low memory load,
  memory reclaim may not occur.

$ free
             total       used       free     shared    buffers     cached
Mem:       8197800    1577300    6620500          0       4808    1516724
-/+ buffers/cache:      55768    8142032
Swap:     16386292          0   16386292

$ vmstat 1
procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa
 0  0      0 5069748  10612 3014060    0    0     0     0    3   26  0  0 100  0
 0  0      0 5069748  10612 3014060    0    0     0     0    4   22  0  0 100  0
 0  0      0 5069748  10612 3014060    0    0     0     0    3   18  0  0 100  0

Measure the time of tar command.

$ ls -s test.dat
1501472 test.dat

$ time tar cvf test.tar test.dat
real    0m13.388s
user    0m0.116s
sys     0m5.304s

$ ./delayget -d -p <pid>
CPU             count     real total  virtual total    delay total
                  428     5528345500     5477116080       62749891
IO              count    delay total
                  338     8078977189
SWAP            count    delay total
                    0              0
RECLAIM         count    delay total
                    0              0

- When system is under heavy memory load
  memory reclaim may occur.

$ vmstat 1
procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa
 0  0 7159032  49724   1812   3012    0    0     0     0    3   24  0  0 100  0
 0  0 7159032  49724   1812   3012    0    0     0     0    4   24  0  0 100  0
 0  0 7159032  49848   1812   3012    0    0     0     0    3   22  0  0 100  0

In this case, one process uses more 8G memory
by execution of malloc() and memset().

$ time tar cvf test.tar test.dat
real    1m38.563s        <-  increased by 85 sec
user    0m0.140s
sys     0m7.060s

$ ./delayget -d -p <pid>
CPU             count     real total  virtual total    delay total
                 9021     7140446250     7315277975      923201824
IO              count    delay total
                 8965    90466349669
SWAP            count    delay total
                    3       21036367
RECLAIM         count    delay total
                  740    61011951153

In the later case, the value of RECLAIM is increasing.
So, taskstats can show how much memory reclaim influences TAT.

Signed-off-by: Keika Kobayashi <kobayashi.kk@ncos.nec.co.jp>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujistu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/delayacct.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 10e43fd8b721..84b6782a2ce4 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -165,3 +165,16 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
 	return ret;
 }
 
+void __delayacct_freepages_start(void)
+{
+	delayacct_start(&current->delays->freepages_start);
+}
+
+void __delayacct_freepages_end(void)
+{
+	delayacct_end(&current->delays->freepages_start,
+			&current->delays->freepages_end,
+			&current->delays->freepages_delay,
+			&current->delays->freepages_count);
+}
+
-- 
cgit v1.2.3


From 016ae219b920c4e606088761d3d6070cdf8ba706 Mon Sep 17 00:00:00 2001
From: Keika Kobayashi <kobayashi.kk@ncos.nec.co.jp>
Date: Fri, 25 Jul 2008 01:48:53 -0700
Subject: per-task-delay-accounting: update taskstats for memory reclaim delay

Add members for memory reclaim delay to taskstats, and accumulate them in
__delayacct_add_tsk() .

Signed-off-by: Keika Kobayashi <kobayashi.kk@ncos.nec.co.jp>
Cc: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/delayacct.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 84b6782a2ce4..b3179dad71be 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -145,8 +145,11 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
 	tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
 	d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
+	tmp = d->freepages_delay_total + tsk->delays->freepages_delay;
+	d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp;
 	d->blkio_count += tsk->delays->blkio_count;
 	d->swapin_count += tsk->delays->swapin_count;
+	d->freepages_count += tsk->delays->freepages_count;
 	spin_unlock_irqrestore(&tsk->delays->lock, flags);
 
 done:
-- 
cgit v1.2.3


From b81f3ea92ba1fa676775677679889dc2a7f03c8b Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Fri, 25 Jul 2008 01:48:55 -0700
Subject: taskstats: remove initialization of static per-cpu variable

Cc: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/taskstats.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 06b17547f4e7..bd6be76303cf 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -35,7 +35,7 @@
  */
 #define TASKSTATS_CPUMASK_MAXLEN	(100+6*NR_CPUS)
 
-static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
+static DEFINE_PER_CPU(__u32, taskstats_seqnum);
 static int family_registered;
 struct kmem_cache *taskstats_cache;
 
-- 
cgit v1.2.3


From 9b81361631bbb1d85c99ddec677d42afe516737b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 25 Jul 2008 13:02:37 +0200
Subject: signalfd: fix undefined reference to `compat_sys_signalfd4' when
 !CONFIG_SIGNALFD

fix:

arch/x86/ia32/built-in.o: In function `ia32_sys_call_table':
(.rodata+0xa38): undefined reference to `compat_sys_signalfd4'

on !CONFIG_SIGNALFD.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys_ni.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 55eca1594da9..08d6e1bb99ac 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -160,6 +160,7 @@ cond_syscall(sys_ioprio_get);
 cond_syscall(sys_signalfd);
 cond_syscall(sys_signalfd4);
 cond_syscall(compat_sys_signalfd);
+cond_syscall(compat_sys_signalfd4);
 cond_syscall(sys_timerfd_create);
 cond_syscall(sys_timerfd_settime);
 cond_syscall(sys_timerfd_gettime);
-- 
cgit v1.2.3


From 1fe371044b21b226b96a9dd959e971b50b28c78e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 26 Jul 2008 15:09:47 +0200
Subject: ftrace: fix modular build

fix:

 ERROR: "start_critical_timings" [drivers/acpi/processor.ko] undefined!
 ERROR: "stop_critical_timings" [drivers/acpi/processor.ko] undefined!

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_irqsoff.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index b1e4a89b08eb..ece6cfb649fa 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -253,12 +253,14 @@ void start_critical_timings(void)
 	if (preempt_trace() || irq_trace())
 		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
+EXPORT_SYMBOL_GPL(start_critical_timings);
 
 void stop_critical_timings(void)
 {
 	if (preempt_trace() || irq_trace())
 		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
+EXPORT_SYMBOL_GPL(stop_critical_timings);
 
 #ifdef CONFIG_IRQSOFF_TRACER
 #ifdef CONFIG_PROVE_LOCKING
-- 
cgit v1.2.3


From b8d317d10cca76cabe6b03ebfeb23cc99118b731 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Thu, 24 Jul 2008 18:21:29 -0700
Subject: cpumask: make cpumask_of_cpu_map generic

If an arch doesn't define cpumask_of_cpu_map, create a generic
statically-initialized one for them.  This allows removal of the buggy
cpumask_of_cpu() macro (&cpumask_of_cpu() gives address of
out-of-scope var).

An arch with NR_CPUS of 4096 probably wants to allocate this itself
based on the actual number of CPUs, since otherwise they're using 2MB
of rodata (1024 cpus means 128k).  That's what
CONFIG_HAVE_CPUMASK_OF_CPU_MAP is for (only x86/64 does so at the
moment).

In future as we support more CPUs, we'll need to resort to a
get_cpu_map()/put_cpu_map() allocation scheme.

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jack Steiner <steiner@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/cpu.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 10ba5f1004a5..fe31ff3d3809 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -461,3 +461,112 @@ out:
 #endif /* CONFIG_PM_SLEEP_SMP */
 
 #endif /* CONFIG_SMP */
+
+#ifndef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
+/* 64 bits of zeros, for initializers. */
+#if BITS_PER_LONG == 32
+#define Z64 0, 0
+#else
+#define Z64 0
+#endif
+
+/* Initializer macros. */
+#define CMI0(n) { .bits = { 1UL << (n) } }
+#define CMI(n, ...) { .bits = { __VA_ARGS__, 1UL << ((n) % BITS_PER_LONG) } }
+
+#define CMI8(n, ...)						\
+	CMI((n), __VA_ARGS__), CMI((n)+1, __VA_ARGS__),		\
+	CMI((n)+2, __VA_ARGS__), CMI((n)+3, __VA_ARGS__),	\
+	CMI((n)+4, __VA_ARGS__), CMI((n)+5, __VA_ARGS__),	\
+	CMI((n)+6, __VA_ARGS__), CMI((n)+7, __VA_ARGS__)
+
+#if BITS_PER_LONG == 32
+#define CMI64(n, ...)							\
+	CMI8((n), __VA_ARGS__), CMI8((n)+8, __VA_ARGS__),		\
+	CMI8((n)+16, __VA_ARGS__), CMI8((n)+24, __VA_ARGS__),		\
+	CMI8((n)+32, 0, __VA_ARGS__), CMI8((n)+40, 0, __VA_ARGS__),	\
+	CMI8((n)+48, 0, __VA_ARGS__), CMI8((n)+56, 0, __VA_ARGS__)
+#else
+#define CMI64(n, ...)							\
+	CMI8((n), __VA_ARGS__), CMI8((n)+8, __VA_ARGS__),		\
+	CMI8((n)+16, __VA_ARGS__), CMI8((n)+24, __VA_ARGS__),		\
+	CMI8((n)+32, __VA_ARGS__), CMI8((n)+40, __VA_ARGS__),	\
+	CMI8((n)+48, __VA_ARGS__), CMI8((n)+56, __VA_ARGS__)
+#endif
+
+#define CMI256(n, ...)							\
+	CMI64((n), __VA_ARGS__), CMI64((n)+64, Z64, __VA_ARGS__),	\
+	CMI64((n)+128, Z64, Z64, __VA_ARGS__),				\
+	CMI64((n)+192, Z64, Z64, Z64, __VA_ARGS__)
+#define Z256 Z64, Z64, Z64, Z64
+
+#define CMI1024(n, ...)					\
+	CMI256((n), __VA_ARGS__),			\
+	CMI256((n)+256, Z256, __VA_ARGS__),		\
+	CMI256((n)+512, Z256, Z256, __VA_ARGS__),	\
+	CMI256((n)+768, Z256, Z256, Z256, __VA_ARGS__)
+#define Z1024 Z256, Z256, Z256, Z256
+
+/* We want this statically initialized, just to be safe.  We try not
+ * to waste too much space, either. */
+static const cpumask_t cpumask_map[] = {
+	CMI0(0), CMI0(1), CMI0(2), CMI0(3),
+#if NR_CPUS > 4
+	CMI0(4), CMI0(5), CMI0(6), CMI0(7),
+#endif
+#if NR_CPUS > 8
+	CMI0(8), CMI0(9), CMI0(10), CMI0(11),
+	CMI0(12), CMI0(13), CMI0(14), CMI0(15),
+#endif
+#if NR_CPUS > 16
+	CMI0(16), CMI0(17), CMI0(18), CMI0(19),
+	CMI0(20), CMI0(21), CMI0(22), CMI0(23),
+	CMI0(24), CMI0(25), CMI0(26), CMI0(27),
+	CMI0(28), CMI0(29), CMI0(30), CMI0(31),
+#endif
+#if NR_CPUS > 32
+#if BITS_PER_LONG == 32
+	CMI(32, 0), CMI(33, 0), CMI(34, 0), CMI(35, 0),
+	CMI(36, 0), CMI(37, 0), CMI(38, 0), CMI(39, 0),
+	CMI(40, 0), CMI(41, 0), CMI(42, 0), CMI(43, 0),
+	CMI(44, 0), CMI(45, 0), CMI(46, 0), CMI(47, 0),
+	CMI(48, 0), CMI(49, 0), CMI(50, 0), CMI(51, 0),
+	CMI(52, 0), CMI(53, 0), CMI(54, 0), CMI(55, 0),
+	CMI(56, 0), CMI(57, 0), CMI(58, 0), CMI(59, 0),
+	CMI(60, 0), CMI(61, 0), CMI(62, 0), CMI(63, 0),
+#else
+	CMI0(32), CMI0(33), CMI0(34), CMI0(35),
+	CMI0(36), CMI0(37), CMI0(38), CMI0(39),
+	CMI0(40), CMI0(41), CMI0(42), CMI0(43),
+	CMI0(44), CMI0(45), CMI0(46), CMI0(47),
+	CMI0(48), CMI0(49), CMI0(50), CMI0(51),
+	CMI0(52), CMI0(53), CMI0(54), CMI0(55),
+	CMI0(56), CMI0(57), CMI0(58), CMI0(59),
+	CMI0(60), CMI0(61), CMI0(62), CMI0(63),
+#endif /* BITS_PER_LONG == 64 */
+#endif
+#if NR_CPUS > 64
+	CMI64(64, Z64),
+#endif
+#if NR_CPUS > 128
+	CMI64(128, Z64, Z64), CMI64(192, Z64, Z64, Z64),
+#endif
+#if NR_CPUS > 256
+	CMI256(256, Z256),
+#endif
+#if NR_CPUS > 512
+	CMI256(512, Z256, Z256), CMI256(768, Z256, Z256, Z256),
+#endif
+#if NR_CPUS > 1024
+	CMI1024(1024, Z1024),
+#endif
+#if NR_CPUS > 2048
+	CMI1024(2048, Z1024, Z1024), CMI1024(3072, Z1024, Z1024, Z1024),
+#endif
+#if NR_CPUS > 4096
+#error NR_CPUS too big.  Fix initializers or set CONFIG_HAVE_CPUMASK_OF_CPU_MAP
+#endif
+};
+
+const cpumask_t *cpumask_of_cpu_map = cpumask_map;
+#endif /* !CONFIG_HAVE_CPUMASK_OF_CPU_MAP */
-- 
cgit v1.2.3


From 6524d938b3360504b43a1278b5a8403e85383d1a Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Thu, 24 Jul 2008 18:21:30 -0700
Subject: cpumask: put cpumask_of_cpu_map in the initdata section

  * Create the cpumask_of_cpu_map statically in the init data section
    using NR_CPUS but replace it during boot up with one sized by
    nr_cpu_ids (num possible cpus).

Signed-off-by: Mike Travis <travis@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/cpu.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index fe31ff3d3809..9d4e1c28c053 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -462,7 +462,6 @@ out:
 
 #endif /* CONFIG_SMP */
 
-#ifndef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
 /* 64 bits of zeros, for initializers. */
 #if BITS_PER_LONG == 32
 #define Z64 0, 0
@@ -509,7 +508,11 @@ out:
 
 /* We want this statically initialized, just to be safe.  We try not
  * to waste too much space, either. */
-static const cpumask_t cpumask_map[] = {
+static const cpumask_t cpumask_map[]
+#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
+__initdata
+#endif
+= {
 	CMI0(0), CMI0(1), CMI0(2), CMI0(3),
 #if NR_CPUS > 4
 	CMI0(4), CMI0(5), CMI0(6), CMI0(7),
@@ -569,4 +572,3 @@ static const cpumask_t cpumask_map[] = {
 };
 
 const cpumask_t *cpumask_of_cpu_map = cpumask_map;
-#endif /* !CONFIG_HAVE_CPUMASK_OF_CPU_MAP */
-- 
cgit v1.2.3


From 0bc3cc03fa6e1c20aecb5a33356bcaae410640b9 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Thu, 24 Jul 2008 18:21:31 -0700
Subject: cpumask: change cpumask_of_cpu_ptr to use new cpumask_of_cpu

  * Replace previous instances of the cpumask_of_cpu_ptr* macros
    with a the new (lvalue capable) generic cpumask_of_cpu().

Signed-off-by: Mike Travis <travis@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/stop_machine.c        | 3 +--
 kernel/time/tick-common.c    | 8 +++-----
 kernel/trace/trace_sysprof.c | 4 +---
 3 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 738b411ff2d3..ba9b2054ecbd 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -33,9 +33,8 @@ static int stopmachine(void *cpu)
 {
 	int irqs_disabled = 0;
 	int prepared = 0;
-	cpumask_of_cpu_ptr(cpumask, (int)(long)cpu);
 
-	set_cpus_allowed_ptr(current, cpumask);
+	set_cpus_allowed_ptr(current, &cpumask_of_cpu((int)(long)cpu));
 
 	/* Ack: we are alive */
 	smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index bf43284d6855..80c4336f4188 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -196,12 +196,10 @@ static int tick_check_new_device(struct clock_event_device *newdev)
 	struct tick_device *td;
 	int cpu, ret = NOTIFY_OK;
 	unsigned long flags;
-	cpumask_of_cpu_ptr_declare(cpumask);
 
 	spin_lock_irqsave(&tick_device_lock, flags);
 
 	cpu = smp_processor_id();
-	cpumask_of_cpu_ptr_next(cpumask, cpu);
 	if (!cpu_isset(cpu, newdev->cpumask))
 		goto out_bc;
 
@@ -209,7 +207,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
 	curdev = td->evtdev;
 
 	/* cpu local device ? */
-	if (!cpus_equal(newdev->cpumask, *cpumask)) {
+	if (!cpus_equal(newdev->cpumask, cpumask_of_cpu(cpu))) {
 
 		/*
 		 * If the cpu affinity of the device interrupt can not
@@ -222,7 +220,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
 		 * If we have a cpu local device already, do not replace it
 		 * by a non cpu local device
 		 */
-		if (curdev && cpus_equal(curdev->cpumask, *cpumask))
+		if (curdev && cpus_equal(curdev->cpumask, cpumask_of_cpu(cpu)))
 			goto out_bc;
 	}
 
@@ -254,7 +252,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
 		curdev = NULL;
 	}
 	clockevents_exchange_device(curdev, newdev);
-	tick_setup_device(td, newdev, cpu, cpumask);
+	tick_setup_device(td, newdev, cpu, &cpumask_of_cpu(cpu));
 	if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
 		tick_oneshot_notify();
 
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index ce2d723c10e1..bb948e52ce20 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -213,9 +213,7 @@ static void start_stack_timers(void)
 	int cpu;
 
 	for_each_online_cpu(cpu) {
-		cpumask_of_cpu_ptr(new_mask, cpu);
-
-		set_cpus_allowed_ptr(current, new_mask);
+		set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
 		start_stack_timer(cpu);
 	}
 	set_cpus_allowed_ptr(current, &saved_mask);
-- 
cgit v1.2.3


From 5a7a201c51c324876d00a54e7208af6af12d1ca4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 26 Jul 2008 16:50:47 +0200
Subject: cpumask: export cpumask_of_cpu_map

fix:

 ERROR: "cpumask_of_cpu_map" [drivers/acpi/processor.ko] undefined!
 ERROR: "cpumask_of_cpu_map" [arch/x86/kernel/microcode.ko] undefined!
 ERROR: "cpumask_of_cpu_map" [arch/x86/kernel/cpu/cpufreq/speedstep-ich.ko] undefined!
 ERROR: "cpumask_of_cpu_map" [arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.ko] undefined!

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/cpu.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 9d4e1c28c053..a35d8995dc8c 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -572,3 +572,5 @@ __initdata
 };
 
 const cpumask_t *cpumask_of_cpu_map = cpumask_map;
+
+EXPORT_SYMBOL_GPL(cpumask_of_cpu_map);
-- 
cgit v1.2.3


From a2e2e3577c3ef2b5dbb866e97e612aae4adfa32f Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Fri, 25 Jul 2008 19:44:38 -0700
Subject: pm selftest: rtc paranoia

Cope with a quirk of some RTCs (notably ACPI ones) which aren't guaranteed
to implement oneshot behavior when they woke the system from sleeep:
forcibly disable the alarm, just in case.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/main.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index 95bff23ecdaa..0b7476f5d2a6 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -635,6 +635,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
 	}
 	if (status < 0)
 		printk(err_suspend, status);
+
+	/* Some platforms can't detect that the alarm triggered the
+	 * wakeup, or (accordingly) disable it after it afterwards.
+	 * It's supposed to give oneshot behavior; cope.
+	 */
+	alm.enabled = false;
+	rtc_set_alarm(rtc, &alm);
 }
 
 static int __init has_wakealarm(struct device *dev, void *name_ptr)
-- 
cgit v1.2.3


From 7fccf0326536c1b245b98740d489abb9aab69a12 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Fri, 25 Jul 2008 19:45:02 -0700
Subject: kernel/kexec.c: make 'kimage_terminate' void

Since kimage_terminate() always returns 0, make it void.

Signed-off-by: WANG Cong <wangcong@zeuux.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 1c5fcacbcf33..6db42ff8d520 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -589,14 +589,12 @@ static void kimage_free_extra_pages(struct kimage *image)
 	kimage_free_page_list(&image->unuseable_pages);
 
 }
-static int kimage_terminate(struct kimage *image)
+static void kimage_terminate(struct kimage *image)
 {
 	if (*image->entry != 0)
 		image->entry++;
 
 	*image->entry = IND_DONE;
-
-	return 0;
 }
 
 #define for_each_kimage_entry(image, ptr, entry) \
@@ -997,9 +995,7 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
 			if (result)
 				goto out;
 		}
-		result = kimage_terminate(image);
-		if (result)
-			goto out;
+		kimage_terminate(image);
 	}
 	/* Install the new kernel, and  Uninstall the old */
 	image = xchg(dest_image, image);
-- 
cgit v1.2.3


From 3ab83521378268044a448113c6aa9a9e245f4d2f Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Fri, 25 Jul 2008 19:45:07 -0700
Subject: kexec jump

This patch provides an enhancement to kexec/kdump.  It implements the
following features:

- Backup/restore memory used by the original kernel before/after
  kexec.

- Save/restore CPU state before/after kexec.

The features of this patch can be used as a general method to call program in
physical mode (paging turning off).  This can be used to call BIOS code under
Linux.

kexec-tools needs to be patched to support kexec jump. The patches and
the precompiled kexec can be download from the following URL:

       source: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-src_git_kh10.tar.bz2
       patches: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-patches_git_kh10.tar.bz2
       binary: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec_git_kh10

Usage example of calling some physical mode code and return:

1. Compile and install patched kernel with following options selected:

CONFIG_X86_32=y
CONFIG_KEXEC=y
CONFIG_PM=y
CONFIG_KEXEC_JUMP=y

2. Build patched kexec-tool or download the pre-built one.

3. Build some physical mode executable named such as "phy_mode"

4. Boot kernel compiled in step 1.

5. Load physical mode executable with /sbin/kexec. The shell command
   line can be as follow:

   /sbin/kexec --load-preserve-context --args-none phy_mode

6. Call physical mode executable with following shell command line:

   /sbin/kexec -e

Implementation point:

To support jumping without reserving memory.  One shadow backup page (source
page) is allocated for each page used by kexeced code image (destination
page).  When do kexec_load, the image of kexeced code is loaded into source
pages, and before executing, the destination pages and the source pages are
swapped, so the contents of destination pages are backupped.  Before jumping
to the kexeced code image and after jumping back to the original kernel, the
destination pages and the source pages are swapped too.

C ABI (calling convention) is used as communication protocol between
kernel and called code.

A flag named KEXEC_PRESERVE_CONTEXT for sys_kexec_load is added to
indicate that the loaded kernel image is used for jumping back.

Now, only the i386 architecture is supported.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@nigel.suspend2.net>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sys.c   | 31 ++++++++-----------------------
 2 files changed, 65 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 6db42ff8d520..a0d920915b38 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -24,6 +24,8 @@
 #include <linux/utsrelease.h>
 #include <linux/utsname.h>
 #include <linux/numa.h>
+#include <linux/suspend.h>
+#include <linux/device.h>
 
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -242,6 +244,12 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
 		goto out;
 	}
 
+	image->swap_page = kimage_alloc_control_pages(image, 0);
+	if (!image->swap_page) {
+		printk(KERN_ERR "Could not allocate swap buffer\n");
+		goto out;
+	}
+
 	result = 0;
  out:
 	if (result == 0)
@@ -986,6 +994,8 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
 		if (result)
 			goto out;
 
+		if (flags & KEXEC_PRESERVE_CONTEXT)
+			image->preserve_context = 1;
 		result = machine_kexec_prepare(image);
 		if (result)
 			goto out;
@@ -1411,3 +1421,50 @@ static int __init crash_save_vmcoreinfo_init(void)
 }
 
 module_init(crash_save_vmcoreinfo_init)
+
+/**
+ *	kernel_kexec - reboot the system
+ *
+ *	Move into place and start executing a preloaded standalone
+ *	executable.  If nothing was preloaded return an error.
+ */
+int kernel_kexec(void)
+{
+	int error = 0;
+
+	if (xchg(&kexec_lock, 1))
+		return -EBUSY;
+	if (!kexec_image) {
+		error = -EINVAL;
+		goto Unlock;
+	}
+
+	if (kexec_image->preserve_context) {
+#ifdef CONFIG_KEXEC_JUMP
+		local_irq_disable();
+		save_processor_state();
+#endif
+	} else {
+		blocking_notifier_call_chain(&reboot_notifier_list,
+					     SYS_RESTART, NULL);
+		system_state = SYSTEM_RESTART;
+		device_shutdown();
+		sysdev_shutdown();
+		printk(KERN_EMERG "Starting new kernel\n");
+		machine_shutdown();
+	}
+
+	machine_kexec(kexec_image);
+
+	if (kexec_image->preserve_context) {
+#ifdef CONFIG_KEXEC_JUMP
+		restore_processor_state();
+		local_irq_enable();
+#endif
+	}
+
+ Unlock:
+	xchg(&kexec_lock, 0);
+
+	return error;
+}
diff --git a/kernel/sys.c b/kernel/sys.c
index 0c9d3fa1f5ff..c01858090a98 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -301,26 +301,6 @@ void kernel_restart(char *cmd)
 }
 EXPORT_SYMBOL_GPL(kernel_restart);
 
-/**
- *	kernel_kexec - reboot the system
- *
- *	Move into place and start executing a preloaded standalone
- *	executable.  If nothing was preloaded return an error.
- */
-static void kernel_kexec(void)
-{
-#ifdef CONFIG_KEXEC
-	struct kimage *image;
-	image = xchg(&kexec_image, NULL);
-	if (!image)
-		return;
-	kernel_restart_prepare(NULL);
-	printk(KERN_EMERG "Starting new kernel\n");
-	machine_shutdown();
-	machine_kexec(image);
-#endif
-}
-
 static void kernel_shutdown_prepare(enum system_states state)
 {
 	blocking_notifier_call_chain(&reboot_notifier_list,
@@ -425,10 +405,15 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 		kernel_restart(buffer);
 		break;
 
+#ifdef CONFIG_KEXEC
 	case LINUX_REBOOT_CMD_KEXEC:
-		kernel_kexec();
-		unlock_kernel();
-		return -EINVAL;
+		{
+			int ret;
+			ret = kernel_kexec();
+			unlock_kernel();
+			return ret;
+		}
+#endif
 
 #ifdef CONFIG_HIBERNATION
 	case LINUX_REBOOT_CMD_SW_SUSPEND:
-- 
cgit v1.2.3


From 89081d17f7bb81d89fa1aa9b70f821c5cf4d39e9 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Fri, 25 Jul 2008 19:45:10 -0700
Subject: kexec jump: save/restore device state

This patch implements devices state save/restore before after kexec.

This patch together with features in kexec_jump patch can be used for
following:

- A simple hibernation implementation without ACPI support.  You can kexec a
  hibernating kernel, save the memory image of original system and shutdown
  the system.  When resuming, you restore the memory image of original system
  via ordinary kexec load then jump back.

- Kernel/system debug through making system snapshot.  You can make system
  snapshot, jump back, do some thing and make another system snapshot.

- Cooperative multi-kernel/system.  With kexec jump, you can switch between
  several kernels/systems quickly without boot process except the first time.
  This appears like swap a whole kernel/system out/in.

- A general method to call program in physical mode (paging turning
  off). This can be used to invoke BIOS code under Linux.

The following user-space tools can be used with kexec jump:

- kexec-tools needs to be patched to support kexec jump. The patches
  and the precompiled kexec can be download from the following URL:
       source: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-src_git_kh10.tar.bz2
       patches: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-patches_git_kh10.tar.bz2
       binary: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec_git_kh10

- makedumpfile with patches are used as memory image saving tool, it
  can exclude free pages from original kernel memory image file. The
  patches and the precompiled makedumpfile can be download from the
  following URL:
       source: http://khibernation.sourceforge.net/download/release_v10/makedumpfile/makedumpfile-src_cvs_kh10.tar.bz2
       patches: http://khibernation.sourceforge.net/download/release_v10/makedumpfile/makedumpfile-patches_cvs_kh10.tar.bz2
       binary: http://khibernation.sourceforge.net/download/release_v10/makedumpfile/makedumpfile_cvs_kh10

- An initramfs image can be used as the root file system of kexeced
  kernel. An initramfs image built with "BuildRoot" can be downloaded
  from the following URL:
       initramfs image: http://khibernation.sourceforge.net/download/release_v10/initramfs/rootfs_cvs_kh10.gz
  All user space tools above are included in the initramfs image.

Usage example of simple hibernation:

1. Compile and install patched kernel with following options selected:

CONFIG_X86_32=y
CONFIG_RELOCATABLE=y
CONFIG_KEXEC=y
CONFIG_CRASH_DUMP=y
CONFIG_PM=y
CONFIG_HIBERNATION=y
CONFIG_KEXEC_JUMP=y

2. Build an initramfs image contains kexec-tool and makedumpfile, or
   download the pre-built initramfs image, called rootfs.gz in
   following text.

3. Prepare a partition to save memory image of original kernel, called
   hibernating partition in following text.

4. Boot kernel compiled in step 1 (kernel A).

5. In the kernel A, load kernel compiled in step 1 (kernel B) with
   /sbin/kexec. The shell command line can be as follow:

   /sbin/kexec --load-preserve-context /boot/bzImage --mem-min=0x100000
     --mem-max=0xffffff --initrd=rootfs.gz

6. Boot the kernel B with following shell command line:

   /sbin/kexec -e

7. The kernel B will boot as normal kexec. In kernel B the memory
   image of kernel A can be saved into hibernating partition as
   follow:

   jump_back_entry=`cat /proc/cmdline | tr ' ' '\n' | grep kexec_jump_back_entry | cut -d '='`
   echo $jump_back_entry > kexec_jump_back_entry
   cp /proc/vmcore dump.elf

   Then you can shutdown the machine as normal.

8. Boot kernel compiled in step 1 (kernel C). Use the rootfs.gz as
   root file system.

9. In kernel C, load the memory image of kernel A as follow:

   /sbin/kexec -l --args-none --entry=`cat kexec_jump_back_entry` dump.elf

10. Jump back to the kernel A as follow:

   /sbin/kexec -e

   Then, kernel A is resumed.

Implementation point:

To support jumping between two kernels, before jumping to (executing)
the new kernel and jumping back to the original kernel, the devices
are put into quiescent state, and the state of devices and CPU is
saved. After jumping back from kexeced kernel and jumping to the new
kernel, the state of devices and CPU are restored accordingly. The
devices/CPU state save/restore code of software suspend is called to
implement corresponding function.

Known issues:

- Because the segment number supported by sys_kexec_load is limited,
  hibernation image with many segments may not be load. This is
  planned to be eliminated by adding a new flag to sys_kexec_load to
  make a image can be loaded with multiple sys_kexec_load invoking.

Now, only the i386 architecture is supported.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Nigel Cunningham <nigel@nigel.suspend2.net>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c       | 39 +++++++++++++++++++++++++++++++++++++++
 kernel/power/power.h |  2 --
 2 files changed, 39 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index a0d920915b38..c8a4370e2a34 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -26,6 +26,10 @@
 #include <linux/numa.h>
 #include <linux/suspend.h>
 #include <linux/device.h>
+#include <linux/freezer.h>
+#include <linux/pm.h>
+#include <linux/cpu.h>
+#include <linux/console.h>
 
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -1441,7 +1445,31 @@ int kernel_kexec(void)
 
 	if (kexec_image->preserve_context) {
 #ifdef CONFIG_KEXEC_JUMP
+		mutex_lock(&pm_mutex);
+		pm_prepare_console();
+		error = freeze_processes();
+		if (error) {
+			error = -EBUSY;
+			goto Restore_console;
+		}
+		suspend_console();
+		error = device_suspend(PMSG_FREEZE);
+		if (error)
+			goto Resume_console;
+		error = disable_nonboot_cpus();
+		if (error)
+			goto Resume_devices;
 		local_irq_disable();
+		/* At this point, device_suspend() has been called,
+		 * but *not* device_power_down(). We *must*
+		 * device_power_down() now.  Otherwise, drivers for
+		 * some devices (e.g. interrupt controllers) become
+		 * desynchronized with the actual state of the
+		 * hardware at resume time, and evil weirdness ensues.
+		 */
+		error = device_power_down(PMSG_FREEZE);
+		if (error)
+			goto Enable_irqs;
 		save_processor_state();
 #endif
 	} else {
@@ -1459,7 +1487,18 @@ int kernel_kexec(void)
 	if (kexec_image->preserve_context) {
 #ifdef CONFIG_KEXEC_JUMP
 		restore_processor_state();
+		device_power_up(PMSG_RESTORE);
+ Enable_irqs:
 		local_irq_enable();
+		enable_nonboot_cpus();
+ Resume_devices:
+		device_resume(PMSG_RESTORE);
+ Resume_console:
+		resume_console();
+		thaw_processes();
+ Restore_console:
+		pm_restore_console();
+		mutex_unlock(&pm_mutex);
 #endif
 	}
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 700f44ec8406..acc0c101dbd5 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -53,8 +53,6 @@ extern int hibernation_platform_enter(void);
 
 extern int pfn_is_nosave(unsigned long);
 
-extern struct mutex pm_mutex;
-
 #define power_attr(_name) \
 static struct kobj_attribute _name##_attr = {	\
 	.attr	= {				\
-- 
cgit v1.2.3


From 7babe8db99d305340cf4828ce1f5a1481d5622ef Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Fri, 25 Jul 2008 19:45:11 -0700
Subject: Full conversion to early_initcall() interface, remove old interface

A previous patch added the early_initcall(), to allow a cleaner hooking of
pre-SMP initcalls.  Now we remove the older interface, converting all
existing users to the new one.

[akpm@linux-foundation.org: cleanups]
[akpm@linux-foundation.org: build fix]
[kosaki.motohiro@jp.fujitsu.com: warning fix]
[kosaki.motohiro@jp.fujitsu.com: warning fix]
Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c      |  5 ++++-
 kernel/smp.c        |  4 +++-
 kernel/softirq.c    |  3 ++-
 kernel/softlockup.c | 25 ++++++++++++++++++++++---
 4 files changed, 31 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0047bd9b96aa..fde1a1026359 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6389,7 +6389,7 @@ static struct notifier_block __cpuinitdata migration_notifier = {
 	.priority = 10
 };
 
-void __init migration_init(void)
+static int __init migration_init(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
 	int err;
@@ -6399,7 +6399,10 @@ void __init migration_init(void)
 	BUG_ON(err == NOTIFY_BAD);
 	migration_call(&migration_notifier, CPU_ONLINE, cpu);
 	register_cpu_notifier(&migration_notifier);
+
+	return err;
 }
+early_initcall(migration_init);
 #endif
 
 #ifdef CONFIG_SMP
diff --git a/kernel/smp.c b/kernel/smp.c
index 462c785ca1ee..96fc7c0edc59 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -33,7 +33,7 @@ struct call_single_queue {
 	spinlock_t lock;
 };
 
-void __cpuinit init_call_single_data(void)
+static int __cpuinit init_call_single_data(void)
 {
 	int i;
 
@@ -43,7 +43,9 @@ void __cpuinit init_call_single_data(void)
 		spin_lock_init(&q->lock);
 		INIT_LIST_HEAD(&q->list);
 	}
+	return 0;
 }
+early_initcall(init_call_single_data);
 
 static void csd_flag_wait(struct call_single_data *data)
 {
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f6b03d56c2bf..c506f266a6b9 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -630,7 +630,7 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
 	.notifier_call = cpu_callback
 };
 
-__init int spawn_ksoftirqd(void)
+static __init int spawn_ksoftirqd(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
 	int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
@@ -640,6 +640,7 @@ __init int spawn_ksoftirqd(void)
 	register_cpu_notifier(&cpu_nfb);
 	return 0;
 }
+early_initcall(spawn_ksoftirqd);
 
 #ifdef CONFIG_SMP
 /*
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 7bd8d1aadd5d..b75b492fbfcf 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -338,14 +338,33 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
 	.notifier_call = cpu_callback
 };
 
-__init void spawn_softlockup_task(void)
+static int __initdata nosoftlockup;
+
+static int __init nosoftlockup_setup(char *str)
+{
+	nosoftlockup = 1;
+	return 1;
+}
+__setup("nosoftlockup", nosoftlockup_setup);
+
+static int __init spawn_softlockup_task(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
-	int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	int err;
 
-	BUG_ON(err == NOTIFY_BAD);
+	if (nosoftlockup)
+		return 0;
+
+	err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	if (err == NOTIFY_BAD) {
+		BUG();
+		return 1;
+	}
 	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
 	register_cpu_notifier(&cpu_nfb);
 
 	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+
+	return 0;
 }
+early_initcall(spawn_softlockup_task);
-- 
cgit v1.2.3


From 20d8b67c06fa5e74f44e80b0a0fd68c8327f7c6a Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Fri, 25 Jul 2008 19:45:12 -0700
Subject: relay: add buffer-only channels; useful for early logging

Allows one to create and use a channel with no associated files.  Files
can be initialized later.  This is useful in scenarios such as logging in
early code, before VFS is up.  Therefore, such channels can be created and
used as soon as kmem_cache_init() completed.

This is needed by kmemtrace to do tracing in early kernel code.

[kosaki.motohiro@jp.fujitsu.com: build fix]
Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/relay.c | 170 +++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 141 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index 7de644cdec43..04006ef970b8 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -407,6 +407,35 @@ void relay_reset(struct rchan *chan)
 }
 EXPORT_SYMBOL_GPL(relay_reset);
 
+static inline void relay_set_buf_dentry(struct rchan_buf *buf,
+					struct dentry *dentry)
+{
+	buf->dentry = dentry;
+	buf->dentry->d_inode->i_size = buf->early_bytes;
+}
+
+static struct dentry *relay_create_buf_file(struct rchan *chan,
+					    struct rchan_buf *buf,
+					    unsigned int cpu)
+{
+	struct dentry *dentry;
+	char *tmpname;
+
+	tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL);
+	if (!tmpname)
+		return NULL;
+	snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu);
+
+	/* Create file in fs */
+	dentry = chan->cb->create_buf_file(tmpname, chan->parent,
+					   S_IRUSR, buf,
+					   &chan->is_global);
+
+	kfree(tmpname);
+
+	return dentry;
+}
+
 /*
  *	relay_open_buf - create a new relay channel buffer
  *
@@ -416,45 +445,34 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
 {
  	struct rchan_buf *buf = NULL;
 	struct dentry *dentry;
- 	char *tmpname;
 
  	if (chan->is_global)
 		return chan->buf[0];
 
-	tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL);
- 	if (!tmpname)
- 		goto end;
- 	snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu);
-
 	buf = relay_create_buf(chan);
 	if (!buf)
- 		goto free_name;
+		return NULL;
+
+	if (chan->has_base_filename) {
+		dentry = relay_create_buf_file(chan, buf, cpu);
+		if (!dentry)
+			goto free_buf;
+		relay_set_buf_dentry(buf, dentry);
+	}
 
  	buf->cpu = cpu;
  	__relay_reset(buf, 1);
 
-	/* Create file in fs */
- 	dentry = chan->cb->create_buf_file(tmpname, chan->parent, S_IRUSR,
- 					   buf, &chan->is_global);
- 	if (!dentry)
- 		goto free_buf;
-
-	buf->dentry = dentry;
-
  	if(chan->is_global) {
  		chan->buf[0] = buf;
  		buf->cpu = 0;
   	}
 
- 	goto free_name;
+	return buf;
 
 free_buf:
  	relay_destroy_buf(buf);
- 	buf = NULL;
-free_name:
- 	kfree(tmpname);
-end:
-	return buf;
+	return NULL;
 }
 
 /**
@@ -537,8 +555,8 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
 
 /**
  *	relay_open - create a new relay channel
- *	@base_filename: base name of files to create
- *	@parent: dentry of parent directory, %NULL for root directory
+ *	@base_filename: base name of files to create, %NULL for buffering only
+ *	@parent: dentry of parent directory, %NULL for root directory or buffer
  *	@subbuf_size: size of sub-buffers
  *	@n_subbufs: number of sub-buffers
  *	@cb: client callback functions
@@ -560,8 +578,6 @@ struct rchan *relay_open(const char *base_filename,
 {
 	unsigned int i;
 	struct rchan *chan;
-	if (!base_filename)
-		return NULL;
 
 	if (!(subbuf_size && n_subbufs))
 		return NULL;
@@ -576,7 +592,10 @@ struct rchan *relay_open(const char *base_filename,
 	chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
 	chan->parent = parent;
 	chan->private_data = private_data;
-	strlcpy(chan->base_filename, base_filename, NAME_MAX);
+	if (base_filename) {
+		chan->has_base_filename = 1;
+		strlcpy(chan->base_filename, base_filename, NAME_MAX);
+	}
 	setup_callbacks(chan, cb);
 	kref_init(&chan->kref);
 
@@ -604,6 +623,94 @@ free_bufs:
 }
 EXPORT_SYMBOL_GPL(relay_open);
 
+struct rchan_percpu_buf_dispatcher {
+	struct rchan_buf *buf;
+	struct dentry *dentry;
+};
+
+/* Called in atomic context. */
+static void __relay_set_buf_dentry(void *info)
+{
+	struct rchan_percpu_buf_dispatcher *p = info;
+
+	relay_set_buf_dentry(p->buf, p->dentry);
+}
+
+/**
+ *	relay_late_setup_files - triggers file creation
+ *	@chan: channel to operate on
+ *	@base_filename: base name of files to create
+ *	@parent: dentry of parent directory, %NULL for root directory
+ *
+ *	Returns 0 if successful, non-zero otherwise.
+ *
+ *	Use to setup files for a previously buffer-only channel.
+ *	Useful to do early tracing in kernel, before VFS is up, for example.
+ */
+int relay_late_setup_files(struct rchan *chan,
+			   const char *base_filename,
+			   struct dentry *parent)
+{
+	int err = 0;
+	unsigned int i, curr_cpu;
+	unsigned long flags;
+	struct dentry *dentry;
+	struct rchan_percpu_buf_dispatcher disp;
+
+	if (!chan || !base_filename)
+		return -EINVAL;
+
+	strlcpy(chan->base_filename, base_filename, NAME_MAX);
+
+	mutex_lock(&relay_channels_mutex);
+	/* Is chan already set up? */
+	if (unlikely(chan->has_base_filename))
+		return -EEXIST;
+	chan->has_base_filename = 1;
+	chan->parent = parent;
+	curr_cpu = get_cpu();
+	/*
+	 * The CPU hotplug notifier ran before us and created buffers with
+	 * no files associated. So it's safe to call relay_setup_buf_file()
+	 * on all currently online CPUs.
+	 */
+	for_each_online_cpu(i) {
+		if (unlikely(!chan->buf[i])) {
+			printk(KERN_ERR "relay_late_setup_files: CPU %u "
+					"has no buffer, it must have!\n", i);
+			BUG();
+			err = -EINVAL;
+			break;
+		}
+
+		dentry = relay_create_buf_file(chan, chan->buf[i], i);
+		if (unlikely(!dentry)) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (curr_cpu == i) {
+			local_irq_save(flags);
+			relay_set_buf_dentry(chan->buf[i], dentry);
+			local_irq_restore(flags);
+		} else {
+			disp.buf = chan->buf[i];
+			disp.dentry = dentry;
+			smp_mb();
+			/* relay_channels_mutex must be held, so wait. */
+			err = smp_call_function_single(i,
+						       __relay_set_buf_dentry,
+						       &disp, 1);
+		}
+		if (unlikely(err))
+			break;
+	}
+	put_cpu();
+	mutex_unlock(&relay_channels_mutex);
+
+	return err;
+}
+
 /**
  *	relay_switch_subbuf - switch to a new sub-buffer
  *	@buf: channel buffer
@@ -627,8 +734,13 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
 		old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
 		buf->padding[old_subbuf] = buf->prev_padding;
 		buf->subbufs_produced++;
-		buf->dentry->d_inode->i_size += buf->chan->subbuf_size -
-			buf->padding[old_subbuf];
+		if (buf->dentry)
+			buf->dentry->d_inode->i_size +=
+				buf->chan->subbuf_size -
+				buf->padding[old_subbuf];
+		else
+			buf->early_bytes += buf->chan->subbuf_size -
+					    buf->padding[old_subbuf];
 		smp_mb();
 		if (waitqueue_active(&buf->read_wait))
 			/*
@@ -1237,4 +1349,4 @@ static __init int relay_init(void)
 	return 0;
 }
 
-module_init(relay_init);
+early_initcall(relay_init);
-- 
cgit v1.2.3


From 51cc50685a4275c6a02653670af9f108a64e01cf Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 25 Jul 2008 19:45:34 -0700
Subject: SL*B: drop kmem cache argument from constructor

Kmem cache passed to constructor is only needed for constructors that are
themselves multiplexeres.  Nobody uses this "feature", nor does anybody uses
passed kmem cache in non-trivial way, so pass only pointer to object.

Non-trivial places are:
	arch/powerpc/mm/init_64.c
	arch/powerpc/mm/hugetlbpage.c

This is flag day, yes.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Jon Tollefson <kniht@linux.vnet.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Matt Mackall <mpm@selenic.com>
[akpm@linux-foundation.org: fix arch/powerpc/mm/hugetlbpage.c]
[akpm@linux-foundation.org: fix mm/slab.c]
[akpm@linux-foundation.org: fix ubifs]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index b99d73e971a4..80e83e459b17 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1442,7 +1442,7 @@ long do_fork(unsigned long clone_flags,
 #define ARCH_MIN_MMSTRUCT_ALIGN 0
 #endif
 
-static void sighand_ctor(struct kmem_cache *cachep, void *data)
+static void sighand_ctor(void *data)
 {
 	struct sighand_struct *sighand = data;
 
-- 
cgit v1.2.3


From b8c512f6190e313df69060bae4a161c5c044e272 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 25 Jul 2008 19:45:36 -0700
Subject: Use WARN() in kernel/irq/manage.c

Replace a printk+WARN_ON() by a WARN(); this increases the chance of the
string making it into the bugreport (ie: it goes inside the
---[ cut here ]--- section)

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/irq/manage.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index f8914b92b664..152abfd3589f 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -177,8 +177,7 @@ static void __enable_irq(struct irq_desc *desc, unsigned int irq)
 {
 	switch (desc->depth) {
 	case 0:
-		printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
-		WARN_ON(1);
+		WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
 		break;
 	case 1: {
 		unsigned int status = desc->status & ~IRQ_DISABLED;
-- 
cgit v1.2.3


From 261c40c1191ad8d7a2e49fa2bb5f6a84e3d44b10 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 25 Jul 2008 19:45:37 -0700
Subject: use WARN() in kernel/irq/chip.c

Use WARN() instead of a printk+WARN_ON() pair; this way the message
becomes part of the warning section for better reporting/collection.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/irq/chip.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 964964baefa2..3cd441ebf5d2 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -28,8 +28,7 @@ void dynamic_irq_init(unsigned int irq)
 	unsigned long flags;
 
 	if (irq >= NR_IRQS) {
-		printk(KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
-		WARN_ON(1);
+		WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
 		return;
 	}
 
@@ -62,8 +61,7 @@ void dynamic_irq_cleanup(unsigned int irq)
 	unsigned long flags;
 
 	if (irq >= NR_IRQS) {
-		printk(KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
-		WARN_ON(1);
+		WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
 		return;
 	}
 
@@ -71,9 +69,8 @@ void dynamic_irq_cleanup(unsigned int irq)
 	spin_lock_irqsave(&desc->lock, flags);
 	if (desc->action) {
 		spin_unlock_irqrestore(&desc->lock, flags);
-		printk(KERN_ERR "Destroying IRQ%d without calling free_irq\n",
+		WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n",
 			irq);
-		WARN_ON(1);
 		return;
 	}
 	desc->msi_desc = NULL;
@@ -96,8 +93,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
 	unsigned long flags;
 
 	if (irq >= NR_IRQS) {
-		printk(KERN_ERR "Trying to install chip for IRQ%d\n", irq);
-		WARN_ON(1);
+		WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq);
 		return -EINVAL;
 	}
 
-- 
cgit v1.2.3


From ff1188646c6870f336e910fb894eeed74f50471f Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:45 -0700
Subject: tracehook: unexport ptrace_notify

The ptrace_notify() function should not be called by any modules.  It was
only ever exported to be called by binfmt exec functions.  But that is no
longer necessary since fs/exec.c deals with that generically now.  There
should be no calls to ptrace_notify() from outside the core kernel.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 82c3545596c5..8715c18b27b9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1895,7 +1895,6 @@ EXPORT_SYMBOL(recalc_sigpending);
 EXPORT_SYMBOL_GPL(dequeue_signal);
 EXPORT_SYMBOL(flush_signals);
 EXPORT_SYMBOL(force_sig);
-EXPORT_SYMBOL(ptrace_notify);
 EXPORT_SYMBOL(send_sig);
 EXPORT_SYMBOL(send_sig_info);
 EXPORT_SYMBOL(sigprocmask);
-- 
cgit v1.2.3


From 30199f5a46aee204bf437a4f5b0740f3efe448b7 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:46 -0700
Subject: tracehook: exit

This moves the PTRACE_EVENT_EXIT tracing into a tracehook.h inline,
tracehook_report_exec().  The change has no effect, just clean-up.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index ad933bb29ec7..c3691cbc220a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -46,6 +46,7 @@
 #include <linux/resource.h>
 #include <linux/blkdev.h>
 #include <linux/task_io_accounting_ops.h>
+#include <linux/tracehook.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -1029,10 +1030,7 @@ NORET_TYPE void do_exit(long code)
 	if (unlikely(!tsk->pid))
 		panic("Attempted to kill the idle task!");
 
-	if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
-		current->ptrace_message = code;
-		ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
-	}
+	tracehook_report_exit(&code);
 
 	/*
 	 * We're taking recursive faults here in do_exit. Safest is to just
-- 
cgit v1.2.3


From 09a05394fe2448a4139b014936330af23fa7ec83 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:47 -0700
Subject: tracehook: clone

This moves all the ptrace initialization and tracing logic for task
creation into tracehook.h and ptrace.h inlines.  It reorganizes the code
slightly, but should not change any behavior.

There are four tracehook entry points, at each important stage of task
creation.  This keeps the interface from the core fork.c code fairly
clean, while supporting the complex setup required for ptrace or something
like it.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 69 ++++++++++++++++++++++++-----------------------------------
 1 file changed, 28 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 80e83e459b17..b42f8ed23611 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -37,6 +37,7 @@
 #include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
+#include <linux/tracehook.h>
 #include <linux/futex.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/rcupdate.h>
@@ -865,8 +866,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
 
 	new_flags &= ~PF_SUPERPRIV;
 	new_flags |= PF_FORKNOEXEC;
-	if (!(clone_flags & CLONE_PTRACE))
-		p->ptrace = 0;
+	new_flags |= PF_STARTING;
 	p->flags = new_flags;
 	clear_freeze_flag(p);
 }
@@ -907,7 +907,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 					struct pt_regs *regs,
 					unsigned long stack_size,
 					int __user *child_tidptr,
-					struct pid *pid)
+					struct pid *pid,
+					int trace)
 {
 	int retval;
 	struct task_struct *p;
@@ -1163,8 +1164,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	 */
 	p->group_leader = p;
 	INIT_LIST_HEAD(&p->thread_group);
-	INIT_LIST_HEAD(&p->ptrace_entry);
-	INIT_LIST_HEAD(&p->ptraced);
 
 	/* Now that the task is set up, run cgroup callbacks if
 	 * necessary. We need to run them before the task is visible
@@ -1195,7 +1194,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		p->real_parent = current->real_parent;
 	else
 		p->real_parent = current;
-	p->parent = p->real_parent;
 
 	spin_lock(&current->sighand->siglock);
 
@@ -1237,8 +1235,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	if (likely(p->pid)) {
 		list_add_tail(&p->sibling, &p->real_parent->children);
-		if (unlikely(p->ptrace & PT_PTRACED))
-			__ptrace_link(p, current->parent);
+		tracehook_finish_clone(p, clone_flags, trace);
 
 		if (thread_group_leader(p)) {
 			if (clone_flags & CLONE_NEWPID)
@@ -1323,29 +1320,13 @@ struct task_struct * __cpuinit fork_idle(int cpu)
 	struct pt_regs regs;
 
 	task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
-				&init_struct_pid);
+			    &init_struct_pid, 0);
 	if (!IS_ERR(task))
 		init_idle(task, cpu);
 
 	return task;
 }
 
-static int fork_traceflag(unsigned clone_flags)
-{
-	if (clone_flags & CLONE_UNTRACED)
-		return 0;
-	else if (clone_flags & CLONE_VFORK) {
-		if (current->ptrace & PT_TRACE_VFORK)
-			return PTRACE_EVENT_VFORK;
-	} else if ((clone_flags & CSIGNAL) != SIGCHLD) {
-		if (current->ptrace & PT_TRACE_CLONE)
-			return PTRACE_EVENT_CLONE;
-	} else if (current->ptrace & PT_TRACE_FORK)
-		return PTRACE_EVENT_FORK;
-
-	return 0;
-}
-
 /*
  *  Ok, this is the main fork-routine.
  *
@@ -1380,14 +1361,14 @@ long do_fork(unsigned long clone_flags,
 		}
 	}
 
-	if (unlikely(current->ptrace)) {
-		trace = fork_traceflag (clone_flags);
-		if (trace)
-			clone_flags |= CLONE_PTRACE;
-	}
+	/*
+	 * When called from kernel_thread, don't do user tracing stuff.
+	 */
+	if (likely(user_mode(regs)))
+		trace = tracehook_prepare_clone(clone_flags);
 
 	p = copy_process(clone_flags, stack_start, regs, stack_size,
-			child_tidptr, NULL);
+			 child_tidptr, NULL, trace);
 	/*
 	 * Do this prior waking up the new thread - the thread pointer
 	 * might get invalid after that point, if the thread exits quickly.
@@ -1405,24 +1386,30 @@ long do_fork(unsigned long clone_flags,
 			init_completion(&vfork);
 		}
 
-		if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
+		tracehook_report_clone(trace, regs, clone_flags, nr, p);
+
+		/*
+		 * We set PF_STARTING at creation in case tracing wants to
+		 * use this to distinguish a fully live task from one that
+		 * hasn't gotten to tracehook_report_clone() yet.  Now we
+		 * clear it and set the child going.
+		 */
+		p->flags &= ~PF_STARTING;
+
+		if (unlikely(clone_flags & CLONE_STOPPED)) {
 			/*
 			 * We'll start up with an immediate SIGSTOP.
 			 */
 			sigaddset(&p->pending.signal, SIGSTOP);
 			set_tsk_thread_flag(p, TIF_SIGPENDING);
-		}
-
-		if (!(clone_flags & CLONE_STOPPED))
-			wake_up_new_task(p, clone_flags);
-		else
 			__set_task_state(p, TASK_STOPPED);
-
-		if (unlikely (trace)) {
-			current->ptrace_message = nr;
-			ptrace_notify ((trace << 8) | SIGTRAP);
+		} else {
+			wake_up_new_task(p, clone_flags);
 		}
 
+		tracehook_report_clone_complete(trace, regs,
+						clone_flags, nr, p);
+
 		if (clone_flags & CLONE_VFORK) {
 			freezer_do_not_count();
 			wait_for_completion(&vfork);
-- 
cgit v1.2.3


From daded34be96b1975ff8539ff62ad8b158ce7d842 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:47 -0700
Subject: tracehook: vfork-done

This moves the PTRACE_EVENT_VFORK_DONE tracing into a tracehook.h inline,
tracehook_report_vfork_done().  The change has no effect, just clean-up.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index b42f8ed23611..abb3ed6298f6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1414,10 +1414,7 @@ long do_fork(unsigned long clone_flags,
 			freezer_do_not_count();
 			wait_for_completion(&vfork);
 			freezer_count();
-			if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) {
-				current->ptrace_message = nr;
-				ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
-			}
+			tracehook_report_vfork_done(p, nr);
 		}
 	} else {
 		nr = PTR_ERR(p);
-- 
cgit v1.2.3


From dae33574dcf5211e1f43c7e45fa29f73ba3e00cb Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:48 -0700
Subject: tracehook: release_task

This moves the ptrace-related logic from release_task into tracehook.h and
ptrace.h inlines.  It provides clean hooks both before and after locking
tasklist_lock, for future tracing logic to do more cleanup without the
lock.

This also changes release_task() itself in the rare "zap_leader" case to
set the leader to EXIT_DEAD before iterating.  This maintains the
invariant that release_task() only ever handles a task in EXIT_DEAD.  This
is a common-sense invariant that is already always true except in this one
arcane case of zombie leader whose parent ignores SIGCHLD.

This change is harmless and only costs one store in this one rare case.
It keeps the expected state more consisently sane, which is nicer when
debugging weirdness in release_task().  It also lets some future code in
the tracehook entry points rely on this invariant for bookkeeping.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index c3691cbc220a..da28745f7c38 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -163,27 +163,17 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 	put_task_struct(container_of(rhp, struct task_struct, rcu));
 }
 
-/*
- * Do final ptrace-related cleanup of a zombie being reaped.
- *
- * Called with write_lock(&tasklist_lock) held.
- */
-static void ptrace_release_task(struct task_struct *p)
-{
-	BUG_ON(!list_empty(&p->ptraced));
-	ptrace_unlink(p);
-	BUG_ON(!list_empty(&p->ptrace_entry));
-}
 
 void release_task(struct task_struct * p)
 {
 	struct task_struct *leader;
 	int zap_leader;
 repeat:
+	tracehook_prepare_release_task(p);
 	atomic_dec(&p->user->processes);
 	proc_flush_task(p);
 	write_lock_irq(&tasklist_lock);
-	ptrace_release_task(p);
+	tracehook_finish_release_task(p);
 	__exit_signal(p);
 
 	/*
@@ -205,6 +195,13 @@ repeat:
 		 * that case.
 		 */
 		zap_leader = task_detached(leader);
+
+		/*
+		 * This maintains the invariant that release_task()
+		 * only runs on a task in EXIT_DEAD, just for sanity.
+		 */
+		if (zap_leader)
+			leader->exit_state = EXIT_DEAD;
 	}
 
 	write_unlock_irq(&tasklist_lock);
-- 
cgit v1.2.3


From 35de254dc60f91004b3b5ebb1fc7b2c3093d6032 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:51 -0700
Subject: tracehook: tracehook_consider_ignored_signal

This defines tracehook_consider_ignored_signal() has a fine-grained hook
for deciding to prevent the normal short-circuit of sending an ignored
signal, as ptrace does.  There is no change, only cleanup.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 8715c18b27b9..9efd1cee6d0b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -22,6 +22,7 @@
 #include <linux/ptrace.h>
 #include <linux/signal.h>
 #include <linux/signalfd.h>
+#include <linux/tracehook.h>
 #include <linux/capability.h>
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
@@ -39,24 +40,21 @@
 
 static struct kmem_cache *sigqueue_cachep;
 
-static int __sig_ignored(struct task_struct *t, int sig)
+static void __user *sig_handler(struct task_struct *t, int sig)
 {
-	void __user *handler;
+	return t->sighand->action[sig - 1].sa.sa_handler;
+}
 
+static int sig_handler_ignored(void __user *handler, int sig)
+{
 	/* Is it explicitly or implicitly ignored? */
-
-	handler = t->sighand->action[sig - 1].sa.sa_handler;
 	return handler == SIG_IGN ||
 		(handler == SIG_DFL && sig_kernel_ignore(sig));
 }
 
 static int sig_ignored(struct task_struct *t, int sig)
 {
-	/*
-	 * Tracers always want to know about signals..
-	 */
-	if (t->ptrace & PT_PTRACED)
-		return 0;
+	void __user *handler;
 
 	/*
 	 * Blocked signals are never ignored, since the
@@ -66,7 +64,14 @@ static int sig_ignored(struct task_struct *t, int sig)
 	if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
 		return 0;
 
-	return __sig_ignored(t, sig);
+	handler = sig_handler(t, sig);
+	if (!sig_handler_ignored(handler, sig))
+		return 0;
+
+	/*
+	 * Tracers may want to know about even ignored signals.
+	 */
+	return !tracehook_consider_ignored_signal(t, sig, handler);
 }
 
 /*
@@ -2298,7 +2303,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
 		 *   (for example, SIGCHLD), shall cause the pending signal to
 		 *   be discarded, whether or not it is blocked"
 		 */
-		if (__sig_ignored(t, sig)) {
+		if (sig_handler_ignored(sig_handler(t, sig), sig)) {
 			sigemptyset(&mask);
 			sigaddset(&mask, sig);
 			rm_from_queue_full(&mask, &t->signal->shared_pending);
-- 
cgit v1.2.3


From 445a91d2fe3667fb8fc251433645f686933cf56a Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:52 -0700
Subject: tracehook: tracehook_consider_fatal_signal

This defines tracehook_consider_fatal_signal() has a fine-grained hook for
deciding to skip the special cases for a fatal signal, as ptrace does.
There is no change, only cleanup.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 9efd1cee6d0b..1a942ce32ba0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -300,12 +300,12 @@ flush_signal_handlers(struct task_struct *t, int force_default)
 
 int unhandled_signal(struct task_struct *tsk, int sig)
 {
+	void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler;
 	if (is_global_init(tsk))
 		return 1;
-	if (tsk->ptrace & PT_PTRACED)
+	if (handler != SIG_IGN && handler != SIG_DFL)
 		return 0;
-	return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
-		(tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
+	return !tracehook_consider_fatal_signal(tsk, sig, handler);
 }
 
 
@@ -761,7 +761,8 @@ static void complete_signal(int sig, struct task_struct *p, int group)
 	if (sig_fatal(p, sig) &&
 	    !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
 	    !sigismember(&t->real_blocked, sig) &&
-	    (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) {
+	    (sig == SIGKILL ||
+	     !tracehook_consider_fatal_signal(t, sig, SIG_DFL))) {
 		/*
 		 * This signal will be fatal to the whole group.
 		 */
-- 
cgit v1.2.3


From 7bcf6a2ca5f639b038c48711ebe6c4eca2036641 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:53 -0700
Subject: tracehook: get_signal_to_deliver

This defines the tracehook_get_signal() hook to allow tracing code to slip
in before normal signal dequeuing.  This lays the groundwork for new
tracing features that can inject synthetic signals outside the normal
queue or control the disposition of delivered signals.  The calling
convention lets tracehook_get_signal() decide both exactly what will
happen and what signal number to report in the handler/exit.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 38 +++++++++++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 1a942ce32ba0..10b31ecdd9c8 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1754,17 +1754,33 @@ relock:
 		    do_signal_stop(0))
 			goto relock;
 
-		signr = dequeue_signal(current, &current->blocked, info);
-		if (!signr)
-			break; /* will return 0 */
+		/*
+		 * Tracing can induce an artifical signal and choose sigaction.
+		 * The return value in @signr determines the default action,
+		 * but @info->si_signo is the signal number we will report.
+		 */
+		signr = tracehook_get_signal(current, regs, info, return_ka);
+		if (unlikely(signr < 0))
+			goto relock;
+		if (unlikely(signr != 0))
+			ka = return_ka;
+		else {
+			signr = dequeue_signal(current, &current->blocked,
+					       info);
 
-		if (signr != SIGKILL) {
-			signr = ptrace_signal(signr, info, regs, cookie);
 			if (!signr)
-				continue;
+				break; /* will return 0 */
+
+			if (signr != SIGKILL) {
+				signr = ptrace_signal(signr, info,
+						      regs, cookie);
+				if (!signr)
+					continue;
+			}
+
+			ka = &sighand->action[signr-1];
 		}
 
-		ka = &sighand->action[signr-1];
 		if (ka->sa.sa_handler == SIG_IGN) /* Do nothing.  */
 			continue;
 		if (ka->sa.sa_handler != SIG_DFL) {
@@ -1812,7 +1828,7 @@ relock:
 				spin_lock_irq(&sighand->siglock);
 			}
 
-			if (likely(do_signal_stop(signr))) {
+			if (likely(do_signal_stop(info->si_signo))) {
 				/* It released the siglock.  */
 				goto relock;
 			}
@@ -1833,7 +1849,7 @@ relock:
 
 		if (sig_kernel_coredump(signr)) {
 			if (print_fatal_signals)
-				print_fatal_signal(regs, signr);
+				print_fatal_signal(regs, info->si_signo);
 			/*
 			 * If it was able to dump core, this kills all
 			 * other threads in the group and synchronizes with
@@ -1842,13 +1858,13 @@ relock:
 			 * first and our do_group_exit call below will use
 			 * that value and ignore the one we pass it.
 			 */
-			do_coredump((long)signr, signr, regs);
+			do_coredump(info->si_signo, info->si_signo, regs);
 		}
 
 		/*
 		 * Death signals, no core dump.
 		 */
-		do_group_exit(signr);
+		do_group_exit(info->si_signo);
 		/* NOTREACHED */
 	}
 	spin_unlock_irq(&sighand->siglock);
-- 
cgit v1.2.3


From fa00b80b3c41a845b3d56f866fb40a2e98754c51 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:54 -0700
Subject: tracehook: job control

This defines the tracehook_notify_jctl() hook to formalize the ptrace
effects on the job control notifications.  There is no change, only
cleanup.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 10b31ecdd9c8..e9e699f4b1bd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -596,9 +596,6 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	return security_task_kill(t, info, sig, 0);
 }
 
-/* forward decl */
-static void do_notify_parent_cldstop(struct task_struct *tsk, int why);
-
 /*
  * Handle magic process-wide effects of stop/continue signals. Unlike
  * the signal actions, these happen immediately at signal-generation
@@ -1605,7 +1602,7 @@ finish_stop(int stop_count)
 	 * a group stop in progress and we are the last to stop,
 	 * report to the parent.  When ptraced, every thread reports itself.
 	 */
-	if (stop_count == 0 || (current->ptrace & PT_PTRACED)) {
+	if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) {
 		read_lock(&tasklist_lock);
 		do_notify_parent_cldstop(current, CLD_STOPPED);
 		read_unlock(&tasklist_lock);
@@ -1741,6 +1738,9 @@ relock:
 		signal->flags &= ~SIGNAL_CLD_MASK;
 		spin_unlock_irq(&sighand->siglock);
 
+		if (unlikely(!tracehook_notify_jctl(1, why)))
+			goto relock;
+
 		read_lock(&tasklist_lock);
 		do_notify_parent_cldstop(current->group_leader, why);
 		read_unlock(&tasklist_lock);
@@ -1906,7 +1906,7 @@ void exit_signals(struct task_struct *tsk)
 out:
 	spin_unlock_irq(&tsk->sighand->siglock);
 
-	if (unlikely(group_stop)) {
+	if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) {
 		read_lock(&tasklist_lock);
 		do_notify_parent_cldstop(tsk, CLD_STOPPED);
 		read_unlock(&tasklist_lock);
-- 
cgit v1.2.3


From 2b2a1ff64afbadac842bbc58c5166962cf4f7664 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:54 -0700
Subject: tracehook: death

This moves the ptrace logic in task death (exit_notify) into tracehook.h
inlines.  Some code is rearranged slightly to make things nicer.  There is
no change, only cleanup.

There is one hook called with the tasklist_lock write-locked, as ptrace
needs.  There is also a new hook called after exit_state changes and
without locks.  This is a better place for tracing work to be in the
future, since it doesn't delay the whole system with locking.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c   | 26 +++++++++-----------------
 kernel/signal.c | 10 +++++++---
 2 files changed, 16 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index da28745f7c38..6cdf60712bd2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -885,7 +885,8 @@ static void forget_original_parent(struct task_struct *father)
  */
 static void exit_notify(struct task_struct *tsk, int group_dead)
 {
-	int state;
+	int signal;
+	void *cookie;
 
 	/*
 	 * This does two things:
@@ -922,22 +923,11 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	    !capable(CAP_KILL))
 		tsk->exit_signal = SIGCHLD;
 
-	/* If something other than our normal parent is ptracing us, then
-	 * send it a SIGCHLD instead of honoring exit_signal.  exit_signal
-	 * only has special meaning to our real parent.
-	 */
-	if (!task_detached(tsk) && thread_group_empty(tsk)) {
-		int signal = ptrace_reparented(tsk) ?
-				SIGCHLD : tsk->exit_signal;
-		do_notify_parent(tsk, signal);
-	} else if (tsk->ptrace) {
-		do_notify_parent(tsk, SIGCHLD);
-	}
+	signal = tracehook_notify_death(tsk, &cookie, group_dead);
+	if (signal > 0)
+		signal = do_notify_parent(tsk, signal);
 
-	state = EXIT_ZOMBIE;
-	if (task_detached(tsk) && likely(!tsk->ptrace))
-		state = EXIT_DEAD;
-	tsk->exit_state = state;
+	tsk->exit_state = signal < 0 ? EXIT_DEAD : EXIT_ZOMBIE;
 
 	/* mt-exec, de_thread() is waiting for us */
 	if (thread_group_leader(tsk) &&
@@ -947,8 +937,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 
 	write_unlock_irq(&tasklist_lock);
 
+	tracehook_report_death(tsk, signal, cookie, group_dead);
+
 	/* If the process is dead, release it - nobody will wait for it */
-	if (state == EXIT_DEAD)
+	if (signal < 0)
 		release_task(tsk);
 }
 
diff --git a/kernel/signal.c b/kernel/signal.c
index e9e699f4b1bd..0e862d3130ff 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1326,9 +1326,11 @@ static inline void __wake_up_parent(struct task_struct *p,
 /*
  * Let a parent know about the death of a child.
  * For a stopped/continued status change, use do_notify_parent_cldstop instead.
+ *
+ * Returns -1 if our parent ignored us and so we've switched to
+ * self-reaping, or else @sig.
  */
-
-void do_notify_parent(struct task_struct *tsk, int sig)
+int do_notify_parent(struct task_struct *tsk, int sig)
 {
 	struct siginfo info;
 	unsigned long flags;
@@ -1399,12 +1401,14 @@ void do_notify_parent(struct task_struct *tsk, int sig)
 		 */
 		tsk->exit_signal = -1;
 		if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
-			sig = 0;
+			sig = -1;
 	}
 	if (valid_signal(sig) && sig > 0)
 		__group_send_sig_info(sig, &info, tsk->parent);
 	__wake_up_parent(tsk, tsk->parent);
 	spin_unlock_irqrestore(&psig->siglock, flags);
+
+	return sig;
 }
 
 static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
-- 
cgit v1.2.3


From b787f7ba677840da16a2228c16571ce8a1fcb799 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:55 -0700
Subject: tracehook: force signal_pending()

This defines a new hook tracehook_force_sigpending() that lets tracing
code decide to force TIF_SIGPENDING on in recalc_sigpending().

This is not used yet, so it compiles away to nothing for now.  It lays the
groundwork for new tracing code that can interrupt a task synthetically
without actually sending a signal.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 0e862d3130ff..954f77d7e3bc 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -134,7 +134,9 @@ void recalc_sigpending_and_wake(struct task_struct *t)
 
 void recalc_sigpending(void)
 {
-	if (!recalc_sigpending_tsk(current) && !freezing(current))
+	if (unlikely(tracehook_force_sigpending()))
+		set_thread_flag(TIF_SIGPENDING);
+	else if (!recalc_sigpending_tsk(current) && !freezing(current))
 		clear_thread_flag(TIF_SIGPENDING);
 
 }
-- 
cgit v1.2.3


From 85ba2d862e521375a8ee01526c5c46b1f24bb4af Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 25 Jul 2008 19:45:58 -0700
Subject: tracehook: wait_task_inactive

This extends wait_task_inactive() with a new argument so it can be used in
a "soft" mode where it will check for the task changing state unexpectedly
and back off.  There is no change to existing callers.  This lays the
groundwork to allow robust, noninvasive tracing that can try to sample a
blocked thread but back off safely if it wakes up.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kthread.c |  2 +-
 kernel/ptrace.c  |  2 +-
 kernel/sched.c   | 29 +++++++++++++++++++++++++++--
 3 files changed, 29 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 6111c27491b1..96cff2f8710b 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -176,7 +176,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
 		return;
 	}
 	/* Must have done schedule() in kthread() before we set_task_cpu */
-	wait_task_inactive(k);
+	wait_task_inactive(k, 0);
 	set_task_cpu(k, cpu);
 	k->cpus_allowed = cpumask_of_cpu(cpu);
 	k->rt.nr_cpus_allowed = 1;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 8392a9da6450..082b3fcb32a0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -107,7 +107,7 @@ int ptrace_check_attach(struct task_struct *child, int kill)
 	read_unlock(&tasklist_lock);
 
 	if (!ret && !kill)
-		wait_task_inactive(child);
+		ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH;
 
 	/* All systems go.. */
 	return ret;
diff --git a/kernel/sched.c b/kernel/sched.c
index fde1a1026359..0236958addcb 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1867,16 +1867,24 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
+ * If @match_state is nonzero, it's the @p->state value just checked and
+ * not expected to change.  If it changes, i.e. @p might have woken up,
+ * then return zero.  When we succeed in waiting for @p to be off its CPU,
+ * we return a positive number (its total switch count).  If a second call
+ * a short while later returns the same number, the caller can be sure that
+ * @p has remained unscheduled the whole time.
+ *
  * The caller must ensure that the task *will* unschedule sometime soon,
  * else this function might spin for a *long* time. This function can't
  * be called with interrupts off, or it may introduce deadlock with
  * smp_call_function() if an IPI is sent by the same process we are
  * waiting to become inactive.
  */
-void wait_task_inactive(struct task_struct *p)
+unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 {
 	unsigned long flags;
 	int running, on_rq;
+	unsigned long ncsw;
 	struct rq *rq;
 
 	for (;;) {
@@ -1899,8 +1907,11 @@ void wait_task_inactive(struct task_struct *p)
 		 * return false if the runqueue has changed and p
 		 * is actually now running somewhere else!
 		 */
-		while (task_running(rq, p))
+		while (task_running(rq, p)) {
+			if (match_state && unlikely(p->state != match_state))
+				return 0;
 			cpu_relax();
+		}
 
 		/*
 		 * Ok, time to look more closely! We need the rq
@@ -1910,8 +1921,20 @@ void wait_task_inactive(struct task_struct *p)
 		rq = task_rq_lock(p, &flags);
 		running = task_running(rq, p);
 		on_rq = p->se.on_rq;
+		ncsw = 0;
+		if (!match_state || p->state == match_state) {
+			ncsw = p->nivcsw + p->nvcsw;
+			if (unlikely(!ncsw))
+				ncsw = 1;
+		}
 		task_rq_unlock(rq, &flags);
 
+		/*
+		 * If it changed from the expected state, bail out now.
+		 */
+		if (unlikely(!ncsw))
+			break;
+
 		/*
 		 * Was it really running after all now that we
 		 * checked with the proper locks actually held?
@@ -1944,6 +1967,8 @@ void wait_task_inactive(struct task_struct *p)
 		 */
 		break;
 	}
+
+	return ncsw;
 }
 
 /***
-- 
cgit v1.2.3


From 96930a6365c99c160138a395566e360b27348b8f Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Fri, 25 Jul 2008 19:46:21 -0700
Subject: make cgroup_seqfile_release() static

cgroup_seqfile_release() can become static.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Acked-by: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 66ec9fd21e0c..89bd6fb7894f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1529,7 +1529,7 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
 	return cft->read_seq_string(state->cgroup, cft, m);
 }
 
-int cgroup_seqfile_release(struct inode *inode, struct file *file)
+static int cgroup_seqfile_release(struct inode *inode, struct file *file)
 {
 	struct seq_file *seq = file->private_data;
 	kfree(seq->private);
-- 
cgit v1.2.3


From 734550921e9b7ab924a43aa3d0bd4239dac4fbf1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 14 Jul 2008 21:22:20 -0400
Subject: [PATCH] beginning of sysctl cleanup - ctl_table_set

New object: set of sysctls [currently - root and per-net-ns].
Contains: pointer to parent set, list of tables and "should I see this set?"
method (->is_seen(set)).
Current lists of tables are subsumed by that; net-ns contains such a beast.
->lookup() for ctl_table_root returns pointer to ctl_table_set instead of
that to ->list of that ctl_table_set.

[folded compile fixes by rdd for configs without sysctl]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/sysctl.c | 41 +++++++++++++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 35a50db9b6ce..8ee4a0619fbb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -160,12 +160,13 @@ static struct ctl_table root_table[];
 static struct ctl_table_root sysctl_table_root;
 static struct ctl_table_header root_table_header = {
 	.ctl_table = root_table,
-	.ctl_entry = LIST_HEAD_INIT(sysctl_table_root.header_list),
+	.ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),
 	.root = &sysctl_table_root,
+	.set = &sysctl_table_root.default_set,
 };
 static struct ctl_table_root sysctl_table_root = {
 	.root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
-	.header_list = LIST_HEAD_INIT(root_table_header.ctl_entry),
+	.default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry),
 };
 
 static struct ctl_table kern_table[];
@@ -1403,14 +1404,20 @@ void sysctl_head_finish(struct ctl_table_header *head)
 	spin_unlock(&sysctl_lock);
 }
 
+static struct ctl_table_set *
+lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
+{
+	struct ctl_table_set *set = &root->default_set;
+	if (root->lookup)
+		set = root->lookup(root, namespaces);
+	return set;
+}
+
 static struct list_head *
 lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces)
 {
-	struct list_head *header_list;
-	header_list = &root->header_list;
-	if (root->lookup)
-		header_list = root->lookup(root, namespaces);
-	return header_list;
+	struct ctl_table_set *set = lookup_header_set(root, namespaces);
+	return &set->list;
 }
 
 struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
@@ -1720,7 +1727,6 @@ struct ctl_table_header *__register_sysctl_paths(
 	struct nsproxy *namespaces,
 	const struct ctl_path *path, struct ctl_table *table)
 {
-	struct list_head *header_list;
 	struct ctl_table_header *header;
 	struct ctl_table *new, **prevp;
 	unsigned int n, npath;
@@ -1772,8 +1778,8 @@ struct ctl_table_header *__register_sysctl_paths(
 	}
 #endif
 	spin_lock(&sysctl_lock);
-	header_list = lookup_header_list(root, namespaces);
-	list_add_tail(&header->ctl_entry, header_list);
+	header->set = lookup_header_set(root, namespaces);
+	list_add_tail(&header->ctl_entry, &header->set->list);
 	spin_unlock(&sysctl_lock);
 
 	return header;
@@ -1832,6 +1838,15 @@ void unregister_sysctl_table(struct ctl_table_header * header)
 	kfree(header);
 }
 
+void setup_sysctl_set(struct ctl_table_set *p,
+	struct ctl_table_set *parent,
+	int (*is_seen)(struct ctl_table_set *))
+{
+	INIT_LIST_HEAD(&p->list);
+	p->parent = parent ? parent : &sysctl_table_root.default_set;
+	p->is_seen = is_seen;
+}
+
 #else /* !CONFIG_SYSCTL */
 struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
 {
@@ -1848,6 +1863,12 @@ void unregister_sysctl_table(struct ctl_table_header * table)
 {
 }
 
+void setup_sysctl_set(struct ctl_table_set *p,
+	struct ctl_table_set *parent,
+	int (*is_seen)(struct ctl_table_set *))
+{
+}
+
 #endif /* CONFIG_SYSCTL */
 
 /*
-- 
cgit v1.2.3


From f7e6ced4061da509f737541ca4dbd44d83a6e82f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 15 Jul 2008 01:44:23 -0400
Subject: [PATCH] allow delayed freeing of ctl_table_header

Refcount the sucker; instead of freeing it by the end of unregistration
just drop the refcount and free only when it hits zero.  Make sure that
we _always_ make ->unregistering non-NULL in start_unregistering().

That allows anybody to get a reference to such puppy, preventing its
freeing and reuse.  It does *not* block unregistration.  Anybody who
holds such a reference can
	* try to grab a "use" reference (ctl_head_grab()); that will
succeeds if and only if it hadn't entered unregistration yet.  If it
succeeds, we can use it in all normal ways until we release the "use"
reference (with ctl_head_finish()).  Note that this relies on having
->unregistering become non-NULL in all cases when one starts to unregister
the sucker.
	* keep pointers to ctl_table entries; they *can* be freed if
the entire thing is unregistered.  However, if ctl_head_grab() succeeds,
we know that unregistration had not happened (and will not happen until
ctl_head_finish()) and such pointers can be used safely.

IOW, now we can have inodes under /proc/sys keep references to ctl_table
entries, protecting them with references to ctl_table_header and
grabbing the latter for the duration of operations that require access
to ctl_table.  That won't cause deadlocks, since unregistration will not
be stopped by mere keeping a reference to ctl_table_header.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/sysctl.c | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8ee4a0619fbb..60d9357e7172 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1387,6 +1387,9 @@ static void start_unregistering(struct ctl_table_header *p)
 		spin_unlock(&sysctl_lock);
 		wait_for_completion(&wait);
 		spin_lock(&sysctl_lock);
+	} else {
+		/* anything non-NULL; we'll never dereference it */
+		p->unregistering = ERR_PTR(-EINVAL);
 	}
 	/*
 	 * do not remove from the list until nobody holds it; walking the
@@ -1395,6 +1398,32 @@ static void start_unregistering(struct ctl_table_header *p)
 	list_del_init(&p->ctl_entry);
 }
 
+void sysctl_head_get(struct ctl_table_header *head)
+{
+	spin_lock(&sysctl_lock);
+	head->count++;
+	spin_unlock(&sysctl_lock);
+}
+
+void sysctl_head_put(struct ctl_table_header *head)
+{
+	spin_lock(&sysctl_lock);
+	if (!--head->count)
+		kfree(head);
+	spin_unlock(&sysctl_lock);
+}
+
+struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
+{
+	if (!head)
+		BUG();
+	spin_lock(&sysctl_lock);
+	if (!use_table(head))
+		head = ERR_PTR(-ENOENT);
+	spin_unlock(&sysctl_lock);
+	return head;
+}
+
 void sysctl_head_finish(struct ctl_table_header *head)
 {
 	if (!head)
@@ -1771,6 +1800,7 @@ struct ctl_table_header *__register_sysctl_paths(
 	header->unregistering = NULL;
 	header->root = root;
 	sysctl_set_parent(NULL, header->ctl_table);
+	header->count = 1;
 #ifdef CONFIG_SYSCTL_SYSCALL_CHECK
 	if (sysctl_check_table(namespaces, header->ctl_table)) {
 		kfree(header);
@@ -1834,8 +1864,9 @@ void unregister_sysctl_table(struct ctl_table_header * header)
 
 	spin_lock(&sysctl_lock);
 	start_unregistering(header);
+	if (!--header->count)
+		kfree(header);
 	spin_unlock(&sysctl_lock);
-	kfree(header);
 }
 
 void setup_sysctl_set(struct ctl_table_set *p,
@@ -1869,6 +1900,10 @@ void setup_sysctl_set(struct ctl_table_set *p,
 {
 }
 
+void sysctl_head_put(struct ctl_table_header *head)
+{
+}
+
 #endif /* CONFIG_SYSCTL */
 
 /*
-- 
cgit v1.2.3


From ae7edecc9b8810770a8e5cb9a466ea4bdcfa8401 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 15 Jul 2008 06:33:31 -0400
Subject: [PATCH] sysctl: keep track of tree relationships

In a sense, that's the heart of the series.  It's based on the following
property of the trees we are actually asked to add: they can be split into
stem that is already covered by registered trees and crown that is entirely
new.  IOW, if a/b and a/c/d are introduced by our tree, then a/c is also
introduced by it.

That allows to associate tree and table entry with each node in the union;
while directory nodes might be covered by many trees, only one will cover
the node by its crown.  And that will allow much saner logics for /proc/sys
in the next patches.  This patch introduces the data structures needed to
keep track of that.

When adding a sysctl table, we find a "parent" one.  Which is to say,
find the deepest node on its stem that already is present in one of the
tables from our table set or its ancestor sets.  That table will be our
parent and that node in it - attachment point.  Add our table to list
anchored in parent, have it refer the parent and contents of attachment
point.  Also remember where its crown lives.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/sysctl.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 60d9357e7172..c9a0af887033 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1680,6 +1680,52 @@ static __init int sysctl_init(void)
 
 core_initcall(sysctl_init);
 
+static int is_branch_in(struct ctl_table *branch, struct ctl_table *table)
+{
+	struct ctl_table *p;
+	const char *s = branch->procname;
+
+	/* branch should have named subdirectory as its first element */
+	if (!s || !branch->child)
+		return 0;
+
+	/* ... and nothing else */
+	if (branch[1].procname || branch[1].ctl_name)
+		return 0;
+
+	/* table should contain subdirectory with the same name */
+	for (p = table; p->procname || p->ctl_name; p++) {
+		if (!p->child)
+			continue;
+		if (p->procname && strcmp(p->procname, s) == 0)
+			return 1;
+	}
+	return 0;
+}
+
+/* see if attaching q to p would be an improvement */
+static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
+{
+	struct ctl_table *to = p->ctl_table, *by = q->ctl_table;
+	int is_better = 0;
+	int not_in_parent = !p->attached_by;
+
+	while (is_branch_in(by, to)) {
+		if (by == q->attached_by)
+			is_better = 1;
+		if (to == p->attached_by)
+			not_in_parent = 1;
+		by = by->child;
+		to = to->child;
+	}
+
+	if (is_better && not_in_parent) {
+		q->attached_by = by;
+		q->attached_to = to;
+		q->parent = p;
+	}
+}
+
 /**
  * __register_sysctl_paths - register a sysctl hierarchy
  * @root: List of sysctl headers to register on
@@ -1759,6 +1805,7 @@ struct ctl_table_header *__register_sysctl_paths(
 	struct ctl_table_header *header;
 	struct ctl_table *new, **prevp;
 	unsigned int n, npath;
+	struct ctl_table_set *set;
 
 	/* Count the path components */
 	for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath)
@@ -1809,6 +1856,18 @@ struct ctl_table_header *__register_sysctl_paths(
 #endif
 	spin_lock(&sysctl_lock);
 	header->set = lookup_header_set(root, namespaces);
+	header->attached_by = header->ctl_table;
+	header->attached_to = root_table;
+	header->parent = &root_table_header;
+	for (set = header->set; set; set = set->parent) {
+		struct ctl_table_header *p;
+		list_for_each_entry(p, &set->list, ctl_entry) {
+			if (p->unregistering)
+				continue;
+			try_attach(p, header);
+		}
+	}
+	header->parent->count++;
 	list_add_tail(&header->ctl_entry, &header->set->list);
 	spin_unlock(&sysctl_lock);
 
@@ -1864,6 +1923,10 @@ void unregister_sysctl_table(struct ctl_table_header * header)
 
 	spin_lock(&sysctl_lock);
 	start_unregistering(header);
+	if (!--header->parent->count) {
+		WARN_ON(1);
+		kfree(header->parent);
+	}
 	if (!--header->count)
 		kfree(header);
 	spin_unlock(&sysctl_lock);
-- 
cgit v1.2.3


From 9043476f726802f4b00c96d0c4f418dde48d1304 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 15 Jul 2008 08:54:06 -0400
Subject: [PATCH] sanitize proc_sysctl

* keep references to ctl_table_head and ctl_table in /proc/sys inodes
* grab the former during operations, use the latter for access to
  entry if that succeeds
* have ->d_compare() check if table should be seen for one who does lookup;
  that allows us to avoid flipping inodes - if we have the same name resolve
  to different things, we'll just keep several dentries and ->d_compare()
  will reject the wrong ones.
* have ->lookup() and ->readdir() scan the table of our inode first, then
  walk all ctl_table_header and scan ->attached_by for those that are
  attached to our directory.
* implement ->getattr().
* get rid of insane amounts of tree-walking
* get rid of the need to know dentry in ->permission() and of the contortions
  induced by that.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/sysctl.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c9a0af887033..ff5abcca5ddf 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1932,6 +1932,21 @@ void unregister_sysctl_table(struct ctl_table_header * header)
 	spin_unlock(&sysctl_lock);
 }
 
+int sysctl_is_seen(struct ctl_table_header *p)
+{
+	struct ctl_table_set *set = p->set;
+	int res;
+	spin_lock(&sysctl_lock);
+	if (p->unregistering)
+		res = 0;
+	else if (!set->is_seen)
+		res = 1;
+	else
+		res = set->is_seen(set);
+	spin_unlock(&sysctl_lock);
+	return res;
+}
+
 void setup_sysctl_set(struct ctl_table_set *p,
 	struct ctl_table_set *parent,
 	int (*is_seen)(struct ctl_table_set *))
-- 
cgit v1.2.3


From e6305c43eda10ebfd2ad9e35d6e172ccc7bb3695 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 15 Jul 2008 21:03:57 -0400
Subject: [PATCH] sanitize ->permission() prototype

* kill nameidata * argument; map the 3 bits in ->flags anybody cares
  about to new MAY_... ones and pass with the mask.
* kill redundant gfs2_iop_permission()
* sanitize ecryptfs_permission()
* fix remaining places where ->permission() instances might barf on new
  MAY_... found in mask.

The obvious next target in that direction is permission(9)

folded fix for nfs_permission() breakage from Miklos Szeredi <mszeredi@suse.cz>

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/sysctl.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ff5abcca5ddf..911d846f0503 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1516,9 +1516,9 @@ static int do_sysctl_strategy(struct ctl_table_root *root,
 	int op = 0, rc;
 
 	if (oldval)
-		op |= 004;
+		op |= MAY_READ;
 	if (newval)
-		op |= 002;
+		op |= MAY_WRITE;
 	if (sysctl_perm(root, table, op))
 		return -EPERM;
 
@@ -1560,7 +1560,7 @@ repeat:
 		if (n == table->ctl_name) {
 			int error;
 			if (table->child) {
-				if (sysctl_perm(root, table, 001))
+				if (sysctl_perm(root, table, MAY_EXEC))
 					return -EPERM;
 				name++;
 				nlen--;
@@ -1635,7 +1635,7 @@ static int test_perm(int mode, int op)
 		mode >>= 6;
 	else if (in_egroup_p(0))
 		mode >>= 3;
-	if ((mode & op & 0007) == op)
+	if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
 		return 0;
 	return -EACCES;
 }
@@ -1645,7 +1645,7 @@ int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
 	int error;
 	int mode;
 
-	error = security_sysctl(table, op);
+	error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC));
 	if (error)
 		return error;
 
-- 
cgit v1.2.3


From 7f2da1e7d0330395e5e9e350b879b98a1ea495df Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 10 May 2008 20:44:54 -0400
Subject: [PATCH] kill altroot

long overdue...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/exec_domain.c | 1 -
 kernel/exit.c        | 2 --
 kernel/fork.c        | 7 -------
 3 files changed, 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index c1ef192aa655..0d407e886735 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -168,7 +168,6 @@ __set_personality(u_long personality)
 	current->personality = personality;
 	oep = current_thread_info()->exec_domain;
 	current_thread_info()->exec_domain = ep;
-	set_fs_altroot();
 
 	module_put(oep->module);
 	return 0;
diff --git a/kernel/exit.c b/kernel/exit.c
index 6cdf60712bd2..0caf590548a0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -565,8 +565,6 @@ void put_fs_struct(struct fs_struct *fs)
 	if (atomic_dec_and_test(&fs->count)) {
 		path_put(&fs->root);
 		path_put(&fs->pwd);
-		if (fs->altroot.dentry)
-			path_put(&fs->altroot);
 		kmem_cache_free(fs_cachep, fs);
 	}
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index abb3ed6298f6..5e050c1317c4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -657,13 +657,6 @@ static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
 		path_get(&old->root);
 		fs->pwd = old->pwd;
 		path_get(&old->pwd);
-		if (old->altroot.dentry) {
-			fs->altroot = old->altroot;
-			path_get(&old->altroot);
-		} else {
-			fs->altroot.mnt = NULL;
-			fs->altroot.dentry = NULL;
-		}
 		read_unlock(&old->lock);
 	}
 	return fs;
-- 
cgit v1.2.3


From 3f8206d496e9e9495afb1d4e70d29712b4d403c9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 26 Jul 2008 03:46:43 -0400
Subject: [PATCH] get rid of indirect users of namei.h

fs.h needs path.h, not namei.h; nfs_fs.h doesn't need it at all.
Several places in the tree needed direct include.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/cgroup.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 89bd6fb7894f..657f8f8d93a5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -45,6 +45,7 @@
 #include <linux/delayacct.h>
 #include <linux/cgroupstats.h>
 #include <linux/hash.h>
+#include <linux/namei.h>
 
 #include <asm/atomic.h>
 
-- 
cgit v1.2.3


From bfbcf034798b2ca45338cee5049b5694b7ddc865 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Sun, 27 Jul 2008 06:31:22 +0100
Subject: lost sysctl fix

try_attach() should walk into the matching subdirectory, not the first one...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Valdis.Kletnieks@vt.edu
Tested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 911d846f0503..fe4713347275 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1680,43 +1680,45 @@ static __init int sysctl_init(void)
 
 core_initcall(sysctl_init);
 
-static int is_branch_in(struct ctl_table *branch, struct ctl_table *table)
+static struct ctl_table *is_branch_in(struct ctl_table *branch,
+				      struct ctl_table *table)
 {
 	struct ctl_table *p;
 	const char *s = branch->procname;
 
 	/* branch should have named subdirectory as its first element */
 	if (!s || !branch->child)
-		return 0;
+		return NULL;
 
 	/* ... and nothing else */
 	if (branch[1].procname || branch[1].ctl_name)
-		return 0;
+		return NULL;
 
 	/* table should contain subdirectory with the same name */
 	for (p = table; p->procname || p->ctl_name; p++) {
 		if (!p->child)
 			continue;
 		if (p->procname && strcmp(p->procname, s) == 0)
-			return 1;
+			return p;
 	}
-	return 0;
+	return NULL;
 }
 
 /* see if attaching q to p would be an improvement */
 static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
 {
 	struct ctl_table *to = p->ctl_table, *by = q->ctl_table;
+	struct ctl_table *next;
 	int is_better = 0;
 	int not_in_parent = !p->attached_by;
 
-	while (is_branch_in(by, to)) {
+	while ((next = is_branch_in(by, to)) != NULL) {
 		if (by == q->attached_by)
 			is_better = 1;
 		if (to == p->attached_by)
 			not_in_parent = 1;
 		by = by->child;
-		to = to->child;
+		to = next->child;
 	}
 
 	if (is_better && not_in_parent) {
-- 
cgit v1.2.3


From 605ccb73f6a1c891a16268b3a2923208fc637958 Mon Sep 17 00:00:00 2001
From: Andrea Righi <righi.andrea@gmail.com>
Date: Sun, 27 Jul 2008 13:39:03 +0200
Subject: tracing: remove unused variable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove the following warning with CONFIG_TRACING=y:

	kernel/trace/trace.c: In function ‘s_next’:
	kernel/trace/trace.c:1186: warning: unused variable ‘last_ent’

Signed-off-by: Andrea Righi <righi.andrea@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/trace/trace.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fc20e09a6cb1..8f3fb3db61c3 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1183,7 +1183,6 @@ static void *find_next_entry_inc(struct trace_iterator *iter)
 static void *s_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct trace_iterator *iter = m->private;
-	void *last_ent = iter->ent;
 	int i = (int)*pos;
 	void *ent;
 
-- 
cgit v1.2.3


From 5995477ab7f3522c497c9c4a1c55373e9d655574 Mon Sep 17 00:00:00 2001
From: Andrea Righi <righi.andrea@gmail.com>
Date: Sun, 27 Jul 2008 17:29:15 +0200
Subject: task IO accounting: improve code readability

Put all i/o statistics in struct proc_io_accounting and use inline functions to
initialize and increment statistics, removing a lot of single variable
assignments.

This also reduces the kernel size as following (with CONFIG_TASK_XACCT=y and
CONFIG_TASK_IO_ACCOUNTING=y).

    text    data     bss     dec     hex filename
   11651       0       0   11651    2d83 kernel/exit.o.before
   11619       0       0   11619    2d63 kernel/exit.o.after
   10886     132     136   11154    2b92 kernel/fork.o.before
   10758     132     136   11026    2b12 kernel/fork.o.after

 3082029  807968 4818600 8708597  84e1f5 vmlinux.o.before
 3081869  807968 4818600 8708437  84e155 vmlinux.o.after

Signed-off-by: Andrea Righi <righi.andrea@gmail.com>
Acked-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c   | 30 +++---------------------------
 kernel/fork.c   | 15 ++-------------
 kernel/tsacct.c | 14 +++++++-------
 3 files changed, 12 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 0caf590548a0..eb4d6470d1d0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -121,18 +121,7 @@ static void __exit_signal(struct task_struct *tsk)
 		sig->nivcsw += tsk->nivcsw;
 		sig->inblock += task_io_get_inblock(tsk);
 		sig->oublock += task_io_get_oublock(tsk);
-#ifdef CONFIG_TASK_XACCT
-		sig->rchar += tsk->rchar;
-		sig->wchar += tsk->wchar;
-		sig->syscr += tsk->syscr;
-		sig->syscw += tsk->syscw;
-#endif /* CONFIG_TASK_XACCT */
-#ifdef CONFIG_TASK_IO_ACCOUNTING
-		sig->ioac.read_bytes += tsk->ioac.read_bytes;
-		sig->ioac.write_bytes += tsk->ioac.write_bytes;
-		sig->ioac.cancelled_write_bytes +=
-					tsk->ioac.cancelled_write_bytes;
-#endif /* CONFIG_TASK_IO_ACCOUNTING */
+		task_io_accounting_add(&sig->ioac, &tsk->ioac);
 		sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 		sig = NULL; /* Marker for below. */
 	}
@@ -1363,21 +1352,8 @@ static int wait_task_zombie(struct task_struct *p, int options,
 		psig->coublock +=
 			task_io_get_oublock(p) +
 			sig->oublock + sig->coublock;
-#ifdef CONFIG_TASK_XACCT
-		psig->rchar += p->rchar + sig->rchar;
-		psig->wchar += p->wchar + sig->wchar;
-		psig->syscr += p->syscr + sig->syscr;
-		psig->syscw += p->syscw + sig->syscw;
-#endif /* CONFIG_TASK_XACCT */
-#ifdef CONFIG_TASK_IO_ACCOUNTING
-		psig->ioac.read_bytes +=
-			p->ioac.read_bytes + sig->ioac.read_bytes;
-		psig->ioac.write_bytes +=
-			p->ioac.write_bytes + sig->ioac.write_bytes;
-		psig->ioac.cancelled_write_bytes +=
-				p->ioac.cancelled_write_bytes +
-				sig->ioac.cancelled_write_bytes;
-#endif /* CONFIG_TASK_IO_ACCOUNTING */
+		task_io_accounting_add(&psig->ioac, &p->ioac);
+		task_io_accounting_add(&psig->ioac, &sig->ioac);
 		spin_unlock_irq(&p->parent->sighand->siglock);
 	}
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 5e050c1317c4..8214ba7c8bb1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -806,12 +806,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
 	sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
-#ifdef CONFIG_TASK_XACCT
-	sig->rchar = sig->wchar = sig->syscr = sig->syscw = 0;
-#endif
-#ifdef CONFIG_TASK_IO_ACCOUNTING
-	memset(&sig->ioac, 0, sizeof(sig->ioac));
-#endif
+	task_io_accounting_init(&sig->ioac);
 	sig->sum_sched_runtime = 0;
 	INIT_LIST_HEAD(&sig->cpu_timers[0]);
 	INIT_LIST_HEAD(&sig->cpu_timers[1]);
@@ -994,13 +989,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->last_switch_timestamp = 0;
 #endif
 
-#ifdef CONFIG_TASK_XACCT
-	p->rchar = 0;		/* I/O counter: bytes read */
-	p->wchar = 0;		/* I/O counter: bytes written */
-	p->syscr = 0;		/* I/O counter: read syscalls */
-	p->syscw = 0;		/* I/O counter: write syscalls */
-#endif
-	task_io_accounting_init(p);
+	task_io_accounting_init(&p->ioac);
 	acct_clear_integrals(p);
 
 	p->it_virt_expires = cputime_zero;
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 3da47ccdc5e5..f9cd2561689c 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -94,14 +94,14 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 		stats->hiwater_vm    = mm->hiwater_vm * PAGE_SIZE / KB;
 		mmput(mm);
 	}
-	stats->read_char	= p->rchar;
-	stats->write_char	= p->wchar;
-	stats->read_syscalls	= p->syscr;
-	stats->write_syscalls	= p->syscw;
+	stats->read_char	= p->ioac.chr.rchar;
+	stats->write_char	= p->ioac.chr.wchar;
+	stats->read_syscalls	= p->ioac.chr.syscr;
+	stats->write_syscalls	= p->ioac.chr.syscw;
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-	stats->read_bytes	= p->ioac.read_bytes;
-	stats->write_bytes	= p->ioac.write_bytes;
-	stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes;
+	stats->read_bytes	= p->ioac.blk.read_bytes;
+	stats->write_bytes	= p->ioac.blk.write_bytes;
+	stats->cancelled_write_bytes = p->ioac.blk.cancelled_write_bytes;
 #else
 	stats->read_bytes	= 0;
 	stats->write_bytes	= 0;
-- 
cgit v1.2.3


From 940389b8afad6495211614c13eb91ef7001773ec Mon Sep 17 00:00:00 2001
From: Andrea Righi <righi.andrea@gmail.com>
Date: Mon, 28 Jul 2008 00:48:12 +0200
Subject: task IO accounting: move all IO statistics in struct
 task_io_accounting

Simplify the code of include/linux/task_io_accounting.h.

It is also more reasonable to have all the task i/o-related statistics in a
single struct (task_io_accounting).

Signed-off-by: Andrea Righi <righi.andrea@gmail.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/tsacct.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index f9cd2561689c..8ebcd8532dfb 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -94,14 +94,14 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 		stats->hiwater_vm    = mm->hiwater_vm * PAGE_SIZE / KB;
 		mmput(mm);
 	}
-	stats->read_char	= p->ioac.chr.rchar;
-	stats->write_char	= p->ioac.chr.wchar;
-	stats->read_syscalls	= p->ioac.chr.syscr;
-	stats->write_syscalls	= p->ioac.chr.syscw;
+	stats->read_char	= p->ioac.rchar;
+	stats->write_char	= p->ioac.wchar;
+	stats->read_syscalls	= p->ioac.syscr;
+	stats->write_syscalls	= p->ioac.syscw;
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-	stats->read_bytes	= p->ioac.blk.read_bytes;
-	stats->write_bytes	= p->ioac.blk.write_bytes;
-	stats->cancelled_write_bytes = p->ioac.blk.cancelled_write_bytes;
+	stats->read_bytes	= p->ioac.read_bytes;
+	stats->write_bytes	= p->ioac.write_bytes;
+	stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes;
 #else
 	stats->read_bytes	= 0;
 	stats->write_bytes	= 0;
-- 
cgit v1.2.3


From 15bba37d62351749c3915add81f673b256952ee1 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Thu, 24 Jul 2008 15:41:48 +0100
Subject: module: fix build warning with !CONFIG_KALLSYMS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch fixed the warning:

  CC      kernel/module.o
  /home/wangcong/Projects/linux-2.6/kernel/module.c:332: warning:
‘lookup_symbol’ defined but not used

Signed-off-by: WANG Cong <wangcong@zeuux.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index d8b5605132a0..d861bd5b8c10 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -325,18 +325,6 @@ static unsigned long find_symbol(const char *name,
 	return -ENOENT;
 }
 
-/* lookup symbol in given range of kernel_symbols */
-static const struct kernel_symbol *lookup_symbol(const char *name,
-	const struct kernel_symbol *start,
-	const struct kernel_symbol *stop)
-{
-	const struct kernel_symbol *ks = start;
-	for (; ks < stop; ks++)
-		if (strcmp(ks->name, name) == 0)
-			return ks;
-	return NULL;
-}
-
 /* Search for module by name: must hold module_mutex. */
 static struct module *find_module(const char *name)
 {
@@ -1703,6 +1691,19 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
 }
 
 #ifdef CONFIG_KALLSYMS
+
+/* lookup symbol in given range of kernel_symbols */
+static const struct kernel_symbol *lookup_symbol(const char *name,
+	const struct kernel_symbol *start,
+	const struct kernel_symbol *stop)
+{
+	const struct kernel_symbol *ks = start;
+	for (; ks < stop; ks++)
+		if (strcmp(ks->name, name) == 0)
+			return ks;
+	return NULL;
+}
+
 static int is_exported(const char *name, const struct module *mod)
 {
 	if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab))
-- 
cgit v1.2.3


From 5c2aed622571ac7c3c6ec182d6d3c318e4b45c8b Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Thu, 28 Feb 2008 11:33:03 -0500
Subject: stop_machine: add ALL_CPUS option

-allow stop_mahcine_run() to call a function on all cpus. Calling
 stop_machine_run() with a 'ALL_CPUS' invokes this new behavior.
 stop_machine_run() proceeds as normal until the calling cpu has
 invoked 'fn'. Then, we tell all the other cpus to call 'fn'.

Signed-off-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
CC: Adrian Bunk <bunk@stusta.de>
CC: Andi Kleen <andi@firstfloor.org>
CC: Alexey Dobriyan <adobriyan@gmail.com>
CC: Christoph Hellwig <hch@infradead.org>
CC: mingo@elte.hu
CC: akpm@osdl.org
---
 kernel/stop_machine.c | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 738b411ff2d3..a473bd0cb71b 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -22,9 +22,17 @@ enum stopmachine_state {
 	STOPMACHINE_WAIT,
 	STOPMACHINE_PREPARE,
 	STOPMACHINE_DISABLE_IRQ,
+	STOPMACHINE_RUN,
 	STOPMACHINE_EXIT,
 };
 
+struct stop_machine_data {
+	int (*fn)(void *);
+	void *data;
+	struct completion done;
+	int run_all;
+} smdata;
+
 static enum stopmachine_state stopmachine_state;
 static unsigned int stopmachine_num_threads;
 static atomic_t stopmachine_thread_ack;
@@ -33,6 +41,7 @@ static int stopmachine(void *cpu)
 {
 	int irqs_disabled = 0;
 	int prepared = 0;
+	int ran = 0;
 	cpumask_of_cpu_ptr(cpumask, (int)(long)cpu);
 
 	set_cpus_allowed_ptr(current, cpumask);
@@ -58,6 +67,11 @@ static int stopmachine(void *cpu)
 			prepared = 1;
 			smp_mb(); /* Must read state first. */
 			atomic_inc(&stopmachine_thread_ack);
+		} else if (stopmachine_state == STOPMACHINE_RUN && !ran) {
+			smdata.fn(smdata.data);
+			ran = 1;
+			smp_mb(); /* Must read state first. */
+			atomic_inc(&stopmachine_thread_ack);
 		}
 		/* Yield in first stage: migration threads need to
 		 * help our sisters onto their CPUs. */
@@ -136,11 +150,10 @@ static void restart_machine(void)
 	preempt_enable_no_resched();
 }
 
-struct stop_machine_data {
-	int (*fn)(void *);
-	void *data;
-	struct completion done;
-};
+static void run_other_cpus(void)
+{
+	stopmachine_set_state(STOPMACHINE_RUN);
+}
 
 static int do_stop(void *_smdata)
 {
@@ -150,6 +163,8 @@ static int do_stop(void *_smdata)
 	ret = stop_machine();
 	if (ret == 0) {
 		ret = smdata->fn(smdata->data);
+		if (smdata->run_all)
+			run_other_cpus();
 		restart_machine();
 	}
 
@@ -173,14 +188,17 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
 	struct stop_machine_data smdata;
 	struct task_struct *p;
 
+	mutex_lock(&stopmachine_mutex);
+
 	smdata.fn = fn;
 	smdata.data = data;
+	smdata.run_all = (cpu == ALL_CPUS) ? 1 : 0;
 	init_completion(&smdata.done);
 
-	mutex_lock(&stopmachine_mutex);
+	smp_wmb(); /* make sure other cpus see smdata updates */
 
 	/* If they don't care which CPU fn runs on, bind to any online one. */
-	if (cpu == NR_CPUS)
+	if (cpu == NR_CPUS || cpu == ALL_CPUS)
 		cpu = raw_smp_processor_id();
 
 	p = kthread_create(do_stop, &smdata, "kstopmachine");
-- 
cgit v1.2.3


From ffdb5976c47609c862917d4c186ecbb5706d2dda Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 28 Jul 2008 12:16:28 -0500
Subject: Simplify stop_machine

stop_machine creates a kthread which creates kernel threads.  We can
create those threads directly and simplify things a little.  Some care
must be taken with CPU hotunplug, which has special needs, but that code
seems more robust than it was in the past.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 kernel/cpu.c          |  13 +--
 kernel/stop_machine.c | 293 +++++++++++++++++++++-----------------------------
 2 files changed, 128 insertions(+), 178 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 10ba5f1004a5..cf79bb911371 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -216,7 +216,6 @@ static int __ref take_cpu_down(void *_param)
 static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 {
 	int err, nr_calls = 0;
-	struct task_struct *p;
 	cpumask_t old_allowed, tmp;
 	void *hcpu = (void *)(long)cpu;
 	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
@@ -250,19 +249,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	cpu_clear(cpu, tmp);
 	set_cpus_allowed_ptr(current, &tmp);
 
-	p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
+	err = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
 
-	if (IS_ERR(p) || cpu_online(cpu)) {
+	if (err || cpu_online(cpu)) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
 		if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
 					    hcpu) == NOTIFY_BAD)
 			BUG();
 
-		if (IS_ERR(p)) {
-			err = PTR_ERR(p);
-			goto out_allowed;
-		}
-		goto out_thread;
+		goto out_allowed;
 	}
 
 	/* Wait for it to sleep (leaving idle task). */
@@ -279,8 +274,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 
 	check_for_tasks(cpu);
 
-out_thread:
-	err = kthread_stop(p);
 out_allowed:
 	set_cpus_allowed_ptr(current, &old_allowed);
 out_release:
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index a473bd0cb71b..35882dccc943 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,4 +1,4 @@
-/* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
+/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
  * GPL v2 and any later version.
  */
 #include <linux/cpu.h>
@@ -13,220 +13,177 @@
 #include <asm/atomic.h>
 #include <asm/uaccess.h>
 
-/* Since we effect priority and affinity (both of which are visible
- * to, and settable by outside processes) we do indirection via a
- * kthread. */
-
-/* Thread to stop each CPU in user context. */
+/* This controls the threads on each CPU. */
 enum stopmachine_state {
-	STOPMACHINE_WAIT,
+	/* Dummy starting state for thread. */
+	STOPMACHINE_NONE,
+	/* Awaiting everyone to be scheduled. */
 	STOPMACHINE_PREPARE,
+	/* Disable interrupts. */
 	STOPMACHINE_DISABLE_IRQ,
+	/* Run the function */
 	STOPMACHINE_RUN,
+	/* Exit */
 	STOPMACHINE_EXIT,
 };
+static enum stopmachine_state state;
 
 struct stop_machine_data {
 	int (*fn)(void *);
 	void *data;
-	struct completion done;
-	int run_all;
-} smdata;
+	int fnret;
+};
 
-static enum stopmachine_state stopmachine_state;
-static unsigned int stopmachine_num_threads;
-static atomic_t stopmachine_thread_ack;
+/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
+static unsigned int num_threads;
+static atomic_t thread_ack;
+static struct completion finished;
+static DEFINE_MUTEX(lock);
 
-static int stopmachine(void *cpu)
+static void set_state(enum stopmachine_state newstate)
 {
-	int irqs_disabled = 0;
-	int prepared = 0;
-	int ran = 0;
-	cpumask_of_cpu_ptr(cpumask, (int)(long)cpu);
-
-	set_cpus_allowed_ptr(current, cpumask);
-
-	/* Ack: we are alive */
-	smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
-	atomic_inc(&stopmachine_thread_ack);
-
-	/* Simple state machine */
-	while (stopmachine_state != STOPMACHINE_EXIT) {
-		if (stopmachine_state == STOPMACHINE_DISABLE_IRQ 
-		    && !irqs_disabled) {
-			local_irq_disable();
-			hard_irq_disable();
-			irqs_disabled = 1;
-			/* Ack: irqs disabled. */
-			smp_mb(); /* Must read state first. */
-			atomic_inc(&stopmachine_thread_ack);
-		} else if (stopmachine_state == STOPMACHINE_PREPARE
-			   && !prepared) {
-			/* Everyone is in place, hold CPU. */
-			preempt_disable();
-			prepared = 1;
-			smp_mb(); /* Must read state first. */
-			atomic_inc(&stopmachine_thread_ack);
-		} else if (stopmachine_state == STOPMACHINE_RUN && !ran) {
-			smdata.fn(smdata.data);
-			ran = 1;
-			smp_mb(); /* Must read state first. */
-			atomic_inc(&stopmachine_thread_ack);
-		}
-		/* Yield in first stage: migration threads need to
-		 * help our sisters onto their CPUs. */
-		if (!prepared && !irqs_disabled)
-			yield();
-		cpu_relax();
-	}
-
-	/* Ack: we are exiting. */
-	smp_mb(); /* Must read state first. */
-	atomic_inc(&stopmachine_thread_ack);
-
-	if (irqs_disabled)
-		local_irq_enable();
-	if (prepared)
-		preempt_enable();
-
-	return 0;
+	/* Reset ack counter. */
+	atomic_set(&thread_ack, num_threads);
+	smp_wmb();
+	state = newstate;
 }
 
-/* Change the thread state */
-static void stopmachine_set_state(enum stopmachine_state state)
+/* Last one to ack a state moves to the next state. */
+static void ack_state(void)
 {
-	atomic_set(&stopmachine_thread_ack, 0);
-	smp_wmb();
-	stopmachine_state = state;
-	while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
-		cpu_relax();
+	if (atomic_dec_and_test(&thread_ack)) {
+		/* If we're the last one to ack the EXIT, we're finished. */
+		if (state == STOPMACHINE_EXIT)
+			complete(&finished);
+		else
+			set_state(state + 1);
+	}
 }
 
-static int stop_machine(void)
+/* This is the actual thread which stops the CPU.  It exits by itself rather
+ * than waiting for kthread_stop(), because it's easier for hotplug CPU. */
+static int stop_cpu(struct stop_machine_data *smdata)
 {
-	int i, ret = 0;
-
-	atomic_set(&stopmachine_thread_ack, 0);
-	stopmachine_num_threads = 0;
-	stopmachine_state = STOPMACHINE_WAIT;
+	enum stopmachine_state curstate = STOPMACHINE_NONE;
+	int uninitialized_var(ret);
 
-	for_each_online_cpu(i) {
-		if (i == raw_smp_processor_id())
-			continue;
-		ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL);
-		if (ret < 0)
-			break;
-		stopmachine_num_threads++;
-	}
-
-	/* Wait for them all to come to life. */
-	while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) {
-		yield();
+	/* Simple state machine */
+	do {
+		/* Chill out and ensure we re-read stopmachine_state. */
 		cpu_relax();
-	}
-
-	/* If some failed, kill them all. */
-	if (ret < 0) {
-		stopmachine_set_state(STOPMACHINE_EXIT);
-		return ret;
-	}
-
-	/* Now they are all started, make them hold the CPUs, ready. */
-	preempt_disable();
-	stopmachine_set_state(STOPMACHINE_PREPARE);
-
-	/* Make them disable irqs. */
-	local_irq_disable();
-	hard_irq_disable();
-	stopmachine_set_state(STOPMACHINE_DISABLE_IRQ);
-
-	return 0;
-}
+		if (state != curstate) {
+			curstate = state;
+			switch (curstate) {
+			case STOPMACHINE_DISABLE_IRQ:
+				local_irq_disable();
+				hard_irq_disable();
+				break;
+			case STOPMACHINE_RUN:
+				/* |= allows error detection if functions on
+				 * multiple CPUs. */
+				smdata->fnret |= smdata->fn(smdata->data);
+				break;
+			default:
+				break;
+			}
+			ack_state();
+		}
+	} while (curstate != STOPMACHINE_EXIT);
 
-static void restart_machine(void)
-{
-	stopmachine_set_state(STOPMACHINE_EXIT);
 	local_irq_enable();
-	preempt_enable_no_resched();
+	do_exit(0);
 }
 
-static void run_other_cpus(void)
+/* Callback for CPUs which aren't supposed to do anything. */
+static int chill(void *unused)
 {
-	stopmachine_set_state(STOPMACHINE_RUN);
+	return 0;
 }
 
-static int do_stop(void *_smdata)
+int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
 {
-	struct stop_machine_data *smdata = _smdata;
-	int ret;
+	int i, err;
+	struct stop_machine_data active, idle;
+	struct task_struct **threads;
+
+	active.fn = fn;
+	active.data = data;
+	active.fnret = 0;
+	idle.fn = chill;
+	idle.data = NULL;
+
+	/* If they don't care which cpu fn runs on, just pick one. */
+	if (cpu == NR_CPUS)
+		cpu = any_online_cpu(cpu_online_map);
+
+	/* This could be too big for stack on large machines. */
+	threads = kcalloc(NR_CPUS, sizeof(threads[0]), GFP_KERNEL);
+	if (!threads)
+		return -ENOMEM;
+
+	/* Set up initial state. */
+	mutex_lock(&lock);
+	init_completion(&finished);
+	num_threads = num_online_cpus();
+	set_state(STOPMACHINE_PREPARE);
 
-	ret = stop_machine();
-	if (ret == 0) {
-		ret = smdata->fn(smdata->data);
-		if (smdata->run_all)
-			run_other_cpus();
-		restart_machine();
-	}
+	for_each_online_cpu(i) {
+		struct stop_machine_data *smdata;
+		struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
 
-	/* We're done: you can kthread_stop us now */
-	complete(&smdata->done);
+		if (cpu == ALL_CPUS || i == cpu)
+			smdata = &active;
+		else
+			smdata = &idle;
+
+		threads[i] = kthread_create((void *)stop_cpu, smdata, "kstop%u",
+					    i);
+		if (IS_ERR(threads[i])) {
+			err = PTR_ERR(threads[i]);
+			threads[i] = NULL;
+			goto kill_threads;
+		}
 
-	/* Wait for kthread_stop */
-	set_current_state(TASK_INTERRUPTIBLE);
-	while (!kthread_should_stop()) {
-		schedule();
-		set_current_state(TASK_INTERRUPTIBLE);
-	}
-	__set_current_state(TASK_RUNNING);
-	return ret;
-}
+		/* Place it onto correct cpu. */
+		kthread_bind(threads[i], i);
 
-struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
-				       unsigned int cpu)
-{
-	static DEFINE_MUTEX(stopmachine_mutex);
-	struct stop_machine_data smdata;
-	struct task_struct *p;
+		/* Make it highest prio. */
+		if (sched_setscheduler_nocheck(threads[i], SCHED_FIFO, &param))
+			BUG();
+	}
 
-	mutex_lock(&stopmachine_mutex);
+	/* We've created all the threads.  Wake them all: hold this CPU so one
+	 * doesn't hit this CPU until we're ready. */
+	cpu = get_cpu();
+	for_each_online_cpu(i)
+		wake_up_process(threads[i]);
 
-	smdata.fn = fn;
-	smdata.data = data;
-	smdata.run_all = (cpu == ALL_CPUS) ? 1 : 0;
-	init_completion(&smdata.done);
+	/* This will release the thread on our CPU. */
+	put_cpu();
+	wait_for_completion(&finished);
+	mutex_unlock(&lock);
 
-	smp_wmb(); /* make sure other cpus see smdata updates */
+	kfree(threads);
 
-	/* If they don't care which CPU fn runs on, bind to any online one. */
-	if (cpu == NR_CPUS || cpu == ALL_CPUS)
-		cpu = raw_smp_processor_id();
+	return active.fnret;
 
-	p = kthread_create(do_stop, &smdata, "kstopmachine");
-	if (!IS_ERR(p)) {
-		struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+kill_threads:
+	for_each_online_cpu(i)
+		if (threads[i])
+			kthread_stop(threads[i]);
+	mutex_unlock(&lock);
 
-		/* One high-prio thread per cpu.  We'll do this one. */
-		sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
-		kthread_bind(p, cpu);
-		wake_up_process(p);
-		wait_for_completion(&smdata.done);
-	}
-	mutex_unlock(&stopmachine_mutex);
-	return p;
+	kfree(threads);
+	return err;
 }
 
 int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
 {
-	struct task_struct *p;
 	int ret;
 
 	/* No CPUs can come up or down during this. */
 	get_online_cpus();
-	p = __stop_machine_run(fn, data, cpu);
-	if (!IS_ERR(p))
-		ret = kthread_stop(p);
-	else
-		ret = PTR_ERR(p);
+	ret = __stop_machine_run(fn, data, cpu);
 	put_online_cpus();
 
 	return ret;
-- 
cgit v1.2.3


From 04321587584272f4e8b9818f319f40caf8eeee13 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 28 Jul 2008 12:16:29 -0500
Subject: Hotplug CPU: don't check cpu_online after take_cpu_down

Akinobu points out that if take_cpu_down() succeeds, the cpu must be offline.
Remove the cpu_online() check, and put a BUG_ON().

Quoting Akinobu Mita:
   Actually the cpu_online() check was necessary before appling this
   stop_machine: simplify patch.

   With old __stop_machine_run(), __stop_machine_run() could succeed
   (return !IS_ERR(p) value) even if take_cpu_down() returned non-zero value.
   The return value of take_cpu_down() was obtained through kthread_stop()..

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: "Akinobu Mita" <akinobu.mita@gmail.com>
---
 kernel/cpu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index cf79bb911371..53cf508f975a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -250,8 +250,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	set_cpus_allowed_ptr(current, &tmp);
 
 	err = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
-
-	if (err || cpu_online(cpu)) {
+	if (err) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
 		if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
 					    hcpu) == NOTIFY_BAD)
@@ -259,6 +258,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 
 		goto out_allowed;
 	}
+	BUG_ON(cpu_online(cpu));
 
 	/* Wait for it to sleep (leaving idle task). */
 	while (!idle_cpu(cpu))
-- 
cgit v1.2.3


From eeec4fad963490821348a331cca6102ae1c4a7a3 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 28 Jul 2008 12:16:30 -0500
Subject: stop_machine(): stop_machine_run() changed to use cpu mask

Instead of a "cpu" arg with magic values NR_CPUS (any cpu) and ~0 (all
cpus), pass a cpumask_t.  Allow NULL for the common case (where we
don't care which CPU the function is run on): temporary cpumask_t's
are usually considered bad for stack space.

This deprecates stop_machine_run, to be removed soon when all the
callers are dead.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/cpu.c          |  3 ++-
 kernel/stop_machine.c | 27 +++++++++++++--------------
 2 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 53cf508f975a..29510d68338a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -248,8 +248,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	cpus_setall(tmp);
 	cpu_clear(cpu, tmp);
 	set_cpus_allowed_ptr(current, &tmp);
+	tmp = cpumask_of_cpu(cpu);
 
-	err = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
+	err = __stop_machine(take_cpu_down, &tcd_param, &tmp);
 	if (err) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
 		if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 35882dccc943..e446c7c7d6a9 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -100,7 +100,7 @@ static int chill(void *unused)
 	return 0;
 }
 
-int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
+int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
 {
 	int i, err;
 	struct stop_machine_data active, idle;
@@ -112,10 +112,6 @@ int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
 	idle.fn = chill;
 	idle.data = NULL;
 
-	/* If they don't care which cpu fn runs on, just pick one. */
-	if (cpu == NR_CPUS)
-		cpu = any_online_cpu(cpu_online_map);
-
 	/* This could be too big for stack on large machines. */
 	threads = kcalloc(NR_CPUS, sizeof(threads[0]), GFP_KERNEL);
 	if (!threads)
@@ -128,13 +124,16 @@ int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
 	set_state(STOPMACHINE_PREPARE);
 
 	for_each_online_cpu(i) {
-		struct stop_machine_data *smdata;
+		struct stop_machine_data *smdata = &idle;
 		struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
 
-		if (cpu == ALL_CPUS || i == cpu)
-			smdata = &active;
-		else
-			smdata = &idle;
+		if (!cpus) {
+			if (i == first_cpu(cpu_online_map))
+				smdata = &active;
+		} else {
+			if (cpu_isset(i, *cpus))
+				smdata = &active;
+		}
 
 		threads[i] = kthread_create((void *)stop_cpu, smdata, "kstop%u",
 					    i);
@@ -154,7 +153,7 @@ int __stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
 
 	/* We've created all the threads.  Wake them all: hold this CPU so one
 	 * doesn't hit this CPU until we're ready. */
-	cpu = get_cpu();
+	get_cpu();
 	for_each_online_cpu(i)
 		wake_up_process(threads[i]);
 
@@ -177,15 +176,15 @@ kill_threads:
 	return err;
 }
 
-int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
+int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
 {
 	int ret;
 
 	/* No CPUs can come up or down during this. */
 	get_online_cpus();
-	ret = __stop_machine_run(fn, data, cpu);
+	ret = __stop_machine(fn, data, cpus);
 	put_online_cpus();
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(stop_machine_run);
+EXPORT_SYMBOL_GPL(stop_machine);
-- 
cgit v1.2.3


From 9b1a4d38373a5581a4e01032a3ccdd94cd93477b Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 28 Jul 2008 12:16:30 -0500
Subject: stop_machine: Wean existing callers off stop_machine_run()

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c     | 8 ++++----
 kernel/rcuclassic.c | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index d861bd5b8c10..61d212120df4 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -678,7 +678,7 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
 	if (flags & O_NONBLOCK) {
 		struct stopref sref = { mod, flags, forced };
 
-		return stop_machine_run(__try_stop_module, &sref, NR_CPUS);
+		return stop_machine(__try_stop_module, &sref, NULL);
 	} else {
 		/* We don't need to stop the machine for this. */
 		mod->state = MODULE_STATE_GOING;
@@ -1416,7 +1416,7 @@ static int __unlink_module(void *_mod)
 static void free_module(struct module *mod)
 {
 	/* Delete from various lists */
-	stop_machine_run(__unlink_module, mod, NR_CPUS);
+	stop_machine(__unlink_module, mod, NULL);
 	remove_notes_attrs(mod);
 	remove_sect_attrs(mod);
 	mod_kobject_remove(mod);
@@ -2197,7 +2197,7 @@ static struct module *load_module(void __user *umod,
 	/* Now sew it into the lists so we can get lockdep and oops
          * info during argument parsing.  Noone should access us, since
          * strong_try_module_get() will fail. */
-	stop_machine_run(__link_module, mod, NR_CPUS);
+	stop_machine(__link_module, mod, NULL);
 
 	/* Size of section 0 is 0, so this works well if no params */
 	err = parse_args(mod->name, mod->args,
@@ -2231,7 +2231,7 @@ static struct module *load_module(void __user *umod,
 	return mod;
 
  unlink:
-	stop_machine_run(__unlink_module, mod, NR_CPUS);
+	stop_machine(__unlink_module, mod, NULL);
 	module_arch_cleanup(mod);
  cleanup:
 	kobject_del(&mod->mkobj.kobj);
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 6f8696c502f4..aad93cdc9f68 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -91,8 +91,8 @@ static void force_quiescent_state(struct rcu_data *rdp,
 		 * rdp->cpu is the current cpu.
 		 *
 		 * cpu_online_map is updated by the _cpu_down()
-		 * using stop_machine_run(). Since we're in irqs disabled
-		 * section, stop_machine_run() is not exectuting, hence
+		 * using __stop_machine(). Since we're in irqs disabled
+		 * section, __stop_machine() is not exectuting, hence
 		 * the cpu_online_map is stable.
 		 *
 		 * However,  a cpu might have been offlined _just_ before
-- 
cgit v1.2.3


From 784e2d76007f90d69341b95967160c4fb7829299 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 28 Jul 2008 12:16:31 -0500
Subject: stop_machine: fix up ftrace.c

Simple conversion.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Abhishek Sagar <sagar.abhishek@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4231a3dc224a..f6e3af31b403 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -587,7 +587,7 @@ static int __ftrace_modify_code(void *data)
 
 static void ftrace_run_update_code(int command)
 {
-	stop_machine_run(__ftrace_modify_code, &command, NR_CPUS);
+	stop_machine(__ftrace_modify_code, &command, NULL);
 }
 
 void ftrace_disable_daemon(void)
@@ -787,7 +787,7 @@ static int ftrace_update_code(void)
 	    !ftrace_enabled || !ftraced_trigger)
 		return 0;
 
-	stop_machine_run(__ftrace_update_code, NULL, NR_CPUS);
+	stop_machine(__ftrace_update_code, NULL, NULL);
 
 	return 1;
 }
@@ -1564,7 +1564,7 @@ static int __init ftrace_dynamic_init(void)
 
 	addr = (unsigned long)ftrace_record_ip;
 
-	stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS);
+	stop_machine(ftrace_dyn_arch_init, &addr, NULL);
 
 	/* ftrace_dyn_arch_init places the return code in addr */
 	if (addr) {
-- 
cgit v1.2.3


From 157124c11f4217733691223ecf5ee47558ae9495 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 28 Jul 2008 11:53:11 +0200
Subject: sched: fix warning in hrtick_start_fair()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Benjamin Herrenschmidt reported:

> I get that on ppc64 ...
>
> In file included from kernel/sched.c:1595:
> kernel/sched_fair.c: In function ‘hrtick_start_fair’:
> kernel/sched_fair.c:902: warning: comparison of distinct pointer types lacks a cast
>
> Probably harmless but annoying.

s64 delta = slice - ran;

-->	delta = max(10000LL, delta);

Probably ppc64's s64 is long vs long long..

I think hpa was looking at sanitizing all these 64bit types across the
architectures.

Use max_t with an explicit type meanwhile.

Reported-by: Benjamin Herrenschmid <benh@kernel.crashing.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index cf2cd6ce4cb2..0fe94ea43f32 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -899,7 +899,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
 		 * doesn't make sense. Rely on vruntime for fairness.
 		 */
 		if (rq->curr != p)
-			delta = max(10000LL, delta);
+			delta = max_t(s64, 10000LL, delta);
 
 		hrtick_start(rq, delta);
 	}
-- 
cgit v1.2.3


From 94f565598827e2015dce97f4c1ac4871ab84407b Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Sun, 27 Jul 2008 20:27:06 +0900
Subject: sched: fix SCHED_HRTICK dependency

Currently, it seems SCHED_HRTICK allowed for !SMP. But, it seems to have
no dependency of it. Fix it.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Kconfig.hz | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 382dd5a8b2d7..94fabd534b03 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -55,4 +55,4 @@ config HZ
 	default 1000 if HZ_1000
 
 config SCHED_HRTICK
-	def_bool HIGH_RES_TIMERS && USE_GENERIC_SMP_HELPERS
+	def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS)
-- 
cgit v1.2.3


From e26873bb10f722f10b1af9de05119a3d7cbc07b2 Mon Sep 17 00:00:00 2001
From: roel kluin <roel.kluin@gmail.com>
Date: Tue, 22 Jul 2008 16:51:15 -0400
Subject: sched: test runtime rather than period in global_rt_runtime()

Test runtime rather than period

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0236958addcb..0d1717b00225 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -834,7 +834,7 @@ static inline u64 global_rt_period(void)
 
 static inline u64 global_rt_runtime(void)
 {
-	if (sysctl_sched_rt_period < 0)
+	if (sysctl_sched_rt_runtime < 0)
 		return RUNTIME_INF;
 
 	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
-- 
cgit v1.2.3


From 2c3d103ba90827cfb478bf10464d9b5b9cea369c Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Fri, 25 Jul 2008 19:45:00 +0100
Subject: sched: move sched_clock before first use

Move sched_clock() up to stop warning: weak declaration of `sched_clock'
after first use results in unspecified behavior (if -fno-unit-at-a-time).

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Mike Travis <travis@sgi.com>
Cc: Ben Herrenschmidt <benh@kernel.crashing.org>
Cc: Linuxppc-dev@ozlabs.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_clock.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 22ed55d1167f..5a2dc7d8fd98 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -32,6 +32,15 @@
 #include <linux/ktime.h>
 #include <linux/module.h>
 
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ * This is default implementation.
+ * Architectures and sub-architectures can override this.
+ */
+unsigned long long __attribute__((weak)) sched_clock(void)
+{
+	return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
+}
 
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
 
@@ -321,16 +330,6 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
 #endif
 
-/*
- * Scheduler clock - returns current time in nanosec units.
- * This is default implementation.
- * Architectures and sub-architectures can override this.
- */
-unsigned long long __attribute__((weak)) sched_clock(void)
-{
-	return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
-}
-
 unsigned long long cpu_clock(int cpu)
 {
 	unsigned long long clock;
-- 
cgit v1.2.3


From 0e241ffd306c0896bb9959be7faa4d4cfcb706d9 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Thu, 24 Jul 2008 16:58:42 -0700
Subject: locking: fix mutex @key parameter kernel-doc notation

Fix @key parameter to mutex_init() and one of its callers.

Warning(linux-2.6.26-git11//drivers/base/class.c:210): No description found for parameter 'key'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/mutex.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/mutex.c b/kernel/mutex.c
index bcdc9ac8ef60..12c779dc65d4 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -34,6 +34,7 @@
 /***
  * mutex_init - initialize the mutex
  * @lock: the mutex to be initialized
+ * @key: the lock_class_key for the class; used by mutex lock debugging
  *
  * Initialize the mutex to unlocked state.
  *
-- 
cgit v1.2.3


From e56b3bc7942982ac2589c942fb345e38bc7a341a Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 28 Jul 2008 11:32:33 -0700
Subject: cpu masks: optimize and clean up cpumask_of_cpu()

Clean up and optimize cpumask_of_cpu(), by sharing all the zero words.

Instead of stupidly generating all possible i=0...NR_CPUS 2^i patterns
creating a huge array of constant bitmasks, realize that the zero words
can be shared.

In other words, on a 64-bit architecture, we only ever need 64 of these
arrays - with a different bit set in one single world (with enough zero
words around it so that we can create any bitmask by just offsetting in
that big array). And then we just put enough zeroes around it that we
can point every single cpumask to be one of those things.

So when we have 4k CPU's, instead of having 4k arrays (of 4k bits each,
with one bit set in each array - 2MB memory total), we have exactly 64
arrays instead, each 8k bits in size (64kB total).

And then we just point cpumask(n) to the right position (which we can
calculate dynamically). Once we have the right arrays, getting
"cpumask(n)" ends up being:

  static inline const cpumask_t *get_cpu_mask(unsigned int cpu)
  {
          const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG];
          p -= cpu / BITS_PER_LONG;
          return (const cpumask_t *)p;
  }

This brings other advantages and simplifications as well:

 - we are not wasting memory that is just filled with a single bit in
   various different places

 - we don't need all those games to re-create the arrays in some dense
   format, because they're already going to be dense enough.

if we compile a kernel for up to 4k CPU's, "wasting" that 64kB of memory
is a non-issue (especially since by doing this "overlapping" trick we
probably get better cache behaviour anyway).

[ mingo@elte.hu:

  Converted Linus's mails into a commit. See:

     http://lkml.org/lkml/2008/7/27/156
     http://lkml.org/lkml/2008/7/28/320

  Also applied a family filter - which also has the side-effect of leaving
  out the bits where Linus calls me an idio... Oh, never mind ;-)
]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/cpu.c | 128 ++++++++++-------------------------------------------------
 1 file changed, 20 insertions(+), 108 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index a35d8995dc8c..06a8358bb418 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -462,115 +462,27 @@ out:
 
 #endif /* CONFIG_SMP */
 
-/* 64 bits of zeros, for initializers. */
-#if BITS_PER_LONG == 32
-#define Z64 0, 0
-#else
-#define Z64 0
-#endif
+/*
+ * cpu_bit_bitmap[] is a special, "compressed" data structure that
+ * represents all NR_CPUS bits binary values of 1<<nr.
+ *
+ * It is used by cpumask_of_cpu() to get a constant address to a CPU
+ * mask value that has a single bit set only.
+ */
 
-/* Initializer macros. */
-#define CMI0(n) { .bits = { 1UL << (n) } }
-#define CMI(n, ...) { .bits = { __VA_ARGS__, 1UL << ((n) % BITS_PER_LONG) } }
-
-#define CMI8(n, ...)						\
-	CMI((n), __VA_ARGS__), CMI((n)+1, __VA_ARGS__),		\
-	CMI((n)+2, __VA_ARGS__), CMI((n)+3, __VA_ARGS__),	\
-	CMI((n)+4, __VA_ARGS__), CMI((n)+5, __VA_ARGS__),	\
-	CMI((n)+6, __VA_ARGS__), CMI((n)+7, __VA_ARGS__)
-
-#if BITS_PER_LONG == 32
-#define CMI64(n, ...)							\
-	CMI8((n), __VA_ARGS__), CMI8((n)+8, __VA_ARGS__),		\
-	CMI8((n)+16, __VA_ARGS__), CMI8((n)+24, __VA_ARGS__),		\
-	CMI8((n)+32, 0, __VA_ARGS__), CMI8((n)+40, 0, __VA_ARGS__),	\
-	CMI8((n)+48, 0, __VA_ARGS__), CMI8((n)+56, 0, __VA_ARGS__)
-#else
-#define CMI64(n, ...)							\
-	CMI8((n), __VA_ARGS__), CMI8((n)+8, __VA_ARGS__),		\
-	CMI8((n)+16, __VA_ARGS__), CMI8((n)+24, __VA_ARGS__),		\
-	CMI8((n)+32, __VA_ARGS__), CMI8((n)+40, __VA_ARGS__),	\
-	CMI8((n)+48, __VA_ARGS__), CMI8((n)+56, __VA_ARGS__)
-#endif
+/* cpu_bit_bitmap[0] is empty - so we can back into it */
+#define MASK_DECLARE_1(x)	[x+1][0] = 1UL << (x)
+#define MASK_DECLARE_2(x)	MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
+#define MASK_DECLARE_4(x)	MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
+#define MASK_DECLARE_8(x)	MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
 
-#define CMI256(n, ...)							\
-	CMI64((n), __VA_ARGS__), CMI64((n)+64, Z64, __VA_ARGS__),	\
-	CMI64((n)+128, Z64, Z64, __VA_ARGS__),				\
-	CMI64((n)+192, Z64, Z64, Z64, __VA_ARGS__)
-#define Z256 Z64, Z64, Z64, Z64
-
-#define CMI1024(n, ...)					\
-	CMI256((n), __VA_ARGS__),			\
-	CMI256((n)+256, Z256, __VA_ARGS__),		\
-	CMI256((n)+512, Z256, Z256, __VA_ARGS__),	\
-	CMI256((n)+768, Z256, Z256, Z256, __VA_ARGS__)
-#define Z1024 Z256, Z256, Z256, Z256
-
-/* We want this statically initialized, just to be safe.  We try not
- * to waste too much space, either. */
-static const cpumask_t cpumask_map[]
-#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
-__initdata
-#endif
-= {
-	CMI0(0), CMI0(1), CMI0(2), CMI0(3),
-#if NR_CPUS > 4
-	CMI0(4), CMI0(5), CMI0(6), CMI0(7),
-#endif
-#if NR_CPUS > 8
-	CMI0(8), CMI0(9), CMI0(10), CMI0(11),
-	CMI0(12), CMI0(13), CMI0(14), CMI0(15),
-#endif
-#if NR_CPUS > 16
-	CMI0(16), CMI0(17), CMI0(18), CMI0(19),
-	CMI0(20), CMI0(21), CMI0(22), CMI0(23),
-	CMI0(24), CMI0(25), CMI0(26), CMI0(27),
-	CMI0(28), CMI0(29), CMI0(30), CMI0(31),
-#endif
-#if NR_CPUS > 32
-#if BITS_PER_LONG == 32
-	CMI(32, 0), CMI(33, 0), CMI(34, 0), CMI(35, 0),
-	CMI(36, 0), CMI(37, 0), CMI(38, 0), CMI(39, 0),
-	CMI(40, 0), CMI(41, 0), CMI(42, 0), CMI(43, 0),
-	CMI(44, 0), CMI(45, 0), CMI(46, 0), CMI(47, 0),
-	CMI(48, 0), CMI(49, 0), CMI(50, 0), CMI(51, 0),
-	CMI(52, 0), CMI(53, 0), CMI(54, 0), CMI(55, 0),
-	CMI(56, 0), CMI(57, 0), CMI(58, 0), CMI(59, 0),
-	CMI(60, 0), CMI(61, 0), CMI(62, 0), CMI(63, 0),
-#else
-	CMI0(32), CMI0(33), CMI0(34), CMI0(35),
-	CMI0(36), CMI0(37), CMI0(38), CMI0(39),
-	CMI0(40), CMI0(41), CMI0(42), CMI0(43),
-	CMI0(44), CMI0(45), CMI0(46), CMI0(47),
-	CMI0(48), CMI0(49), CMI0(50), CMI0(51),
-	CMI0(52), CMI0(53), CMI0(54), CMI0(55),
-	CMI0(56), CMI0(57), CMI0(58), CMI0(59),
-	CMI0(60), CMI0(61), CMI0(62), CMI0(63),
-#endif /* BITS_PER_LONG == 64 */
-#endif
-#if NR_CPUS > 64
-	CMI64(64, Z64),
-#endif
-#if NR_CPUS > 128
-	CMI64(128, Z64, Z64), CMI64(192, Z64, Z64, Z64),
-#endif
-#if NR_CPUS > 256
-	CMI256(256, Z256),
-#endif
-#if NR_CPUS > 512
-	CMI256(512, Z256, Z256), CMI256(768, Z256, Z256, Z256),
-#endif
-#if NR_CPUS > 1024
-	CMI1024(1024, Z1024),
-#endif
-#if NR_CPUS > 2048
-	CMI1024(2048, Z1024, Z1024), CMI1024(3072, Z1024, Z1024, Z1024),
-#endif
-#if NR_CPUS > 4096
-#error NR_CPUS too big.  Fix initializers or set CONFIG_HAVE_CPUMASK_OF_CPU_MAP
+const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
+
+	MASK_DECLARE_8(0),	MASK_DECLARE_8(8),
+	MASK_DECLARE_8(16),	MASK_DECLARE_8(24),
+#if BITS_PER_LONG > 32
+	MASK_DECLARE_8(32),	MASK_DECLARE_8(40),
+	MASK_DECLARE_8(48),	MASK_DECLARE_8(56),
 #endif
 };
-
-const cpumask_t *cpumask_of_cpu_map = cpumask_map;
-
-EXPORT_SYMBOL_GPL(cpumask_of_cpu_map);
+EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
-- 
cgit v1.2.3


From cddb8a5c14aa89810b40495d94d3d2a0faee6619 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <andrea@qumranet.com>
Date: Mon, 28 Jul 2008 15:46:29 -0700
Subject: mmu-notifiers: core

With KVM/GFP/XPMEM there isn't just the primary CPU MMU pointing to pages.
 There are secondary MMUs (with secondary sptes and secondary tlbs) too.
sptes in the kvm case are shadow pagetables, but when I say spte in
mmu-notifier context, I mean "secondary pte".  In GRU case there's no
actual secondary pte and there's only a secondary tlb because the GRU
secondary MMU has no knowledge about sptes and every secondary tlb miss
event in the MMU always generates a page fault that has to be resolved by
the CPU (this is not the case of KVM where the a secondary tlb miss will
walk sptes in hardware and it will refill the secondary tlb transparently
to software if the corresponding spte is present).  The same way
zap_page_range has to invalidate the pte before freeing the page, the spte
(and secondary tlb) must also be invalidated before any page is freed and
reused.

Currently we take a page_count pin on every page mapped by sptes, but that
means the pages can't be swapped whenever they're mapped by any spte
because they're part of the guest working set.  Furthermore a spte unmap
event can immediately lead to a page to be freed when the pin is released
(so requiring the same complex and relatively slow tlb_gather smp safe
logic we have in zap_page_range and that can be avoided completely if the
spte unmap event doesn't require an unpin of the page previously mapped in
the secondary MMU).

The mmu notifiers allow kvm/GRU/XPMEM to attach to the tsk->mm and know
when the VM is swapping or freeing or doing anything on the primary MMU so
that the secondary MMU code can drop sptes before the pages are freed,
avoiding all page pinning and allowing 100% reliable swapping of guest
physical address space.  Furthermore it avoids the code that teardown the
mappings of the secondary MMU, to implement a logic like tlb_gather in
zap_page_range that would require many IPI to flush other cpu tlbs, for
each fixed number of spte unmapped.

To make an example: if what happens on the primary MMU is a protection
downgrade (from writeable to wrprotect) the secondary MMU mappings will be
invalidated, and the next secondary-mmu-page-fault will call
get_user_pages and trigger a do_wp_page through get_user_pages if it
called get_user_pages with write=1, and it'll re-establishing an updated
spte or secondary-tlb-mapping on the copied page.  Or it will setup a
readonly spte or readonly tlb mapping if it's a guest-read, if it calls
get_user_pages with write=0.  This is just an example.

This allows to map any page pointed by any pte (and in turn visible in the
primary CPU MMU), into a secondary MMU (be it a pure tlb like GRU, or an
full MMU with both sptes and secondary-tlb like the shadow-pagetable layer
with kvm), or a remote DMA in software like XPMEM (hence needing of
schedule in XPMEM code to send the invalidate to the remote node, while no
need to schedule in kvm/gru as it's an immediate event like invalidating
primary-mmu pte).

At least for KVM without this patch it's impossible to swap guests
reliably.  And having this feature and removing the page pin allows
several other optimizations that simplify life considerably.

Dependencies:

1) mm_take_all_locks() to register the mmu notifier when the whole VM
   isn't doing anything with "mm".  This allows mmu notifier users to keep
   track if the VM is in the middle of the invalidate_range_begin/end
   critical section with an atomic counter incraese in range_begin and
   decreased in range_end.  No secondary MMU page fault is allowed to map
   any spte or secondary tlb reference, while the VM is in the middle of
   range_begin/end as any page returned by get_user_pages in that critical
   section could later immediately be freed without any further
   ->invalidate_page notification (invalidate_range_begin/end works on
   ranges and ->invalidate_page isn't called immediately before freeing
   the page).  To stop all page freeing and pagetable overwrites the
   mmap_sem must be taken in write mode and all other anon_vma/i_mmap
   locks must be taken too.

2) It'd be a waste to add branches in the VM if nobody could possibly
   run KVM/GRU/XPMEM on the kernel, so mmu notifiers will only enabled if
   CONFIG_KVM=m/y.  In the current kernel kvm won't yet take advantage of
   mmu notifiers, but this already allows to compile a KVM external module
   against a kernel with mmu notifiers enabled and from the next pull from
   kvm.git we'll start using them.  And GRU/XPMEM will also be able to
   continue the development by enabling KVM=m in their config, until they
   submit all GRU/XPMEM GPLv2 code to the mainline kernel.  Then they can
   also enable MMU_NOTIFIERS in the same way KVM does it (even if KVM=n).
   This guarantees nobody selects MMU_NOTIFIER=y if KVM and GRU and XPMEM
   are all =n.

The mmu_notifier_register call can fail because mm_take_all_locks may be
interrupted by a signal and return -EINTR.  Because mmu_notifier_reigster
is used when a driver startup, a failure can be gracefully handled.  Here
an example of the change applied to kvm to register the mmu notifiers.
Usually when a driver startups other allocations are required anyway and
-ENOMEM failure paths exists already.

 struct  kvm *kvm_arch_create_vm(void)
 {
        struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+       int err;

        if (!kvm)
                return ERR_PTR(-ENOMEM);

        INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);

+       kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops;
+       err = mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm);
+       if (err) {
+               kfree(kvm);
+               return ERR_PTR(err);
+       }
+
        return kvm;
 }

mmu_notifier_unregister returns void and it's reliable.

The patch also adds a few needed but missing includes that would prevent
kernel to compile after these changes on non-x86 archs (x86 didn't need
them by luck).

[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: fix mm/filemap_xip.c build]
[akpm@linux-foundation.org: fix mm/mmu_notifier.c build]
Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Kanoj Sarcar <kanojsarcar@yahoo.com>
Cc: Roland Dreier <rdreier@cisco.com>
Cc: Steve Wise <swise@opengridcomputing.com>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Chris Wright <chrisw@redhat.com>
Cc: Marcelo Tosatti <marcelo@kvack.org>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Izik Eidus <izike@qumranet.com>
Cc: Anthony Liguori <aliguori@us.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 8214ba7c8bb1..7ce2ebe84796 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -27,6 +27,7 @@
 #include <linux/key.h>
 #include <linux/binfmts.h>
 #include <linux/mman.h>
+#include <linux/mmu_notifier.h>
 #include <linux/fs.h>
 #include <linux/nsproxy.h>
 #include <linux/capability.h>
@@ -414,6 +415,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 
 	if (likely(!mm_alloc_pgd(mm))) {
 		mm->def_flags = 0;
+		mmu_notifier_mm_init(mm);
 		return mm;
 	}
 
@@ -446,6 +448,7 @@ void __mmdrop(struct mm_struct *mm)
 	BUG_ON(mm == &init_mm);
 	mm_free_pgd(mm);
 	destroy_context(mm);
+	mmu_notifier_mm_destroy(mm);
 	free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
-- 
cgit v1.2.3


From 1a4e564b7db999fbe5d88318c96ac8747699d417 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 29 Jul 2008 22:32:57 -0700
Subject: resource: add resource_size()

Avoid one-off errors by introducing a resource_size() function.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Cc: Ben Dooks <ben-linux@fluff.org>
Cc: Jean Delvare <khali@linux-fr.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/resource.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index 74af2d7cb5a1..f5b518eabefe 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -490,7 +490,7 @@ resource_size_t resource_alignment(struct resource *res)
 {
 	switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) {
 	case IORESOURCE_SIZEALIGN:
-		return res->end - res->start + 1;
+		return resource_size(res);
 	case IORESOURCE_STARTALIGN:
 		return res->start;
 	default:
-- 
cgit v1.2.3


From 5a3eb9f6b7c598529f832b8baa6458ab1cbab2c6 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 29 Jul 2008 22:33:18 -0700
Subject: cgroup: fix possible memory leak

There's a leak if copy_from_user() returns failure.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 657f8f8d93a5..28debe4e1488 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1424,14 +1424,17 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
 		if (buffer == NULL)
 			return -ENOMEM;
 	}
-	if (nbytes && copy_from_user(buffer, userbuf, nbytes))
-		return -EFAULT;
+	if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
+		retval = -EFAULT;
+		goto out;
+	}
 
 	buffer[nbytes] = 0;     /* nul-terminate */
 	strstrip(buffer);
 	retval = cft->write_string(cgrp, cft, buffer);
 	if (!retval)
 		retval = nbytes;
+out:
 	if (buffer != local_buffer)
 		kfree(buffer);
 	return retval;
-- 
cgit v1.2.3


From 36553434f475a84b653e25e74490ee8df43b86d5 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 29 Jul 2008 22:33:19 -0700
Subject: cgroup: remove duplicate code in allocate_cg_link()

- just call free_cg_links() in allocate_cg_links()
- the list will get initialized in allocate_cg_links(), so don't init
  it twice

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 28debe4e1488..249a517591b8 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -355,6 +355,17 @@ static struct css_set *find_existing_css_set(
 	return NULL;
 }
 
+static void free_cg_links(struct list_head *tmp)
+{
+	struct cg_cgroup_link *link;
+	struct cg_cgroup_link *saved_link;
+
+	list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
+		list_del(&link->cgrp_link_list);
+		kfree(link);
+	}
+}
+
 /*
  * allocate_cg_links() allocates "count" cg_cgroup_link structures
  * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
@@ -363,17 +374,12 @@ static struct css_set *find_existing_css_set(
 static int allocate_cg_links(int count, struct list_head *tmp)
 {
 	struct cg_cgroup_link *link;
-	struct cg_cgroup_link *saved_link;
 	int i;
 	INIT_LIST_HEAD(tmp);
 	for (i = 0; i < count; i++) {
 		link = kmalloc(sizeof(*link), GFP_KERNEL);
 		if (!link) {
-			list_for_each_entry_safe(link, saved_link, tmp,
-						 cgrp_link_list) {
-				list_del(&link->cgrp_link_list);
-				kfree(link);
-			}
+			free_cg_links(tmp);
 			return -ENOMEM;
 		}
 		list_add(&link->cgrp_link_list, tmp);
@@ -381,17 +387,6 @@ static int allocate_cg_links(int count, struct list_head *tmp)
 	return 0;
 }
 
-static void free_cg_links(struct list_head *tmp)
-{
-	struct cg_cgroup_link *link;
-	struct cg_cgroup_link *saved_link;
-
-	list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
-		list_del(&link->cgrp_link_list);
-		kfree(link);
-	}
-}
-
 /*
  * find_css_set() takes an existing cgroup group and a
  * cgroup object, and returns a css_set object that's
@@ -956,7 +951,6 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 	struct super_block *sb;
 	struct cgroupfs_root *root;
 	struct list_head tmp_cg_links;
-	INIT_LIST_HEAD(&tmp_cg_links);
 
 	/* First find the desired set of subsystems */
 	ret = parse_cgroupfs_options(data, &opts);
-- 
cgit v1.2.3


From 55b6fd0162ace1e0f1b52c8c092565c115127ef6 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 29 Jul 2008 22:33:20 -0700
Subject: cgroup: uninline cgroup_has_css_refs()

It's not small enough, and has 2 call sites.

 text    data     bss     dec     hex filename
12813    1676    4832   19321    4b79 cgroup.o.orig
12775    1676    4832   19283    4b53 cgroup.o

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 249a517591b8..13932abde159 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2368,7 +2368,7 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	return cgroup_create(c_parent, dentry, mode | S_IFDIR);
 }
 
-static inline int cgroup_has_css_refs(struct cgroup *cgrp)
+static int cgroup_has_css_refs(struct cgroup *cgrp)
 {
 	/* Check the reference count on each subsystem. Since we
 	 * already established that there are no tasks in the
-- 
cgit v1.2.3


From 8d1e6266f512b3a94ef6d33528ff385f1aea0392 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 29 Jul 2008 22:33:21 -0700
Subject: cpuset: a bit cleanup for scan_for_empty_cpusets()

clean up hierarchy traversal code

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Cliff Wickman <cpw@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 91cf85b36dd5..3624dc0e95ed 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1833,24 +1833,21 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
  */
 static void scan_for_empty_cpusets(const struct cpuset *root)
 {
+	LIST_HEAD(queue);
 	struct cpuset *cp;	/* scans cpusets being updated */
 	struct cpuset *child;	/* scans child cpusets of cp */
-	struct list_head queue;
 	struct cgroup *cont;
 	nodemask_t oldmems;
 
-	INIT_LIST_HEAD(&queue);
-
 	list_add_tail((struct list_head *)&root->stack_list, &queue);
 
 	while (!list_empty(&queue)) {
-		cp = container_of(queue.next, struct cpuset, stack_list);
+		cp = list_first_entry(&queue, struct cpuset, stack_list);
 		list_del(queue.next);
 		list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
 			child = cgroup_cs(cont);
 			list_add_tail(&child->stack_list, &queue);
 		}
-		cont = cp->css.cgroup;
 
 		/* Continue past cpusets with all cpus, mems online */
 		if (cpus_subset(cp->cpus_allowed, cpu_online_map) &&
-- 
cgit v1.2.3


From f5393693e96393131a4a2e2743f883986d508503 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 29 Jul 2008 22:33:22 -0700
Subject: cpuset: speed up sched domain partition

All child cpusets contain a subset of the parent's cpus, so we can skip
them when partitioning sched domains. This decreases 'csa' greately for
cpusets with multi-level hierarchy.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 41 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 37 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3624dc0e95ed..c818c36b0c5f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -486,13 +486,38 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
 static void
 update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
 {
-	if (!dattr)
-		return;
 	if (dattr->relax_domain_level < c->relax_domain_level)
 		dattr->relax_domain_level = c->relax_domain_level;
 	return;
 }
 
+static void
+update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
+{
+	LIST_HEAD(q);
+
+	list_add(&c->stack_list, &q);
+	while (!list_empty(&q)) {
+		struct cpuset *cp;
+		struct cgroup *cont;
+		struct cpuset *child;
+
+		cp = list_first_entry(&q, struct cpuset, stack_list);
+		list_del(q.next);
+
+		if (cpus_empty(cp->cpus_allowed))
+			continue;
+
+		if (is_sched_load_balance(cp))
+			update_domain_attr(dattr, cp);
+
+		list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
+			child = cgroup_cs(cont);
+			list_add_tail(&child->stack_list, &q);
+		}
+	}
+}
+
 /*
  * rebuild_sched_domains()
  *
@@ -614,8 +639,16 @@ void rebuild_sched_domains(void)
 		if (cpus_empty(cp->cpus_allowed))
 			continue;
 
-		if (is_sched_load_balance(cp))
+		/*
+		 * All child cpusets contain a subset of the parent's cpus, so
+		 * just skip them, and then we call update_domain_attr_tree()
+		 * to calc relax_domain_level of the corresponding sched
+		 * domain.
+		 */
+		if (is_sched_load_balance(cp)) {
 			csa[csn++] = cp;
+			continue;
+		}
 
 		list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
 			child = cgroup_cs(cont);
@@ -686,7 +719,7 @@ restart:
 					cpus_or(*dp, *dp, b->cpus_allowed);
 					b->pn = -1;
 					if (dattr)
-						update_domain_attr(dattr
+						update_domain_attr_tree(dattr
 								   + nslot, b);
 				}
 			}
-- 
cgit v1.2.3


From 93a6557558a13f9ff35213efeca483f353c39dd3 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 29 Jul 2008 22:33:23 -0700
Subject: cpuset: fix wrong calculation of relax domain level

When multiple cpusets are overlapping in their 'cpus' and hence they
form a single sched domain, the largest sched_relax_domain_level among
those should be used. But when top_cpuset's sched_load_balance is
set, its sched_relax_domain_level is used regardless other sub-cpusets'.

This patch fixes it by walking the cpuset hierarchy to find the largest
sched_relax_domain_level.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index c818c36b0c5f..7a82e9033a7f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -616,7 +616,7 @@ void rebuild_sched_domains(void)
 		dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
 		if (dattr) {
 			*dattr = SD_ATTR_INIT;
-			update_domain_attr(dattr, &top_cpuset);
+			update_domain_attr_tree(dattr, &top_cpuset);
 		}
 		*doms = top_cpuset.cpus_allowed;
 		goto rebuild;
-- 
cgit v1.2.3


From aeed682421a5ebfbf46940e30c3d1caf3bc64304 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 29 Jul 2008 22:33:24 -0700
Subject: cpuset: clean up cpuset hierarchy traversal code

Use cpuset.stack_list rather than kfifo, so we avoid memory allocation
for kfifo.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7a82e9033a7f..d5ab79cf516d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -54,7 +54,6 @@
 #include <asm/uaccess.h>
 #include <asm/atomic.h>
 #include <linux/mutex.h>
-#include <linux/kfifo.h>
 #include <linux/workqueue.h>
 #include <linux/cgroup.h>
 
@@ -557,7 +556,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
  * So the reverse nesting would risk an ABBA deadlock.
  *
  * The three key local variables below are:
- *    q  - a kfifo queue of cpuset pointers, used to implement a
+ *    q  - a linked-list queue of cpuset pointers, used to implement a
  *	   top-down scan of all cpusets.  This scan loads a pointer
  *	   to each cpuset marked is_sched_load_balance into the
  *	   array 'csa'.  For our purposes, rebuilding the schedulers
@@ -592,7 +591,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
 
 void rebuild_sched_domains(void)
 {
-	struct kfifo *q;	/* queue of cpusets to be scanned */
+	LIST_HEAD(q);		/* queue of cpusets to be scanned*/
 	struct cpuset *cp;	/* scans q */
 	struct cpuset **csa;	/* array of all cpuset ptrs */
 	int csn;		/* how many cpuset ptrs in csa so far */
@@ -602,7 +601,6 @@ void rebuild_sched_domains(void)
 	int ndoms;		/* number of sched domains in result */
 	int nslot;		/* next empty doms[] cpumask_t slot */
 
-	q = NULL;
 	csa = NULL;
 	doms = NULL;
 	dattr = NULL;
@@ -622,20 +620,19 @@ void rebuild_sched_domains(void)
 		goto rebuild;
 	}
 
-	q = kfifo_alloc(number_of_cpusets * sizeof(cp), GFP_KERNEL, NULL);
-	if (IS_ERR(q))
-		goto done;
 	csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
 	if (!csa)
 		goto done;
 	csn = 0;
 
-	cp = &top_cpuset;
-	__kfifo_put(q, (void *)&cp, sizeof(cp));
-	while (__kfifo_get(q, (void *)&cp, sizeof(cp))) {
+	list_add(&top_cpuset.stack_list, &q);
+	while (!list_empty(&q)) {
 		struct cgroup *cont;
 		struct cpuset *child;   /* scans child cpusets of cp */
 
+		cp = list_first_entry(&q, struct cpuset, stack_list);
+		list_del(q.next);
+
 		if (cpus_empty(cp->cpus_allowed))
 			continue;
 
@@ -652,7 +649,7 @@ void rebuild_sched_domains(void)
 
 		list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
 			child = cgroup_cs(cont);
-			__kfifo_put(q, (void *)&child, sizeof(cp));
+			list_add_tail(&child->stack_list, &q);
 		}
   	}
 
@@ -735,8 +732,6 @@ rebuild:
 	put_online_cpus();
 
 done:
-	if (q && !IS_ERR(q))
-		kfifo_free(q);
 	kfree(csa);
 	/* Don't kfree(doms) -- partition_sched_domains() does that. */
 	/* Don't kfree(dattr) -- partition_sched_domains() does that. */
-- 
cgit v1.2.3


From 5def9a3a22e09c99717f41ab7f07ec9e1a1f3ec8 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Tue, 29 Jul 2008 22:33:31 -0700
Subject: markers: fix markers read barrier for multiple probes

Paul pointed out two incorrect read barriers in the marker handler code in
the path where multiple probes are connected.  Those are ordering reads of
"ptype" (single or multi probe marker), "multi" array pointer, and "multi"
array data access.

It should be ordered like this :

read ptype
smp_rmb()
read multi array pointer
smp_read_barrier_depends()
access data referenced by multi array pointer

The code with a single probe connected (optimized case, does not have to
allocate an array) has correct memory ordering.

It applies to kernel 2.6.26.x, 2.6.25.x and linux-next.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: <stable@kernel.org>		[2.6.25.x, 2.6.26.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/marker.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/marker.c b/kernel/marker.c
index 971da5317903..7d1faecd7a51 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -125,6 +125,11 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
 	} else {
 		struct marker_probe_closure *multi;
 		int i;
+		/*
+		 * Read mdata->ptype before mdata->multi.
+		 */
+		smp_rmb();
+		multi = mdata->multi;
 		/*
 		 * multi points to an array, therefore accessing the array
 		 * depends on reading multi. However, even in this case,
@@ -133,7 +138,6 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
 		 * in the fast path, so put the explicit barrier here.
 		 */
 		smp_read_barrier_depends();
-		multi = mdata->multi;
 		for (i = 0; multi[i].func; i++) {
 			va_start(args, call_private);
 			multi[i].func(multi[i].probe_private, call_private,
@@ -174,6 +178,11 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
 	} else {
 		struct marker_probe_closure *multi;
 		int i;
+		/*
+		 * Read mdata->ptype before mdata->multi.
+		 */
+		smp_rmb();
+		multi = mdata->multi;
 		/*
 		 * multi points to an array, therefore accessing the array
 		 * depends on reading multi. However, even in this case,
@@ -182,7 +191,6 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
 		 * in the fast path, so put the explicit barrier here.
 		 */
 		smp_read_barrier_depends();
-		multi = mdata->multi;
 		for (i = 0; multi[i].func; i++)
 			multi[i].func(multi[i].probe_private, call_private,
 				mdata->format, &args);
-- 
cgit v1.2.3


From 641de9d8f505db055d451b50e6e38117f84e79bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <Uwe.Kleine-Koenig@digi.com>
Date: Tue, 29 Jul 2008 22:33:38 -0700
Subject: printk: fix comment for printk ratelimiting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The comment assumed the burst to be one and the ratelimit used to be named
printk_ratelimit_jiffies.

Signed-off-by: Uwe Kleine-König <Uwe.Kleine-Koenig@digi.com>
Cc: Dave Young <hidave.darkstar@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index a7f7559c5f6c..b51b1567bb55 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1309,14 +1309,14 @@ void tty_write_message(struct tty_struct *tty, char *msg)
 
 #if defined CONFIG_PRINTK
 
-DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
 /*
  * printk rate limiting, lifted from the networking subsystem.
  *
- * This enforces a rate limit: not more than one kernel message
- * every printk_ratelimit_jiffies to make a denial-of-service
- * attack impossible.
+ * This enforces a rate limit: not more than 10 kernel messages
+ * every 5s to make a denial-of-service attack impossible.
  */
+DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
+
 int printk_ratelimit(void)
 {
 	return __ratelimit(&printk_ratelimit_state);
-- 
cgit v1.2.3


From 6af8bf3d86d55c98af6e453cb920ddc30867e5c7 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 29 Jul 2008 22:33:49 -0700
Subject: workqueues: add comments to __create_workqueue_key()

Dmitry Adamushko pointed out that the error handling in
__create_workqueue_key() is not clear, add the comment.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ec7e4f62aaff..4a26a1382df0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -830,10 +830,21 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 		start_workqueue_thread(cwq, -1);
 	} else {
 		cpu_maps_update_begin();
+		/*
+		 * We must place this wq on list even if the code below fails.
+		 * cpu_down(cpu) can remove cpu from cpu_populated_map before
+		 * destroy_workqueue() takes the lock, in that case we leak
+		 * cwq[cpu]->thread.
+		 */
 		spin_lock(&workqueue_lock);
 		list_add(&wq->list, &workqueues);
 		spin_unlock(&workqueue_lock);
-
+		/*
+		 * We must initialize cwqs for each possible cpu even if we
+		 * are going to call destroy_workqueue() finally. Otherwise
+		 * cpu_up() can hit the uninitialized cwq once we drop the
+		 * lock.
+		 */
 		for_each_possible_cpu(cpu) {
 			cwq = init_cpu_workqueue(wq, cpu);
 			if (err || !cpu_online(cpu))
-- 
cgit v1.2.3


From f718cd4add5aea9d379faff92f162571e356cc5f Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Tue, 29 Jul 2008 22:33:52 -0700
Subject: sched: make scheduler sysfs attributes sysdev class devices

They are really class devices, but were incorrectly declared.  This
leads to crashes with the recent changes that makes non normal sysdevs
use a different prototype.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Pierre Ossman <drzeus-list@drzeus.cx>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0236958addcb..21f7da94662e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7671,34 +7671,34 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 }
 
 #ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct sys_device *dev,
-				struct sysdev_attribute *attr, char *page)
+static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
+					   char *page)
 {
 	return sprintf(page, "%u\n", sched_mc_power_savings);
 }
-static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
-					    struct sysdev_attribute *attr,
+static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
 					    const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 0);
 }
-static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
-		   sched_mc_power_savings_store);
+static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
+			 sched_mc_power_savings_show,
+			 sched_mc_power_savings_store);
 #endif
 
 #ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct sys_device *dev,
-				struct sysdev_attribute *attr, char *page)
+static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
+					    char *page)
 {
 	return sprintf(page, "%u\n", sched_smt_power_savings);
 }
-static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
-					     struct sysdev_attribute *attr,
+static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
 					     const char *buf, size_t count)
 {
 	return sched_power_savings_store(buf, count, 1);
 }
-static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
+static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
+		   sched_smt_power_savings_show,
 		   sched_smt_power_savings_store);
 #endif
 
-- 
cgit v1.2.3


From e4e4e534faa3c2be4e165ce414f44b76ada7208c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 14 Apr 2008 08:50:02 +0200
Subject: sched clock: revert various sched_clock() changes

Found an interactivity problem on a quad core test-system - simple
CPU loops would occasionally delay the system un an unacceptable way.

After much debugging with Peter Zijlstra it turned out that the problem
is caused by the string of sched_clock() changes - they caused the CPU
clock to jump backwards a bit - which confuses the scheduler arithmetics.

(which is unsigned for performance reasons)

So revert:

 # c300ba2: sched_clock: and multiplier for TSC to gtod drift
 # c0c8773: sched_clock: only update deltas with local reads.
 # af52a90: sched_clock: stop maximum check on NO HZ
 # f7cce27: sched_clock: widen the max and min time

This solves the interactivity problems.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Mike Galbraith <efault@gmx.de>
---
 kernel/sched_clock.c     | 109 ++++++-----------------------------------------
 kernel/time/tick-sched.c |   2 -
 2 files changed, 13 insertions(+), 98 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 5a2dc7d8fd98..9a7844158ae8 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -44,11 +44,6 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
 
-#define MULTI_SHIFT 15
-/* Max is double, Min is 1/2 */
-#define MAX_MULTI (2LL << MULTI_SHIFT)
-#define MIN_MULTI (1LL << (MULTI_SHIFT-1))
-
 struct sched_clock_data {
 	/*
 	 * Raw spinlock - this is a special case: this might be called
@@ -62,10 +57,6 @@ struct sched_clock_data {
 	u64			tick_raw;
 	u64			tick_gtod;
 	u64			clock;
-	s64			multi;
-#ifdef CONFIG_NO_HZ
-	int			check_max;
-#endif
 };
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
@@ -97,53 +88,18 @@ void sched_clock_init(void)
 		scd->tick_raw = 0;
 		scd->tick_gtod = ktime_now;
 		scd->clock = ktime_now;
-		scd->multi = 1 << MULTI_SHIFT;
-#ifdef CONFIG_NO_HZ
-		scd->check_max = 1;
-#endif
 	}
 
 	sched_clock_running = 1;
 }
 
-#ifdef CONFIG_NO_HZ
-/*
- * The dynamic ticks makes the delta jiffies inaccurate. This
- * prevents us from checking the maximum time update.
- * Disable the maximum check during stopped ticks.
- */
-void sched_clock_tick_stop(int cpu)
-{
-	struct sched_clock_data *scd = cpu_sdc(cpu);
-
-	scd->check_max = 0;
-}
-
-void sched_clock_tick_start(int cpu)
-{
-	struct sched_clock_data *scd = cpu_sdc(cpu);
-
-	scd->check_max = 1;
-}
-
-static int check_max(struct sched_clock_data *scd)
-{
-	return scd->check_max;
-}
-#else
-static int check_max(struct sched_clock_data *scd)
-{
-	return 1;
-}
-#endif /* CONFIG_NO_HZ */
-
 /*
  * update the percpu scd from the raw @now value
  *
  *  - filter out backward motion
  *  - use jiffies to generate a min,max window to clip the raw values
  */
-static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *time)
+static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
 {
 	unsigned long now_jiffies = jiffies;
 	long delta_jiffies = now_jiffies - scd->tick_jiffies;
@@ -152,31 +108,16 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *tim
 	s64 delta = now - scd->prev_raw;
 
 	WARN_ON_ONCE(!irqs_disabled());
-
-	/*
-	 * At schedule tick the clock can be just under the gtod. We don't
-	 * want to push it too prematurely.
-	 */
-	min_clock = scd->tick_gtod + (delta_jiffies * TICK_NSEC);
-	if (min_clock > TICK_NSEC)
-		min_clock -= TICK_NSEC / 2;
+	min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC;
 
 	if (unlikely(delta < 0)) {
 		clock++;
 		goto out;
 	}
 
-	/*
-	 * The clock must stay within a jiffie of the gtod.
-	 * But since we may be at the start of a jiffy or the end of one
-	 * we add another jiffy buffer.
-	 */
-	max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC;
-
-	delta *= scd->multi;
-	delta >>= MULTI_SHIFT;
+	max_clock = min_clock + TICK_NSEC;
 
-	if (unlikely(clock + delta > max_clock) && check_max(scd)) {
+	if (unlikely(clock + delta > max_clock)) {
 		if (clock < max_clock)
 			clock = max_clock;
 		else
@@ -189,12 +130,9 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *tim
 	if (unlikely(clock < min_clock))
 		clock = min_clock;
 
-	if (time)
-		*time = clock;
-	else {
-		scd->prev_raw = now;
-		scd->clock = clock;
-	}
+	scd->prev_raw = now;
+	scd->tick_jiffies = now_jiffies;
+	scd->clock = clock;
 }
 
 static void lock_double_clock(struct sched_clock_data *data1,
@@ -238,26 +176,21 @@ u64 sched_clock_cpu(int cpu)
 		now -= scd->tick_gtod;
 
 		__raw_spin_unlock(&my_scd->lock);
-
-		__update_sched_clock(scd, now, &clock);
-
-		__raw_spin_unlock(&scd->lock);
-
 	} else {
 		__raw_spin_lock(&scd->lock);
-		__update_sched_clock(scd, now, NULL);
-		clock = scd->clock;
-		__raw_spin_unlock(&scd->lock);
 	}
 
+	__update_sched_clock(scd, now);
+	clock = scd->clock;
+
+	__raw_spin_unlock(&scd->lock);
+
 	return clock;
 }
 
 void sched_clock_tick(void)
 {
 	struct sched_clock_data *scd = this_scd();
-	unsigned long now_jiffies = jiffies;
-	s64 mult, delta_gtod, delta_raw;
 	u64 now, now_gtod;
 
 	if (unlikely(!sched_clock_running))
@@ -269,29 +202,14 @@ void sched_clock_tick(void)
 	now = sched_clock();
 
 	__raw_spin_lock(&scd->lock);
-	__update_sched_clock(scd, now, NULL);
+	__update_sched_clock(scd, now);
 	/*
 	 * update tick_gtod after __update_sched_clock() because that will
 	 * already observe 1 new jiffy; adding a new tick_gtod to that would
 	 * increase the clock 2 jiffies.
 	 */
-	delta_gtod = now_gtod - scd->tick_gtod;
-	delta_raw = now - scd->tick_raw;
-
-	if ((long)delta_raw > 0) {
-		mult = delta_gtod << MULTI_SHIFT;
-		do_div(mult, delta_raw);
-		scd->multi = mult;
-		if (scd->multi > MAX_MULTI)
-			scd->multi = MAX_MULTI;
-		else if (scd->multi < MIN_MULTI)
-			scd->multi = MIN_MULTI;
-	} else
-		scd->multi = 1 << MULTI_SHIFT;
-
 	scd->tick_raw = now;
 	scd->tick_gtod = now_gtod;
-	scd->tick_jiffies = now_jiffies;
 	__raw_spin_unlock(&scd->lock);
 }
 
@@ -321,7 +239,6 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 	__raw_spin_lock(&scd->lock);
 	scd->prev_raw = now;
 	scd->clock += delta_ns;
-	scd->multi = 1 << MULTI_SHIFT;
 	__raw_spin_unlock(&scd->lock);
 
 	touch_softlockup_watchdog();
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 825b4c00fe44..f5da526424a9 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -289,7 +289,6 @@ void tick_nohz_stop_sched_tick(int inidle)
 			ts->tick_stopped = 1;
 			ts->idle_jiffies = last_jiffies;
 			rcu_enter_nohz();
-			sched_clock_tick_stop(cpu);
 		}
 
 		/*
@@ -392,7 +391,6 @@ void tick_nohz_restart_sched_tick(void)
 	select_nohz_load_balancer(0);
 	now = ktime_get();
 	tick_do_update_jiffies64(now);
-	sched_clock_tick_start(cpu);
 	cpu_clear(cpu, nohz_cpu_mask);
 
 	/*
-- 
cgit v1.2.3


From 50526968e99afbca34924abcb04658b6dd5c5ea5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jul 2008 09:39:48 +0200
Subject: sched clock: clean up sched_clock_cpu()

- simplify the remote clock rebasing

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Mike Galbraith <efault@gmx.de>
---
 kernel/sched_clock.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 9a7844158ae8..b96559cb96a5 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -169,11 +169,8 @@ u64 sched_clock_cpu(int cpu)
 
 		lock_double_clock(scd, my_scd);
 
-		now -= my_scd->tick_raw;
-		now += scd->tick_raw;
-
-		now += my_scd->tick_gtod;
-		now -= scd->tick_gtod;
+		now += scd->tick_raw - my_scd->tick_raw;
+		now += my_scd->tick_gtod - scd->tick_gtod;
 
 		__raw_spin_unlock(&my_scd->lock);
 	} else {
-- 
cgit v1.2.3


From 18e4e36c66d6edbdefc639692206cdf01e468713 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jul 2008 10:13:35 +0200
Subject: sched: eliminate scd->prev_raw

eliminate prev_raw and use tick_raw instead.

It's enough to base the current time on the scheduler tick timestamp
alone - the monotonicity and maximum checks will prevent any damage.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Mike Galbraith <efault@gmx.de>
---
 kernel/sched_clock.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index b96559cb96a5..4b8474c966dc 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -53,7 +53,6 @@ struct sched_clock_data {
 	raw_spinlock_t		lock;
 
 	unsigned long		tick_jiffies;
-	u64			prev_raw;
 	u64			tick_raw;
 	u64			tick_gtod;
 	u64			clock;
@@ -84,7 +83,6 @@ void sched_clock_init(void)
 
 		scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 		scd->tick_jiffies = now_jiffies;
-		scd->prev_raw = 0;
 		scd->tick_raw = 0;
 		scd->tick_gtod = ktime_now;
 		scd->clock = ktime_now;
@@ -105,7 +103,7 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
 	long delta_jiffies = now_jiffies - scd->tick_jiffies;
 	u64 clock = scd->clock;
 	u64 min_clock, max_clock;
-	s64 delta = now - scd->prev_raw;
+	s64 delta = now - scd->tick_raw;
 
 	WARN_ON_ONCE(!irqs_disabled());
 	min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC;
@@ -130,7 +128,6 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
 	if (unlikely(clock < min_clock))
 		clock = min_clock;
 
-	scd->prev_raw = now;
 	scd->tick_jiffies = now_jiffies;
 	scd->clock = clock;
 }
@@ -234,7 +231,6 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 	 * rq clock:
 	 */
 	__raw_spin_lock(&scd->lock);
-	scd->prev_raw = now;
 	scd->clock += delta_ns;
 	__raw_spin_unlock(&scd->lock);
 
-- 
cgit v1.2.3


From 56b906126d33904d4d67615d0d5b95dbdf1f27ca Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jul 2008 10:15:55 +0200
Subject: sched clock: simplify __update_sched_clock()

- return the current clock instead of letting callers
  fetch it from scd->clock

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Mike Galbraith <efault@gmx.de>
---
 kernel/sched_clock.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 4b8474c966dc..857a1291fd23 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -97,7 +97,7 @@ void sched_clock_init(void)
  *  - filter out backward motion
  *  - use jiffies to generate a min,max window to clip the raw values
  */
-static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
+static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
 {
 	unsigned long now_jiffies = jiffies;
 	long delta_jiffies = now_jiffies - scd->tick_jiffies;
@@ -130,6 +130,8 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
 
 	scd->tick_jiffies = now_jiffies;
 	scd->clock = clock;
+
+	return clock;
 }
 
 static void lock_double_clock(struct sched_clock_data *data1,
@@ -174,8 +176,7 @@ u64 sched_clock_cpu(int cpu)
 		__raw_spin_lock(&scd->lock);
 	}
 
-	__update_sched_clock(scd, now);
-	clock = scd->clock;
+	clock = __update_sched_clock(scd, now);
 
 	__raw_spin_unlock(&scd->lock);
 
-- 
cgit v1.2.3


From 4a273f209cc95d148f79b4c96d3d03997b44ffda Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 30 Jul 2008 10:22:07 +0200
Subject: sched clock: couple local and remote clocks

When taking the time of a remote CPU, use the opportunity to
couple (sync) the clocks to each other. (in a monotonic way)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Mike Galbraith <efault@gmx.de>
---
 kernel/sched_clock.c | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 857a1291fd23..074edc989379 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -149,7 +149,7 @@ static void lock_double_clock(struct sched_clock_data *data1,
 u64 sched_clock_cpu(int cpu)
 {
 	struct sched_clock_data *scd = cpu_sdc(cpu);
-	u64 now, clock;
+	u64 now, clock, this_clock, remote_clock;
 
 	if (unlikely(!sched_clock_running))
 		return 0ull;
@@ -158,26 +158,36 @@ u64 sched_clock_cpu(int cpu)
 	now = sched_clock();
 
 	if (cpu != raw_smp_processor_id()) {
-		/*
-		 * in order to update a remote cpu's clock based on our
-		 * unstable raw time rebase it against:
-		 *   tick_raw		(offset between raw counters)
-		 *   tick_gotd          (tick offset between cpus)
-		 */
 		struct sched_clock_data *my_scd = this_scd();
 
 		lock_double_clock(scd, my_scd);
 
-		now += scd->tick_raw - my_scd->tick_raw;
-		now += my_scd->tick_gtod - scd->tick_gtod;
+		this_clock = __update_sched_clock(my_scd, now);
+		remote_clock = scd->clock;
+
+		/*
+		 * Use the opportunity that we have both locks
+		 * taken to couple the two clocks: we take the
+		 * larger time as the latest time for both
+		 * runqueues. (this creates monotonic movement)
+		 */
+		if (likely(remote_clock < this_clock)) {
+			clock = this_clock;
+			scd->clock = clock;
+		} else {
+			/*
+			 * Should be rare, but possible:
+			 */
+			clock = remote_clock;
+			my_scd->clock = remote_clock;
+		}
 
 		__raw_spin_unlock(&my_scd->lock);
 	} else {
 		__raw_spin_lock(&scd->lock);
+		clock = __update_sched_clock(scd, now);
 	}
 
-	clock = __update_sched_clock(scd, now);
-
 	__raw_spin_unlock(&scd->lock);
 
 	return clock;
@@ -223,7 +233,6 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
 void sched_clock_idle_wakeup_event(u64 delta_ns)
 {
 	struct sched_clock_data *scd = this_scd();
-	u64 now = sched_clock();
 
 	/*
 	 * Override the previous timestamp and ignore all
-- 
cgit v1.2.3


From 419ca3f13532793b81aff09f80c60af3eacbb43d Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Tue, 29 Jul 2008 21:45:03 -0700
Subject: lockdep: fix combinatorial explosion in lock subgraph traversal

When we traverse the graph, either forwards or backwards, we
are interested in whether a certain property exists somewhere
in a node reachable in the graph.

Therefore it is never necessary to traverse through a node more
than once to get a correct answer to the given query.

Take advantage of this property using a global ID counter so that we
need not clear all the markers in all the lock_class entries before
doing a traversal.  A new ID is choosen when we start to traverse, and
we continue through a lock_class only if it's ID hasn't been marked
with the new value yet.

This short-circuiting is essential especially for high CPU count
systems.  The scheduler has a runqueue per cpu, and needs to take
two runqueue locks at a time, which leads to long chains of
backwards and forwards subgraphs from these runqueue lock nodes.
Without the short-circuit implemented here, a graph traversal on
a runqueue lock can take up to (1 << (N - 1)) checks on a system
with N cpus.

For anything more than 16 cpus or so, lockdep will eventually bring
the machine to a complete standstill.

Signed-off-by: David S. Miller <davem@davemloft.net>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c           | 86 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/lockdep_internals.h |  3 ++
 kernel/lockdep_proc.c      | 34 ++----------------
 3 files changed, 92 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index d38a64362973..6999e64fc248 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -372,6 +372,19 @@ unsigned int nr_process_chains;
 unsigned int max_lockdep_depth;
 unsigned int max_recursion_depth;
 
+static unsigned int lockdep_dependency_gen_id;
+
+static bool lockdep_dependency_visit(struct lock_class *source,
+				     unsigned int depth)
+{
+	if (!depth)
+		lockdep_dependency_gen_id++;
+	if (source->dep_gen_id == lockdep_dependency_gen_id)
+		return true;
+	source->dep_gen_id = lockdep_dependency_gen_id;
+	return false;
+}
+
 #ifdef CONFIG_DEBUG_LOCKDEP
 /*
  * We cannot printk in early bootup code. Not even early_printk()
@@ -558,6 +571,9 @@ static void print_lock_dependencies(struct lock_class *class, int depth)
 {
 	struct lock_list *entry;
 
+	if (lockdep_dependency_visit(class, depth))
+		return;
+
 	if (DEBUG_LOCKS_WARN_ON(depth >= 20))
 		return;
 
@@ -959,6 +975,67 @@ static int noinline print_infinite_recursion_bug(void)
 	return 0;
 }
 
+unsigned long __lockdep_count_forward_deps(struct lock_class *class,
+					   unsigned int depth)
+{
+	struct lock_list *entry;
+	unsigned long ret = 1;
+
+	if (lockdep_dependency_visit(class, depth))
+		return 0;
+
+	/*
+	 * Recurse this class's dependency list:
+	 */
+	list_for_each_entry(entry, &class->locks_after, entry)
+		ret += __lockdep_count_forward_deps(entry->class, depth + 1);
+
+	return ret;
+}
+
+unsigned long lockdep_count_forward_deps(struct lock_class *class)
+{
+	unsigned long ret, flags;
+
+	local_irq_save(flags);
+	__raw_spin_lock(&lockdep_lock);
+	ret = __lockdep_count_forward_deps(class, 0);
+	__raw_spin_unlock(&lockdep_lock);
+	local_irq_restore(flags);
+
+	return ret;
+}
+
+unsigned long __lockdep_count_backward_deps(struct lock_class *class,
+					    unsigned int depth)
+{
+	struct lock_list *entry;
+	unsigned long ret = 1;
+
+	if (lockdep_dependency_visit(class, depth))
+		return 0;
+	/*
+	 * Recurse this class's dependency list:
+	 */
+	list_for_each_entry(entry, &class->locks_before, entry)
+		ret += __lockdep_count_backward_deps(entry->class, depth + 1);
+
+	return ret;
+}
+
+unsigned long lockdep_count_backward_deps(struct lock_class *class)
+{
+	unsigned long ret, flags;
+
+	local_irq_save(flags);
+	__raw_spin_lock(&lockdep_lock);
+	ret = __lockdep_count_backward_deps(class, 0);
+	__raw_spin_unlock(&lockdep_lock);
+	local_irq_restore(flags);
+
+	return ret;
+}
+
 /*
  * Prove that the dependency graph starting at <entry> can not
  * lead to <target>. Print an error and return 0 if it does.
@@ -968,6 +1045,9 @@ check_noncircular(struct lock_class *source, unsigned int depth)
 {
 	struct lock_list *entry;
 
+	if (lockdep_dependency_visit(source, depth))
+		return 1;
+
 	debug_atomic_inc(&nr_cyclic_check_recursions);
 	if (depth > max_recursion_depth)
 		max_recursion_depth = depth;
@@ -1011,6 +1091,9 @@ find_usage_forwards(struct lock_class *source, unsigned int depth)
 	struct lock_list *entry;
 	int ret;
 
+	if (lockdep_dependency_visit(source, depth))
+		return 1;
+
 	if (depth > max_recursion_depth)
 		max_recursion_depth = depth;
 	if (depth >= RECURSION_LIMIT)
@@ -1050,6 +1133,9 @@ find_usage_backwards(struct lock_class *source, unsigned int depth)
 	struct lock_list *entry;
 	int ret;
 
+	if (lockdep_dependency_visit(source, depth))
+		return 1;
+
 	if (!__raw_spin_is_locked(&lockdep_lock))
 		return DEBUG_LOCKS_WARN_ON(1);
 
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index c3600a091a28..68d44ec77ab5 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -53,6 +53,9 @@ extern unsigned int nr_process_chains;
 extern unsigned int max_lockdep_depth;
 extern unsigned int max_recursion_depth;
 
+extern unsigned long lockdep_count_forward_deps(struct lock_class *);
+extern unsigned long lockdep_count_backward_deps(struct lock_class *);
+
 #ifdef CONFIG_DEBUG_LOCKDEP
 /*
  * Various lockdep statistics:
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 9b0e940e2545..6252ff799d19 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -63,34 +63,6 @@ static void l_stop(struct seq_file *m, void *v)
 {
 }
 
-static unsigned long count_forward_deps(struct lock_class *class)
-{
-	struct lock_list *entry;
-	unsigned long ret = 1;
-
-	/*
-	 * Recurse this class's dependency list:
-	 */
-	list_for_each_entry(entry, &class->locks_after, entry)
-		ret += count_forward_deps(entry->class);
-
-	return ret;
-}
-
-static unsigned long count_backward_deps(struct lock_class *class)
-{
-	struct lock_list *entry;
-	unsigned long ret = 1;
-
-	/*
-	 * Recurse this class's dependency list:
-	 */
-	list_for_each_entry(entry, &class->locks_before, entry)
-		ret += count_backward_deps(entry->class);
-
-	return ret;
-}
-
 static void print_name(struct seq_file *m, struct lock_class *class)
 {
 	char str[128];
@@ -124,10 +96,10 @@ static int l_show(struct seq_file *m, void *v)
 #ifdef CONFIG_DEBUG_LOCKDEP
 	seq_printf(m, " OPS:%8ld", class->ops);
 #endif
-	nr_forward_deps = count_forward_deps(class);
+	nr_forward_deps = lockdep_count_forward_deps(class);
 	seq_printf(m, " FD:%5ld", nr_forward_deps);
 
-	nr_backward_deps = count_backward_deps(class);
+	nr_backward_deps = lockdep_count_backward_deps(class);
 	seq_printf(m, " BD:%5ld", nr_backward_deps);
 
 	get_usage_chars(class, &c1, &c2, &c3, &c4);
@@ -350,7 +322,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
 		if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
 			nr_hardirq_read_unsafe++;
 
-		sum_forward_deps += count_forward_deps(class);
+		sum_forward_deps += lockdep_count_forward_deps(class);
 	}
 #ifdef CONFIG_DEBUG_LOCKDEP
 	DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused);
-- 
cgit v1.2.3


From 5e710e37bde120bb069f691bee68e69ef4393173 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 30 Jul 2008 13:26:57 +0200
Subject: lockdep: change scheduler annotation

While thinking about David's graph walk lockdep patch it _finally_
dawned on me that there is no reason we have a lock class per cpu ...

Sorry for being dense :-/

The below changes the annotation from a lock class per cpu, to a single
nested lock, as the scheduler never holds more that 2 rq locks at a time
anyway.

If there was code requiring holding all rq locks this would not work and
the original annotation would be the only option, but that not being the
case, this is a much lighter one.

Compiles and boots on a 2-way x86_64.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0236958addcb..655f1db26b12 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -600,7 +600,6 @@ struct rq {
 	/* BKL stats */
 	unsigned int bkl_count;
 #endif
-	struct lock_class_key rq_lock_key;
 };
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -2759,10 +2758,10 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
 	} else {
 		if (rq1 < rq2) {
 			spin_lock(&rq1->lock);
-			spin_lock(&rq2->lock);
+			spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
 		} else {
 			spin_lock(&rq2->lock);
-			spin_lock(&rq1->lock);
+			spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
 		}
 	}
 	update_rq_clock(rq1);
@@ -2805,10 +2804,10 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 		if (busiest < this_rq) {
 			spin_unlock(&this_rq->lock);
 			spin_lock(&busiest->lock);
-			spin_lock(&this_rq->lock);
+			spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
 			ret = 1;
 		} else
-			spin_lock(&busiest->lock);
+			spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
 	}
 	return ret;
 }
@@ -7998,7 +7997,6 @@ void __init sched_init(void)
 
 		rq = cpu_rq(i);
 		spin_lock_init(&rq->lock);
-		lockdep_set_class(&rq->lock, &rq->rq_lock_key);
 		rq->nr_running = 0;
 		init_cfs_rq(&rq->cfs, rq);
 		init_rt_rq(&rq->rt, rq);
-- 
cgit v1.2.3


From a9b60bf4c29e07a5a2f26a6f74937972fee9b58b Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Fri, 1 Aug 2008 08:39:34 -0500
Subject: kgdb: fix kgdb_validate_break_address to perform a mem write

A regression to the kgdb core was found in the case of using the
CONFIG_DEBUG_RODATA kernel option.  When this option is on, a breakpoint
cannot be written into any readonly memory page.  When an external
debugger requests a breakpoint to get set, the
kgdb_validate_break_address() was only checking to see if the address
to place the breakpoint was readable and lacked a write check.

This patch changes the validate routine to try reading (via the
breakpoint set request) and also to try immediately writing the break
point.  If either fails, an error is correctly returned and the
debugger behaves correctly.  Then an end user can make the
descision to use hardware breakpoints.

Also update the documentation to reflect that using
CONFIG_DEBUG_RODATA will inhibit the use of software breakpoints.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 kernel/kgdb.c | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 3ec23c3ec97f..c0d45b2c4d79 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -166,13 +166,6 @@ early_param("nokgdbroundup", opt_nokgdbroundup);
  * Weak aliases for breakpoint management,
  * can be overriden by architectures when needed:
  */
-int __weak kgdb_validate_break_address(unsigned long addr)
-{
-	char tmp_variable[BREAK_INSTR_SIZE];
-
-	return probe_kernel_read(tmp_variable, (char *)addr, BREAK_INSTR_SIZE);
-}
-
 int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
 {
 	int err;
@@ -191,6 +184,25 @@ int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
 				  (char *)bundle, BREAK_INSTR_SIZE);
 }
 
+int __weak kgdb_validate_break_address(unsigned long addr)
+{
+	char tmp_variable[BREAK_INSTR_SIZE];
+	int err;
+	/* Validate setting the breakpoint and then removing it.  In the
+	 * remove fails, the kernel needs to emit a bad message because we
+	 * are deep trouble not being able to put things back the way we
+	 * found them.
+	 */
+	err = kgdb_arch_set_breakpoint(addr, tmp_variable);
+	if (err)
+		return err;
+	err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
+	if (err)
+		printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
+		   "memory destroyed at: %lx", addr);
+	return err;
+}
+
 unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
 {
 	return instruction_pointer(regs);
-- 
cgit v1.2.3


From 25fc999913839a45cbb48ac7872e67f7521e7ed9 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Fri, 1 Aug 2008 08:39:35 -0500
Subject: kgdb: fix gdb serial thread queries

The command "info threads" did not work correctly with kgdb.  It would
result in a silent kernel hang if used.

This patach addresses several problems.
 - Fix use of deprecated NR_CPUS
 - Fix kgdb to not walk linearly through the pid space
 - Correctly implement shadow pids
 - Change the threads per query to a #define
 - Fix kgdb_hex2long to work with negated values

The threads 0 and -1 are reserved to represent the current task.  That
means that CPU 0 will start with a shadow thread id of -2, and CPU 1
will have a shadow thread id of -3, etc...

From the debugger you can switch to a shadow thread to see what one of
the other cpus was doing, however it is not possible to execute run
control operations on any other cpu execept the cpu executing the
kgdb_handle_exception().

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 kernel/kgdb.c | 68 +++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 50 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index c0d45b2c4d79..eaa21fc9ad1d 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -56,12 +56,14 @@
 
 static int kgdb_break_asap;
 
+#define KGDB_MAX_THREAD_QUERY 17
 struct kgdb_state {
 	int			ex_vector;
 	int			signo;
 	int			err_code;
 	int			cpu;
 	int			pass_exception;
+	unsigned long		thr_query;
 	unsigned long		threadid;
 	long			kgdb_usethreadid;
 	struct pt_regs		*linux_regs;
@@ -445,9 +447,14 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val)
 {
 	int hex_val;
 	int num = 0;
+	int negate = 0;
 
 	*long_val = 0;
 
+	if (**ptr == '-') {
+		negate = 1;
+		(*ptr)++;
+	}
 	while (**ptr) {
 		hex_val = hex(**ptr);
 		if (hex_val < 0)
@@ -458,6 +465,9 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val)
 		(*ptr)++;
 	}
 
+	if (negate)
+		*long_val = -*long_val;
+
 	return num;
 }
 
@@ -527,10 +537,16 @@ static void int_to_threadref(unsigned char *id, int value)
 static struct task_struct *getthread(struct pt_regs *regs, int tid)
 {
 	/*
-	 * Non-positive TIDs are remapped idle tasks:
+	 * Non-positive TIDs are remapped to the cpu shadow information
 	 */
-	if (tid <= 0)
-		return idle_task(-tid);
+	if (tid == 0 || tid == -1)
+		tid = -atomic_read(&kgdb_active) - 2;
+	if (tid < 0) {
+		if (kgdb_info[-tid - 2].task)
+			return kgdb_info[-tid - 2].task;
+		else
+			return idle_task(-tid - 2);
+	}
 
 	/*
 	 * find_task_by_pid_ns() does not take the tasklist lock anymore
@@ -737,14 +753,15 @@ setundefined:
 }
 
 /*
- * Remap normal tasks to their real PID, idle tasks to -1 ... -NR_CPUs:
+ * Remap normal tasks to their real PID,
+ * CPU shadow threads are mapped to -CPU - 2
  */
 static inline int shadow_pid(int realpid)
 {
 	if (realpid)
 		return realpid;
 
-	return -1-raw_smp_processor_id();
+	return -raw_smp_processor_id() - 2;
 }
 
 static char gdbmsgbuf[BUFMAX + 1];
@@ -838,7 +855,7 @@ static void gdb_cmd_getregs(struct kgdb_state *ks)
 		local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
 	} else {
 		local_debuggerinfo = NULL;
-		for (i = 0; i < NR_CPUS; i++) {
+		for_each_online_cpu(i) {
 			/*
 			 * Try to find the task on some other
 			 * or possibly this node if we do not
@@ -972,10 +989,13 @@ static int gdb_cmd_reboot(struct kgdb_state *ks)
 /* Handle the 'q' query packets */
 static void gdb_cmd_query(struct kgdb_state *ks)
 {
-	struct task_struct *thread;
+	struct task_struct *g;
+	struct task_struct *p;
 	unsigned char thref[8];
 	char *ptr;
 	int i;
+	int cpu;
+	int finished = 0;
 
 	switch (remcom_in_buffer[1]) {
 	case 's':
@@ -985,22 +1005,34 @@ static void gdb_cmd_query(struct kgdb_state *ks)
 			break;
 		}
 
-		if (remcom_in_buffer[1] == 'f')
-			ks->threadid = 1;
-
+		i = 0;
 		remcom_out_buffer[0] = 'm';
 		ptr = remcom_out_buffer + 1;
-
-		for (i = 0; i < 17; ks->threadid++) {
-			thread = getthread(ks->linux_regs, ks->threadid);
-			if (thread) {
-				int_to_threadref(thref, ks->threadid);
+		if (remcom_in_buffer[1] == 'f') {
+			/* Each cpu is a shadow thread */
+			for_each_online_cpu(cpu) {
+				ks->thr_query = 0;
+				int_to_threadref(thref, -cpu - 2);
 				pack_threadid(ptr, thref);
 				ptr += BUF_THREAD_ID_SIZE;
 				*(ptr++) = ',';
 				i++;
 			}
 		}
+
+		do_each_thread(g, p) {
+			if (i >= ks->thr_query && !finished) {
+				int_to_threadref(thref, p->pid);
+				pack_threadid(ptr, thref);
+				ptr += BUF_THREAD_ID_SIZE;
+				*(ptr++) = ',';
+				ks->thr_query++;
+				if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
+					finished = 1;
+			}
+			i++;
+		} while_each_thread(g, p);
+
 		*(--ptr) = '\0';
 		break;
 
@@ -1023,15 +1055,15 @@ static void gdb_cmd_query(struct kgdb_state *ks)
 			error_packet(remcom_out_buffer, -EINVAL);
 			break;
 		}
-		if (ks->threadid > 0) {
+		if ((int)ks->threadid > 0) {
 			kgdb_mem2hex(getthread(ks->linux_regs,
 					ks->threadid)->comm,
 					remcom_out_buffer, 16);
 		} else {
 			static char tmpstr[23 + BUF_THREAD_ID_SIZE];
 
-			sprintf(tmpstr, "Shadow task %d for pid 0",
-					(int)(-ks->threadid-1));
+			sprintf(tmpstr, "shadowCPU%d",
+					(int)(-ks->threadid - 2));
 			kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
 		}
 		break;
-- 
cgit v1.2.3


From ee1d315663ee0b494898f813a266d6244b263b4f Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 7 Jul 2008 10:49:45 -0400
Subject: [PATCH] Audit: Collect signal info when SIGUSR2 is sent to auditd

Makes the kernel audit subsystem collect information about the sending
process when that process sends SIGUSR2 to the userspace audit daemon.
SIGUSR2 is a new interesting signal to auditd telling auditd that it
should try to start logging to disk again and the error condition which
caused it to stop logging to disk (usually out of space) has been
rectified.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4699950e65bd..580a5389fd99 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2375,7 +2375,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 	struct audit_context *ctx = tsk->audit_context;
 
 	if (audit_pid && t->tgid == audit_pid) {
-		if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) {
+		if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
 			audit_sig_pid = tsk->pid;
 			if (tsk->loginuid != -1)
 				audit_sig_uid = tsk->loginuid;
-- 
cgit v1.2.3


From 1d6c9649e236caa2e93e3647256216e57172b011 Mon Sep 17 00:00:00 2001
From: Vesa-Matti J Kari <vmkari@cc.helsinki.fi>
Date: Wed, 23 Jul 2008 00:06:13 +0300
Subject: kernel/audit.c control character detection is off-by-one

Hello,

According to my understanding there is an off-by-one bug in the
function:

   audit_string_contains_control()

in:

  kernel/audit.c

Patch is included.

I do not know from how many places the function is called from, but for
example, SELinux Access Vector Cache tries to log untrusted filenames via
call path:

avc_audit()
     audit_log_untrustedstring()
         audit_log_n_untrustedstring()
             audit_string_contains_control()

If audit_string_contains_control() detects control characters, then the
string is hex-encoded. But the hex=0x7f dec=127, DEL-character, is not
detected.

I guess this could have at least some minor security implications, since a
user can create a filename with 0x7f in it, causing logged filename to
possibly look different when someone reads it on the terminal.

Signed-off-by: Vesa-Matti Kari <vmkari@cc.helsinki.fi>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index e092f1c0ce30..6d903182c6b7 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1366,7 +1366,7 @@ int audit_string_contains_control(const char *string, size_t len)
 {
 	const unsigned char *p;
 	for (p = string; p < (const unsigned char *)string + len && *p; p++) {
-		if (*p == '"' || *p < 0x21 || *p > 0x7f)
+		if (*p == '"' || *p < 0x21 || *p > 0x7e)
 			return 1;
 	}
 	return 0;
-- 
cgit v1.2.3


From 036bbf76ad9f83781590623111b80ba0b82930ac Mon Sep 17 00:00:00 2001
From: zhangxiliang <zhangxiliang@cn.fujitsu.com>
Date: Fri, 1 Aug 2008 09:47:01 +0800
Subject: Re: [PATCH] the loginuid field should be output in all
 AUDIT_CONFIG_CHANGE audit messages

> shouldn't these be using the "audit_get_loginuid(current)"  and if we
> are going to output loginuid we also should be outputting sessionid

Thanks for your detailed explanation.
I have made a new patch for outputing "loginuid" and "sessionid" by audit_get_loginuid(current) and audit_get_sessionid(current).
If there are some deficiencies, please give me your indication.

Signed-off-by: Zhang Xiliang <zhangxiliang@cn.fujitsu.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditfilter.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 98c50cc671bb..b7d354e2b0ef 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1022,8 +1022,11 @@ static void audit_update_watch(struct audit_parent *parent,
 			struct audit_buffer *ab;
 			ab = audit_log_start(NULL, GFP_KERNEL,
 				AUDIT_CONFIG_CHANGE);
+			audit_log_format(ab, "auid=%u ses=%u",
+				audit_get_loginuid(current),
+				audit_get_sessionid(current));
 			audit_log_format(ab,
-				"op=updated rules specifying path=");
+				" op=updated rules specifying path=");
 			audit_log_untrustedstring(ab, owatch->path);
 			audit_log_format(ab, " with dev=%u ino=%lu\n",
 				 dev, ino);
@@ -1058,7 +1061,10 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
 				struct audit_buffer *ab;
 				ab = audit_log_start(NULL, GFP_KERNEL,
 					AUDIT_CONFIG_CHANGE);
-				audit_log_format(ab, "op=remove rule path=");
+				audit_log_format(ab, "auid=%u ses=%u",
+					audit_get_loginuid(current),
+					audit_get_sessionid(current));
+				audit_log_format(ab, " op=remove rule path=");
 				audit_log_untrustedstring(ab, w->path);
 				if (r->filterkey) {
 					audit_log_format(ab, " key=");
-- 
cgit v1.2.3


From 980dfb0db340b95094732d78b55311f2c539c1af Mon Sep 17 00:00:00 2001
From: zhangxiliang <zhangxiliang@cn.fujitsu.com>
Date: Fri, 1 Aug 2008 19:15:47 +0800
Subject: [PATCH] Fix the kernel panic of audit_filter_task when key field is
 set

When calling audit_filter_task(), it calls audit_filter_rules() with audit_context is NULL.
If the key field is set, the result in audit_filter_rules() will be set to 1 and
ctx->filterkey will be set to key.
But the ctx is NULL in this condition, so kernel will panic.

Signed-off-by: Zhang Xiliang <zhangxiliang@cn.fujitsu.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 580a5389fd99..496c3dd37276 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -610,7 +610,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 		if (!result)
 			return 0;
 	}
-	if (rule->filterkey)
+	if (rule->filterkey && ctx)
 		ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
 	switch (rule->action) {
 	case AUDIT_NEVER:    *state = AUDIT_DISABLED;	    break;
-- 
cgit v1.2.3


From 20c6aaa39ab735c7ed78e4e5a214d250efae0a6e Mon Sep 17 00:00:00 2001
From: zhangxiliang <zhangxiliang@cn.fujitsu.com>
Date: Thu, 31 Jul 2008 10:11:19 +0800
Subject: [PATCH] Fix the bug of using AUDIT_STATUS_RATE_LIMIT when set fail,
 no error output.

When the "status_get->mask" is "AUDIT_STATUS_RATE_LIMIT || AUDIT_STATUS_BACKLOG_LIMIT".
If "audit_set_rate_limit" fails and "audit_set_backlog_limit" succeeds, the "err" value
will be greater than or equal to 0. It will miss the failure of rate set.

Signed-off-by: Zhang Xiliang <zhangxiliang@cn.fujitsu.com>
Acked-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 6d903182c6b7..4414e93d8750 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -707,12 +707,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		if (status_get->mask & AUDIT_STATUS_ENABLED) {
 			err = audit_set_enabled(status_get->enabled,
 						loginuid, sessionid, sid);
-			if (err < 0) return err;
+			if (err < 0)
+				return err;
 		}
 		if (status_get->mask & AUDIT_STATUS_FAILURE) {
 			err = audit_set_failure(status_get->failure,
 						loginuid, sessionid, sid);
-			if (err < 0) return err;
+			if (err < 0)
+				return err;
 		}
 		if (status_get->mask & AUDIT_STATUS_PID) {
 			int new_pid = status_get->pid;
@@ -725,9 +727,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			audit_pid = new_pid;
 			audit_nlk_pid = NETLINK_CB(skb).pid;
 		}
-		if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
+		if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) {
 			err = audit_set_rate_limit(status_get->rate_limit,
 						   loginuid, sessionid, sid);
+			if (err < 0)
+				return err;
+		}
 		if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
 			err = audit_set_backlog_limit(status_get->backlog_limit,
 						      loginuid, sessionid, sid);
-- 
cgit v1.2.3


From 5c7edcd7ee6b77b88252fe4096dce1a46a60c829 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Thu, 31 Jul 2008 02:04:09 -0700
Subject: tracehook: fix exit_signal=0 case

My commit 2b2a1ff64afbadac842bbc58c5166962cf4f7664 introduced a regression
(sorry about that) for the odd case of exit_signal=0 (e.g. clone_flags=0).
This is not a normal use, but it's used by a case in the glibc test suite.

Dying with exit_signal=0 sends no signal, but it's supposed to wake up a
parent's blocked wait*() calls (unlike the delayed_group_leader case).
This fixes tracehook_notify_death() and its caller to distinguish a
"signal 0" wakeup from the delayed_group_leader case (with no wakeup).

Signed-off-by: Roland McGrath <roland@redhat.com>
Tested-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index eb4d6470d1d0..38ec40630149 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -911,10 +911,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 		tsk->exit_signal = SIGCHLD;
 
 	signal = tracehook_notify_death(tsk, &cookie, group_dead);
-	if (signal > 0)
+	if (signal >= 0)
 		signal = do_notify_parent(tsk, signal);
 
-	tsk->exit_state = signal < 0 ? EXIT_DEAD : EXIT_ZOMBIE;
+	tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
 
 	/* mt-exec, de_thread() is waiting for us */
 	if (thread_group_leader(tsk) &&
@@ -927,7 +927,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	tracehook_report_death(tsk, signal, cookie, group_dead);
 
 	/* If the process is dead, release it - nobody will wait for it */
-	if (signal < 0)
+	if (signal == DEATH_REAP)
 		release_task(tsk);
 }
 
-- 
cgit v1.2.3


From 1a61c88defcd611bd148d6c960b498e1b8bbbe00 Mon Sep 17 00:00:00 2001
From: zhangxiliang <zhangxiliang@cn.fujitsu.com>
Date: Sat, 2 Aug 2008 10:56:37 +0800
Subject: Re: [PATCH] Fix the kernel panic of audit_filter_task when key field
 is set

Sorry, I miss a blank between if and "(".
And I add "unlikely" to check "ctx" in audit_match_perm() and audit_match_filetype().
This is a new patch for it.

Signed-off-by: Zhang Xiliang <zhangxiliang@cn.fujitsu.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 496c3dd37276..972f8e61d36a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -243,6 +243,9 @@ static inline int open_arg(int flags, int mask)
 
 static int audit_match_perm(struct audit_context *ctx, int mask)
 {
+	if (unlikely(!ctx))
+		return 0;
+
 	unsigned n = ctx->major;
 	switch (audit_classify_syscall(ctx->arch, n)) {
 	case 0:	/* native */
@@ -284,6 +287,10 @@ static int audit_match_filetype(struct audit_context *ctx, int which)
 {
 	unsigned index = which & ~S_IFMT;
 	mode_t mode = which & S_IFMT;
+
+	if (unlikely(!ctx))
+		return 0;
+
 	if (index >= ctx->name_count)
 		return 0;
 	if (ctx->names[index].ino == -1)
-- 
cgit v1.2.3


From 725aad24c3ba96a7c06448c14c265a466cdbd663 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Sun, 3 Aug 2008 09:33:03 -0700
Subject: __sched_setscheduler: don't do any policy checks when not "user"

The "user" parameter to __sched_setscheduler indicates whether the
change is being done on behalf of a user process or not.  If not, we
shouldn't apply any permissions checks, so don't call
security_task_setscheduler().

Signed-off-by: Jeremy Fitzhardinge <jeremy@goop.org>
Tested-by: Steve Wise <swise@opengridcomputing.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 21f7da94662e..04160d277e7a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5004,19 +5004,21 @@ recheck:
 			return -EPERM;
 	}
 
+	if (user) {
 #ifdef CONFIG_RT_GROUP_SCHED
-	/*
-	 * Do not allow realtime tasks into groups that have no runtime
-	 * assigned.
-	 */
-	if (user
-	    && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
-		return -EPERM;
+		/*
+		 * Do not allow realtime tasks into groups that have no runtime
+		 * assigned.
+		 */
+		if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
+			return -EPERM;
 #endif
 
-	retval = security_task_setscheduler(p, policy, param);
-	if (retval)
-		return retval;
+		retval = security_task_setscheduler(p, policy, param);
+		if (retval)
+			return retval;
+	}
+
 	/*
 	 * make sure no PI-waiters arrive (or leave) while we are
 	 * changing the priority of the task:
-- 
cgit v1.2.3


From 32194450330be327f3b25bf6b66298bd122599e9 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 5 Aug 2008 13:01:10 -0700
Subject: relay: fix "full buffer with exactly full last subbuffer" accounting
 problem

In relay's current read implementation, if the buffer is completely full
but hasn't triggered the buffer-full condition (i.e. the last write
didn't cross the subbuffer boundary) and the last subbuffer is exactly
full, the subbuffer accounting code erroneously finds nothing available.
This patch fixes the problem.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Mathieu Desnoyers <compudj@krystal.dyndns.org>
Cc: Andrea Righi <righi.andrea@gmail.com>
Cc: <stable@kernel.org>		[2.6.25.x, 2.6.26.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/relay.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index 04006ef970b8..8d13a7855c08 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -944,6 +944,10 @@ static void relay_file_read_consume(struct rchan_buf *buf,
 	size_t n_subbufs = buf->chan->n_subbufs;
 	size_t read_subbuf;
 
+	if (buf->subbufs_produced == buf->subbufs_consumed &&
+	    buf->offset == buf->bytes_consumed)
+		return;
+
 	if (buf->bytes_consumed + bytes_consumed > subbuf_size) {
 		relay_subbufs_consumed(buf->chan, buf->cpu, 1);
 		buf->bytes_consumed = 0;
@@ -975,6 +979,8 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
 
 	relay_file_read_consume(buf, read_pos, 0);
 
+	consumed = buf->subbufs_consumed;
+
 	if (unlikely(buf->offset > subbuf_size)) {
 		if (produced == consumed)
 			return 0;
@@ -993,8 +999,12 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
 	if (consumed > produced)
 		produced += n_subbufs * subbuf_size;
 
-	if (consumed == produced)
+	if (consumed == produced) {
+		if (buf->offset == subbuf_size &&
+		    buf->subbufs_produced > buf->subbufs_consumed)
+			return 1;
 		return 0;
+	}
 
 	return 1;
 }
-- 
cgit v1.2.3


From 5b2becc8cffdccdd60c63099f592ddd35aa6c34f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 5 Aug 2008 13:01:13 -0700
Subject: semaphore: __down_common: use signal_pending_state()

Change __down_common() to use signal_pending_state() instead of open
coding.

The changes in kernel/semaphore.o are just artifacts, the state checks are
optimized away.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/semaphore.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index aaaeae8244e7..94a62c0d4ade 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -212,9 +212,7 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
 	waiter.up = 0;
 
 	for (;;) {
-		if (state == TASK_INTERRUPTIBLE && signal_pending(task))
-			goto interrupted;
-		if (state == TASK_KILLABLE && fatal_signal_pending(task))
+		if (signal_pending_state(state, task))
 			goto interrupted;
 		if (timeout <= 0)
 			goto timed_out;
-- 
cgit v1.2.3


From c69ad71bcdecbaab82cfacb1dc967bd7fd967a3b Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Tue, 5 Aug 2008 13:01:14 -0700
Subject: genirq: better warning on irqchip->set_type() failure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

While I'm glad to finally see the hole fixed whereby passing an invalid
IRQ trigger type to request_irq() would be ignored, the current diagnostic
isn't quite useful.  Fixed by also listing the trigger type which was
rejected.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Acked-by: Uwe Kleine-König <Uwe.Kleine-Koenig@digi.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/irq/manage.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 152abfd3589f..0314074fa232 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -323,7 +323,8 @@ static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq,
 	ret = chip->set_type(irq, flags & IRQF_TRIGGER_MASK);
 
 	if (ret)
-		pr_err("setting flow type for irq %u failed (%pF)\n",
+		pr_err("setting trigger mode %d for irq %u failed (%pF)\n",
+				(int)(flags & IRQF_TRIGGER_MASK),
 				irq, chip->set_type);
 
 	return ret;
-- 
cgit v1.2.3


From d2dc1f4adb4b5b02d87e49e115e5107f4da790c0 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Tue, 5 Aug 2008 13:01:31 -0700
Subject: dma: fix order calculation in dma_mark_declared_memory_occupied()

get_order() takes byte-sized input, not a page-granular one.

Irrespective of this fix I'm inclined to believe that this doesn't work
right anyway - bitmap_allocate_region() has an implicit assumption of
'pos' being suitable for 'order', which this function doesn't seem to
enforce (and since it's being called with a byte-granular value there's no
reason to believe that the callers would make sure device_addr is passed
accordingly - it's also not documented that way).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: James E.J. Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Dmitry Baryshkov <dbaryshkov@gmail.com>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/dma-coherent.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
index 7517115a8cce..91e96950cd52 100644
--- a/kernel/dma-coherent.c
+++ b/kernel/dma-coherent.c
@@ -77,15 +77,14 @@ void *dma_mark_declared_memory_occupied(struct device *dev,
 {
 	struct dma_coherent_mem *mem = dev->dma_mem;
 	int pos, err;
-	int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
 
-	pages >>= PAGE_SHIFT;
+	size += device_addr & ~PAGE_MASK;
 
 	if (!mem)
 		return ERR_PTR(-EINVAL);
 
 	pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
-	err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
+	err = bitmap_allocate_region(mem->bitmap, pos, get_order(size));
 	if (err != 0)
 		return ERR_PTR(err);
 	return mem->virt_base + (pos << PAGE_SHIFT);
-- 
cgit v1.2.3


From bf1db69fbf4ff511e88736ce2e6318846f34492b Mon Sep 17 00:00:00 2001
From: Richard Hughes <richard@hughsie.com>
Date: Tue, 5 Aug 2008 13:01:35 -0700
Subject: pm_qos: spelling fixes

A documentation cleanup patch.  With a minor tweak to clarify units for
kbs.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: mark gross <mgross@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/pm_qos_params.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 8cb757026386..da9c2dda6a4e 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -24,7 +24,7 @@
  * requirement that the application has is cleaned up when closes the file
  * pointer or exits the pm_qos_object will get an opportunity to clean up.
  *
- * mark gross mgross@linux.intel.com
+ * Mark Gross <mgross@linux.intel.com>
  */
 
 #include <linux/pm_qos_params.h>
@@ -211,8 +211,8 @@ EXPORT_SYMBOL_GPL(pm_qos_requirement);
  * @value: defines the qos request
  *
  * This function inserts a new entry in the pm_qos_class list of requested qos
- * performance charactoistics.  It recomputes the agregate QoS expectations for
- * the pm_qos_class of parrameters.
+ * performance characteristics.  It recomputes the aggregate QoS expectations
+ * for the pm_qos_class of parameters.
  */
 int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value)
 {
@@ -250,10 +250,10 @@ EXPORT_SYMBOL_GPL(pm_qos_add_requirement);
  * @name: identifies the request
  * @value: defines the qos request
  *
- * Updates an existing qos requierement for the pm_qos_class of parameters along
+ * Updates an existing qos requirement for the pm_qos_class of parameters along
  * with updating the target pm_qos_class value.
  *
- * If the named request isn't in the lest then no change is made.
+ * If the named request isn't in the list then no change is made.
  */
 int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value)
 {
@@ -287,7 +287,7 @@ EXPORT_SYMBOL_GPL(pm_qos_update_requirement);
  * @pm_qos_class: identifies which list of qos request to us
  * @name: identifies the request
  *
- * Will remove named qos request from pm_qos_class list of parrameters and
+ * Will remove named qos request from pm_qos_class list of parameters and
  * recompute the current target value for the pm_qos_class.
  */
 void pm_qos_remove_requirement(int pm_qos_class, char *name)
@@ -319,7 +319,7 @@ EXPORT_SYMBOL_GPL(pm_qos_remove_requirement);
  * @notifier: notifier block managed by caller.
  *
  * will register the notifier into a notification chain that gets called
- * uppon changes to the pm_qos_class target value.
+ * upon changes to the pm_qos_class target value.
  */
  int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
 {
@@ -338,7 +338,7 @@ EXPORT_SYMBOL_GPL(pm_qos_add_notifier);
  * @notifier: notifier block to be removed.
  *
  * will remove the notifier from the notification chain that gets called
- * uppon changes to the pm_qos_class target value.
+ * upon changes to the pm_qos_class target value.
  */
 int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
 {
-- 
cgit v1.2.3


From cb3952bf7853667a1cb3515e67f27e67f0fce9e8 Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dbaryshkov@gmail.com>
Date: Wed, 30 Jul 2008 14:46:50 +0400
Subject: DMA: make dma-coherent.c documentation kdoc-friendly

Spotted by Randy.

Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Dmitry Baryshkov <dbaryshkov@gmail.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 kernel/dma-coherent.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
index 91e96950cd52..c1d4d5b4c61c 100644
--- a/kernel/dma-coherent.c
+++ b/kernel/dma-coherent.c
@@ -92,7 +92,7 @@ void *dma_mark_declared_memory_occupied(struct device *dev,
 EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
 
 /**
- * Try to allocate memory from the per-device coherent area.
+ * dma_alloc_from_coherent() - try to allocate memory from the per-device coherent area
  *
  * @dev:	device from which we allocate memory
  * @size:	size of requested memory area
@@ -100,11 +100,11 @@ EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
  * @ret:	This pointer will be filled with the virtual address
  * 		to allocated area.
  *
- * This function should be only called from per-arch %dma_alloc_coherent()
+ * This function should be only called from per-arch dma_alloc_coherent()
  * to support allocation from per-device coherent memory pools.
  *
  * Returns 0 if dma_alloc_coherent should continue with allocating from
- * generic memory areas, or !0 if dma_alloc_coherent should return %ret.
+ * generic memory areas, or !0 if dma_alloc_coherent should return @ret.
  */
 int dma_alloc_from_coherent(struct device *dev, ssize_t size,
 				       dma_addr_t *dma_handle, void **ret)
@@ -126,7 +126,7 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size,
 }
 
 /**
- * Try to free the memory allocated from per-device coherent memory pool.
+ * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool
  * @dev:	device from which the memory was allocated
  * @order:	the order of pages allocated
  * @vaddr:	virtual address of allocated pages
@@ -135,7 +135,7 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size,
  * coherent memory pool and if so, releases that memory.
  *
  * Returns 1 if we correctly released the memory, or 0 if
- * %dma_release_coherent() should proceed with releasing memory from
+ * dma_release_coherent() should proceed with releasing memory from
  * generic pools.
  */
 int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
-- 
cgit v1.2.3


From c1955a3d4762e7a9bf84035eb3c4886a900f0d15 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 11 Aug 2008 08:59:03 +0200
Subject: sched_clock: delay using sched_clock()

Some arch's can't handle sched_clock() being called too early - delay
this until sched_clock_init() has been called.

Reported-by: Bill Gatliff <bgat@billgatliff.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Nishanth Aravamudan <nacc@us.ibm.com>
CC: Russell King - ARM Linux <linux@arm.linux.org.uk>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_clock.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 074edc989379..204991a0bfa7 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -42,6 +42,8 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 	return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
 }
 
+static __read_mostly int sched_clock_running;
+
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
 
 struct sched_clock_data {
@@ -70,8 +72,6 @@ static inline struct sched_clock_data *cpu_sdc(int cpu)
 	return &per_cpu(sched_clock_data, cpu);
 }
 
-static __read_mostly int sched_clock_running;
-
 void sched_clock_init(void)
 {
 	u64 ktime_now = ktime_to_ns(ktime_get());
@@ -248,6 +248,21 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
+#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
+
+void sched_clock_init(void)
+{
+	sched_clock_running = 1;
+}
+
+u64 sched_clock_cpu(int cpu)
+{
+	if (unlikely(!sched_clock_running))
+		return 0;
+
+	return sched_clock();
+}
+
 #endif
 
 unsigned long long cpu_clock(int cpu)
-- 
cgit v1.2.3


From 64aa348edc617dea17bbd01ddee4e47886d5ec8c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 11 Aug 2008 09:30:21 +0200
Subject: lockdep: lock_set_subclass - reset a held lock's subclass

this can be used to reset a held lock's subclass, for arbitrary-depth
iterated data structures such as trees or lists which have per-node
locks.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 6999e64fc248..e14d383dcb0b 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2660,6 +2660,55 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
 	return 1;
 }
 
+static int
+__lock_set_subclass(struct lockdep_map *lock,
+		    unsigned int subclass, unsigned long ip)
+{
+	struct task_struct *curr = current;
+	struct held_lock *hlock, *prev_hlock;
+	struct lock_class *class;
+	unsigned int depth;
+	int i;
+
+	depth = curr->lockdep_depth;
+	if (DEBUG_LOCKS_WARN_ON(!depth))
+		return 0;
+
+	prev_hlock = NULL;
+	for (i = depth-1; i >= 0; i--) {
+		hlock = curr->held_locks + i;
+		/*
+		 * We must not cross into another context:
+		 */
+		if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
+			break;
+		if (hlock->instance == lock)
+			goto found_it;
+		prev_hlock = hlock;
+	}
+	return print_unlock_inbalance_bug(curr, lock, ip);
+
+found_it:
+	class = register_lock_class(lock, subclass, 0);
+	hlock->class = class;
+
+	curr->lockdep_depth = i;
+	curr->curr_chain_key = hlock->prev_chain_key;
+
+	for (; i < depth; i++) {
+		hlock = curr->held_locks + i;
+		if (!__lock_acquire(hlock->instance,
+			hlock->class->subclass, hlock->trylock,
+				hlock->read, hlock->check, hlock->hardirqs_off,
+				hlock->acquire_ip))
+			return 0;
+	}
+
+	if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
+		return 0;
+	return 1;
+}
+
 /*
  * Remove the lock to the list of currently held locks in a
  * potentially non-nested (out of order) manner. This is a
@@ -2824,6 +2873,26 @@ static void check_flags(unsigned long flags)
 #endif
 }
 
+void
+lock_set_subclass(struct lockdep_map *lock,
+		  unsigned int subclass, unsigned long ip)
+{
+	unsigned long flags;
+
+	if (unlikely(current->lockdep_recursion))
+		return;
+
+	raw_local_irq_save(flags);
+	current->lockdep_recursion = 1;
+	check_flags(flags);
+	if (__lock_set_subclass(lock, subclass, ip))
+		check_chain_key(current);
+	current->lockdep_recursion = 0;
+	raw_local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL_GPL(lock_set_subclass);
+
 /*
  * We are not always called with irqs disabled - do that here,
  * and also avoid lockdep recursion:
-- 
cgit v1.2.3


From 1b12bbc747560ea68bcc132c3d05699e52271da0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 11 Aug 2008 09:30:22 +0200
Subject: lockdep: re-annotate scheduler runqueues

Instead of using a per-rq lock class, use the regular nesting operations.

However, take extra care with double_lock_balance() as it can release the
already held rq->lock (and therefore change its nesting class).

So what can happen is:

 spin_lock(rq->lock);	// this rq subclass 0

 double_lock_balance(rq, other_rq);
   // release rq
   // acquire other_rq->lock subclass 0
   // acquire rq->lock subclass 1

 spin_unlock(other_rq->lock);

leaving you with rq->lock in subclass 1

So a subsequent double_lock_balance() call can try to nest a subclass 1
lock while already holding a subclass 1 lock.

Fix this by introducing double_unlock_balance() which releases the other
rq's lock, but also re-sets the subclass for this rq's lock to 0.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    | 11 +++++++++--
 kernel/sched_rt.c |  8 +++++---
 2 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 655f1db26b12..9b2b6a85577d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2812,6 +2812,13 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 	return ret;
 }
 
+static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+	__releases(busiest->lock)
+{
+	spin_unlock(&busiest->lock);
+	lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+}
+
 /*
  * If dest_cpu is allowed for this process, migrate the task to it.
  * This is accomplished by forcing the cpu_allowed mask to only
@@ -3636,7 +3643,7 @@ redo:
 		ld_moved = move_tasks(this_rq, this_cpu, busiest,
 					imbalance, sd, CPU_NEWLY_IDLE,
 					&all_pinned);
-		spin_unlock(&busiest->lock);
+		double_unlock_balance(this_rq, busiest);
 
 		if (unlikely(all_pinned)) {
 			cpu_clear(cpu_of(busiest), *cpus);
@@ -3751,7 +3758,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 		else
 			schedstat_inc(sd, alb_failed);
 	}
-	spin_unlock(&target_rq->lock);
+	double_unlock_balance(busiest_rq, target_rq);
 }
 
 #ifdef CONFIG_NO_HZ
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 908c04f9dad0..6163e4cf885b 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -861,6 +861,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 #define RT_MAX_TRIES 3
 
 static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
+static void double_unlock_balance(struct rq *this_rq, struct rq *busiest);
+
 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
@@ -1022,7 +1024,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
 			break;
 
 		/* try again */
-		spin_unlock(&lowest_rq->lock);
+		double_unlock_balance(rq, lowest_rq);
 		lowest_rq = NULL;
 	}
 
@@ -1091,7 +1093,7 @@ static int push_rt_task(struct rq *rq)
 
 	resched_task(lowest_rq->curr);
 
-	spin_unlock(&lowest_rq->lock);
+	double_unlock_balance(rq, lowest_rq);
 
 	ret = 1;
 out:
@@ -1197,7 +1199,7 @@ static int pull_rt_task(struct rq *this_rq)
 
 		}
  skip:
-		spin_unlock(&src_rq->lock);
+		double_unlock_balance(this_rq, src_rq);
 	}
 
 	return ret;
-- 
cgit v1.2.3


From f82b217e3513fe3af342c0f3ee1494e86250c21c Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Mon, 11 Aug 2008 09:30:23 +0200
Subject: lockdep: shrink held_lock structure

struct held_lock {
        u64                        prev_chain_key;       /*     0     8 */
        struct lock_class *        class;                /*     8     8 */
        long unsigned int          acquire_ip;           /*    16     8 */
        struct lockdep_map *       instance;             /*    24     8 */
        int                        irq_context;          /*    32     4 */
        int                        trylock;              /*    36     4 */
        int                        read;                 /*    40     4 */
        int                        check;                /*    44     4 */
        int                        hardirqs_off;         /*    48     4 */

        /* size: 56, cachelines: 1 */
        /* padding: 4 */
        /* last cacheline: 56 bytes */
};

struct held_lock {
        u64                        prev_chain_key;       /*     0     8 */
        long unsigned int          acquire_ip;           /*     8     8 */
        struct lockdep_map *       instance;             /*    16     8 */
        unsigned int               class_idx:11;         /*    24:21  4 */
        unsigned int               irq_context:2;        /*    24:19  4 */
        unsigned int               trylock:1;            /*    24:18  4 */
        unsigned int               read:2;               /*    24:16  4 */
        unsigned int               check:2;              /*    24:14  4 */
        unsigned int               hardirqs_off:1;       /*    24:13  4 */

        /* size: 32, cachelines: 1 */
        /* padding: 4 */
        /* bit_padding: 13 bits */
        /* last cacheline: 32 bytes */
};

[mingo@elte.hu: shrunk hlock->class too]
[peterz@infradead.org: fixup bit sizes]
Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/lockdep.c           | 113 ++++++++++++++++++++++++++-------------------
 kernel/lockdep_internals.h |   3 --
 2 files changed, 65 insertions(+), 51 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index e14d383dcb0b..d3c72ad8d09e 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -124,6 +124,15 @@ static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
 unsigned long nr_lock_classes;
 static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
 
+static inline struct lock_class *hlock_class(struct held_lock *hlock)
+{
+	if (!hlock->class_idx) {
+		DEBUG_LOCKS_WARN_ON(1);
+		return NULL;
+	}
+	return lock_classes + hlock->class_idx - 1;
+}
+
 #ifdef CONFIG_LOCK_STAT
 static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
 
@@ -222,7 +231,7 @@ static void lock_release_holdtime(struct held_lock *hlock)
 
 	holdtime = sched_clock() - hlock->holdtime_stamp;
 
-	stats = get_lock_stats(hlock->class);
+	stats = get_lock_stats(hlock_class(hlock));
 	if (hlock->read)
 		lock_time_inc(&stats->read_holdtime, holdtime);
 	else
@@ -518,7 +527,7 @@ static void print_lockdep_cache(struct lockdep_map *lock)
 
 static void print_lock(struct held_lock *hlock)
 {
-	print_lock_name(hlock->class);
+	print_lock_name(hlock_class(hlock));
 	printk(", at: ");
 	print_ip_sym(hlock->acquire_ip);
 }
@@ -948,7 +957,7 @@ static noinline int print_circular_bug_tail(void)
 	if (debug_locks_silent)
 		return 0;
 
-	this.class = check_source->class;
+	this.class = hlock_class(check_source);
 	if (!save_trace(&this.trace))
 		return 0;
 
@@ -1057,7 +1066,7 @@ check_noncircular(struct lock_class *source, unsigned int depth)
 	 * Check this lock's dependency list:
 	 */
 	list_for_each_entry(entry, &source->locks_after, entry) {
-		if (entry->class == check_target->class)
+		if (entry->class == hlock_class(check_target))
 			return print_circular_bug_header(entry, depth+1);
 		debug_atomic_inc(&nr_cyclic_checks);
 		if (!check_noncircular(entry->class, depth+1))
@@ -1150,6 +1159,11 @@ find_usage_backwards(struct lock_class *source, unsigned int depth)
 		return 2;
 	}
 
+	if (!source && debug_locks_off_graph_unlock()) {
+		WARN_ON(1);
+		return 0;
+	}
+
 	/*
 	 * Check this lock's dependency list:
 	 */
@@ -1189,9 +1203,9 @@ print_bad_irq_dependency(struct task_struct *curr,
 	printk("\nand this task is already holding:\n");
 	print_lock(prev);
 	printk("which would create a new lock dependency:\n");
-	print_lock_name(prev->class);
+	print_lock_name(hlock_class(prev));
 	printk(" ->");
-	print_lock_name(next->class);
+	print_lock_name(hlock_class(next));
 	printk("\n");
 
 	printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
@@ -1232,12 +1246,12 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
 
 	find_usage_bit = bit_backwards;
 	/* fills in <backwards_match> */
-	ret = find_usage_backwards(prev->class, 0);
+	ret = find_usage_backwards(hlock_class(prev), 0);
 	if (!ret || ret == 1)
 		return ret;
 
 	find_usage_bit = bit_forwards;
-	ret = find_usage_forwards(next->class, 0);
+	ret = find_usage_forwards(hlock_class(next), 0);
 	if (!ret || ret == 1)
 		return ret;
 	/* ret == 2 */
@@ -1362,7 +1376,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
 
 	for (i = 0; i < curr->lockdep_depth; i++) {
 		prev = curr->held_locks + i;
-		if (prev->class != next->class)
+		if (hlock_class(prev) != hlock_class(next))
 			continue;
 		/*
 		 * Allow read-after-read recursion of the same
@@ -1415,7 +1429,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 	 */
 	check_source = next;
 	check_target = prev;
-	if (!(check_noncircular(next->class, 0)))
+	if (!(check_noncircular(hlock_class(next), 0)))
 		return print_circular_bug_tail();
 
 	if (!check_prev_add_irq(curr, prev, next))
@@ -1439,8 +1453,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 	 *  chains - the second one will be new, but L1 already has
 	 *  L2 added to its dependency list, due to the first chain.)
 	 */
-	list_for_each_entry(entry, &prev->class->locks_after, entry) {
-		if (entry->class == next->class) {
+	list_for_each_entry(entry, &hlock_class(prev)->locks_after, entry) {
+		if (entry->class == hlock_class(next)) {
 			if (distance == 1)
 				entry->distance = 1;
 			return 2;
@@ -1451,26 +1465,28 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 	 * Ok, all validations passed, add the new lock
 	 * to the previous lock's dependency list:
 	 */
-	ret = add_lock_to_list(prev->class, next->class,
-			       &prev->class->locks_after, next->acquire_ip, distance);
+	ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
+			       &hlock_class(prev)->locks_after,
+			       next->acquire_ip, distance);
 
 	if (!ret)
 		return 0;
 
-	ret = add_lock_to_list(next->class, prev->class,
-			       &next->class->locks_before, next->acquire_ip, distance);
+	ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
+			       &hlock_class(next)->locks_before,
+			       next->acquire_ip, distance);
 	if (!ret)
 		return 0;
 
 	/*
 	 * Debugging printouts:
 	 */
-	if (verbose(prev->class) || verbose(next->class)) {
+	if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
 		graph_unlock();
 		printk("\n new dependency: ");
-		print_lock_name(prev->class);
+		print_lock_name(hlock_class(prev));
 		printk(" => ");
-		print_lock_name(next->class);
+		print_lock_name(hlock_class(next));
 		printk("\n");
 		dump_stack();
 		return graph_lock();
@@ -1567,7 +1583,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
 				     struct held_lock *hlock,
 				     u64 chain_key)
 {
-	struct lock_class *class = hlock->class;
+	struct lock_class *class = hlock_class(hlock);
 	struct list_head *hash_head = chainhashentry(chain_key);
 	struct lock_chain *chain;
 	struct held_lock *hlock_curr, *hlock_next;
@@ -1640,7 +1656,7 @@ cache_hit:
 	if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
 		chain->base = cn;
 		for (j = 0; j < chain->depth - 1; j++, i++) {
-			int lock_id = curr->held_locks[i].class - lock_classes;
+			int lock_id = curr->held_locks[i].class_idx - 1;
 			chain_hlocks[chain->base + j] = lock_id;
 		}
 		chain_hlocks[chain->base + j] = class - lock_classes;
@@ -1736,7 +1752,7 @@ static void check_chain_key(struct task_struct *curr)
 			WARN_ON(1);
 			return;
 		}
-		id = hlock->class - lock_classes;
+		id = hlock->class_idx - 1;
 		if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
 			return;
 
@@ -1781,7 +1797,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
 	print_lock(this);
 
 	printk("{%s} state was registered at:\n", usage_str[prev_bit]);
-	print_stack_trace(this->class->usage_traces + prev_bit, 1);
+	print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1);
 
 	print_irqtrace_events(curr);
 	printk("\nother info that might help us debug this:\n");
@@ -1800,7 +1816,7 @@ static inline int
 valid_state(struct task_struct *curr, struct held_lock *this,
 	    enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
 {
-	if (unlikely(this->class->usage_mask & (1 << bad_bit)))
+	if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit)))
 		return print_usage_bug(curr, this, bad_bit, new_bit);
 	return 1;
 }
@@ -1839,7 +1855,7 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
 	lockdep_print_held_locks(curr);
 
 	printk("\nthe first lock's dependencies:\n");
-	print_lock_dependencies(this->class, 0);
+	print_lock_dependencies(hlock_class(this), 0);
 
 	printk("\nthe second lock's dependencies:\n");
 	print_lock_dependencies(other, 0);
@@ -1862,7 +1878,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
 
 	find_usage_bit = bit;
 	/* fills in <forwards_match> */
-	ret = find_usage_forwards(this->class, 0);
+	ret = find_usage_forwards(hlock_class(this), 0);
 	if (!ret || ret == 1)
 		return ret;
 
@@ -1881,7 +1897,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
 
 	find_usage_bit = bit;
 	/* fills in <backwards_match> */
-	ret = find_usage_backwards(this->class, 0);
+	ret = find_usage_backwards(hlock_class(this), 0);
 	if (!ret || ret == 1)
 		return ret;
 
@@ -1947,7 +1963,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 				LOCK_ENABLED_HARDIRQS_READ, "hard-read"))
 			return 0;
 #endif
-		if (hardirq_verbose(this->class))
+		if (hardirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
 	case LOCK_USED_IN_SOFTIRQ:
@@ -1972,7 +1988,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 				LOCK_ENABLED_SOFTIRQS_READ, "soft-read"))
 			return 0;
 #endif
-		if (softirq_verbose(this->class))
+		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
 	case LOCK_USED_IN_HARDIRQ_READ:
@@ -1985,7 +2001,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (!check_usage_forwards(curr, this,
 					  LOCK_ENABLED_HARDIRQS, "hard"))
 			return 0;
-		if (hardirq_verbose(this->class))
+		if (hardirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
 	case LOCK_USED_IN_SOFTIRQ_READ:
@@ -1998,7 +2014,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 		if (!check_usage_forwards(curr, this,
 					  LOCK_ENABLED_SOFTIRQS, "soft"))
 			return 0;
-		if (softirq_verbose(this->class))
+		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
 	case LOCK_ENABLED_HARDIRQS:
@@ -2024,7 +2040,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 				   LOCK_USED_IN_HARDIRQ_READ, "hard-read"))
 			return 0;
 #endif
-		if (hardirq_verbose(this->class))
+		if (hardirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
 	case LOCK_ENABLED_SOFTIRQS:
@@ -2050,7 +2066,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 				   LOCK_USED_IN_SOFTIRQ_READ, "soft-read"))
 			return 0;
 #endif
-		if (softirq_verbose(this->class))
+		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
 	case LOCK_ENABLED_HARDIRQS_READ:
@@ -2065,7 +2081,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 					   LOCK_USED_IN_HARDIRQ, "hard"))
 			return 0;
 #endif
-		if (hardirq_verbose(this->class))
+		if (hardirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
 	case LOCK_ENABLED_SOFTIRQS_READ:
@@ -2080,7 +2096,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
 					   LOCK_USED_IN_SOFTIRQ, "soft"))
 			return 0;
 #endif
-		if (softirq_verbose(this->class))
+		if (softirq_verbose(hlock_class(this)))
 			ret = 2;
 		break;
 	default:
@@ -2396,7 +2412,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 	 * If already set then do not dirty the cacheline,
 	 * nor do any checks:
 	 */
-	if (likely(this->class->usage_mask & new_mask))
+	if (likely(hlock_class(this)->usage_mask & new_mask))
 		return 1;
 
 	if (!graph_lock())
@@ -2404,14 +2420,14 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 	/*
 	 * Make sure we didnt race:
 	 */
-	if (unlikely(this->class->usage_mask & new_mask)) {
+	if (unlikely(hlock_class(this)->usage_mask & new_mask)) {
 		graph_unlock();
 		return 1;
 	}
 
-	this->class->usage_mask |= new_mask;
+	hlock_class(this)->usage_mask |= new_mask;
 
-	if (!save_trace(this->class->usage_traces + new_bit))
+	if (!save_trace(hlock_class(this)->usage_traces + new_bit))
 		return 0;
 
 	switch (new_bit) {
@@ -2545,8 +2561,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 		return 0;
 
 	hlock = curr->held_locks + depth;
-
-	hlock->class = class;
+	if (DEBUG_LOCKS_WARN_ON(!class))
+		return 0;
+	hlock->class_idx = class - lock_classes + 1;
 	hlock->acquire_ip = ip;
 	hlock->instance = lock;
 	hlock->trylock = trylock;
@@ -2690,7 +2707,7 @@ __lock_set_subclass(struct lockdep_map *lock,
 
 found_it:
 	class = register_lock_class(lock, subclass, 0);
-	hlock->class = class;
+	hlock->class_idx = class - lock_classes + 1;
 
 	curr->lockdep_depth = i;
 	curr->curr_chain_key = hlock->prev_chain_key;
@@ -2698,7 +2715,7 @@ found_it:
 	for (; i < depth; i++) {
 		hlock = curr->held_locks + i;
 		if (!__lock_acquire(hlock->instance,
-			hlock->class->subclass, hlock->trylock,
+			hlock_class(hlock)->subclass, hlock->trylock,
 				hlock->read, hlock->check, hlock->hardirqs_off,
 				hlock->acquire_ip))
 			return 0;
@@ -2759,7 +2776,7 @@ found_it:
 	for (i++; i < depth; i++) {
 		hlock = curr->held_locks + i;
 		if (!__lock_acquire(hlock->instance,
-			hlock->class->subclass, hlock->trylock,
+			hlock_class(hlock)->subclass, hlock->trylock,
 				hlock->read, hlock->check, hlock->hardirqs_off,
 				hlock->acquire_ip))
 			return 0;
@@ -2804,7 +2821,7 @@ static int lock_release_nested(struct task_struct *curr,
 
 #ifdef CONFIG_DEBUG_LOCKDEP
 	hlock->prev_chain_key = 0;
-	hlock->class = NULL;
+	hlock->class_idx = 0;
 	hlock->acquire_ip = 0;
 	hlock->irq_context = 0;
 #endif
@@ -3000,9 +3017,9 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
 found_it:
 	hlock->waittime_stamp = sched_clock();
 
-	point = lock_contention_point(hlock->class, ip);
+	point = lock_contention_point(hlock_class(hlock), ip);
 
-	stats = get_lock_stats(hlock->class);
+	stats = get_lock_stats(hlock_class(hlock));
 	if (point < ARRAY_SIZE(stats->contention_point))
 		stats->contention_point[i]++;
 	if (lock->cpu != smp_processor_id())
@@ -3048,7 +3065,7 @@ found_it:
 		hlock->holdtime_stamp = now;
 	}
 
-	stats = get_lock_stats(hlock->class);
+	stats = get_lock_stats(hlock_class(hlock));
 	if (waittime) {
 		if (hlock->read)
 			lock_time_inc(&stats->read_waittime, waittime);
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 68d44ec77ab5..55db193d366d 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -17,9 +17,6 @@
  */
 #define MAX_LOCKDEP_ENTRIES	8192UL
 
-#define MAX_LOCKDEP_KEYS_BITS	11
-#define MAX_LOCKDEP_KEYS	(1UL << MAX_LOCKDEP_KEYS_BITS)
-
 #define MAX_LOCKDEP_CHAINS_BITS	14
 #define MAX_LOCKDEP_CHAINS	(1UL << MAX_LOCKDEP_CHAINS_BITS)
 
-- 
cgit v1.2.3


From 4f3e7524b2e703d9f8b02ac338153a53dd7ede66 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 11 Aug 2008 09:30:23 +0200
Subject: lockdep: map_acquire

Most the free-standing lock_acquire() usages look remarkably similar, sweep
them into a new helper.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/workqueue.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ec7e4f62aaff..53564ae894a6 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -290,11 +290,11 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
 
 		BUG_ON(get_wq_data(work) != cwq);
 		work_clear_pending(work);
-		lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
-		lock_acquire(&lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+		map_acquire(&cwq->wq->lockdep_map);
+		map_acquire(&lockdep_map);
 		f(work);
-		lock_release(&lockdep_map, 1, _THIS_IP_);
-		lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
+		map_release(&lockdep_map);
+		map_release(&cwq->wq->lockdep_map);
 
 		if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
 			printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
@@ -413,8 +413,8 @@ void flush_workqueue(struct workqueue_struct *wq)
 	int cpu;
 
 	might_sleep();
-	lock_acquire(&wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
-	lock_release(&wq->lockdep_map, 1, _THIS_IP_);
+	map_acquire(&wq->lockdep_map);
+	map_release(&wq->lockdep_map);
 	for_each_cpu_mask_nr(cpu, *cpu_map)
 		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
 }
@@ -441,8 +441,8 @@ int flush_work(struct work_struct *work)
 	if (!cwq)
 		return 0;
 
-	lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
-	lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
+	map_acquire(&cwq->wq->lockdep_map);
+	map_release(&cwq->wq->lockdep_map);
 
 	prev = NULL;
 	spin_lock_irq(&cwq->lock);
@@ -536,8 +536,8 @@ static void wait_on_work(struct work_struct *work)
 
 	might_sleep();
 
-	lock_acquire(&work->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
-	lock_release(&work->lockdep_map, 1, _THIS_IP_);
+	map_acquire(&work->lockdep_map);
+	map_release(&work->lockdep_map);
 
 	cwq = get_wq_data(work);
 	if (!cwq)
@@ -861,8 +861,8 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
 	if (cwq->thread == NULL)
 		return;
 
-	lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
-	lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
+	map_acquire(&cwq->wq->lockdep_map);
+	map_release(&cwq->wq->lockdep_map);
 
 	flush_cpu_workqueue(cwq);
 	/*
-- 
cgit v1.2.3


From 7531e2f34d1d551b096143f19111139f0dd84c8b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 11 Aug 2008 09:30:24 +0200
Subject: lockdep: lock protection locks

On Fri, 2008-08-01 at 16:26 -0700, Linus Torvalds wrote:

> On Fri, 1 Aug 2008, David Miller wrote:
> >
> > Taking more than a few locks of the same class at once is bad
> > news and it's better to find an alternative method.
>
> It's not always wrong.
>
> If you can guarantee that anybody that takes more than one lock of a
> particular class will always take a single top-level lock _first_, then
> that's all good. You can obviously screw up and take the same lock _twice_
> (which will deadlock), but at least you cannot get into ABBA situations.
>
> So maybe the right thing to do is to just teach lockdep about "lock
> protection locks". That would have solved the multi-queue issues for
> networking too - all the actual network drivers would still have taken
> just their single queue lock, but the one case that needs to take all of
> them would have taken a separate top-level lock first.
>
> Never mind that the multi-queue locks were always taken in the same order:
> it's never wrong to just have some top-level serialization, and anybody
> who needs to take <n> locks might as well do <n+1>, because they sure as
> hell aren't going to be on _any_ fastpaths.
>
> So the simplest solution really sounds like just teaching lockdep about
> that one special case. It's not "nesting" exactly, although it's obviously
> related to it.

Do as Linus suggested. The lock protection lock is called nest_lock.

Note that we still have the MAX_LOCK_DEPTH (48) limit to consider, so anything
that spills that it still up shit creek.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index d3c72ad8d09e..410c3365ad8f 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -1372,18 +1372,32 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
 	       struct lockdep_map *next_instance, int read)
 {
 	struct held_lock *prev;
+	struct held_lock *nest = NULL;
 	int i;
 
 	for (i = 0; i < curr->lockdep_depth; i++) {
 		prev = curr->held_locks + i;
+
+		if (prev->instance == next->nest_lock)
+			nest = prev;
+
 		if (hlock_class(prev) != hlock_class(next))
 			continue;
+
 		/*
 		 * Allow read-after-read recursion of the same
 		 * lock class (i.e. read_lock(lock)+read_lock(lock)):
 		 */
 		if ((read == 2) && prev->read)
 			return 2;
+
+		/*
+		 * We're holding the nest_lock, which serializes this lock's
+		 * nesting behaviour.
+		 */
+		if (nest)
+			return 2;
+
 		return print_deadlock_bug(curr, prev, next);
 	}
 	return 1;
@@ -2507,7 +2521,7 @@ EXPORT_SYMBOL_GPL(lockdep_init_map);
  */
 static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 			  int trylock, int read, int check, int hardirqs_off,
-			  unsigned long ip)
+			  struct lockdep_map *nest_lock, unsigned long ip)
 {
 	struct task_struct *curr = current;
 	struct lock_class *class = NULL;
@@ -2566,6 +2580,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	hlock->class_idx = class - lock_classes + 1;
 	hlock->acquire_ip = ip;
 	hlock->instance = lock;
+	hlock->nest_lock = nest_lock;
 	hlock->trylock = trylock;
 	hlock->read = read;
 	hlock->check = check;
@@ -2717,7 +2732,7 @@ found_it:
 		if (!__lock_acquire(hlock->instance,
 			hlock_class(hlock)->subclass, hlock->trylock,
 				hlock->read, hlock->check, hlock->hardirqs_off,
-				hlock->acquire_ip))
+				hlock->nest_lock, hlock->acquire_ip))
 			return 0;
 	}
 
@@ -2778,7 +2793,7 @@ found_it:
 		if (!__lock_acquire(hlock->instance,
 			hlock_class(hlock)->subclass, hlock->trylock,
 				hlock->read, hlock->check, hlock->hardirqs_off,
-				hlock->acquire_ip))
+				hlock->nest_lock, hlock->acquire_ip))
 			return 0;
 	}
 
@@ -2915,7 +2930,8 @@ EXPORT_SYMBOL_GPL(lock_set_subclass);
  * and also avoid lockdep recursion:
  */
 void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
-			  int trylock, int read, int check, unsigned long ip)
+			  int trylock, int read, int check,
+			  struct lockdep_map *nest_lock, unsigned long ip)
 {
 	unsigned long flags;
 
@@ -2930,7 +2946,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 
 	current->lockdep_recursion = 1;
 	__lock_acquire(lock, subclass, trylock, read, check,
-		       irqs_disabled_flags(flags), ip);
+		       irqs_disabled_flags(flags), nest_lock, ip);
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
 }
-- 
cgit v1.2.3


From b7d39aff91454f2534db2275f55908656ec0470c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 11 Aug 2008 09:30:24 +0200
Subject: lockdep: spin_lock_nest_lock()

Expose the new lock protection lock.

This can be used to annotate places where we take multiple locks of the
same class and avoid deadlocks by always taking another (top-level) lock
first.

NOTE: we're still bound to the MAX_LOCK_DEPTH (48) limit.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/spinlock.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index a1fb54c93cdd..44baeea94ab9 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -292,6 +292,7 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
 }
 
 EXPORT_SYMBOL(_spin_lock_nested);
+
 unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass)
 {
 	unsigned long flags;
@@ -314,6 +315,16 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas
 
 EXPORT_SYMBOL(_spin_lock_irqsave_nested);
 
+void __lockfunc _spin_lock_nest_lock(spinlock_t *lock,
+				     struct lockdep_map *nest_lock)
+{
+	preempt_disable();
+	spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
+	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
+}
+
+EXPORT_SYMBOL(_spin_lock_nest_lock);
+
 #endif
 
 void __lockfunc _spin_unlock(spinlock_t *lock)
-- 
cgit v1.2.3


From 8bfe0298f7a04952d19f4a2cf510d7a6311eeed0 Mon Sep 17 00:00:00 2001
From: Rabin Vincent <rabin@rab.in>
Date: Mon, 11 Aug 2008 09:30:26 +0200
Subject: lockdep: handle chains involving classes defined in modules

Solve this by marking the classes as unused and not printing information
about the unused classes.

Reported-by: Eric Sesterhenn <snakebyte@gmx.de>
Signed-off-by: Rabin Vincent <rabin@rab.in>
Acked-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c      | 1 +
 kernel/lockdep_proc.c | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 410c3365ad8f..ab933fecd2a1 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3176,6 +3176,7 @@ static void zap_class(struct lock_class *class)
 	list_del_rcu(&class->hash_entry);
 	list_del_rcu(&class->lock_entry);
 
+	class->key = NULL;
 }
 
 static inline int within(const void *addr, void *start, unsigned long size)
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 6252ff799d19..fa19aee604c2 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -201,6 +201,9 @@ static int lc_show(struct seq_file *m, void *v)
 
 	for (i = 0; i < chain->depth; i++) {
 		class = lock_chain_get_class(chain, i);
+		if (!class->key)
+			continue;
+
 		seq_printf(m, "[%p] ", class->key);
 		print_name(m, class);
 		seq_puts(m, "\n");
-- 
cgit v1.2.3


From 3295f0ef9ff048a4619ede597ad9ec9cab725654 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 11 Aug 2008 10:30:30 +0200
Subject: lockdep: rename map_[acquire|release]() =>
 lock_map_[acquire|release]()

the names were too generic:

 drivers/uio/uio.c:87: error: expected identifier or '(' before 'do'
 drivers/uio/uio.c:87: error: expected identifier or '(' before 'while'
 drivers/uio/uio.c:113: error: 'map_release' undeclared here (not in a function)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/workqueue.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 53564ae894a6..8bb5b68fb3a9 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -290,11 +290,11 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
 
 		BUG_ON(get_wq_data(work) != cwq);
 		work_clear_pending(work);
-		map_acquire(&cwq->wq->lockdep_map);
-		map_acquire(&lockdep_map);
+		lock_map_acquire(&cwq->wq->lockdep_map);
+		lock_map_acquire(&lockdep_map);
 		f(work);
-		map_release(&lockdep_map);
-		map_release(&cwq->wq->lockdep_map);
+		lock_map_release(&lockdep_map);
+		lock_map_release(&cwq->wq->lockdep_map);
 
 		if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
 			printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
@@ -413,8 +413,8 @@ void flush_workqueue(struct workqueue_struct *wq)
 	int cpu;
 
 	might_sleep();
-	map_acquire(&wq->lockdep_map);
-	map_release(&wq->lockdep_map);
+	lock_map_acquire(&wq->lockdep_map);
+	lock_map_release(&wq->lockdep_map);
 	for_each_cpu_mask_nr(cpu, *cpu_map)
 		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
 }
@@ -441,8 +441,8 @@ int flush_work(struct work_struct *work)
 	if (!cwq)
 		return 0;
 
-	map_acquire(&cwq->wq->lockdep_map);
-	map_release(&cwq->wq->lockdep_map);
+	lock_map_acquire(&cwq->wq->lockdep_map);
+	lock_map_release(&cwq->wq->lockdep_map);
 
 	prev = NULL;
 	spin_lock_irq(&cwq->lock);
@@ -536,8 +536,8 @@ static void wait_on_work(struct work_struct *work)
 
 	might_sleep();
 
-	map_acquire(&work->lockdep_map);
-	map_release(&work->lockdep_map);
+	lock_map_acquire(&work->lockdep_map);
+	lock_map_release(&work->lockdep_map);
 
 	cwq = get_wq_data(work);
 	if (!cwq)
@@ -861,8 +861,8 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
 	if (cwq->thread == NULL)
 		return;
 
-	map_acquire(&cwq->wq->lockdep_map);
-	map_release(&cwq->wq->lockdep_map);
+	lock_map_acquire(&cwq->wq->lockdep_map);
+	lock_map_release(&cwq->wq->lockdep_map);
 
 	flush_cpu_workqueue(cwq);
 	/*
-- 
cgit v1.2.3


From 67182ae1c42206e516f7efb292b745e826497b24 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 10 Aug 2008 18:35:38 -0700
Subject: rcu, debug: detect stalled grace periods

this is a diagnostic patch for Classic RCU.

The approach is to record a timestamp at the beginning
of the grace period (in rcu_start_batch()), then have
rcu_check_callbacks() complain if:

 1.	it is running on a CPU that has holding up grace periods for
 	a long time (say one second).  This will identify the culprit
 	assuming that the culprit has not disabled hardware irqs,
 	instruction execution, or some such.

 2.	it is running on a CPU that is not holding up grace periods,
 	but grace periods have been held up for an even longer time
 	(say two seconds).

It is enabled via the default-off CONFIG_DEBUG_RCU_STALL kernel parameter.

Rather than exponential backoff, it backs off to once per 30 seconds.
My feeling upon thinking on it was that if you have stalled RCU grace
periods for that long, a few extra printk() messages are probably the
least of your worries...

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: David Witbrodt <dawitbro@sbcglobal.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index d4271146a9bd..d7ec731de75c 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -47,6 +47,7 @@
 #include <linux/notifier.h>
 #include <linux/cpu.h>
 #include <linux/mutex.h>
+#include <linux/time.h>
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key rcu_lock_key;
@@ -286,6 +287,81 @@ static void rcu_do_batch(struct rcu_data *rdp)
  *   rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
  *   period (if necessary).
  */
+
+#ifdef CONFIG_DEBUG_RCU_STALL
+
+static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
+{
+	rcp->gp_check = get_seconds() + 3;
+}
+static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+	int cpu;
+	long delta;
+
+	/* Only let one CPU complain about others per time interval. */
+
+	spin_lock(&rcp->lock);
+	delta = get_seconds() - rcp->gp_check;
+	if (delta < 2L ||
+	    cpus_empty(rcp->cpumask)) {
+		spin_unlock(&rcp->lock);
+		return;
+	rcp->gp_check = get_seconds() + 30;
+	}
+	spin_unlock(&rcp->lock);
+
+	/* OK, time to rat on our buddy... */
+
+	printk(KERN_ERR "RCU detected CPU stalls:");
+	for_each_cpu_mask(cpu, rcp->cpumask)
+		printk(" %d", cpu);
+	printk(" (detected by %d, t=%lu/%lu)\n",
+	       smp_processor_id(), get_seconds(), rcp->gp_check);
+}
+static void print_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+	printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n",
+			smp_processor_id(), get_seconds(), rcp->gp_check);
+	dump_stack();
+	spin_lock(&rcp->lock);
+	if ((long)(get_seconds() - rcp->gp_check) >= 0L)
+		rcp->gp_check = get_seconds() + 30;
+	spin_unlock(&rcp->lock);
+}
+static inline void check_cpu_stall(struct rcu_ctrlblk *rcp,
+				   struct rcu_data *rdp)
+{
+	long delta;
+
+	delta = get_seconds() - rcp->gp_check;
+	if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0L) {
+
+		/* We haven't checked in, so go dump stack. */
+
+		print_cpu_stall(rcp);
+
+	} else if (!cpus_empty(rcp->cpumask) && delta >= 2L) {
+
+		/* They had two seconds to dump stack, so complain. */
+
+		print_other_cpu_stall(rcp);
+
+	}
+}
+
+#else /* #ifdef CONFIG_DEBUG_RCU_STALL */
+
+static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
+{
+}
+static inline void check_cpu_stall(struct rcu_ctrlblk *rcp,
+				   struct rcu_data *rdp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_DEBUG_RCU_STALL */
+
 /*
  * Register a new batch of callbacks, and start it up if there is currently no
  * active batch and the batch to be registered has not already occurred.
@@ -296,6 +372,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
 	if (rcp->cur != rcp->pending &&
 			rcp->completed == rcp->cur) {
 		rcp->cur++;
+		record_gp_check_time(rcp);
 
 		/*
 		 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
@@ -489,6 +566,9 @@ static void rcu_process_callbacks(struct softirq_action *unused)
 
 static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
 {
+	/* Check for CPU stalls, if enabled. */
+	check_cpu_stall(rcp, rdp);
+
 	if (rdp->nxtlist) {
 		/*
 		 * This cpu has pending rcu entries and the grace period
-- 
cgit v1.2.3


From 78635fc739b1254f3e0362ac430edbdd2cff01dc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 11 Aug 2008 13:34:15 +0200
Subject: rcu, debug: detect stalled grace periods, cleanups

small cleanups.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index d7ec731de75c..56b8712fe5aa 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -294,6 +294,7 @@ static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
 {
 	rcp->gp_check = get_seconds() + 3;
 }
+
 static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
 {
 	int cpu;
@@ -303,11 +304,9 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
 
 	spin_lock(&rcp->lock);
 	delta = get_seconds() - rcp->gp_check;
-	if (delta < 2L ||
-	    cpus_empty(rcp->cpumask)) {
+	if (delta < 2L || cpus_empty(rcp->cpumask)) {
 		spin_unlock(&rcp->lock);
 		return;
-	rcp->gp_check = get_seconds() + 30;
 	}
 	spin_unlock(&rcp->lock);
 
@@ -319,6 +318,7 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
 	printk(" (detected by %d, t=%lu/%lu)\n",
 	       smp_processor_id(), get_seconds(), rcp->gp_check);
 }
+
 static void print_cpu_stall(struct rcu_ctrlblk *rcp)
 {
 	printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n",
@@ -329,8 +329,8 @@ static void print_cpu_stall(struct rcu_ctrlblk *rcp)
 		rcp->gp_check = get_seconds() + 30;
 	spin_unlock(&rcp->lock);
 }
-static inline void check_cpu_stall(struct rcu_ctrlblk *rcp,
-				   struct rcu_data *rdp)
+
+static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
 {
 	long delta;
 
@@ -341,12 +341,11 @@ static inline void check_cpu_stall(struct rcu_ctrlblk *rcp,
 
 		print_cpu_stall(rcp);
 
-	} else if (!cpus_empty(rcp->cpumask) && delta >= 2L) {
-
-		/* They had two seconds to dump stack, so complain. */
-
-		print_other_cpu_stall(rcp);
-
+	} else {
+		if (!cpus_empty(rcp->cpumask) && delta >= 2L) {
+			/* They had two seconds to dump stack, so complain. */
+			print_other_cpu_stall(rcp);
+		}
 	}
 }
 
@@ -355,8 +354,9 @@ static inline void check_cpu_stall(struct rcu_ctrlblk *rcp,
 static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
 {
 }
-static inline void check_cpu_stall(struct rcu_ctrlblk *rcp,
-				   struct rcu_data *rdp)
+
+static inline void
+check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
 {
 }
 
-- 
cgit v1.2.3


From 77ae651347bdd46830da8b28b1efc5e4a9d7cbd0 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Mon, 11 Aug 2008 13:32:02 +0200
Subject: sched: fix mysql+oltp regression

Defer commit 6d299f1b53b84e2665f402d9bcc494800aba6386 to the next release.

Testing of the tip/sched/clock tree revealed a mysql+oltp regression
which bisection eventually traced back to this commit in mainline.

Pertinent test results:  Three run sysbench averages, throughput units
in read/write requests/sec.

clients         1     2     4     8    16    32    64
6e0534f      9646 17876 34774 33868 32230 30767 29441
2.6.26.1     9112 17936 34652 33383 31929 30665 29232
6d299f1      9112 14637 28370 33339 32038 30762 29204

Note: subsequent commits hide the majority of this regression until you
apply the clock fixes, at which time it reemerges at full magnitude.

We cannot see anything bad about the change itself so we defer it to the
next release until this problem is fully analysed.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Gregory Haskins <ghaskins@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0fe94ea43f32..fb8994c6d4bb 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1442,18 +1442,23 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
 	struct task_struct *p = NULL;
 	struct sched_entity *se;
 
-	while (next != &cfs_rq->tasks) {
+	if (next == &cfs_rq->tasks)
+		return NULL;
+
+	/* Skip over entities that are not tasks */
+	do {
 		se = list_entry(next, struct sched_entity, group_node);
 		next = next->next;
+	} while (next != &cfs_rq->tasks && !entity_is_task(se));
 
-		/* Skip over entities that are not tasks */
-		if (entity_is_task(se)) {
-			p = task_of(se);
-			break;
-		}
-	}
+	if (next == &cfs_rq->tasks)
+		return NULL;
 
 	cfs_rq->balance_iterator = next;
+
+	if (entity_is_task(se))
+		p = task_of(se);
+
 	return p;
 }
 
-- 
cgit v1.2.3


From cc7a486cac78f6fc1a24e8cd63036bae8d2ab431 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Mon, 11 Aug 2008 13:49:30 +1000
Subject: generic-ipi: fix stack and rcu interaction bug in
 smp_call_function_mask()

* Venki Pallipadi <venkatesh.pallipadi@intel.com> wrote:

> Found a OOPS on a big SMP box during an overnight reboot test with
> upstream git.
>
> Suresh and I looked at the oops and looks like the root cause is in
> generic_smp_call_function_interrupt() and smp_call_function_mask() with
> wait parameter.
>
> The actual oops looked like
>
> [   11.277260] BUG: unable to handle kernel paging request at ffff8802ffffffff
> [   11.277815] IP: [<ffff8802ffffffff>] 0xffff8802ffffffff
> [   11.278155] PGD 202063 PUD 0
> [   11.278576] Oops: 0010 [1] SMP
> [   11.279006] CPU 5
> [   11.279336] Modules linked in:
> [   11.279752] Pid: 0, comm: swapper Not tainted 2.6.27-rc2-00020-g685d87f #290
> [   11.280039] RIP: 0010:[<ffff8802ffffffff>]  [<ffff8802ffffffff>] 0xffff8802ffffffff
> [   11.280692] RSP: 0018:ffff88027f1f7f70  EFLAGS: 00010086
> [   11.280976] RAX: 00000000ffffffff RBX: 0000000000000000 RCX: 0000000000000000
> [   11.281264] RDX: 0000000000004f4e RSI: 0000000000000001 RDI: 0000000000000000
> [   11.281624] RBP: ffff88027f1f7f98 R08: 0000000000000001 R09: ffffffff802509af
> [   11.281925] R10: ffff8800280c2780 R11: 0000000000000000 R12: ffff88027f097d48
> [   11.282214] R13: ffff88027f097d70 R14: 0000000000000005 R15: ffff88027e571000
> [   11.282502] FS:  0000000000000000(0000) GS:ffff88027f1c3340(0000) knlGS:0000000000000000
> [   11.283096] CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
> [   11.283382] CR2: ffff8802ffffffff CR3: 0000000000201000 CR4: 00000000000006e0
> [   11.283760] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [   11.284048] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
> [   11.284337] Process swapper (pid: 0, threadinfo ffff88027f1f2000, task ffff88027f1f0640)
> [   11.284936] Stack:  ffffffff80250963 0000000000000212 0000000000ee8c78 0000000000ee8a66
> [   11.285802]  ffff88027e571550 ffff88027f1f7fa8 ffffffff8021adb5 ffff88027f1f3e40
> [   11.286599]  ffffffff8020bdd6 ffff88027f1f3e40 <EOI>  ffff88027f1f3ef8 0000000000000000
> [   11.287120] Call Trace:
> [   11.287768]  <IRQ>  [<ffffffff80250963>] ? generic_smp_call_function_interrupt+0x61/0x12c
> [   11.288354]  [<ffffffff8021adb5>] smp_call_function_interrupt+0x17/0x27
> [   11.288744]  [<ffffffff8020bdd6>] call_function_interrupt+0x66/0x70
> [   11.289030]  <EOI>  [<ffffffff8024ab3b>] ? clockevents_notify+0x19/0x73
> [   11.289380]  [<ffffffff803b9b75>] ? acpi_idle_enter_simple+0x18b/0x1fa
> [   11.289760]  [<ffffffff803b9b6b>] ? acpi_idle_enter_simple+0x181/0x1fa
> [   11.290051]  [<ffffffff8053aeca>] ? cpuidle_idle_call+0x70/0xa2
> [   11.290338]  [<ffffffff80209f61>] ? cpu_idle+0x5f/0x7d
> [   11.290723]  [<ffffffff8060224a>] ? start_secondary+0x14d/0x152
> [   11.291010]
> [   11.291287]
> [   11.291654] Code:  Bad RIP value.
> [   11.292041] RIP  [<ffff8802ffffffff>] 0xffff8802ffffffff
> [   11.292380]  RSP <ffff88027f1f7f70>
> [   11.292741] CR2: ffff8802ffffffff
> [   11.310951] ---[ end trace 137c54d525305f1c ]---
>
> The problem is with the following sequence of events:
>
> - CPU A calls smp_call_function_mask() for CPU B with wait parameter
> - CPU A sets up the call_function_data on the stack and does an rcu add to
>   call_function_queue
> - CPU A waits until the WAIT flag is cleared
> - CPU B gets the call function interrupt and starts going through the
>   call_function_queue
> - CPU C also gets some other call function interrupt and starts going through
>   the call_function_queue
> - CPU C, which is also going through the call_function_queue, starts referencing
>   CPU A's stack, as that element is still in call_function_queue
> - CPU B finishes the function call that CPU A set up and as there are no other
>   references to it, rcu deletes the call_function_data (which was from CPU A
>   stack)
> - CPU B sees the wait flag and just clears the flag (no call_rcu to free)
> - CPU A which was waiting on the flag continues executing and the stack
>   contents change
>
> - CPU C is still in rcu_read section accessing the CPU A's stack sees
>   inconsistent call_funation_data and can try to execute
>   function with some random pointer, causing stack corruption for A
>   (by clearing the bits in mask field) and oops.

Nice debugging work.

I'd suggest something like the attached (boot tested) patch as the simple
fix for now.

I expect the benefits from the less synchronized, multiple-in-flight-data
global queue will still outweigh the costs of dynamic allocations. But
if worst comes to worst then we just go back to a globally synchronous
one-at-a-time implementation, but that would be pretty sad!

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/smp.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 47 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index 96fc7c0edc59..e6084f6efb4d 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -260,6 +260,41 @@ void __smp_call_function_single(int cpu, struct call_single_data *data)
 	generic_exec_single(cpu, data);
 }
 
+/* Dummy function */
+static void quiesce_dummy(void *unused)
+{
+}
+
+/*
+ * Ensure stack based data used in call function mask is safe to free.
+ *
+ * This is needed by smp_call_function_mask when using on-stack data, because
+ * a single call function queue is shared by all CPUs, and any CPU may pick up
+ * the data item on the queue at any time before it is deleted. So we need to
+ * ensure that all CPUs have transitioned through a quiescent state after
+ * this call.
+ *
+ * This is a very slow function, implemented by sending synchronous IPIs to
+ * all possible CPUs. For this reason, we have to alloc data rather than use
+ * stack based data even in the case of synchronous calls. The stack based
+ * data is then just used for deadlock/oom fallback which will be very rare.
+ *
+ * If a faster scheme can be made, we could go back to preferring stack based
+ * data -- the data allocation/free is non-zero cost.
+ */
+static void smp_call_function_mask_quiesce_stack(cpumask_t mask)
+{
+	struct call_single_data data;
+	int cpu;
+
+	data.func = quiesce_dummy;
+	data.info = NULL;
+	data.flags = CSD_FLAG_WAIT;
+
+	for_each_cpu_mask(cpu, mask)
+		generic_exec_single(cpu, &data);
+}
+
 /**
  * smp_call_function_mask(): Run a function on a set of other CPUs.
  * @mask: The set of cpus to run on.
@@ -285,6 +320,7 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
 	cpumask_t allbutself;
 	unsigned long flags;
 	int cpu, num_cpus;
+	int slowpath = 0;
 
 	/* Can deadlock when called with interrupts disabled */
 	WARN_ON(irqs_disabled());
@@ -306,15 +342,16 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
 		return smp_call_function_single(cpu, func, info, wait);
 	}
 
-	if (!wait) {
-		data = kmalloc(sizeof(*data), GFP_ATOMIC);
-		if (data)
-			data->csd.flags = CSD_FLAG_ALLOC;
-	}
-	if (!data) {
+	data = kmalloc(sizeof(*data), GFP_ATOMIC);
+	if (data) {
+		data->csd.flags = CSD_FLAG_ALLOC;
+		if (wait)
+			data->csd.flags |= CSD_FLAG_WAIT;
+	} else {
 		data = &d;
 		data->csd.flags = CSD_FLAG_WAIT;
 		wait = 1;
+		slowpath = 1;
 	}
 
 	spin_lock_init(&data->lock);
@@ -331,8 +368,11 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
 	arch_send_call_function_ipi(mask);
 
 	/* optionally wait for the CPUs to complete */
-	if (wait)
+	if (wait) {
 		csd_flag_wait(&data->csd);
+		if (unlikely(slowpath))
+			smp_call_function_mask_quiesce_stack(allbutself);
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3


From 279ef6bbb8308488398c8f33b04c760148428378 Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Wed, 30 Jul 2008 12:34:04 +0200
Subject: sched, cpu hotplug: fix set_cpus_allowed() use in hotplug callbacks

Mark Langsdorf reported:

> One of my co-workers noticed that the powernow-k8
> driver no longer restarts when a CPU core is
> hot-disabled and then hot-enabled on AMD quad-core
> systems.
>
> The following comands work fine on 2.6.26 and fail
> on 2.6.27-rc1:
>
> echo 0 > /sys/devices/system/cpu/cpu3/online
> echo 1 > /sys/devices/system/cpu/cpu3/online
> find /sys -name cpufreq
>
> For 2.6.26, the find will return a cpufreq
> directory for each processor.  In 2.6.27-rc1,
> the cpu3 directory is missing.
>
> After digging through the code, the following
> logic is failing when the core is hot-enabled
> at runtime.  The code works during the boot
> sequence.
>
>       cpumask_t = current->cpus_allowed;
>       set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
>       if (smp_processor_id() != cpu)
>               return -ENODEV;

So set the CPU active before calling the CPU_ONLINE notifier chain,
there are a handful of notifiers that use set_cpus_allowed().

This fix also solves the problem with x86-microcode. I've sent
alternative patches for microcode, but as this "rely on
set_cpus_allowed_ptr() being workable in cpu-hotplug(CPU_ONLINE, ...)"
assumption seems to be more broad than what we thought, perhaps this fix
should be applied.

With this patch we define that by the moment CPU_ONLINE is being sent,
a 'cpu' is online and ready for tasks to be migrated onto it.

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Reported-by: Mark Langsdorf <mark.langsdorf@amd.com>
Tested-by: Mark Langsdorf <mark.langsdorf@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/cpu.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index e202a68d1cc1..c977c339f559 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -349,6 +349,8 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
 		goto out_notify;
 	BUG_ON(!cpu_online(cpu));
 
+	cpu_set(cpu, cpu_active_map);
+
 	/* Now call notifier in preparation. */
 	raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
 
@@ -383,9 +385,6 @@ int __cpuinit cpu_up(unsigned int cpu)
 
 	err = _cpu_up(cpu, 0);
 
-	if (cpu_online(cpu))
-		cpu_set(cpu, cpu_active_map);
-
 out:
 	cpu_maps_update_done();
 	return err;
-- 
cgit v1.2.3


From 0f2bc27be27ca1dcc66b96131e44bf7648b959c6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 11 Aug 2008 22:45:51 +0200
Subject: lockdep: fix debug_lock_alloc

When we enable DEBUG_LOCK_ALLOC but do not enable PROVE_LOCKING and or
LOCK_STAT, lock_alloc() and lock_release() turn into nops, even though
we should be doing hlock checking (check=1).

This causes a false warning and a lockdep self-disable.

Rectify this.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index ab933fecd2a1..1aa91fd6b06e 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2935,9 +2935,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 {
 	unsigned long flags;
 
-	if (unlikely(!lock_stat && !prove_locking))
-		return;
-
 	if (unlikely(current->lockdep_recursion))
 		return;
 
@@ -2958,9 +2955,6 @@ void lock_release(struct lockdep_map *lock, int nested,
 {
 	unsigned long flags;
 
-	if (unlikely(!lock_stat && !prove_locking))
-		return;
-
 	if (unlikely(current->lockdep_recursion))
 		return;
 
-- 
cgit v1.2.3


From 59f9415ffb9759e950d775f4c400f747b332cc02 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Wed, 30 Jul 2008 12:49:02 -0700
Subject: modules: extend initcall_debug functionality to the module loader

The kernel has this really nice facility where if you put "initcall_debug"
on the kernel commandline, it'll print which function it's going to
execute just before calling an initcall, and then after the call completes
it will

1) print if it had an error code

2) checks for a few simple bugs (like leaving irqs off)
and

3) print how long the init call took in milliseconds.

While trying to optimize the boot speed of my laptop, I have been loving
number 3 to figure out what to optimize...  ...  and then I wished that
the same thing was done for module loading.

This patch makes the module loader use this exact same functionality; it's
a logical extension in my view (since modules are just sort of late
binding initcalls anyway) and so far I've found it quite useful in finding
where things are too slow in my boot.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 61d212120df4..08864d257eb0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2288,7 +2288,7 @@ sys_init_module(void __user *umod,
 
 	/* Start the module */
 	if (mod->init != NULL)
-		ret = mod->init();
+		ret = do_one_initcall(mod->init);
 	if (ret < 0) {
 		/* Init routine failed: abort.  Try to protect us from
                    buggy refcounters. */
-- 
cgit v1.2.3


From ed6d68763b8b589c0ae9d231cbd72bd01f6685c5 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 31 Jul 2008 10:31:02 +0800
Subject: stop_machine: remove unused variable

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/stop_machine.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index e446c7c7d6a9..af3c7cea258b 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -65,7 +65,6 @@ static void ack_state(void)
 static int stop_cpu(struct stop_machine_data *smdata)
 {
 	enum stopmachine_state curstate = STOPMACHINE_NONE;
-	int uninitialized_var(ret);
 
 	/* Simple state machine */
 	do {
-- 
cgit v1.2.3


From c2fc11985db304572322f1dcdcb0f71337315006 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Tue, 12 Aug 2008 18:05:13 +1000
Subject: generic-ipi: fix stack and rcu interaction bug in
 smp_call_function_mask(), fix

> > Nick Piggin (1):
> >       generic-ipi: fix stack and rcu interaction bug in
> > smp_call_function_mask()
>
> I'm still not 100% sure that I have this patch right... I might have seen
> a lockup trace implicating the smp call function path... which may have
> been due to some other problem or a different bug in the new call function
> code, but if some more people can take a look at it before merging?

OK indeed it did have a couple of bugs. Firstly, I wasn't freeing the
data properly in the alloc && wait case. Secondly, I wasn't resetting
CSD_FLAG_WAIT in the for each cpu loop (so only the first CPU would
wait).

After those fixes, the patch boots and runs with the kmalloc commented
out (so it always executes the slowpath).

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/smp.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index e6084f6efb4d..782e2b93e465 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -135,7 +135,8 @@ void generic_smp_call_function_interrupt(void)
 			 */
 			smp_wmb();
 			data->csd.flags &= ~CSD_FLAG_WAIT;
-		} else
+		}
+		if (data->csd.flags & CSD_FLAG_ALLOC)
 			call_rcu(&data->rcu_head, rcu_free_call_data);
 	}
 	rcu_read_unlock();
@@ -289,10 +290,11 @@ static void smp_call_function_mask_quiesce_stack(cpumask_t mask)
 
 	data.func = quiesce_dummy;
 	data.info = NULL;
-	data.flags = CSD_FLAG_WAIT;
 
-	for_each_cpu_mask(cpu, mask)
+	for_each_cpu_mask(cpu, mask) {
+		data.flags = CSD_FLAG_WAIT;
 		generic_exec_single(cpu, &data);
+	}
 }
 
 /**
@@ -371,7 +373,7 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
 	if (wait) {
 		csd_flag_wait(&data->csd);
 		if (unlikely(slowpath))
-			smp_call_function_mask_quiesce_stack(allbutself);
+			smp_call_function_mask_quiesce_stack(mask);
 	}
 
 	return 0;
-- 
cgit v1.2.3


From 3ee1062b4ee82a56294808a065b64f4bc6781a56 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 12 Aug 2008 15:08:40 -0700
Subject: cpu hotplug: s390 doesn't support additional_cpus anymore.

s390 doesn't support the additional_cpus kernel parameter anymore since a
long time.  So we better update the code and documentation to reflect
that.

Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index c977c339f559..f17e9854c246 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -369,7 +369,7 @@ int __cpuinit cpu_up(unsigned int cpu)
 	if (!cpu_isset(cpu, cpu_possible_map)) {
 		printk(KERN_ERR "can't online cpu %d because it is not "
 			"configured as may-hotadd at boot time\n", cpu);
-#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) || defined(CONFIG_S390)
+#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
 		printk(KERN_ERR "please check additional_cpus= boot "
 				"parameter\n");
 #endif
-- 
cgit v1.2.3


From f18e439d1035d059534d261c414af33f89aee89a Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 12 Aug 2008 15:09:03 -0700
Subject: genirq: switch /proc/irq/*/smp_affinity et al to seqfiles

Switch /proc/irq/*/smp_affinity , /proc/irq/default_smp_affinity to
seq_files.

cat(1) reads with 1024 chunks by default, with high enough NR_CPUS, there
will be -EINVAL.

As side effect, there are now two less users of the ->read_proc interface.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Mike Travis <travis@sgi.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/irq/proc.c | 96 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 51 insertions(+), 45 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6c6d35d68ee9..a09dd29c2fd7 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -8,6 +8,7 @@
 
 #include <linux/irq.h>
 #include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 #include <linux/interrupt.h>
 
 #include "internals.h"
@@ -16,23 +17,18 @@ static struct proc_dir_entry *root_irq_dir;
 
 #ifdef CONFIG_SMP
 
-static int irq_affinity_read_proc(char *page, char **start, off_t off,
-				  int count, int *eof, void *data)
+static int irq_affinity_proc_show(struct seq_file *m, void *v)
 {
-	struct irq_desc *desc = irq_desc + (long)data;
+	struct irq_desc *desc = irq_desc + (long)m->private;
 	cpumask_t *mask = &desc->affinity;
-	int len;
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	if (desc->status & IRQ_MOVE_PENDING)
 		mask = &desc->pending_mask;
 #endif
-	len = cpumask_scnprintf(page, count, *mask);
-
-	if (count - len < 2)
-		return -EINVAL;
-	len += sprintf(page + len, "\n");
-	return len;
+	seq_cpumask(m, mask);
+	seq_putc(m, '\n');
+	return 0;
 }
 
 #ifndef is_affinity_mask_valid
@@ -40,11 +36,12 @@ static int irq_affinity_read_proc(char *page, char **start, off_t off,
 #endif
 
 int no_irq_affinity;
-static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
-				   unsigned long count, void *data)
+static ssize_t irq_affinity_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *pos)
 {
-	unsigned int irq = (int)(long)data, full_count = count, err;
+	unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
 	cpumask_t new_value;
+	int err;
 
 	if (!irq_desc[irq].chip->set_affinity || no_irq_affinity ||
 	    irq_balancing_disabled(irq))
@@ -65,28 +62,38 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
 	if (!cpus_intersects(new_value, cpu_online_map))
 		/* Special case for empty set - allow the architecture
 		   code to set default SMP affinity. */
-		return irq_select_affinity(irq) ? -EINVAL : full_count;
+		return irq_select_affinity(irq) ? -EINVAL : count;
 
 	irq_set_affinity(irq, new_value);
 
-	return full_count;
+	return count;
 }
 
-static int default_affinity_read(char *page, char **start, off_t off,
-				  int count, int *eof, void *data)
+static int irq_affinity_proc_open(struct inode *inode, struct file *file)
 {
-	int len = cpumask_scnprintf(page, count, irq_default_affinity);
-	if (count - len < 2)
-		return -EINVAL;
-	len += sprintf(page + len, "\n");
-	return len;
+	return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
 }
 
-static int default_affinity_write(struct file *file, const char __user *buffer,
-				   unsigned long count, void *data)
+static const struct file_operations irq_affinity_proc_fops = {
+	.open		= irq_affinity_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= irq_affinity_proc_write,
+};
+
+static int default_affinity_show(struct seq_file *m, void *v)
+{
+	seq_cpumask(m, &irq_default_affinity);
+	seq_putc(m, '\n');
+	return 0;
+}
+
+static ssize_t default_affinity_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *ppos)
 {
-	unsigned int full_count = count, err;
 	cpumask_t new_value;
+	int err;
 
 	err = cpumask_parse_user(buffer, count, new_value);
 	if (err)
@@ -105,8 +112,21 @@ static int default_affinity_write(struct file *file, const char __user *buffer,
 
 	irq_default_affinity = new_value;
 
-	return full_count;
+	return count;
 }
+
+static int default_affinity_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, default_affinity_show, NULL);
+}
+
+static const struct file_operations default_affinity_proc_fops = {
+	.open		= default_affinity_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= default_affinity_write,
+};
 #endif
 
 static int irq_spurious_read(char *page, char **start, off_t off,
@@ -178,16 +198,9 @@ void register_irq_proc(unsigned int irq)
 	irq_desc[irq].dir = proc_mkdir(name, root_irq_dir);
 
 #ifdef CONFIG_SMP
-	{
-		/* create /proc/irq/<irq>/smp_affinity */
-		entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir);
-
-		if (entry) {
-			entry->data = (void *)(long)irq;
-			entry->read_proc = irq_affinity_read_proc;
-			entry->write_proc = irq_affinity_write_proc;
-		}
-	}
+	/* create /proc/irq/<irq>/smp_affinity */
+	proc_create_data("smp_affinity", 0600, irq_desc[irq].dir,
+			 &irq_affinity_proc_fops, (void *)(long)irq);
 #endif
 
 	entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir);
@@ -208,15 +221,8 @@ void unregister_handler_proc(unsigned int irq, struct irqaction *action)
 void register_default_affinity_proc(void)
 {
 #ifdef CONFIG_SMP
-	struct proc_dir_entry *entry;
-
-	/* create /proc/irq/default_smp_affinity */
-	entry = create_proc_entry("default_smp_affinity", 0600, root_irq_dir);
-	if (entry) {
-		entry->data = NULL;
-		entry->read_proc  = default_affinity_read;
-		entry->write_proc = default_affinity_write;
-	}
+	proc_create("irq/default_smp_affinity", 0600, NULL,
+		    &default_affinity_proc_fops);
 #endif
 }
 
-- 
cgit v1.2.3


From d6672c501852d577097f6757c311d937aca0b04b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 1 Aug 2008 11:23:50 +0200
Subject: lockdep: build fix

fix:

 kernel/built-in.o: In function `lockdep_stats_show':
 lockdep_proc.c:(.text+0x3cb2f): undefined reference to `lockdep_count_forward_deps'
 kernel/built-in.o: In function `l_show':
 lockdep_proc.c:(.text+0x3d02b): undefined reference to `lockdep_count_forward_deps'
 lockdep_proc.c:(.text+0x3d047): undefined reference to `lockdep_count_backward_deps'

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep_internals.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 55db193d366d..56b196932c08 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -50,8 +50,21 @@ extern unsigned int nr_process_chains;
 extern unsigned int max_lockdep_depth;
 extern unsigned int max_recursion_depth;
 
+#ifdef CONFIG_PROVE_LOCKING
 extern unsigned long lockdep_count_forward_deps(struct lock_class *);
 extern unsigned long lockdep_count_backward_deps(struct lock_class *);
+#else
+static inline unsigned long
+lockdep_count_forward_deps(struct lock_class *class)
+{
+	return 0;
+}
+static inline unsigned long
+lockdep_count_backward_deps(struct lock_class *class)
+{
+	return 0;
+}
+#endif
 
 #ifdef CONFIG_DEBUG_LOCKDEP
 /*
-- 
cgit v1.2.3


From c72f4573a5e05e35a31474977f500cbe21dcbb11 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 12 Aug 2008 13:27:37 -0700
Subject: lockdep: spin_lock_nest_lock(), checkpatch fixes

fix:

 WARNING: EXPORT_SYMBOL(foo); should immediately follow its function/variable
 #46: FILE: kernel/spinlock.c:326:
 +EXPORT_SYMBOL(_spin_lock_nest_lock);

total: 0 errors, 1 warnings, 26 lines checked

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/spinlock.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 44baeea94ab9..29ab20749dd3 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -290,7 +290,6 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
 	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
 	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
 }
-
 EXPORT_SYMBOL(_spin_lock_nested);
 
 unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass)
@@ -312,7 +311,6 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas
 #endif
 	return flags;
 }
-
 EXPORT_SYMBOL(_spin_lock_irqsave_nested);
 
 void __lockfunc _spin_lock_nest_lock(spinlock_t *lock,
@@ -322,7 +320,6 @@ void __lockfunc _spin_lock_nest_lock(spinlock_t *lock,
 	spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
 	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
 }
-
 EXPORT_SYMBOL(_spin_lock_nest_lock);
 
 #endif
-- 
cgit v1.2.3


From 2df8b1d656021e180ab93c8a4b2c9c2923d30b82 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Wed, 30 Jul 2008 12:43:11 -0700
Subject: lockdep: use WARN() in kernel/lockdep.c

Use WARN() instead of a printk+WARN_ON() pair; this way the message
becomes part of the warning section for better reporting/collection.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/lockdep.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 1aa91fd6b06e..77fa776a2da8 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -1759,11 +1759,10 @@ static void check_chain_key(struct task_struct *curr)
 		hlock = curr->held_locks + i;
 		if (chain_key != hlock->prev_chain_key) {
 			debug_locks_off();
-			printk("hm#1, depth: %u [%u], %016Lx != %016Lx\n",
+			WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n",
 				curr->lockdep_depth, i,
 				(unsigned long long)chain_key,
 				(unsigned long long)hlock->prev_chain_key);
-			WARN_ON(1);
 			return;
 		}
 		id = hlock->class_idx - 1;
@@ -1778,11 +1777,10 @@ static void check_chain_key(struct task_struct *curr)
 	}
 	if (chain_key != curr->curr_chain_key) {
 		debug_locks_off();
-		printk("hm#2, depth: %u [%u], %016Lx != %016Lx\n",
+		WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n",
 			curr->lockdep_depth, i,
 			(unsigned long long)chain_key,
 			(unsigned long long)curr->curr_chain_key);
-		WARN_ON(1);
 	}
 #endif
 }
-- 
cgit v1.2.3


From 09f2724a786f76475ef2985cf84f5359c553aade Mon Sep 17 00:00:00 2001
From: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Date: Wed, 14 Aug 2030 15:56:40 +0800
Subject: sched: fix the race between walk_tg_tree and sched_create_group

With 2.6.27-rc3, I hit a kernel panic when running volanoMark on my
new x86_64 machine. I also hit it with other 2.6.27-rc kernels.
See below log.

Basically, function walk_tg_tree and sched_create_group have a race
between accessing and initiating tg->children. Below patch fixes it
by moving tg->children initiation to the front of linking tg->siblings
to parent->children.

{----------------panic log------------}

BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
IP: [<ffffffff802292ab>] walk_tg_tree+0x45/0x7f
PGD 1be1c4067 PUD 1bdd8d067 PMD 0
Oops: 0000 [1] SMP
CPU 11
Modules linked in: igb
Pid: 22979, comm: java Not tainted 2.6.27-rc3 #1
RIP: 0010:[<ffffffff802292ab>]  [<ffffffff802292ab>] walk_tg_tree+0x45/0x7f
RSP: 0018:ffff8801bfbbbd18  EFLAGS: 00010083
RAX: 0000000000000000 RBX: ffff8800be0dce40 RCX: ffffffffffffffc0
RDX: ffff880102c43740 RSI: 0000000000000000 RDI: ffff8800be0dce40
RBP: ffff8801bfbbbd48 R08: ffff8800ba437bc8 R09: 0000000000001f40
R10: ffff8801be812100 R11: ffffffff805fdf44 R12: ffff880102c43740
R13: 0000000000000000 R14: ffffffff8022cf0f R15: ffffffff8022749f
FS:  00000000568ac950(0063) GS:ffff8801bfa26d00(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000000000 CR3: 00000001bd848000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process java (pid: 22979, threadinfo ffff8801b145a000, task ffff8801bf18e450)
Stack:  0000000000000001 ffff8800ba5c8d60 0000000000000001 0000000000000001
 ffff8800bad1ccb8 0000000000000000 ffff8801bfbbbd98 ffffffff8022ed37
 0000000000000001 0000000000000286 ffff8801bd5ee180 ffff8800ba437bc8
Call Trace:
 <IRQ>  [<ffffffff8022ed37>] try_to_wake_up+0x71/0x24c
 [<ffffffff80247177>] autoremove_wake_function+0x9/0x2e
 [<ffffffff80228039>] ? __wake_up_common+0x46/0x76
 [<ffffffff802296d5>] __wake_up+0x38/0x4f
 [<ffffffff806169cc>] tcp_v4_rcv+0x380/0x62e

Signed-off-by: Zhang Yanmin <yanmin_zhang@linux.intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index d601fb0406ca..8bf8a5528bc7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8462,8 +8462,8 @@ struct task_group *sched_create_group(struct task_group *parent)
 	WARN_ON(!parent); /* root should already exist */
 
 	tg->parent = parent;
-	list_add_rcu(&tg->siblings, &parent->children);
 	INIT_LIST_HEAD(&tg->children);
+	list_add_rcu(&tg->siblings, &parent->children);
 	spin_unlock_irqrestore(&task_group_lock, flags);
 
 	return tg;
-- 
cgit v1.2.3


From cf417141cbb3a4ceb5cca15b2c1f099bd0a6603c Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qualcomm.com>
Date: Mon, 11 Aug 2008 14:33:53 -0700
Subject: sched, cpuset: rework sched domains and CPU hotplug handling (v4)

This is an updated version of my previous cpuset patch on top of
the latest mainline git.
The patch fixes CPU hotplug handling issues in the current cpusets code.
Namely circular locking in rebuild_sched_domains() and unsafe access to
the cpu_online_map in the cpuset cpu hotplug handler.

This version includes changes suggested by Paul Jackson (naming, comments,
style, etc). I also got rid of the separate workqueue thread because it is
now safe to call get_online_cpus() from workqueue callbacks.

Here are some more details:

rebuild_sched_domains() is the only way to rebuild sched domains
correctly based on the current cpuset settings. What this means
is that we need to be able to call it from different contexts,
like cpu hotplug for example.
Also latest scheduler code in -tip now calls rebuild_sched_domains()
directly from functions like arch_reinit_sched_domains().

In order to support that properly we need to rework cpuset locking
rules to avoid circular dependencies, which is what this patch does.
New lock nesting rules are explained in the comments.
We can now safely call rebuild_sched_domains() from virtually any
context. The only requirement is that it needs to be called under
get_online_cpus(). This allows cpu hotplug handlers and the scheduler
to call rebuild_sched_domains() directly.
The rest of the cpuset code now offloads sched domains rebuilds to
a workqueue (async_rebuild_sched_domains()).

This version of the patch addresses comments from the previous review.
I fixed all miss-formated comments and trailing spaces.

I also factored out the code that builds domain masks and split up CPU and
memory hotplug handling. This was needed to simplify locking, to avoid unsafe
access to the cpu_online_map from mem hotplug handler, and in general to make
things cleaner.

The patch passes moderate testing (building kernel with -j 16, creating &
removing domains and bringing cpus off/online at the same time) on the
quad-core2 based machine.

It passes lockdep checks, even with preemptable RCU enabled.
This time I also tested in with suspend/resume path and everything is working
as expected.

Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Cc: menage@google.com
Cc: a.p.zijlstra@chello.nl
Cc: vegard.nossum@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/cpuset.c | 312 +++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 182 insertions(+), 130 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d5ab79cf516d..f227bc172690 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -14,6 +14,8 @@
  *  2003-10-22 Updates by Stephen Hemminger.
  *  2004 May-July Rework by Paul Jackson.
  *  2006 Rework by Paul Menage to use generic cgroups
+ *  2008 Rework of the scheduler domains and CPU hotplug handling
+ *       by Max Krasnyansky
  *
  *  This file is subject to the terms and conditions of the GNU General Public
  *  License.  See the file COPYING in the main directory of the Linux
@@ -236,9 +238,11 @@ static struct cpuset top_cpuset = {
 
 static DEFINE_MUTEX(callback_mutex);
 
-/* This is ugly, but preserves the userspace API for existing cpuset
+/*
+ * This is ugly, but preserves the userspace API for existing cpuset
  * users. If someone tries to mount the "cpuset" filesystem, we
- * silently switch it to mount "cgroup" instead */
+ * silently switch it to mount "cgroup" instead
+ */
 static int cpuset_get_sb(struct file_system_type *fs_type,
 			 int flags, const char *unused_dev_name,
 			 void *data, struct vfsmount *mnt)
@@ -473,10 +477,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
 }
 
 /*
- * Helper routine for rebuild_sched_domains().
+ * Helper routine for generate_sched_domains().
  * Do cpusets a, b have overlapping cpus_allowed masks?
  */
-
 static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
 {
 	return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
@@ -518,26 +521,15 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
 }
 
 /*
- * rebuild_sched_domains()
- *
- * This routine will be called to rebuild the scheduler's dynamic
- * sched domains:
- * - if the flag 'sched_load_balance' of any cpuset with non-empty
- *   'cpus' changes,
- * - or if the 'cpus' allowed changes in any cpuset which has that
- *   flag enabled,
- * - or if the 'sched_relax_domain_level' of any cpuset which has
- *   that flag enabled and with non-empty 'cpus' changes,
- * - or if any cpuset with non-empty 'cpus' is removed,
- * - or if a cpu gets offlined.
- *
- * This routine builds a partial partition of the systems CPUs
- * (the set of non-overlappping cpumask_t's in the array 'part'
- * below), and passes that partial partition to the kernel/sched.c
- * partition_sched_domains() routine, which will rebuild the
- * schedulers load balancing domains (sched domains) as specified
- * by that partial partition.  A 'partial partition' is a set of
- * non-overlapping subsets whose union is a subset of that set.
+ * generate_sched_domains()
+ *
+ * This function builds a partial partition of the systems CPUs
+ * A 'partial partition' is a set of non-overlapping subsets whose
+ * union is a subset of that set.
+ * The output of this function needs to be passed to kernel/sched.c
+ * partition_sched_domains() routine, which will rebuild the scheduler's
+ * load balancing domains (sched domains) as specified by that partial
+ * partition.
  *
  * See "What is sched_load_balance" in Documentation/cpusets.txt
  * for a background explanation of this.
@@ -547,13 +539,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
  * domains when operating in the severe memory shortage situations
  * that could cause allocation failures below.
  *
- * Call with cgroup_mutex held.  May take callback_mutex during
- * call due to the kfifo_alloc() and kmalloc() calls.  May nest
- * a call to the get_online_cpus()/put_online_cpus() pair.
- * Must not be called holding callback_mutex, because we must not
- * call get_online_cpus() while holding callback_mutex.  Elsewhere
- * the kernel nests callback_mutex inside get_online_cpus() calls.
- * So the reverse nesting would risk an ABBA deadlock.
+ * Must be called with cgroup_lock held.
  *
  * The three key local variables below are:
  *    q  - a linked-list queue of cpuset pointers, used to implement a
@@ -588,10 +574,10 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
  *	element of the partition (one sched domain) to be passed to
  *	partition_sched_domains().
  */
-
-void rebuild_sched_domains(void)
+static int generate_sched_domains(cpumask_t **domains,
+			struct sched_domain_attr **attributes)
 {
-	LIST_HEAD(q);		/* queue of cpusets to be scanned*/
+	LIST_HEAD(q);		/* queue of cpusets to be scanned */
 	struct cpuset *cp;	/* scans q */
 	struct cpuset **csa;	/* array of all cpuset ptrs */
 	int csn;		/* how many cpuset ptrs in csa so far */
@@ -601,23 +587,26 @@ void rebuild_sched_domains(void)
 	int ndoms;		/* number of sched domains in result */
 	int nslot;		/* next empty doms[] cpumask_t slot */
 
-	csa = NULL;
+	ndoms = 0;
 	doms = NULL;
 	dattr = NULL;
+	csa = NULL;
 
 	/* Special case for the 99% of systems with one, full, sched domain */
 	if (is_sched_load_balance(&top_cpuset)) {
-		ndoms = 1;
 		doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
 		if (!doms)
-			goto rebuild;
+			goto done;
+
 		dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
 		if (dattr) {
 			*dattr = SD_ATTR_INIT;
 			update_domain_attr_tree(dattr, &top_cpuset);
 		}
 		*doms = top_cpuset.cpus_allowed;
-		goto rebuild;
+
+		ndoms = 1;
+		goto done;
 	}
 
 	csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
@@ -680,61 +669,141 @@ restart:
 		}
 	}
 
-	/* Convert <csn, csa> to <ndoms, doms> */
+	/*
+	 * Now we know how many domains to create.
+	 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
+	 */
 	doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
-	if (!doms)
-		goto rebuild;
+	if (!doms) {
+		ndoms = 0;
+		goto done;
+	}
+
+	/*
+	 * The rest of the code, including the scheduler, can deal with
+	 * dattr==NULL case. No need to abort if alloc fails.
+	 */
 	dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
 
 	for (nslot = 0, i = 0; i < csn; i++) {
 		struct cpuset *a = csa[i];
+		cpumask_t *dp;
 		int apn = a->pn;
 
-		if (apn >= 0) {
-			cpumask_t *dp = doms + nslot;
-
-			if (nslot == ndoms) {
-				static int warnings = 10;
-				if (warnings) {
-					printk(KERN_WARNING
-					 "rebuild_sched_domains confused:"
-					  " nslot %d, ndoms %d, csn %d, i %d,"
-					  " apn %d\n",
-					  nslot, ndoms, csn, i, apn);
-					warnings--;
-				}
-				continue;
+		if (apn < 0) {
+			/* Skip completed partitions */
+			continue;
+		}
+
+		dp = doms + nslot;
+
+		if (nslot == ndoms) {
+			static int warnings = 10;
+			if (warnings) {
+				printk(KERN_WARNING
+				 "rebuild_sched_domains confused:"
+				  " nslot %d, ndoms %d, csn %d, i %d,"
+				  " apn %d\n",
+				  nslot, ndoms, csn, i, apn);
+				warnings--;
 			}
+			continue;
+		}
 
-			cpus_clear(*dp);
-			if (dattr)
-				*(dattr + nslot) = SD_ATTR_INIT;
-			for (j = i; j < csn; j++) {
-				struct cpuset *b = csa[j];
-
-				if (apn == b->pn) {
-					cpus_or(*dp, *dp, b->cpus_allowed);
-					b->pn = -1;
-					if (dattr)
-						update_domain_attr_tree(dattr
-								   + nslot, b);
-				}
+		cpus_clear(*dp);
+		if (dattr)
+			*(dattr + nslot) = SD_ATTR_INIT;
+		for (j = i; j < csn; j++) {
+			struct cpuset *b = csa[j];
+
+			if (apn == b->pn) {
+				cpus_or(*dp, *dp, b->cpus_allowed);
+				if (dattr)
+					update_domain_attr_tree(dattr + nslot, b);
+
+				/* Done with this partition */
+				b->pn = -1;
 			}
-			nslot++;
 		}
+		nslot++;
 	}
 	BUG_ON(nslot != ndoms);
 
-rebuild:
-	/* Have scheduler rebuild sched domains */
+done:
+	kfree(csa);
+
+	*domains    = doms;
+	*attributes = dattr;
+	return ndoms;
+}
+
+/*
+ * Rebuild scheduler domains.
+ *
+ * Call with neither cgroup_mutex held nor within get_online_cpus().
+ * Takes both cgroup_mutex and get_online_cpus().
+ *
+ * Cannot be directly called from cpuset code handling changes
+ * to the cpuset pseudo-filesystem, because it cannot be called
+ * from code that already holds cgroup_mutex.
+ */
+static void do_rebuild_sched_domains(struct work_struct *unused)
+{
+	struct sched_domain_attr *attr;
+	cpumask_t *doms;
+	int ndoms;
+
 	get_online_cpus();
-	partition_sched_domains(ndoms, doms, dattr);
+
+	/* Generate domain masks and attrs */
+	cgroup_lock();
+	ndoms = generate_sched_domains(&doms, &attr);
+	cgroup_unlock();
+
+	/* Have scheduler rebuild the domains */
+	partition_sched_domains(ndoms, doms, attr);
+
 	put_online_cpus();
+}
 
-done:
-	kfree(csa);
-	/* Don't kfree(doms) -- partition_sched_domains() does that. */
-	/* Don't kfree(dattr) -- partition_sched_domains() does that. */
+static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
+
+/*
+ * Rebuild scheduler domains, asynchronously via workqueue.
+ *
+ * If the flag 'sched_load_balance' of any cpuset with non-empty
+ * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
+ * which has that flag enabled, or if any cpuset with a non-empty
+ * 'cpus' is removed, then call this routine to rebuild the
+ * scheduler's dynamic sched domains.
+ *
+ * The rebuild_sched_domains() and partition_sched_domains()
+ * routines must nest cgroup_lock() inside get_online_cpus(),
+ * but such cpuset changes as these must nest that locking the
+ * other way, holding cgroup_lock() for much of the code.
+ *
+ * So in order to avoid an ABBA deadlock, the cpuset code handling
+ * these user changes delegates the actual sched domain rebuilding
+ * to a separate workqueue thread, which ends up processing the
+ * above do_rebuild_sched_domains() function.
+ */
+static void async_rebuild_sched_domains(void)
+{
+	schedule_work(&rebuild_sched_domains_work);
+}
+
+/*
+ * Accomplishes the same scheduler domain rebuild as the above
+ * async_rebuild_sched_domains(), however it directly calls the
+ * rebuild routine synchronously rather than calling it via an
+ * asynchronous work thread.
+ *
+ * This can only be called from code that is not holding
+ * cgroup_mutex (not nested in a cgroup_lock() call.)
+ */
+void rebuild_sched_domains(void)
+{
+	do_rebuild_sched_domains(NULL);
 }
 
 /**
@@ -863,7 +932,7 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
 		return retval;
 
 	if (is_load_balanced)
-		rebuild_sched_domains();
+		async_rebuild_sched_domains();
 	return 0;
 }
 
@@ -1090,7 +1159,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
 	if (val != cs->relax_domain_level) {
 		cs->relax_domain_level = val;
 		if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))
-			rebuild_sched_domains();
+			async_rebuild_sched_domains();
 	}
 
 	return 0;
@@ -1131,7 +1200,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 	mutex_unlock(&callback_mutex);
 
 	if (cpus_nonempty && balance_flag_changed)
-		rebuild_sched_domains();
+		async_rebuild_sched_domains();
 
 	return 0;
 }
@@ -1492,6 +1561,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
 	default:
 		BUG();
 	}
+
+	/* Unreachable but makes gcc happy */
+	return 0;
 }
 
 static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
@@ -1504,6 +1576,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
 	default:
 		BUG();
 	}
+
+	/* Unrechable but makes gcc happy */
+	return 0;
 }
 
 
@@ -1692,15 +1767,9 @@ static struct cgroup_subsys_state *cpuset_create(
 }
 
 /*
- * Locking note on the strange update_flag() call below:
- *
  * If the cpuset being removed has its flag 'sched_load_balance'
  * enabled, then simulate turning sched_load_balance off, which
- * will call rebuild_sched_domains().  The get_online_cpus()
- * call in rebuild_sched_domains() must not be made while holding
- * callback_mutex.  Elsewhere the kernel nests callback_mutex inside
- * get_online_cpus() calls.  So the reverse nesting would risk an
- * ABBA deadlock.
+ * will call async_rebuild_sched_domains().
  */
 
 static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
@@ -1719,7 +1788,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
 struct cgroup_subsys cpuset_subsys = {
 	.name = "cpuset",
 	.create = cpuset_create,
-	.destroy  = cpuset_destroy,
+	.destroy = cpuset_destroy,
 	.can_attach = cpuset_can_attach,
 	.attach = cpuset_attach,
 	.populate = cpuset_populate,
@@ -1811,7 +1880,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
 }
 
 /*
- * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
+ * If CPU and/or memory hotplug handlers, below, unplug any CPUs
  * or memory nodes, we need to walk over the cpuset hierarchy,
  * removing that CPU or node from all cpusets.  If this removes the
  * last CPU or node from a cpuset, then move the tasks in the empty
@@ -1902,35 +1971,6 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
 	}
 }
 
-/*
- * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
- * cpu_online_map and node_states[N_HIGH_MEMORY].  Force the top cpuset to
- * track what's online after any CPU or memory node hotplug or unplug event.
- *
- * Since there are two callers of this routine, one for CPU hotplug
- * events and one for memory node hotplug events, we could have coded
- * two separate routines here.  We code it as a single common routine
- * in order to minimize text size.
- */
-
-static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
-{
-	cgroup_lock();
-
-	top_cpuset.cpus_allowed = cpu_online_map;
-	top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
-	scan_for_empty_cpusets(&top_cpuset);
-
-	/*
-	 * Scheduler destroys domains on hotplug events.
-	 * Rebuild them based on the current settings.
-	 */
-	if (rebuild_sd)
-		rebuild_sched_domains();
-
-	cgroup_unlock();
-}
-
 /*
  * The top_cpuset tracks what CPUs and Memory Nodes are online,
  * period.  This is necessary in order to make cpusets transparent
@@ -1939,40 +1979,52 @@ static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
  *
  * This routine ensures that top_cpuset.cpus_allowed tracks
  * cpu_online_map on each CPU hotplug (cpuhp) event.
+ *
+ * Called within get_online_cpus().  Needs to call cgroup_lock()
+ * before calling generate_sched_domains().
  */
-
-static int cpuset_handle_cpuhp(struct notifier_block *unused_nb,
+static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
 				unsigned long phase, void *unused_cpu)
 {
+	struct sched_domain_attr *attr;
+	cpumask_t *doms;
+	int ndoms;
+
 	switch (phase) {
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
-	case CPU_DOWN_FAILED:
-	case CPU_DOWN_FAILED_FROZEN:
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
-		common_cpu_mem_hotplug_unplug(1);
 		break;
+
 	default:
 		return NOTIFY_DONE;
 	}
 
+	cgroup_lock();
+	top_cpuset.cpus_allowed = cpu_online_map;
+	scan_for_empty_cpusets(&top_cpuset);
+	ndoms = generate_sched_domains(&doms, &attr);
+	cgroup_unlock();
+
+	/* Have scheduler rebuild the domains */
+	partition_sched_domains(ndoms, doms, attr);
+
 	return NOTIFY_OK;
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
  * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
- * Call this routine anytime after you change
- * node_states[N_HIGH_MEMORY].
- * See also the previous routine cpuset_handle_cpuhp().
+ * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
+ * See also the previous routine cpuset_track_online_cpus().
  */
-
 void cpuset_track_online_nodes(void)
 {
-	common_cpu_mem_hotplug_unplug(0);
+	cgroup_lock();
+	top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
+	scan_for_empty_cpusets(&top_cpuset);
+	cgroup_unlock();
 }
 #endif
 
@@ -1987,7 +2039,7 @@ void __init cpuset_init_smp(void)
 	top_cpuset.cpus_allowed = cpu_online_map;
 	top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
 
-	hotcpu_notifier(cpuset_handle_cpuhp, 0);
+	hotcpu_notifier(cpuset_track_online_cpus, 0);
 }
 
 /**
-- 
cgit v1.2.3


From 5cd9c58fbe9ec92b45b27e131719af4f2bd9eb40 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 14 Aug 2008 11:37:28 +0100
Subject: security: Fix setting of PF_SUPERPRIV by __capable()

Fix the setting of PF_SUPERPRIV by __capable() as it could corrupt the flags
the target process if that is not the current process and it is trying to
change its own flags in a different way at the same time.

__capable() is using neither atomic ops nor locking to protect t->flags.  This
patch removes __capable() and introduces has_capability() that doesn't set
PF_SUPERPRIV on the process being queried.

This patch further splits security_ptrace() in two:

 (1) security_ptrace_may_access().  This passes judgement on whether one
     process may access another only (PTRACE_MODE_ATTACH for ptrace() and
     PTRACE_MODE_READ for /proc), and takes a pointer to the child process.
     current is the parent.

 (2) security_ptrace_traceme().  This passes judgement on PTRACE_TRACEME only,
     and takes only a pointer to the parent process.  current is the child.

     In Smack and commoncap, this uses has_capability() to determine whether
     the parent will be permitted to use PTRACE_ATTACH if normal checks fail.
     This does not set PF_SUPERPRIV.

Two of the instances of __capable() actually only act on current, and so have
been changed to calls to capable().

Of the places that were using __capable():

 (1) The OOM killer calls __capable() thrice when weighing the killability of a
     process.  All of these now use has_capability().

 (2) cap_ptrace() and smack_ptrace() were using __capable() to check to see
     whether the parent was allowed to trace any process.  As mentioned above,
     these have been split.  For PTRACE_ATTACH and /proc, capable() is now
     used, and for PTRACE_TRACEME, has_capability() is used.

 (3) cap_safe_nice() only ever saw current, so now uses capable().

 (4) smack_setprocattr() rejected accesses to tasks other than current just
     after calling __capable(), so the order of these two tests have been
     switched and capable() is used instead.

 (5) In smack_file_send_sigiotask(), we need to allow privileged processes to
     receive SIGIO on files they're manipulating.

 (6) In smack_task_wait(), we let a process wait for a privileged process,
     whether or not the process doing the waiting is privileged.

I've tested this with the LTP SELinux and syscalls testscripts.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Acked-by: Andrew G. Morgan <morgan@kernel.org>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/capability.c | 21 +++++++++++++--------
 kernel/ptrace.c     |  5 ++---
 2 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/capability.c b/kernel/capability.c
index 0101e847603e..33e51e78c2d8 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -486,17 +486,22 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 	return ret;
 }
 
-int __capable(struct task_struct *t, int cap)
+/**
+ * capable - Determine if the current task has a superior capability in effect
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+int capable(int cap)
 {
-	if (security_capable(t, cap) == 0) {
-		t->flags |= PF_SUPERPRIV;
+	if (has_capability(current, cap)) {
+		current->flags |= PF_SUPERPRIV;
 		return 1;
 	}
 	return 0;
 }
-
-int capable(int cap)
-{
-	return __capable(current, cap);
-}
 EXPORT_SYMBOL(capable);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 082b3fcb32a0..356699a96d56 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -140,7 +140,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 	if (!dumpable && !capable(CAP_SYS_PTRACE))
 		return -EPERM;
 
-	return security_ptrace(current, task, mode);
+	return security_ptrace_may_access(task, mode);
 }
 
 bool ptrace_may_access(struct task_struct *task, unsigned int mode)
@@ -499,8 +499,7 @@ repeat:
 			goto repeat;
 		}
 
-		ret = security_ptrace(current->parent, current,
-				      PTRACE_MODE_ATTACH);
+		ret = security_ptrace_traceme(current->parent);
 
 		/*
 		 * Set the ptrace bit in the process ptrace flags.
-- 
cgit v1.2.3


From f1679d08480008e06fd619c71635ed33274e2595 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 14 Aug 2008 15:49:00 +0200
Subject: sched: fix rt-bandwidth hotplug race

When we hot-unplug a cpu and rebuild the sched-domain, all cpus will be
detatched. Alex observed the case where a runqueue was stealing bandwidth
from an already disabled runqueue to satisfy its own needs.

Stop this by skipping over already disabled runqueues.

Reported-by: Alex Nixon <alex.nixon@citrix.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Alex Nixon <alex.nixon@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 6163e4cf885b..998ba54b4543 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -298,7 +298,7 @@ static void __disable_runtime(struct rq *rq)
 			struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 			s64 diff;
 
-			if (iter == rt_rq)
+			if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
 				continue;
 
 			spin_lock(&iter->rt_runtime_lock);
-- 
cgit v1.2.3


From 293a17ebc944c958e24e6ffbd1d5a49abdbf489e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 12 Aug 2008 17:25:03 -0700
Subject: rcu: prevent console flood when one CPU sees another AWOL via RCU

One small change needed to keep from flooding the console when one
CPU notices that another is AWOL.  Unless I am missing something subtle.
Otherwise the cleanups look good!

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 56b8712fe5aa..dab2676d9d72 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -308,6 +308,7 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
 		spin_unlock(&rcp->lock);
 		return;
 	}
+	rcp->gp_check = get_seconds() + 30;
 	spin_unlock(&rcp->lock);
 
 	/* OK, time to rat on our buddy... */
-- 
cgit v1.2.3


From 1f7b94cd3d564901f9e04a8bc5832ae7bfd690a0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 5 Aug 2008 09:21:44 -0700
Subject: rcu: classic RCU locking and memory-barrier cleanups

This patch simplifies the locking and memory-barrier usage in the Classic
RCU grace-period-detection mechanism, incorporating Lai Jiangshan's
feedback from the earlier version (http://lkml.org/lkml/2008/8/1/400
and http://lkml.org/lkml/2008/8/3/43).  Passed 10 hours of
rcutorture concurrent with CPUs being put online and taken offline on
a 128-hardware-thread Power machine.  My apologies to whoever in the
Eastern Hemisphere was planning to use this machine over the Western
Hemisphere night, but it was sitting idle and...

So this is ready for tip/core/rcu.

This patch is in preparation for moving to a hierarchical
algorithm to allow the very large SMP machines -- requested by some
people at OLS, and there seem to have been a few recent patches in the
4096-CPU direction as well.  The general idea is to move to a much more
conservative concurrency design, then apply a hierarchy to reduce
contention on the global lock by a few orders of magnitude (larger
machines would see greater reductions).  The reason for taking a
conservative approach is that this code isn't on any fast path.

Prototype in progress.

This patch is against the linux-tip git tree (tip/core/rcu).  If you
wish to test this against 2.6.26, use the following set of patches:

http://www.rdrop.com/users/paulmck/patches/2.6.26-ljsimp-1.patch
http://www.rdrop.com/users/paulmck/patches/2.6.26-ljsimpfix-3.patch

The first patch combines commits 5127bed588a2f8f3a1f732de2a8a190b7df5dce3
and 3cac97cbb14aed00d83eb33d4613b0fe3aaea863 from Lai Jiangshan
<laijs@cn.fujitsu.com>, and the second patch contains my changes.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c | 51 +++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 41 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index dab2676d9d72..5de126630b10 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -87,6 +87,7 @@ static void force_quiescent_state(struct rcu_data *rdp,
 	int cpu;
 	cpumask_t cpumask;
 	set_need_resched();
+	spin_lock(&rcp->lock);
 	if (unlikely(!rcp->signaled)) {
 		rcp->signaled = 1;
 		/*
@@ -112,6 +113,7 @@ static void force_quiescent_state(struct rcu_data *rdp,
 		for_each_cpu_mask_nr(cpu, cpumask)
 			smp_send_reschedule(cpu);
 	}
+	spin_unlock(&rcp->lock);
 }
 #else
 static inline void force_quiescent_state(struct rcu_data *rdp,
@@ -125,7 +127,9 @@ static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
 		struct rcu_data *rdp)
 {
 	long batch;
-	smp_mb(); /* reads the most recently updated value of rcu->cur. */
+
+	head->next = NULL;
+	smp_mb(); /* Read of rcu->cur must happen after any change by caller. */
 
 	/*
 	 * Determine the batch number of this callback.
@@ -175,7 +179,6 @@ void call_rcu(struct rcu_head *head,
 	unsigned long flags;
 
 	head->func = func;
-	head->next = NULL;
 	local_irq_save(flags);
 	__call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data));
 	local_irq_restore(flags);
@@ -204,7 +207,6 @@ void call_rcu_bh(struct rcu_head *head,
 	unsigned long flags;
 
 	head->func = func;
-	head->next = NULL;
 	local_irq_save(flags);
 	__call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
 	local_irq_restore(flags);
@@ -467,17 +469,17 @@ static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
 static void __rcu_offline_cpu(struct rcu_data *this_rdp,
 				struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
 {
-	/* if the cpu going offline owns the grace period
+	/*
+	 * if the cpu going offline owns the grace period
 	 * we can block indefinitely waiting for it, so flush
 	 * it here
 	 */
 	spin_lock_bh(&rcp->lock);
 	if (rcp->cur != rcp->completed)
 		cpu_quiet(rdp->cpu, rcp);
-	spin_unlock_bh(&rcp->lock);
-	/* spin_lock implies smp_mb() */
 	rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1);
 	rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1);
+	spin_unlock_bh(&rcp->lock);
 
 	local_irq_disable();
 	this_rdp->qlen += rdp->qlen;
@@ -511,16 +513,19 @@ static void rcu_offline_cpu(int cpu)
 static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 					struct rcu_data *rdp)
 {
+	long completed_snap;
+
 	if (rdp->nxtlist) {
 		local_irq_disable();
+		completed_snap = ACCESS_ONCE(rcp->completed);
 
 		/*
 		 * move the other grace-period-completed entries to
 		 * [rdp->nxtlist, *rdp->nxttail[0]) temporarily
 		 */
-		if (!rcu_batch_before(rcp->completed, rdp->batch))
+		if (!rcu_batch_before(completed_snap, rdp->batch))
 			rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2];
-		else if (!rcu_batch_before(rcp->completed, rdp->batch - 1))
+		else if (!rcu_batch_before(completed_snap, rdp->batch - 1))
 			rdp->nxttail[0] = rdp->nxttail[1];
 
 		/*
@@ -561,8 +566,24 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 
 static void rcu_process_callbacks(struct softirq_action *unused)
 {
+	/*
+	 * Memory references from any prior RCU read-side critical sections
+	 * executed by the interrupted code must be see before any RCU
+	 * grace-period manupulations below.
+	 */
+
+	smp_mb(); /* See above block comment. */
+
 	__rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
 	__rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
+
+	/*
+	 * Memory references from any later RCU read-side critical sections
+	 * executed by the interrupted code must be see after any RCU
+	 * grace-period manupulations above.
+	 */
+
+	smp_mb(); /* See above block comment. */
 }
 
 static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
@@ -571,13 +592,15 @@ static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
 	check_cpu_stall(rcp, rdp);
 
 	if (rdp->nxtlist) {
+		long completed_snap = ACCESS_ONCE(rcp->completed);
+
 		/*
 		 * This cpu has pending rcu entries and the grace period
 		 * for them has completed.
 		 */
-		if (!rcu_batch_before(rcp->completed, rdp->batch))
+		if (!rcu_batch_before(completed_snap, rdp->batch))
 			return 1;
-		if (!rcu_batch_before(rcp->completed, rdp->batch - 1) &&
+		if (!rcu_batch_before(completed_snap, rdp->batch - 1) &&
 				rdp->nxttail[0] != rdp->nxttail[1])
 			return 1;
 		if (rdp->nxttail[0] != &rdp->nxtlist)
@@ -628,6 +651,12 @@ int rcu_needs_cpu(int cpu)
 	return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
 }
 
+/*
+ * Top-level function driving RCU grace-period detection, normally
+ * invoked from the scheduler-clock interrupt.  This function simply
+ * increments counters that are read only from softirq by this same
+ * CPU, so there are no memory barriers required.
+ */
 void rcu_check_callbacks(int cpu, int user)
 {
 	if (user ||
@@ -671,6 +700,7 @@ void rcu_check_callbacks(int cpu, int user)
 static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
 						struct rcu_data *rdp)
 {
+	spin_lock(&rcp->lock);
 	memset(rdp, 0, sizeof(*rdp));
 	rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist;
 	rdp->donetail = &rdp->donelist;
@@ -678,6 +708,7 @@ static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
 	rdp->qs_pending = 0;
 	rdp->cpu = cpu;
 	rdp->blimit = blimit;
+	spin_unlock(&rcp->lock);
 }
 
 static void __cpuinit rcu_online_cpu(int cpu)
-- 
cgit v1.2.3


From 4cd69b986ebf0f8da93f82ffbb89c032ee09c2e1 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Fri, 15 Aug 2008 00:40:20 -0700
Subject: kexec: fix compilation warning on xchg(&kexec_lock, 0) in
 kernel_kexec()

kernel/kexec.c: In function 'kernel_kexec':
kernel/kexec.c:1506: warning: value computed is not used

Signed-off-by: Huang Ying <ying.huang@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index c8a4370e2a34..cf3797b76786 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1503,7 +1503,8 @@ int kernel_kexec(void)
 	}
 
  Unlock:
-	xchg(&kexec_lock, 0);
+	if (!xchg(&kexec_lock, 0))
+		BUG();
 
 	return error;
 }
-- 
cgit v1.2.3


From 7ade3fcc1fe2801336112027a884070c9ca451af Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Fri, 15 Aug 2008 00:40:21 -0700
Subject: kexec jump: clean up #ifdef and comments

Move if (kexec_image->preserve_context) { ...  } into #ifdef
CONFIG_KEXEC_JUMP to make code looks cleaner.

Fix no longer correct comments of kernel_kexec().

Signed-off-by: Huang Ying <ying.huang@intel.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index cf3797b76786..bfbbd120623c 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1426,11 +1426,9 @@ static int __init crash_save_vmcoreinfo_init(void)
 
 module_init(crash_save_vmcoreinfo_init)
 
-/**
- *	kernel_kexec - reboot the system
- *
- *	Move into place and start executing a preloaded standalone
- *	executable.  If nothing was preloaded return an error.
+/*
+ * Move into place and start executing a preloaded standalone
+ * executable.  If nothing was preloaded return an error.
  */
 int kernel_kexec(void)
 {
@@ -1443,8 +1441,8 @@ int kernel_kexec(void)
 		goto Unlock;
 	}
 
-	if (kexec_image->preserve_context) {
 #ifdef CONFIG_KEXEC_JUMP
+	if (kexec_image->preserve_context) {
 		mutex_lock(&pm_mutex);
 		pm_prepare_console();
 		error = freeze_processes();
@@ -1471,8 +1469,9 @@ int kernel_kexec(void)
 		if (error)
 			goto Enable_irqs;
 		save_processor_state();
+	} else
 #endif
-	} else {
+	{
 		blocking_notifier_call_chain(&reboot_notifier_list,
 					     SYS_RESTART, NULL);
 		system_state = SYSTEM_RESTART;
@@ -1484,8 +1483,8 @@ int kernel_kexec(void)
 
 	machine_kexec(kexec_image);
 
-	if (kexec_image->preserve_context) {
 #ifdef CONFIG_KEXEC_JUMP
+	if (kexec_image->preserve_context) {
 		restore_processor_state();
 		device_power_up(PMSG_RESTORE);
  Enable_irqs:
@@ -1499,8 +1498,8 @@ int kernel_kexec(void)
  Restore_console:
 		pm_restore_console();
 		mutex_unlock(&pm_mutex);
-#endif
 	}
+#endif
 
  Unlock:
 	if (!xchg(&kexec_lock, 0))
-- 
cgit v1.2.3


From 163f6876f5c3ff8215e900b93779e960a56b3694 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Fri, 15 Aug 2008 00:40:22 -0700
Subject: kexec jump: rename KEXEC_CONTROL_CODE_SIZE to KEXEC_CONTROL_PAGE_SIZE

Rename KEXEC_CONTROL_CODE_SIZE to KEXEC_CONTROL_PAGE_SIZE, because control
page is used for not only code on some platform.  For example in kexec
jump, it is used for data and stack too.

[akpm@linux-foundation.org: unbreak powerpc and arm, finish conversion]
Signed-off-by: Huang Ying <ying.huang@intel.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index bfbbd120623c..2810558802b6 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -77,7 +77,7 @@ int kexec_should_crash(struct task_struct *p)
  *
  * The code for the transition from the current kernel to the
  * the new kernel is placed in the control_code_buffer, whose size
- * is given by KEXEC_CONTROL_CODE_SIZE.  In the best case only a single
+ * is given by KEXEC_CONTROL_PAGE_SIZE.  In the best case only a single
  * page of memory is necessary, but some architectures require more.
  * Because this memory must be identity mapped in the transition from
  * virtual to physical addresses it must live in the range
@@ -242,7 +242,7 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
 	 */
 	result = -ENOMEM;
 	image->control_code_page = kimage_alloc_control_pages(image,
-					   get_order(KEXEC_CONTROL_CODE_SIZE));
+					   get_order(KEXEC_CONTROL_PAGE_SIZE));
 	if (!image->control_code_page) {
 		printk(KERN_ERR "Could not allocate control_code_buffer\n");
 		goto out;
@@ -317,7 +317,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
 	 */
 	result = -ENOMEM;
 	image->control_code_page = kimage_alloc_control_pages(image,
-					   get_order(KEXEC_CONTROL_CODE_SIZE));
+					   get_order(KEXEC_CONTROL_PAGE_SIZE));
 	if (!image->control_code_page) {
 		printk(KERN_ERR "Could not allocate control_code_buffer\n");
 		goto out;
-- 
cgit v1.2.3


From ca195b7f6da3d5dde0bb85a7c322d7de73352653 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Fri, 15 Aug 2008 00:40:24 -0700
Subject: kexec jump: remove duplication of kexec_restart_prepare()

Call kernel_restart_prepare() in kernel_kexec() instead of duplicating the
code.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Acked-by: Pavel Machek <pavel@suse.cz>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c | 6 +-----
 kernel/sys.c   | 2 +-
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 2810558802b6..b81682312dc4 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1472,11 +1472,7 @@ int kernel_kexec(void)
 	} else
 #endif
 	{
-		blocking_notifier_call_chain(&reboot_notifier_list,
-					     SYS_RESTART, NULL);
-		system_state = SYSTEM_RESTART;
-		device_shutdown();
-		sysdev_shutdown();
+		kernel_restart_prepare(NULL);
 		printk(KERN_EMERG "Starting new kernel\n");
 		machine_shutdown();
 	}
diff --git a/kernel/sys.c b/kernel/sys.c
index c01858090a98..3dacb00a7f76 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -274,7 +274,7 @@ void emergency_restart(void)
 }
 EXPORT_SYMBOL_GPL(emergency_restart);
 
-static void kernel_restart_prepare(char *cmd)
+void kernel_restart_prepare(char *cmd)
 {
 	blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
 	system_state = SYSTEM_RESTART;
-- 
cgit v1.2.3


From 73bd9c72a29be1e8de008186eea55d333a938804 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Fri, 15 Aug 2008 00:40:24 -0700
Subject: kexec jump: in sync with hibernation implementation

Add device_pm_lock() and device_pm_unlock() in kernel_kexec() in sync with
current hibernation implementation.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index b81682312dc4..17c80fdc453b 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1457,6 +1457,7 @@ int kernel_kexec(void)
 		error = disable_nonboot_cpus();
 		if (error)
 			goto Resume_devices;
+		device_pm_lock();
 		local_irq_disable();
 		/* At this point, device_suspend() has been called,
 		 * but *not* device_power_down(). We *must*
@@ -1485,6 +1486,7 @@ int kernel_kexec(void)
 		device_power_up(PMSG_RESTORE);
  Enable_irqs:
 		local_irq_enable();
+		device_pm_unlock();
 		enable_nonboot_cpus();
  Resume_devices:
 		device_resume(PMSG_RESTORE);
-- 
cgit v1.2.3


From 3122c331190e9d1622bf1c8cf6ce3b17cca67c9e Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Fri, 15 Aug 2008 00:40:26 -0700
Subject: kexec jump: fix for ftrace

Ftrace depends on some processor state that we destroyed during kexec and
restored by restore_processor_state().  So save_processor_state() and
restore_processor_state() are moved into machine_kexec() and ftrace is
restored after restore_processor_state().

Signed-off-by: Huang Ying <ying.huang@intel.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 17c80fdc453b..9fc6f7cbd8a8 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1469,7 +1469,6 @@ int kernel_kexec(void)
 		error = device_power_down(PMSG_FREEZE);
 		if (error)
 			goto Enable_irqs;
-		save_processor_state();
 	} else
 #endif
 	{
@@ -1482,7 +1481,6 @@ int kernel_kexec(void)
 
 #ifdef CONFIG_KEXEC_JUMP
 	if (kexec_image->preserve_context) {
-		restore_processor_state();
 		device_power_up(PMSG_RESTORE);
  Enable_irqs:
 		local_irq_enable();
-- 
cgit v1.2.3


From 8c5a1cf0ad3ac5fcdf51314a63b16a440870f6a2 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Fri, 15 Aug 2008 00:40:27 -0700
Subject: kexec: use a mutex for locking rather than xchg()

Functionally the same, but more conventional.

Cc: Huang Ying <ying.huang@intel.com>
Tested-by: Vivek Goyal <vgoyal@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c | 34 ++++++++++------------------------
 1 file changed, 10 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 9fc6f7cbd8a8..59f3f0df35d4 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -12,7 +12,7 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/kexec.h>
-#include <linux/spinlock.h>
+#include <linux/mutex.h>
 #include <linux/list.h>
 #include <linux/highmem.h>
 #include <linux/syscalls.h>
@@ -924,19 +924,14 @@ static int kimage_load_segment(struct kimage *image,
  */
 struct kimage *kexec_image;
 struct kimage *kexec_crash_image;
-/*
- * A home grown binary mutex.
- * Nothing can wait so this mutex is safe to use
- * in interrupt context :)
- */
-static int kexec_lock;
+
+static DEFINE_MUTEX(kexec_mutex);
 
 asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
 				struct kexec_segment __user *segments,
 				unsigned long flags)
 {
 	struct kimage **dest_image, *image;
-	int locked;
 	int result;
 
 	/* We only trust the superuser with rebooting the system. */
@@ -972,8 +967,7 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
 	 *
 	 * KISS: always take the mutex.
 	 */
-	locked = xchg(&kexec_lock, 1);
-	if (locked)
+	if (!mutex_trylock(&kexec_mutex))
 		return -EBUSY;
 
 	dest_image = &kexec_image;
@@ -1015,8 +1009,7 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
 	image = xchg(dest_image, image);
 
 out:
-	locked = xchg(&kexec_lock, 0); /* Release the mutex */
-	BUG_ON(!locked);
+	mutex_unlock(&kexec_mutex);
 	kimage_free(image);
 
 	return result;
@@ -1063,10 +1056,7 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry,
 
 void crash_kexec(struct pt_regs *regs)
 {
-	int locked;
-
-
-	/* Take the kexec_lock here to prevent sys_kexec_load
+	/* Take the kexec_mutex here to prevent sys_kexec_load
 	 * running on one cpu from replacing the crash kernel
 	 * we are using after a panic on a different cpu.
 	 *
@@ -1074,8 +1064,7 @@ void crash_kexec(struct pt_regs *regs)
 	 * of memory the xchg(&kexec_crash_image) would be
 	 * sufficient.  But since I reuse the memory...
 	 */
-	locked = xchg(&kexec_lock, 1);
-	if (!locked) {
+	if (mutex_trylock(&kexec_mutex)) {
 		if (kexec_crash_image) {
 			struct pt_regs fixed_regs;
 			crash_setup_regs(&fixed_regs, regs);
@@ -1083,8 +1072,7 @@ void crash_kexec(struct pt_regs *regs)
 			machine_crash_shutdown(&fixed_regs);
 			machine_kexec(kexec_crash_image);
 		}
-		locked = xchg(&kexec_lock, 0);
-		BUG_ON(!locked);
+		mutex_unlock(&kexec_mutex);
 	}
 }
 
@@ -1434,7 +1422,7 @@ int kernel_kexec(void)
 {
 	int error = 0;
 
-	if (xchg(&kexec_lock, 1))
+	if (!mutex_trylock(&kexec_mutex))
 		return -EBUSY;
 	if (!kexec_image) {
 		error = -EINVAL;
@@ -1498,8 +1486,6 @@ int kernel_kexec(void)
 #endif
 
  Unlock:
-	if (!xchg(&kexec_lock, 0))
-		BUG();
-
+	mutex_unlock(&kexec_mutex);
 	return error;
 }
-- 
cgit v1.2.3


From be4de35263f59ca1f4740edfffbfb02cc3f2189e Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Fri, 15 Aug 2008 00:40:44 -0700
Subject: completions: uninline try_wait_for_completion and completion_done

m68k fails to build with these functions inlined in completion.h.  Move
them out of line into sched.c and export them to avoid this problem.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index d601fb0406ca..95e6ad3c231d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4669,6 +4669,52 @@ int __sched wait_for_completion_killable(struct completion *x)
 }
 EXPORT_SYMBOL(wait_for_completion_killable);
 
+/**
+ *	try_wait_for_completion - try to decrement a completion without blocking
+ *	@x:	completion structure
+ *
+ *	Returns: 0 if a decrement cannot be done without blocking
+ *		 1 if a decrement succeeded.
+ *
+ *	If a completion is being used as a counting completion,
+ *	attempt to decrement the counter without blocking. This
+ *	enables us to avoid waiting if the resource the completion
+ *	is protecting is not available.
+ */
+bool try_wait_for_completion(struct completion *x)
+{
+	int ret = 1;
+
+	spin_lock_irq(&x->wait.lock);
+	if (!x->done)
+		ret = 0;
+	else
+		x->done--;
+	spin_unlock_irq(&x->wait.lock);
+	return ret;
+}
+EXPORT_SYMBOL(try_wait_for_completion);
+
+/**
+ *	completion_done - Test to see if a completion has any waiters
+ *	@x:	completion structure
+ *
+ *	Returns: 0 if there are waiters (wait_for_completion() in progress)
+ *		 1 if there are no waiters.
+ *
+ */
+bool completion_done(struct completion *x)
+{
+	int ret = 1;
+
+	spin_lock_irq(&x->wait.lock);
+	if (!x->done)
+		ret = 0;
+	spin_unlock_irq(&x->wait.lock);
+	return ret;
+}
+EXPORT_SYMBOL(completion_done);
+
 static long __sched
 sleep_on_common(wait_queue_head_t *q, int state, long timeout)
 {
-- 
cgit v1.2.3


From 5802294f1b1895ee19a3d0ae72805da453afb9de Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 30 Jul 2008 14:20:55 -0400
Subject: rcu: trace fix possible mem-leak

In the initialization of the RCU trace module, if
rcupreempt_debugfs_init() fails, we never free the the trace buffer.

This patch frees the trace buffer in case the debugfs fails.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Reviewed-by: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupreempt_trace.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
index 5edf82c34bbc..35c2d3360ecf 100644
--- a/kernel/rcupreempt_trace.c
+++ b/kernel/rcupreempt_trace.c
@@ -308,11 +308,16 @@ out:
 
 static int __init rcupreempt_trace_init(void)
 {
+	int ret;
+
 	mutex_init(&rcupreempt_trace_mutex);
 	rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
 	if (!rcupreempt_trace_buf)
 		return 1;
-	return rcupreempt_debugfs_init();
+	ret = rcupreempt_debugfs_init();
+	if (ret)
+		kfree(rcupreempt_trace_buf);
+	return ret;
 }
 
 static void __exit rcupreempt_trace_cleanup(void)
-- 
cgit v1.2.3


From 55cd53404c5cc5fd94708232e3b4aa4a9388917b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 4 Aug 2008 08:54:26 +0200
Subject: sched: scale sysctl_sched_shares_ratelimit with nr_cpus

David reported that his Niagra spend a little too much time in
tg_shares_up(), which considering he has a large cpu count makes sense.

So scale the ratelimit value with the number of cpus like we do for
other controls as well.

Reported-by: David Miller <davem@davemloft.net>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 8bf8a5528bc7..040807196b33 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -808,9 +808,9 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
 
 /*
  * ratelimit for updating the group shares.
- * default: 0.5ms
+ * default: 0.25ms
  */
-const_debug unsigned int sysctl_sched_shares_ratelimit = 500000;
+unsigned int sysctl_sched_shares_ratelimit = 250000;
 
 /*
  * period over which we measure -rt task cpu usage in us.
@@ -5740,6 +5740,8 @@ static inline void sched_init_granularity(void)
 		sysctl_sched_latency = limit;
 
 	sysctl_sched_wakeup_granularity *= factor;
+
+	sysctl_sched_shares_ratelimit *= factor;
 }
 
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3


From df60a8441866153d691ae69b77934904c2de5e0d Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Fri, 15 Aug 2008 09:33:05 -0700
Subject: lockdep: fix build if CONFIG_PROVE_LOCKING not defined

If CONFIG_PROVE_LOCKING not defined, then no dependency information
is available.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep_proc.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index fa19aee604c2..4b194d34d77f 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -82,7 +82,6 @@ static void print_name(struct seq_file *m, struct lock_class *class)
 
 static int l_show(struct seq_file *m, void *v)
 {
-	unsigned long nr_forward_deps, nr_backward_deps;
 	struct lock_class *class = v;
 	struct lock_list *entry;
 	char c1, c2, c3, c4;
@@ -96,11 +95,10 @@ static int l_show(struct seq_file *m, void *v)
 #ifdef CONFIG_DEBUG_LOCKDEP
 	seq_printf(m, " OPS:%8ld", class->ops);
 #endif
-	nr_forward_deps = lockdep_count_forward_deps(class);
-	seq_printf(m, " FD:%5ld", nr_forward_deps);
-
-	nr_backward_deps = lockdep_count_backward_deps(class);
-	seq_printf(m, " BD:%5ld", nr_backward_deps);
+#ifdef CONFIG_PROVE_LOCKING
+	seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class));
+	seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));
+#endif
 
 	get_usage_chars(class, &c1, &c2, &c3, &c4);
 	seq_printf(m, " %c%c%c%c", c1, c2, c3, c4);
@@ -325,7 +323,9 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
 		if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
 			nr_hardirq_read_unsafe++;
 
+#ifdef CONFIG_PROVE_LOCKING
 		sum_forward_deps += lockdep_count_forward_deps(class);
+#endif
 	}
 #ifdef CONFIG_DEBUG_LOCKDEP
 	DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused);
-- 
cgit v1.2.3


From cd95851785bcfe95fdf73689e8ecb5a1c5959231 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 17 Aug 2008 07:37:15 -0700
Subject: rcu: fix classic RCU locking cleanup lockdep problem

On Fri, Aug 15, 2008 at 04:24:30PM +0200, Ingo Molnar wrote:
>
> Paul,
>
> one of your two recent RCU patches caused this lockdep splat in -tip
> testing:
>
> ------------------->
> Brought up 2 CPUs
> Total of 2 processors activated (6850.87 BogoMIPS).
> PM: Adding info for No Bus:platform
> khelper used greatest stack depth: 3124 bytes left
>
> =================================
> [ INFO: inconsistent lock state ]
> 2.6.27-rc3-tip #1
> ---------------------------------
> inconsistent {softirq-on-W} -> {in-softirq-W} usage.
> ksoftirqd/0/4 [HC0[0]:SC1[1]:HE1:SE0] takes:
>  (&rcu_ctrlblk.lock){-+..}, at: [<c016d91c>] __rcu_process_callbacks+0x1ac/0x1f0
> {softirq-on-W} state was registered at:
>   [<c01528e4>] __lock_acquire+0x3f4/0x5b0
>   [<c0152b29>] lock_acquire+0x89/0xc0
>   [<c076142b>] _spin_lock+0x3b/0x70
>   [<c016d649>] rcu_init_percpu_data+0x29/0x80
>   [<c075e43f>] rcu_cpu_notify+0xaf/0xd0
>   [<c076458d>] notifier_call_chain+0x2d/0x60
>   [<c0145ede>] __raw_notifier_call_chain+0x1e/0x30
>   [<c075db29>] _cpu_up+0x79/0x110
>   [<c075dc0d>] cpu_up+0x4d/0x70
>   [<c0a769e1>] kernel_init+0xb1/0x200
>   [<c01048a3>] kernel_thread_helper+0x7/0x10
>   [<ffffffff>] 0xffffffff
> irq event stamp: 14
> hardirqs last  enabled at (14): [<c01534db>] trace_hardirqs_on+0xb/0x10
> hardirqs last disabled at (13): [<c014dbeb>] trace_hardirqs_off+0xb/0x10
> softirqs last  enabled at (0): [<c012b186>] copy_process+0x276/0x1190
> softirqs last disabled at (11): [<c0105c0a>] call_on_stack+0x1a/0x30
>
> other info that might help us debug this:
> no locks held by ksoftirqd/0/4.
>
> stack backtrace:
> Pid: 4, comm: ksoftirqd/0 Not tainted 2.6.27-rc3-tip #1
>  [<c01504dc>] print_usage_bug+0x16c/0x1b0
>  [<c0152455>] mark_lock+0xa75/0xb10
>  [<c0108b75>] ? sched_clock+0x15/0x30
>  [<c015289d>] __lock_acquire+0x3ad/0x5b0
>  [<c0152b29>] lock_acquire+0x89/0xc0
>  [<c016d91c>] ? __rcu_process_callbacks+0x1ac/0x1f0
>  [<c076142b>] _spin_lock+0x3b/0x70
>  [<c016d91c>] ? __rcu_process_callbacks+0x1ac/0x1f0
>  [<c016d91c>] __rcu_process_callbacks+0x1ac/0x1f0
>  [<c016d986>] rcu_process_callbacks+0x26/0x50
>  [<c0132305>] __do_softirq+0x95/0x120
>  [<c0132270>] ? __do_softirq+0x0/0x120
>  [<c0105c0a>] call_on_stack+0x1a/0x30
>  [<c0132426>] ? ksoftirqd+0x96/0x110
>  [<c0132390>] ? ksoftirqd+0x0/0x110
>  [<c01411f7>] ? kthread+0x47/0x80
>  [<c01411b0>] ? kthread+0x0/0x80
>  [<c01048a3>] ? kernel_thread_helper+0x7/0x10
>  =======================
> calling  init_cpufreq_transition_notifier_list+0x0/0x20
> initcall init_cpufreq_transition_notifier_list+0x0/0x20 returned 0 after 0 msecs
> calling  net_ns_init+0x0/0x190
> net_namespace: 676 bytes
> initcall net_ns_init+0x0/0x190 returned 0 after 0 msecs
> calling  cpufreq_tsc+0x0/0x20
> initcall cpufreq_tsc+0x0/0x20 returned 0 after 0 msecs
> calling  reboot_init+0x0/0x20
> initcall reboot_init+0x0/0x20 returned 0 after 0 msecs
> calling  print_banner+0x0/0x10
> Booting paravirtualized kernel on bare hardware
>
> <-----------------------
>
> my guess is on:
>
>  commit 1f7b94cd3d564901f9e04a8bc5832ae7bfd690a0
>  Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
>  Date:   Tue Aug 5 09:21:44 2008 -0700
>
>     rcu: classic RCU locking and memory-barrier cleanups
>
> 	Ingo

Fixes a problem detected by lockdep in which rcu->lock was acquired
both in irq context and in process context, but without disabling from
process context.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 5de126630b10..fb1f1cc45142 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -700,7 +700,9 @@ void rcu_check_callbacks(int cpu, int user)
 static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
 						struct rcu_data *rdp)
 {
-	spin_lock(&rcp->lock);
+	long flags;
+
+	spin_lock_irqsave(&rcp->lock, flags);
 	memset(rdp, 0, sizeof(*rdp));
 	rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist;
 	rdp->donetail = &rdp->donelist;
@@ -708,7 +710,7 @@ static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
 	rdp->qs_pending = 0;
 	rdp->cpu = cpu;
 	rdp->blimit = blimit;
-	spin_unlock(&rcp->lock);
+	spin_unlock_irqrestore(&rcp->lock, flags);
 }
 
 static void __cpuinit rcu_online_cpu(int cpu)
-- 
cgit v1.2.3


From 6951b12a0fe7fc64789f2c4d62d2a304e93836a8 Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dbaryshkov@gmail.com>
Date: Mon, 18 Aug 2008 04:26:37 +0400
Subject: lockdep: fix spurious 'inconsistent lock state' warning

Since f82b217e3513fe3af342c0f3ee1494e86250c21c lockdep can output spurious
warnings related to hwirqs due to hardirq_off shrinkage from int to bit-sized
flag. Guard it with double negation to fix the warning.

Signed-off-by: Dmitry Baryshkov <dbaryshkov@gmail.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 77fa776a2da8..3bfb1877a003 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2582,7 +2582,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	hlock->trylock = trylock;
 	hlock->read = read;
 	hlock->check = check;
-	hlock->hardirqs_off = hardirqs_off;
+	hlock->hardirqs_off = !!hardirqs_off;
 #ifdef CONFIG_LOCK_STAT
 	hlock->waittime_stamp = 0;
 	hlock->holdtime_stamp = sched_clock();
-- 
cgit v1.2.3


From ded00a56e99555c3f4000ef3eebfd5fe0d574565 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 17 Aug 2008 12:50:36 -0700
Subject: rcu: remove redundant ACCESS_ONCE definition from rcupreempt.c

Remove the redundant definition of ACCESS_ONCE() from rcupreempt.c in
favor of the one in compiler.h.  Also merge the comment header from
rcupreempt.c's definition into that in compiler.h.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupreempt.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 27827931ca0d..ca4bbbe04aa4 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -58,14 +58,6 @@
 #include <linux/cpumask.h>
 #include <linux/rcupreempt_trace.h>
 
-/*
- * Macro that prevents the compiler from reordering accesses, but does
- * absolutely -nothing- to prevent CPUs from reordering.  This is used
- * only to mediate communication between mainline code and hardware
- * interrupt and NMI handlers.
- */
-#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
-
 /*
  * PREEMPT_RCU data structures.
  */
-- 
cgit v1.2.3


From eff9b713ee3540ddab862095aaf4b1511a6758bc Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 18 Aug 2008 17:51:08 -0700
Subject: rcu: fix locking cleanup fallout

Given that the rcp->lock is now acquired from call_rcu(), which can be
invoked from irq-disable regions, all acquisitions need to disable irqs.
The following patch fixes this.

Although I don't have any reason to believe that this is the cause of
Yinghai's oops, it does need to be fixed.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index fb1f1cc45142..c6b6cf55f3e2 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -86,8 +86,10 @@ static void force_quiescent_state(struct rcu_data *rdp,
 {
 	int cpu;
 	cpumask_t cpumask;
+	unsigned long flags;
+
 	set_need_resched();
-	spin_lock(&rcp->lock);
+	spin_lock_irqsave(&rcp->lock, flags);
 	if (unlikely(!rcp->signaled)) {
 		rcp->signaled = 1;
 		/*
@@ -113,7 +115,7 @@ static void force_quiescent_state(struct rcu_data *rdp,
 		for_each_cpu_mask_nr(cpu, cpumask)
 			smp_send_reschedule(cpu);
 	}
-	spin_unlock(&rcp->lock);
+	spin_unlock_irqrestore(&rcp->lock, flags);
 }
 #else
 static inline void force_quiescent_state(struct rcu_data *rdp,
@@ -301,17 +303,18 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
 {
 	int cpu;
 	long delta;
+	unsigned long flags;
 
 	/* Only let one CPU complain about others per time interval. */
 
-	spin_lock(&rcp->lock);
+	spin_lock_irqsave(&rcp->lock, flags);
 	delta = get_seconds() - rcp->gp_check;
 	if (delta < 2L || cpus_empty(rcp->cpumask)) {
 		spin_unlock(&rcp->lock);
 		return;
 	}
 	rcp->gp_check = get_seconds() + 30;
-	spin_unlock(&rcp->lock);
+	spin_unlock_irqrestore(&rcp->lock, flags);
 
 	/* OK, time to rat on our buddy... */
 
@@ -324,13 +327,15 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
 
 static void print_cpu_stall(struct rcu_ctrlblk *rcp)
 {
+	unsigned long flags;
+
 	printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n",
 			smp_processor_id(), get_seconds(), rcp->gp_check);
 	dump_stack();
-	spin_lock(&rcp->lock);
+	spin_lock_irqsave(&rcp->lock, flags);
 	if ((long)(get_seconds() - rcp->gp_check) >= 0L)
 		rcp->gp_check = get_seconds() + 30;
-	spin_unlock(&rcp->lock);
+	spin_unlock_irqrestore(&rcp->lock, flags);
 }
 
 static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
@@ -413,6 +418,8 @@ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
 static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
 					struct rcu_data *rdp)
 {
+	unsigned long flags;
+
 	if (rdp->quiescbatch != rcp->cur) {
 		/* start new grace period: */
 		rdp->qs_pending = 1;
@@ -436,7 +443,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
 		return;
 	rdp->qs_pending = 0;
 
-	spin_lock(&rcp->lock);
+	spin_lock_irqsave(&rcp->lock, flags);
 	/*
 	 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
 	 * during cpu startup. Ignore the quiescent state.
@@ -444,7 +451,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
 	if (likely(rdp->quiescbatch == rcp->cur))
 		cpu_quiet(rdp->cpu, rcp);
 
-	spin_unlock(&rcp->lock);
+	spin_unlock_irqrestore(&rcp->lock, flags);
 }
 
 
@@ -469,21 +476,22 @@ static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
 static void __rcu_offline_cpu(struct rcu_data *this_rdp,
 				struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
 {
+	unsigned long flags;
+
 	/*
 	 * if the cpu going offline owns the grace period
 	 * we can block indefinitely waiting for it, so flush
 	 * it here
 	 */
-	spin_lock_bh(&rcp->lock);
+	spin_lock_irqsave(&rcp->lock, flags);
 	if (rcp->cur != rcp->completed)
 		cpu_quiet(rdp->cpu, rcp);
 	rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1);
 	rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1);
-	spin_unlock_bh(&rcp->lock);
+	spin_unlock(&rcp->lock);
 
-	local_irq_disable();
 	this_rdp->qlen += rdp->qlen;
-	local_irq_enable();
+	local_irq_restore(flags);
 }
 
 static void rcu_offline_cpu(int cpu)
@@ -550,12 +558,12 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 
 		if (rcu_batch_after(rdp->batch, rcp->pending)) {
 			/* and start it/schedule start if it's a new batch */
-			spin_lock(&rcp->lock);
+			spin_lock_irqsave(&rcp->lock, flags);
 			if (rcu_batch_after(rdp->batch, rcp->pending)) {
 				rcp->pending = rdp->batch;
 				rcu_start_batch(rcp);
 			}
-			spin_unlock(&rcp->lock);
+			spin_unlock_irqrestore(&rcp->lock, flags);
 		}
 	}
 
-- 
cgit v1.2.3


From 0c925d79234fe77589d8ff3861f9f8bb9e7fc3f6 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Mon, 18 Aug 2008 21:49:51 -0700
Subject: rcuclassic: fix compilation NG

fix:

  CC      kernel/rcuclassic.o
kernel/rcuclassic.c: In function '__rcu_process_callbacks':
kernel/rcuclassic.c:561: error: 'flags' undeclared (first use in this function)
kernel/rcuclassic.c:561: error: (Each undeclared identifier is reported only once
kernel/rcuclassic.c:561: error: for each function it appears in.)

Declare missing variable flags.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index c6b6cf55f3e2..01e761a6b38c 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -557,6 +557,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 		local_irq_enable();
 
 		if (rcu_batch_after(rdp->batch, rcp->pending)) {
+			unsigned long flags;
+
 			/* and start it/schedule start if it's a new batch */
 			spin_lock_irqsave(&rcp->lock, flags);
 			if (rcu_batch_after(rdp->batch, rcp->pending)) {
-- 
cgit v1.2.3


From af4491e51632d01fbc2b856ffa9ebcd4b38db68c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 19 Aug 2008 12:33:02 +0200
Subject: sched: rt-bandwidth for user grouping interface

rt_runtime is a signed value

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/user.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user.c b/kernel/user.c
index 865ecf57a096..39d6159fae43 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -169,7 +169,7 @@ static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
 {
 	struct user_struct *up = container_of(kobj, struct user_struct, kobj);
 
-	return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg));
+	return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
 }
 
 static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
@@ -180,7 +180,7 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
 	unsigned long rt_runtime;
 	int rc;
 
-	sscanf(buf, "%lu", &rt_runtime);
+	sscanf(buf, "%ld", &rt_runtime);
 
 	rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
 
-- 
cgit v1.2.3


From 6f0d5c390e4206dcb3804a5072a048fdb7d2b428 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 19 Aug 2008 12:33:03 +0200
Subject: sched: rt-bandwidth accounting fix

It fixes an accounting bug where we would continue accumulating runtime
even though the bandwidth control is disabled. This would lead to very long
throttle periods once bandwidth control gets turned on again.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 998ba54b4543..77340b04a538 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -438,9 +438,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 {
 	u64 runtime = sched_rt_runtime(rt_rq);
 
-	if (runtime == RUNTIME_INF)
-		return 0;
-
 	if (rt_rq->rt_throttled)
 		return rt_rq_throttled(rt_rq);
 
@@ -491,9 +488,11 @@ static void update_curr_rt(struct rq *rq)
 		rt_rq = rt_rq_of_se(rt_se);
 
 		spin_lock(&rt_rq->rt_runtime_lock);
-		rt_rq->rt_time += delta_exec;
-		if (sched_rt_runtime_exceeded(rt_rq))
-			resched_task(curr);
+		if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
+			rt_rq->rt_time += delta_exec;
+			if (sched_rt_runtime_exceeded(rt_rq))
+				resched_task(curr);
+		}
 		spin_unlock(&rt_rq->rt_runtime_lock);
 	}
 }
-- 
cgit v1.2.3


From 0b148fa04852859972abbf848177b92daeef138a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 19 Aug 2008 12:33:04 +0200
Subject: sched: rt-bandwidth group disable fixes

More extensive disable of bandwidth control. It allows sysctl_sched_rt_runtime
to disable full group bandwidth control.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    | 9 ++++++++-
 kernel/sched_rt.c | 5 ++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 9a1ddb84e26d..c1bee5fb8154 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -204,11 +204,13 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
 	rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 }
 
+static inline int rt_bandwidth_enabled(void);
+
 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 {
 	ktime_t now;
 
-	if (rt_b->rt_runtime == RUNTIME_INF)
+	if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
 		return;
 
 	if (hrtimer_active(&rt_b->rt_period_timer))
@@ -839,6 +841,11 @@ static inline u64 global_rt_runtime(void)
 	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 }
 
+static inline int rt_bandwidth_enabled(void)
+{
+	return sysctl_sched_rt_runtime >= 0;
+}
+
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 77340b04a538..94daace5ee15 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -386,7 +386,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 	int i, idle = 1;
 	cpumask_t span;
 
-	if (rt_b->rt_runtime == RUNTIME_INF)
+	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
 		return 1;
 
 	span = sched_rt_period_mask();
@@ -484,6 +484,9 @@ static void update_curr_rt(struct rq *rq)
 	curr->se.exec_start = rq->clock;
 	cpuacct_charge(curr, delta_exec);
 
+	if (!rt_bandwidth_enabled())
+		return;
+
 	for_each_sched_rt_entity(rt_se) {
 		rt_rq = rt_rq_of_se(rt_se);
 
-- 
cgit v1.2.3


From eb755805f21bd5ded84026e167b7a90887ac42e5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 19 Aug 2008 12:33:05 +0200
Subject: sched: extract walk_tg_tree()

Extract walk_tg_tree() and make it a little more generic so we can use it
in the schedulablity test.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 79 ++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 46 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c1bee5fb8154..8c019a19d052 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1387,38 +1387,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
 	update_load_sub(&rq->load, load);
 }
 
-#ifdef CONFIG_SMP
-static unsigned long source_load(int cpu, int type);
-static unsigned long target_load(int cpu, int type);
-static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
-
-static unsigned long cpu_avg_load_per_task(int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-
-	if (rq->nr_running)
-		rq->avg_load_per_task = rq->load.weight / rq->nr_running;
-
-	return rq->avg_load_per_task;
-}
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
-typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
+#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED))
+typedef int (*tg_visitor)(struct task_group *, void *);
 
 /*
  * Iterate the full tree, calling @down when first entering a node and @up when
  * leaving it for the final time.
  */
-static void
-walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
+static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
 {
 	struct task_group *parent, *child;
+	int ret;
 
 	rcu_read_lock();
 	parent = &root_task_group;
 down:
-	(*down)(parent, cpu, sd);
+	ret = (*down)(parent, data);
+	if (ret)
+		goto out_unlock;
 	list_for_each_entry_rcu(child, &parent->children, siblings) {
 		parent = child;
 		goto down;
@@ -1426,14 +1412,42 @@ down:
 up:
 		continue;
 	}
-	(*up)(parent, cpu, sd);
+	ret = (*up)(parent, data);
+	if (ret)
+		goto out_unlock;
 
 	child = parent;
 	parent = parent->parent;
 	if (parent)
 		goto up;
+out_unlock:
 	rcu_read_unlock();
+
+	return ret;
+}
+
+static int tg_nop(struct task_group *tg, void *data)
+{
+	return 0;
 }
+#endif
+
+#ifdef CONFIG_SMP
+static unsigned long source_load(int cpu, int type);
+static unsigned long target_load(int cpu, int type);
+static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
+
+static unsigned long cpu_avg_load_per_task(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	if (rq->nr_running)
+		rq->avg_load_per_task = rq->load.weight / rq->nr_running;
+
+	return rq->avg_load_per_task;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
 
 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
 
@@ -1493,11 +1507,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
  * This needs to be done in a bottom-up fashion because the rq weight of a
  * parent group depends on the shares of its child groups.
  */
-static void
-tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
+static int tg_shares_up(struct task_group *tg, void *data)
 {
 	unsigned long rq_weight = 0;
 	unsigned long shares = 0;
+	struct sched_domain *sd = data;
 	int i;
 
 	for_each_cpu_mask(i, sd->span) {
@@ -1522,6 +1536,8 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
 		__update_group_shares_cpu(tg, i, shares, rq_weight);
 		spin_unlock_irqrestore(&rq->lock, flags);
 	}
+
+	return 0;
 }
 
 /*
@@ -1529,10 +1545,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
  * This needs to be done in a top-down fashion because the load of a child
  * group is a fraction of its parents load.
  */
-static void
-tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
+static int tg_load_down(struct task_group *tg, void *data)
 {
 	unsigned long load;
+	long cpu = (long)data;
 
 	if (!tg->parent) {
 		load = cpu_rq(cpu)->load.weight;
@@ -1543,11 +1559,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
 	}
 
 	tg->cfs_rq[cpu]->h_load = load;
-}
 
-static void
-tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
+	return 0;
 }
 
 static void update_shares(struct sched_domain *sd)
@@ -1557,7 +1570,7 @@ static void update_shares(struct sched_domain *sd)
 
 	if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
 		sd->last_update = now;
-		walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
+		walk_tg_tree(tg_nop, tg_shares_up, sd);
 	}
 }
 
@@ -1568,9 +1581,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 	spin_lock(&rq->lock);
 }
 
-static void update_h_load(int cpu)
+static void update_h_load(long cpu)
 {
-	walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
+	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
 }
 
 #else
-- 
cgit v1.2.3


From 9a7e0b180da21885988d47558671cf580279f9d6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 19 Aug 2008 12:33:06 +0200
Subject: sched: rt-bandwidth fixes

The last patch allows sysctl_sched_rt_runtime to disable bandwidth accounting
for the group scheduler - however it doesn't deal with sched_setscheduler(),
which will keep tasks out of groups that have no assigned runtime.

If we relax this, we get into the situation where RT tasks can get into a group
when we disable bandwidth control, and then starve them by enabling it again.

Rework the schedulability code to check for this condition and fail to turn
on bandwidth control with -EBUSY when this situation is found.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 125 +++++++++++++++++++++++++++++----------------------------
 1 file changed, 63 insertions(+), 62 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 8c019a19d052..e41bdae2778d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -300,9 +300,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
 static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
 static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
 #endif /* CONFIG_RT_GROUP_SCHED */
-#else /* !CONFIG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_USER_SCHED */
 #define root_task_group init_task_group
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* CONFIG_USER_SCHED */
 
 /* task_group_lock serializes add/remove of task groups and also changes to
  * a task group's cpu shares.
@@ -1387,7 +1387,7 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
 	update_load_sub(&rq->load, load);
 }
 
-#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED))
+#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(SCHED_RT_GROUP_SCHED)
 typedef int (*tg_visitor)(struct task_group *, void *);
 
 /*
@@ -5082,7 +5082,8 @@ recheck:
 		 * Do not allow realtime tasks into groups that have no runtime
 		 * assigned.
 		 */
-		if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
+		if (rt_bandwidth_enabled() && rt_policy(policy) &&
+				task_group(p)->rt_bandwidth.rt_runtime == 0)
 			return -EPERM;
 #endif
 
@@ -8707,73 +8708,77 @@ static DEFINE_MUTEX(rt_constraints_mutex);
 static unsigned long to_ratio(u64 period, u64 runtime)
 {
 	if (runtime == RUNTIME_INF)
-		return 1ULL << 16;
+		return 1ULL << 20;
 
-	return div64_u64(runtime << 16, period);
+	return div64_u64(runtime << 20, period);
 }
 
-#ifdef CONFIG_CGROUP_SCHED
-static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
+/* Must be called with tasklist_lock held */
+static inline int tg_has_rt_tasks(struct task_group *tg)
 {
-	struct task_group *tgi, *parent = tg->parent;
-	unsigned long total = 0;
+	struct task_struct *g, *p;
 
-	if (!parent) {
-		if (global_rt_period() < period)
-			return 0;
+	do_each_thread(g, p) {
+		if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
+			return 1;
+	} while_each_thread(g, p);
 
-		return to_ratio(period, runtime) <
-			to_ratio(global_rt_period(), global_rt_runtime());
-	}
+	return 0;
+}
 
-	if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
-		return 0;
+struct rt_schedulable_data {
+	struct task_group *tg;
+	u64 rt_period;
+	u64 rt_runtime;
+};
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(tgi, &parent->children, siblings) {
-		if (tgi == tg)
-			continue;
+static int tg_schedulable(struct task_group *tg, void *data)
+{
+	struct rt_schedulable_data *d = data;
+	struct task_group *child;
+	unsigned long total, sum = 0;
+	u64 period, runtime;
+
+	period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+	runtime = tg->rt_bandwidth.rt_runtime;
 
-		total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
-				tgi->rt_bandwidth.rt_runtime);
+	if (tg == d->tg) {
+		period = d->rt_period;
+		runtime = d->rt_runtime;
 	}
-	rcu_read_unlock();
 
-	return total + to_ratio(period, runtime) <=
-		to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
-				parent->rt_bandwidth.rt_runtime);
-}
-#elif defined CONFIG_USER_SCHED
-static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
-{
-	struct task_group *tgi;
-	unsigned long total = 0;
-	unsigned long global_ratio =
-		to_ratio(global_rt_period(), global_rt_runtime());
+	if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
+		return -EBUSY;
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(tgi, &task_groups, list) {
-		if (tgi == tg)
-			continue;
+	total = to_ratio(period, runtime);
 
-		total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
-				tgi->rt_bandwidth.rt_runtime);
+	list_for_each_entry_rcu(child, &tg->children, siblings) {
+		period = ktime_to_ns(child->rt_bandwidth.rt_period);
+		runtime = child->rt_bandwidth.rt_runtime;
+
+		if (child == d->tg) {
+			period = d->rt_period;
+			runtime = d->rt_runtime;
+		}
+
+		sum += to_ratio(period, runtime);
 	}
-	rcu_read_unlock();
 
-	return total + to_ratio(period, runtime) < global_ratio;
+	if (sum > total)
+		return -EINVAL;
+
+	return 0;
 }
-#endif
 
-/* Must be called with tasklist_lock held */
-static inline int tg_has_rt_tasks(struct task_group *tg)
+static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 {
-	struct task_struct *g, *p;
-	do_each_thread(g, p) {
-		if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
-			return 1;
-	} while_each_thread(g, p);
-	return 0;
+	struct rt_schedulable_data data = {
+		.tg = tg,
+		.rt_period = period,
+		.rt_runtime = runtime,
+	};
+
+	return walk_tg_tree(tg_schedulable, tg_nop, &data);
 }
 
 static int tg_set_bandwidth(struct task_group *tg,
@@ -8783,14 +8788,9 @@ static int tg_set_bandwidth(struct task_group *tg,
 
 	mutex_lock(&rt_constraints_mutex);
 	read_lock(&tasklist_lock);
-	if (rt_runtime == 0 && tg_has_rt_tasks(tg)) {
-		err = -EBUSY;
+	err = __rt_schedulable(tg, rt_period, rt_runtime);
+	if (err)
 		goto unlock;
-	}
-	if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
-		err = -EINVAL;
-		goto unlock;
-	}
 
 	spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
 	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
@@ -8867,8 +8867,9 @@ static int sched_rt_global_constraints(void)
 	rt_runtime = tg->rt_bandwidth.rt_runtime;
 
 	mutex_lock(&rt_constraints_mutex);
-	if (!__rt_schedulable(tg, rt_period, rt_runtime))
-		ret = -EINVAL;
+	read_lock(&tasklist_lock);
+	ret = __rt_schedulable(tg, rt_period, rt_runtime);
+	read_unlock(&tasklist_lock);
 	mutex_unlock(&rt_constraints_mutex);
 
 	return ret;
-- 
cgit v1.2.3


From 1b04624f93bb1c4f9495b8476d1dd0200af019e2 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Tue, 19 Aug 2008 20:37:07 -0700
Subject: tracehook: fix SA_NOCLDWAIT

I outwitted myself again in commit 2b2a1ff64afbadac842bbc58c5166962cf4f7664,
and broke the SA_NOCLDWAIT behavior so it leaks zombies.  This fixes it.

Reported-by: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Roland McGrath <roland@redhat.com>
---
 kernel/signal.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index c539f60c6f41..e661b01d340f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1338,6 +1338,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 	struct siginfo info;
 	unsigned long flags;
 	struct sighand_struct *psig;
+	int ret = sig;
 
 	BUG_ON(sig == -1);
 
@@ -1402,7 +1403,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 		 * is implementation-defined: we do (if you don't want
 		 * it, just use SIG_IGN instead).
 		 */
-		tsk->exit_signal = -1;
+		ret = tsk->exit_signal = -1;
 		if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
 			sig = -1;
 	}
@@ -1411,7 +1412,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 	__wake_up_parent(tsk, tsk->parent);
 	spin_unlock_irqrestore(&psig->siglock, flags);
 
-	return sig;
+	return ret;
 }
 
 static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
-- 
cgit v1.2.3


From 2d70b68d42b5196a48ccb639e3797f097ef5bea3 Mon Sep 17 00:00:00 2001
From: Ken Chen <kenchen@google.com>
Date: Wed, 20 Aug 2008 14:09:17 -0700
Subject: fix setpriority(PRIO_PGRP) thread iterator breakage

When user calls sys_setpriority(PRIO_PGRP ...) on a NPTL style multi-LWP
process, only the task leader of the process is affected, all other
sibling LWP threads didn't receive the setting.  The problem was that the
iterator used in sys_setpriority() only iteartes over one task for each
process, ignoring all other sibling thread.

Introduce a new macro do_each_pid_thread / while_each_pid_thread to walk
each thread of a process.  Convert 4 call sites in {set/get}priority and
ioprio_{set/get}.

Signed-off-by: Ken Chen <kenchen@google.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 3dacb00a7f76..038a7bc0901d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -169,9 +169,9 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
 				pgrp = find_vpid(who);
 			else
 				pgrp = task_pgrp(current);
-			do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
+			do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
 				error = set_one_prio(p, niceval, error);
-			} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
+			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case PRIO_USER:
 			user = current->user;
@@ -229,11 +229,11 @@ asmlinkage long sys_getpriority(int which, int who)
 				pgrp = find_vpid(who);
 			else
 				pgrp = task_pgrp(current);
-			do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
+			do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
 				niceval = 20 - task_nice(p);
 				if (niceval > retval)
 					retval = niceval;
-			} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
+			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case PRIO_USER:
 			user = current->user;
-- 
cgit v1.2.3


From efc2dead2c82cae31943828f6d977c483942b0eb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 20 Aug 2008 12:44:55 +0200
Subject: sched: enable LB_BIAS by default

Yanmin reported a significant regression on his 16-core machine due to:

  commit 93b75217df39e6d75889cc6f8050343286aff4a5
  Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
  Date:   Fri Jun 27 13:41:33 2008 +0200

Flip back to the old behaviour.

Reported-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_features.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 862b06bd560a..9353ca78154e 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -8,6 +8,6 @@ SCHED_FEAT(SYNC_WAKEUPS, 1)
 SCHED_FEAT(HRTICK, 1)
 SCHED_FEAT(DOUBLE_TICK, 0)
 SCHED_FEAT(ASYM_GRAN, 1)
-SCHED_FEAT(LB_BIAS, 0)
+SCHED_FEAT(LB_BIAS, 1)
 SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
 SCHED_FEAT(ASYM_EFF_LOAD, 1)
-- 
cgit v1.2.3


From 01dcb0443ed89eccf26c2b43f1ea13b368ae740d Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 20 Aug 2008 16:35:19 -0700
Subject: rcu: fix synchronize_rcu() so that kernel-doc works

Fix RCU's synchronize_rcu() so that it looks like a C function, enabling
it to be recognized as a function with kernel-doc annotation.

Warning(linux-2.6.26-git11//kernel/rcupdate.c:81): No description found for parameter 'synchronize_rcu'
Warning(linux-2.6.26-git11//kernel/rcupdate.c:81): No description found for parameter 'call_rcu'

[akpm@linux-foundation.org: fix comment]
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupdate.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f14f372cf6f5..467d5940f624 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -77,6 +77,7 @@ void wakeme_after_rcu(struct rcu_head  *head)
  * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
  * and may be nested.
  */
+void synchronize_rcu(void);	/* Makes kernel-doc tools happy */
 synchronize_rcu_xxx(synchronize_rcu, call_rcu)
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 
-- 
cgit v1.2.3


From 3c4fbe5e01d7e5309be5045e7ae0db20a049e6dc Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Wed, 20 Aug 2008 16:37:38 -0700
Subject: nohz: fix wrong event handler after online an offlined cpu

On the tickless system(CONFIG_NO_HZ=y and CONFIG_HIGH_RES_TIMERS=n), after
I made an offlined cpu online, I found this cpu's event handler was
tick_handle_periodic, not tick_nohz_handler.

After debuging, I found this bug was caused by the wrong tick mode.  the
tick mode is not changed to NOHZ_MODE_INACTIVE when the cpu is offline.

This patch fixes this bug.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/tick-sched.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f5da526424a9..7a46bde78c66 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -643,17 +643,21 @@ void tick_setup_sched_timer(void)
 		ts->nohz_mode = NOHZ_MODE_HIGHRES;
 #endif
 }
+#endif /* HIGH_RES_TIMERS */
 
+#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS
 void tick_cancel_sched_timer(int cpu)
 {
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 
+# ifdef CONFIG_HIGH_RES_TIMERS
 	if (ts->sched_timer.base)
 		hrtimer_cancel(&ts->sched_timer);
+# endif
 
 	ts->nohz_mode = NOHZ_MODE_INACTIVE;
 }
-#endif /* HIGH_RES_TIMERS */
+#endif
 
 /**
  * Async notification about clocksource changes
-- 
cgit v1.2.3


From 275a89bdd3868af3008852594d2e169eaf69441b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 21 Aug 2008 06:14:55 -0700
Subject: rcu: use irq-safe locks

Some earlier tip/core/rcu patches caused RCU to incorrectly enable irqs
too early in boot.  This caused Yinghai's repeated-kexec testing to
hit oopses, presumably due to so that device interrupts left over from
the prior kernel instance (which would oops the newly booting kernel
before it got a chance to reset said devices).  This patch therefore
converts all the local_irq_disable()s in rcuclassic.c to local_irq_save().

Besides, I never did like local_irq_disable() anyway.  ;-)

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 01e761a6b38c..3f6918966bda 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -247,6 +247,7 @@ static inline void raise_rcu_softirq(void)
  */
 static void rcu_do_batch(struct rcu_data *rdp)
 {
+	unsigned long flags;
 	struct rcu_head *next, *list;
 	int count = 0;
 
@@ -261,9 +262,9 @@ static void rcu_do_batch(struct rcu_data *rdp)
 	}
 	rdp->donelist = list;
 
-	local_irq_disable();
+	local_irq_save(flags);
 	rdp->qlen -= count;
-	local_irq_enable();
+	local_irq_restore(flags);
 	if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
 		rdp->blimit = blimit;
 
@@ -464,12 +465,14 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
 static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
 				struct rcu_head **tail, long batch)
 {
+	unsigned long flags;
+
 	if (list) {
-		local_irq_disable();
+		local_irq_save(flags);
 		this_rdp->batch = batch;
 		*this_rdp->nxttail[2] = list;
 		this_rdp->nxttail[2] = tail;
-		local_irq_enable();
+		local_irq_restore(flags);
 	}
 }
 
@@ -521,10 +524,11 @@ static void rcu_offline_cpu(int cpu)
 static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 					struct rcu_data *rdp)
 {
+	unsigned long flags;
 	long completed_snap;
 
 	if (rdp->nxtlist) {
-		local_irq_disable();
+		local_irq_save(flags);
 		completed_snap = ACCESS_ONCE(rcp->completed);
 
 		/*
@@ -554,7 +558,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 			rdp->nxttail[0] = &rdp->nxtlist;
 		}
 
-		local_irq_enable();
+		local_irq_restore(flags);
 
 		if (rcu_batch_after(rdp->batch, rcp->pending)) {
 			unsigned long flags;
-- 
cgit v1.2.3


From a38409fbb5181324fb73b359189a72e02b6c7030 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 20 Aug 2008 12:16:09 +0200
Subject: dma-coherent: export dma_[alloc|release]_from_coherent methods

fixes modular builds:

  ERROR: "dma_alloc_from_coherent" [sound/core/snd-page-alloc.ko] undefined!
  ERROR: "dma_release_from_coherent" [sound/core/snd-page-alloc.ko] undefined!

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/dma-coherent.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
index c1d4d5b4c61c..f013a0c2e111 100644
--- a/kernel/dma-coherent.c
+++ b/kernel/dma-coherent.c
@@ -124,6 +124,7 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size,
 	}
 	return (mem != NULL);
 }
+EXPORT_SYMBOL(dma_alloc_from_coherent);
 
 /**
  * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool
@@ -151,3 +152,4 @@ int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
 	}
 	return 0;
 }
+EXPORT_SYMBOL(dma_release_from_coherent);
-- 
cgit v1.2.3


From 94d3d8247de22c5b0624aa00616ceca459498e55 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 20 Aug 2008 16:54:41 -0700
Subject: sched: do_wait_for_common: use signal_pending_state()

Change do_wait_for_common() to use signal_pending_state() instead of open
coding.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index d601fb0406ca..da7c5d23cc03 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4599,10 +4599,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
 		wait.flags |= WQ_FLAG_EXCLUSIVE;
 		__add_wait_queue_tail(&x->wait, &wait);
 		do {
-			if ((state == TASK_INTERRUPTIBLE &&
-			     signal_pending(current)) ||
-			    (state == TASK_KILLABLE &&
-			     fatal_signal_pending(current))) {
+			if (signal_pending_state(state, current)) {
 				timeout = -ERESTARTSYS;
 				break;
 			}
-- 
cgit v1.2.3


From f31e11d87a5d7601636710195891ba462ad99f11 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 20 Aug 2008 16:54:44 -0700
Subject: wait_task_inactive(): don't consider task->nivcsw

If wait_task_inactive() returns success the task was deactivated.  In that
case schedule() always increments ->nvcsw which alone can be used as a
"generation counter".

If the next call returns the same number, we can be sure that the task was
unscheduled.  Otherwise, because we know that .on_rq == 0 again, ->nvcsw
should have been changed in between.

Q: perhaps it is better to do "ncsw = (p->nvcsw << 1) | 1" ?  This
decreases the possibility of "was it unscheduled" false positive when
->nvcsw == 0.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index da7c5d23cc03..908670aa215a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1921,11 +1921,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 		running = task_running(rq, p);
 		on_rq = p->se.on_rq;
 		ncsw = 0;
-		if (!match_state || p->state == match_state) {
-			ncsw = p->nivcsw + p->nvcsw;
-			if (unlikely(!ncsw))
-				ncsw = 1;
-		}
+		if (!match_state || p->state == match_state)
+			ncsw = p->nvcsw ?: 1;
 		task_rq_unlock(rq, &flags);
 
 		/*
-- 
cgit v1.2.3


From 93dcf55f828b035fc93fc19eb03c1390e1e6d570 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 20 Aug 2008 16:54:44 -0700
Subject: wait_task_inactive: "improve" the returned value for ->nvcsw == 0

wait_task_inactive() returns 1 when p->nvcsw == 0 || p->nvcsw == 1.  This
means that two subsequent calls can return the same number while the task
was scheduled in between.

Change the code to return "nvcsw | LONG_MIN" instead of "nvcsw ?: 1", now
the overlap always needs LONG_MAX schedules.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 908670aa215a..6a43c8942b05 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1922,7 +1922,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 		on_rq = p->se.on_rq;
 		ncsw = 0;
 		if (!match_state || p->state == match_state)
-			ncsw = p->nvcsw ?: 1;
+			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
 		task_rq_unlock(rq, &flags);
 
 		/*
-- 
cgit v1.2.3


From 7a8fc9b248e77a4eab0613acf30a6811799786b3 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Sun, 17 Aug 2008 17:36:59 +0300
Subject: removed unused #include <linux/version.h>'s

This patch lets the files using linux/version.h match the files that
#include it.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/nsproxy.c        | 1 -
 kernel/power/swap.c     | 1 -
 kernel/user_namespace.c | 1 -
 kernel/utsname.c        | 1 -
 kernel/utsname_sysctl.c | 1 -
 5 files changed, 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 21575fc46d05..1d3ef29a2583 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -14,7 +14,6 @@
  */
 
 #include <linux/module.h>
-#include <linux/version.h>
 #include <linux/nsproxy.h>
 #include <linux/init_task.h>
 #include <linux/mnt_namespace.h>
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index a0abf9a463f9..80ccac849e46 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -14,7 +14,6 @@
 #include <linux/module.h>
 #include <linux/file.h>
 #include <linux/utsname.h>
-#include <linux/version.h>
 #include <linux/delay.h>
 #include <linux/bitops.h>
 #include <linux/genhd.h>
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index a9ab0596de44..532858fa5b88 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -6,7 +6,6 @@
  */
 
 #include <linux/module.h>
-#include <linux/version.h>
 #include <linux/nsproxy.h>
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 64d398f12444..815237a55af8 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -12,7 +12,6 @@
 #include <linux/module.h>
 #include <linux/uts.h>
 #include <linux/utsname.h>
-#include <linux/version.h>
 #include <linux/err.h>
 #include <linux/slab.h>
 
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index fe3a56c2256d..4ab9659d269e 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -12,7 +12,6 @@
 #include <linux/module.h>
 #include <linux/uts.h>
 #include <linux/utsname.h>
-#include <linux/version.h>
 #include <linux/sysctl.h>
 
 static void *get_uts(ctl_table *table, int write)
-- 
cgit v1.2.3


From 354879bb977e06695993435745f06a0f6d39ce2b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 Aug 2008 17:15:34 +0200
Subject: sched_clock: fix cpu_clock()

This patch fixes 3 issues:

a) it removes the dependency on jiffies, because jiffies are incremented
   by a single CPU, and the tick is not synchronized between CPUs. Therefore
   relying on it to calculate a window to clip whacky TSC values doesn't work
   as it can drift around.

   So instead use [GTOD, GTOD+TICK_NSEC) as the window.

b) __update_sched_clock() did (roughly speaking):

   delta = sched_clock() - scd->tick_raw;
   clock += delta;

   Which gives exponential growth, instead of linear.

c) allows the sched_clock_cpu() value to warp the u64 without breaking.

the results are more reliable sched_clock() deltas:

           before       after   sched_clock

cpu_clock: 15750        51312   51488
cpu_clock: 59719        51052   50947
cpu_clock: 15879        51249   51061
cpu_clock: 1            50933   51198
cpu_clock: 1            50931   51039
cpu_clock: 1            51093   50981
cpu_clock: 1            51043   51040
cpu_clock: 1            50959   50938
cpu_clock: 1            50981   51011
cpu_clock: 1            51364   51212
cpu_clock: 1            51219   51273
cpu_clock: 1            51389   51048
cpu_clock: 1            51285   51611
cpu_clock: 1            50964   51137
cpu_clock: 1            50973   50968
cpu_clock: 1            50967   50972
cpu_clock: 1            58910   58485
cpu_clock: 1            51082   51025
cpu_clock: 1            50957   50958
cpu_clock: 1            50958   50957
cpu_clock: 1006128      51128   50971
cpu_clock: 1            51107   51155
cpu_clock: 1            51371   51081
cpu_clock: 1            51104   51365
cpu_clock: 1            51363   51309
cpu_clock: 1            51107   51160
cpu_clock: 1            51139   51100
cpu_clock: 1            51216   51136
cpu_clock: 1            51207   51215
cpu_clock: 1            51087   51263
cpu_clock: 1            51249   51177
cpu_clock: 1            51519   51412
cpu_clock: 1            51416   51255
cpu_clock: 1            51591   51594
cpu_clock: 1            50966   51374
cpu_clock: 1            50966   50966
cpu_clock: 1            51291   50948
cpu_clock: 1            50973   50867
cpu_clock: 1            50970   50970
cpu_clock: 998306       50970   50971
cpu_clock: 1            50971   50970
cpu_clock: 1            50970   50970
cpu_clock: 1            50971   50971
cpu_clock: 1            50970   50970
cpu_clock: 1            51351   50970
cpu_clock: 1            50970   51352
cpu_clock: 1            50971   50970
cpu_clock: 1            50970   50970
cpu_clock: 1            51321   50971
cpu_clock: 1            50974   51324

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_clock.c | 84 +++++++++++++++++++++-------------------------------
 1 file changed, 34 insertions(+), 50 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 204991a0bfa7..e8ab096ddfe3 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -12,19 +12,17 @@
  *
  * Create a semi stable clock from a mixture of other events, including:
  *  - gtod
- *  - jiffies
  *  - sched_clock()
  *  - explicit idle events
  *
  * We use gtod as base and the unstable clock deltas. The deltas are filtered,
- * making it monotonic and keeping it within an expected window.  This window
- * is set up using jiffies.
+ * making it monotonic and keeping it within an expected window.
  *
  * Furthermore, explicit sleep and wakeup hooks allow us to account for time
  * that is otherwise invisible (TSC gets stopped).
  *
  * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
- * consistent between cpus (never more than 1 jiffies difference).
+ * consistent between cpus (never more than 2 jiffies difference).
  */
 #include <linux/sched.h>
 #include <linux/percpu.h>
@@ -54,7 +52,6 @@ struct sched_clock_data {
 	 */
 	raw_spinlock_t		lock;
 
-	unsigned long		tick_jiffies;
 	u64			tick_raw;
 	u64			tick_gtod;
 	u64			clock;
@@ -75,14 +72,12 @@ static inline struct sched_clock_data *cpu_sdc(int cpu)
 void sched_clock_init(void)
 {
 	u64 ktime_now = ktime_to_ns(ktime_get());
-	unsigned long now_jiffies = jiffies;
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
 		struct sched_clock_data *scd = cpu_sdc(cpu);
 
 		scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
-		scd->tick_jiffies = now_jiffies;
 		scd->tick_raw = 0;
 		scd->tick_gtod = ktime_now;
 		scd->clock = ktime_now;
@@ -91,47 +86,52 @@ void sched_clock_init(void)
 	sched_clock_running = 1;
 }
 
+/*
+ * min,max except they take wrapping into account
+ */
+
+static inline u64 wrap_min(u64 x, u64 y)
+{
+	return (s64)(x - y) < 0 ? x : y;
+}
+
+static inline u64 wrap_max(u64 x, u64 y)
+{
+	return (s64)(x - y) > 0 ? x : y;
+}
+
 /*
  * update the percpu scd from the raw @now value
  *
  *  - filter out backward motion
- *  - use jiffies to generate a min,max window to clip the raw values
+ *  - use the GTOD tick value to create a window to filter crazy TSC values
  */
 static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
 {
-	unsigned long now_jiffies = jiffies;
-	long delta_jiffies = now_jiffies - scd->tick_jiffies;
-	u64 clock = scd->clock;
-	u64 min_clock, max_clock;
 	s64 delta = now - scd->tick_raw;
+	u64 clock, min_clock, max_clock;
 
 	WARN_ON_ONCE(!irqs_disabled());
-	min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC;
 
-	if (unlikely(delta < 0)) {
-		clock++;
-		goto out;
-	}
+	if (unlikely(delta < 0))
+		delta = 0;
 
-	max_clock = min_clock + TICK_NSEC;
+	/*
+	 * scd->clock = clamp(scd->tick_gtod + delta,
+	 * 		      max(scd->tick_gtod, scd->clock),
+	 * 		      scd->tick_gtod + TICK_NSEC);
+	 */
 
-	if (unlikely(clock + delta > max_clock)) {
-		if (clock < max_clock)
-			clock = max_clock;
-		else
-			clock++;
-	} else {
-		clock += delta;
-	}
+	clock = scd->tick_gtod + delta;
+	min_clock = wrap_max(scd->tick_gtod, scd->clock);
+	max_clock = scd->tick_gtod + TICK_NSEC;
 
- out:
-	if (unlikely(clock < min_clock))
-		clock = min_clock;
+	clock = wrap_max(clock, min_clock);
+	clock = wrap_min(clock, max_clock);
 
-	scd->tick_jiffies = now_jiffies;
 	scd->clock = clock;
 
-	return clock;
+	return scd->clock;
 }
 
 static void lock_double_clock(struct sched_clock_data *data1,
@@ -171,7 +171,7 @@ u64 sched_clock_cpu(int cpu)
 		 * larger time as the latest time for both
 		 * runqueues. (this creates monotonic movement)
 		 */
-		if (likely(remote_clock < this_clock)) {
+		if (likely((s64)(remote_clock - this_clock) < 0)) {
 			clock = this_clock;
 			scd->clock = clock;
 		} else {
@@ -207,14 +207,9 @@ void sched_clock_tick(void)
 	now = sched_clock();
 
 	__raw_spin_lock(&scd->lock);
-	__update_sched_clock(scd, now);
-	/*
-	 * update tick_gtod after __update_sched_clock() because that will
-	 * already observe 1 new jiffy; adding a new tick_gtod to that would
-	 * increase the clock 2 jiffies.
-	 */
 	scd->tick_raw = now;
 	scd->tick_gtod = now_gtod;
+	__update_sched_clock(scd, now);
 	__raw_spin_unlock(&scd->lock);
 }
 
@@ -232,18 +227,7 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
  */
 void sched_clock_idle_wakeup_event(u64 delta_ns)
 {
-	struct sched_clock_data *scd = this_scd();
-
-	/*
-	 * Override the previous timestamp and ignore all
-	 * sched_clock() deltas that occured while we idled,
-	 * and use the PM-provided delta_ns to advance the
-	 * rq clock:
-	 */
-	__raw_spin_lock(&scd->lock);
-	scd->clock += delta_ns;
-	__raw_spin_unlock(&scd->lock);
-
+	sched_clock_tick();
 	touch_softlockup_watchdog();
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
-- 
cgit v1.2.3


From ffb4ba76a25ab6c9deeec33e4f58395586ca747c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 25 Aug 2008 11:10:26 -0700
Subject: [module] Don't let gcc inline load_module()

'load_module()' is a complex function that contains all the ELF section
logic, and inlining it is utterly insane.  But gcc will do it, simply
because there is only one call-site.  As a result, all the stack space
that is allocated for all the work to load the module will still be
active when we actually call the module init sequence, and the deep call
chain makes stack overflows happen.

And stack overflows are really hard to debug, because they not only
corrupt random pages below the stack, but also corrupt the thread_info
structure that is allocated under the stack.

In this case, Alan Brunelle reported some crazy oopses at bootup, after
loading the processor module that ends up doing complex ACPI stuff and
has quite a deep callchain.  This should fix it, and is the sane thing
to do regardless.

Cc: Alan D. Brunelle <Alan.Brunelle@hp.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/module.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 08864d257eb0..9db11911e04b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1799,7 +1799,7 @@ static void *module_alloc_update_bounds(unsigned long size)
 
 /* Allocate and load the module: note that size of section 0 is always
    zero, and we rely on this for optional sections. */
-static struct module *load_module(void __user *umod,
+static noinline struct module *load_module(void __user *umod,
 				  unsigned long len,
 				  const char __user *uargs)
 {
-- 
cgit v1.2.3


From f73be6dedf4fa058ce80846dae604b08fa805ca1 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 25 Aug 2008 17:07:14 -0700
Subject: smp: have smp_call_function_single() detect invalid CPUs

Have smp_call_function_single() return invalid CPU indicies and return
-ENXIO.  This function is already executed inside a
get_cpu()..put_cpu() which locks out CPU removal, so rather than
having the higher layers doing another layer of locking to guard
against unplugged CPUs do the test here.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 kernel/smp.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index 782e2b93e465..f362a8553777 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -210,8 +210,10 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 {
 	struct call_single_data d;
 	unsigned long flags;
-	/* prevent preemption and reschedule on another processor */
+	/* prevent preemption and reschedule on another processor,
+	   as well as CPU removal */
 	int me = get_cpu();
+	int err = 0;
 
 	/* Can deadlock when called with interrupts disabled */
 	WARN_ON(irqs_disabled());
@@ -220,7 +222,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 		local_irq_save(flags);
 		func(info);
 		local_irq_restore(flags);
-	} else {
+	} else if ((unsigned)cpu < NR_CPUS && cpu_online(cpu)) {
 		struct call_single_data *data = NULL;
 
 		if (!wait) {
@@ -236,10 +238,12 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 		data->func = func;
 		data->info = info;
 		generic_exec_single(cpu, data);
+	} else {
+		err = -ENXIO;	/* CPU not online */
 	}
 
 	put_cpu();
-	return 0;
+	return err;
 }
 EXPORT_SYMBOL(smp_call_function_single);
 
-- 
cgit v1.2.3


From 65eb3dc609dec17deea48dcd4de2e549d29a9824 Mon Sep 17 00:00:00 2001
From: Kevin Diggs <kevdig@hypersurf.com>
Date: Tue, 26 Aug 2008 10:26:54 +0200
Subject: sched: add kernel doc for the completion, fix
 kernel-doc-nano-HOWTO.txt

This patch adds kernel doc for the completion feature.

An error in the split-man.pl PERL snippet in kernel-doc-nano-HOWTO.txt is
also fixed.

Signed-off-by: Kevin Diggs <kevdig@hypersurf.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 29e2ec0bd831..93f5ea08be97 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4565,6 +4565,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
 
+/**
+ * complete: - signals a single thread waiting on this completion
+ * @x:  holds the state of this particular completion
+ *
+ * This will wake up a single thread waiting on this completion. Threads will be
+ * awakened in the same order in which they were queued.
+ *
+ * See also complete_all(), wait_for_completion() and related routines.
+ */
 void complete(struct completion *x)
 {
 	unsigned long flags;
@@ -4576,6 +4585,12 @@ void complete(struct completion *x)
 }
 EXPORT_SYMBOL(complete);
 
+/**
+ * complete_all: - signals all threads waiting on this completion
+ * @x:  holds the state of this particular completion
+ *
+ * This will wake up all threads waiting on this particular completion event.
+ */
 void complete_all(struct completion *x)
 {
 	unsigned long flags;
@@ -4624,12 +4639,31 @@ wait_for_common(struct completion *x, long timeout, int state)
 	return timeout;
 }
 
+/**
+ * wait_for_completion: - waits for completion of a task
+ * @x:  holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout.
+ *
+ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
+ * and interrupt capability. Also see complete().
+ */
 void __sched wait_for_completion(struct completion *x)
 {
 	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(wait_for_completion);
 
+/**
+ * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible.
+ */
 unsigned long __sched
 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
 {
@@ -4637,6 +4671,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
 }
 EXPORT_SYMBOL(wait_for_completion_timeout);
 
+/**
+ * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
+ * @x:  holds the state of this particular completion
+ *
+ * This waits for completion of a specific task to be signaled. It is
+ * interruptible.
+ */
 int __sched wait_for_completion_interruptible(struct completion *x)
 {
 	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
@@ -4646,6 +4687,14 @@ int __sched wait_for_completion_interruptible(struct completion *x)
 }
 EXPORT_SYMBOL(wait_for_completion_interruptible);
 
+/**
+ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. It is interruptible. The timeout is in jiffies.
+ */
 unsigned long __sched
 wait_for_completion_interruptible_timeout(struct completion *x,
 					  unsigned long timeout)
@@ -4654,6 +4703,13 @@ wait_for_completion_interruptible_timeout(struct completion *x,
 }
 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
 
+/**
+ * wait_for_completion_killable: - waits for completion of a task (killable)
+ * @x:  holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It can be
+ * interrupted by a kill signal.
+ */
 int __sched wait_for_completion_killable(struct completion *x)
 {
 	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
-- 
cgit v1.2.3


From 2189459d25a47401c69a17794c9d390c890351f9 Mon Sep 17 00:00:00 2001
From: Joe Korty <joe.korty@ccur.com>
Date: Mon, 25 Aug 2008 17:15:33 -0400
Subject: lockstat: fix numerical output rounding error

Fix rounding error in /proc/lock_stat numerical output.

On occasion the two digit fractional part contains the three
digit value '100'.  This is due to a bug in the rounding algorithm
which pushes values in the range '95..99' to '100' rather than
to '00' + an increment to the integer part.  For example,

	- 123456.100      old display
	+ 123457.00	  new display

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep_proc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 4b194d34d77f..20dbcbf9c7dd 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -472,8 +472,9 @@ static void snprint_time(char *buf, size_t bufsiz, s64 nr)
 {
 	unsigned long rem;
 
+	nr += 5; /* for display rounding */
 	rem = do_div(nr, 1000); /* XXX: do_div_signed */
-	snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, ((int)rem+5)/10);
+	snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, (int)rem/10);
 }
 
 static void seq_time(struct seq_file *m, s64 time)
-- 
cgit v1.2.3


From 04148b73b89d49fe0fe201bcee395e51f7d637ce Mon Sep 17 00:00:00 2001
From: Joe Korty <joe.korty@ccur.com>
Date: Mon, 25 Aug 2008 17:16:23 -0400
Subject: lockstat: repair erronous contention statistics

Fix bad contention counting in /proc/lock_stat.

/proc/lockstat tries to gather per-ip contention
statistics per-lock.  This was failing due to
a garbage per-ip index selector being used.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 3bfb1877a003..b5db51d2803d 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3029,7 +3029,7 @@ found_it:
 
 	stats = get_lock_stats(hlock_class(hlock));
 	if (point < ARRAY_SIZE(stats->contention_point))
-		stats->contention_point[i]++;
+		stats->contention_point[point]++;
 	if (lock->cpu != smp_processor_id())
 		stats->bounces[bounce_contended + !!hlock->read]++;
 	put_lock_stats(stats);
-- 
cgit v1.2.3


From 74870172824a78640ec4f03058d9bd35dfa08618 Mon Sep 17 00:00:00 2001
From: Zhu Yi <yi.zhu@intel.com>
Date: Wed, 27 Aug 2008 14:33:00 +0800
Subject: lockdep: fix invalid list_del_rcu in zap_class

The problem is found during iwlagn driver testing on
v2.6.27-rc4-176-gb8e6c91 kernel, but it turns out to be a lockdep bug.
In our testing, we frequently load and unload the iwlagn driver
(>50 times). Then the MAX_STACK_TRACE_ENTRIES is reached (expected
behaviour?). The error message with the call trace is as below.

BUG: MAX_STACK_TRACE_ENTRIES too low!
turning off the locking correctness validator.
Pid: 4895, comm: iwlagn Not tainted 2.6.27-rc4 #13

Call Trace:
 [<ffffffff81014aa1>] save_stack_trace+0x22/0x3e
 [<ffffffff8105390a>] save_trace+0x8b/0x91
 [<ffffffff81054e60>] mark_lock+0x1b0/0x8fa
 [<ffffffff81056f71>] __lock_acquire+0x5b9/0x716
 [<ffffffffa00d818a>] ieee80211_sta_work+0x0/0x6ea [mac80211]
 [<ffffffff81057120>] lock_acquire+0x52/0x6b
 [<ffffffff81045f0e>] run_workqueue+0x97/0x1ed
 [<ffffffff81045f5e>] run_workqueue+0xe7/0x1ed
 [<ffffffff81045f0e>] run_workqueue+0x97/0x1ed
 [<ffffffff81046ae4>] worker_thread+0xd8/0xe3
 [<ffffffff81049503>] autoremove_wake_function+0x0/0x2e
 [<ffffffff81046a0c>] worker_thread+0x0/0xe3
 [<ffffffff810493ec>] kthread+0x47/0x73
 [<ffffffff8128e3ab>] trace_hardirqs_on_thunk+0x3a/0x3f
 [<ffffffff8100cea9>] child_rip+0xa/0x11
 [<ffffffff8100c4df>] restore_args+0x0/0x30
 [<ffffffff810316e1>] finish_task_switch+0x0/0xcc
 [<ffffffff810493a5>] kthread+0x0/0x73
 [<ffffffff8100ce9f>] child_rip+0x0/0x11

Although the above is harmless, when the ilwagn module is removed
later lockdep will trigger a kernel oops as below.

BUG: unable to handle kernel NULL pointer dereference at
0000000000000008
IP: [<ffffffff810531e1>] zap_class+0x24/0x82
PGD 73128067 PUD 7448c067 PMD 0
Oops: 0002 [1] SMP
CPU 0
Modules linked in: rfcomm l2cap bluetooth autofs4 sunrpc
nf_conntrack_ipv6 xt_state nf_conntrack xt_tcpudp ip6t_ipv6header
ip6t_REJECT ip6table_filter ip6_tables x_tables ipv6 cpufreq_ondemand
acpi_cpufreq dm_mirror dm_log dm_multipath dm_mod snd_hda_intel sr_mod
snd_seq_dummy snd_seq_oss snd_seq_midi_event battery snd_seq
snd_seq_device cdrom button snd_pcm_oss snd_mixer_oss snd_pcm
snd_timer snd_page_alloc e1000e snd_hwdep sg iTCO_wdt
iTCO_vendor_support ac pcspkr i2c_i801 i2c_core snd soundcore video
output ata_piix ata_generic libata sd_mod scsi_mod ext3 jbd mbcache
uhci_hcd ohci_hcd ehci_hcd [last unloaded: mac80211]
Pid: 4941, comm: modprobe Not tainted 2.6.27-rc4 #10
RIP: 0010:[<ffffffff810531e1>]  [<ffffffff810531e1>]
zap_class+0x24/0x82
RSP: 0000:ffff88007bcb3eb0  EFLAGS: 00010046
RAX: 0000000000068ee8 RBX: ffffffff8192a0a0 RCX: 0000000000000000
RDX: 0000000000000000 RSI: 0000000000001dfb RDI: ffffffff816e70b0
RBP: ffffffffa00cd000 R08: ffffffff816818f8 R09: ffff88007c923558
R10: ffffe20002ad2408 R11: ffffffff811028ec R12: ffffffff8192a0a0
R13: 000000000002bd90 R14: 0000000000000000 R15: 0000000000000296
FS:  00007f9d1cee56f0(0000) GS:ffffffff814a58c0(0000)
knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000000008 CR3: 0000000073047000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process modprobe (pid: 4941, threadinfo ffff88007bcb2000, task
ffff8800758d1fc0)
Stack:  ffffffff81057376 0000000000000000 ffffffffa00f7b00
0000000000000000
 0000000000000080 0000000000618278 00007fff24f16720 0000000000000000
 ffffffff8105d37a ffffffffa00f7b00 ffffffff8105d591 313132303863616d
Call Trace:
 [<ffffffff81057376>] ? lockdep_free_key_range+0x61/0xf5
 [<ffffffff8105d37a>] ? free_module+0xd4/0xe4
 [<ffffffff8105d591>] ? sys_delete_module+0x1de/0x1f9
 [<ffffffff8106dbfa>] ? audit_syscall_entry+0x12d/0x160
 [<ffffffff8100be2b>] ? system_call_fastpath+0x16/0x1b

Code: b2 00 01 00 00 00 c3 31 f6 49 c7 c0 10 8a 61 81 eb 32 49 39 38
75 26 48 98 48 6b c0 38 48 8b 90 08 8a 61 81 48 8b 88 00 8a 61 81 <48>
89 51 08 48 89 0a 48 c7 80 08 8a 61 81 00 02 20 00 48 ff c6
RIP  [<ffffffff810531e1>] zap_class+0x24/0x82
 RSP <ffff88007bcb3eb0>
CR2: 0000000000000008
---[ end trace a1297e0c4abb0f2e ]---

The root cause for this oops is in add_lock_to_list() when
save_trace() fails due to MAX_STACK_TRACE_ENTRIES is reached,
entry->class is assigned but entry is never added into any lock list.
This makes the list_del_rcu() in zap_class() oops later when the
module is unloaded. This patch fixes the problem by assigning
entry->class after save_trace() returns success.

Signed-off-by: Zhu Yi <yi.zhu@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index b5db51d2803d..dbda475b13bd 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -875,11 +875,11 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
 	if (!entry)
 		return 0;
 
-	entry->class = this;
-	entry->distance = distance;
 	if (!save_trace(&entry->trace))
 		return 0;
 
+	entry->class = this;
+	entry->distance = distance;
 	/*
 	 * Since we never remove from the dependency list, the list can
 	 * be walked lockless by other CPUs, it's only allocation
-- 
cgit v1.2.3


From 2633f0e57b1127f4060d70bf460140dc9bb19386 Mon Sep 17 00:00:00 2001
From: Steve VanDeBogart <vandebo-lkml@NerdBox.Net>
Date: Tue, 26 Aug 2008 15:14:36 -0700
Subject: exit signals: use of uninitialized field notify_count

task->signal->notify_count is only initialized if
task->signal->group_exit_task is not NULL.  Reorder a conditional so
that uninitialised memory is not used.  Found by Valgrind.

Signed-off-by: Steve VanDeBogart <vandebo-lkml@nerdbox.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/exit.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 38ec40630149..75c647387639 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -918,8 +918,8 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 
 	/* mt-exec, de_thread() is waiting for us */
 	if (thread_group_leader(tsk) &&
-	    tsk->signal->notify_count < 0 &&
-	    tsk->signal->group_exit_task)
+	    tsk->signal->group_exit_task &&
+	    tsk->signal->notify_count < 0)
 		wake_up_process(tsk->signal->group_exit_task);
 
 	write_unlock_irq(&tasklist_lock);
-- 
cgit v1.2.3


From 0cd418ddb1ee88df7d16d5df06cb2da68eceb9e4 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Mon, 18 Aug 2008 14:39:21 -0700
Subject: rcuclassic: fix compiler warning

CC      kernel/rcuclassic.o
kernel/rcuclassic.c: In function 'rcu_init_percpu_data':
kernel/rcuclassic.c:705: warning: comparison of distinct pointer types lacks a cast
kernel/rcuclassic.c:713: warning: comparison of distinct pointer types lacks a cast

flags should be unsigned long.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 3f6918966bda..743cf0550ff4 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -714,7 +714,7 @@ void rcu_check_callbacks(int cpu, int user)
 static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
 						struct rcu_data *rdp)
 {
-	long flags;
+	unsigned long flags;
 
 	spin_lock_irqsave(&rcp->lock, flags);
 	memset(rdp, 0, sizeof(*rdp));
-- 
cgit v1.2.3


From f42ac38c59e0a03d6da0c24a63fb211393f484b0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 27 Aug 2008 09:14:40 -0400
Subject: ftrace: disable tracing for suspend to ram

I've been painstakingly debugging the issue with suspend to ram and
ftraced. The 2.6.28 code does not have this issue, but since the mcount
recording is not going to be in 27, this must be solved for the ftrace
daemon version.

The resume from suspend to ram would reboot because it was triple
faulting. Debugging further, I found that calling the mcount function
itself was not an issue, but it would fault when it incremented
preempt_count. preempt_count is on the tasks info structure that is on the
low memory address of the task's stack.  For some reason, it could not
write to it. Resuming out of suspend to ram does quite a lot of funny
tricks to get to work, so it is not surprising at all that simply doing a
preempt_disable() would cause a fault.

Thanks to Rafael for suggesting to add a "while (1);" to find the place in
resuming that is causing the fault. I would place the loop somewhere in
the code, compile and reboot and see if it would either reboot (hit the
fault) or simply hang (hit the loop).  Doing this over and over again, I
narrowed it down that it was happening in enable_nonboot_cpus.

At this point, I found that it is easier to simply disable tracing around
the suspend code, instead of searching for the particular function that
can not handle doing a preempt_disable.

This patch disables the tracer as it suspends and reenables it on resume.

I tested this patch on my Laptop, and it can resume fine with the patch.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/main.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index 0b7476f5d2a6..540b16b68565 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -21,6 +21,7 @@
 #include <linux/freezer.h>
 #include <linux/vmstat.h>
 #include <linux/syscalls.h>
+#include <linux/ftrace.h>
 
 #include "power.h"
 
@@ -310,7 +311,7 @@ static int suspend_enter(suspend_state_t state)
  */
 int suspend_devices_and_enter(suspend_state_t state)
 {
-	int error;
+	int error, ftrace_save;
 
 	if (!suspend_ops)
 		return -ENOSYS;
@@ -321,6 +322,7 @@ int suspend_devices_and_enter(suspend_state_t state)
 			goto Close;
 	}
 	suspend_console();
+	ftrace_save = __ftrace_enabled_save();
 	suspend_test_start();
 	error = device_suspend(PMSG_SUSPEND);
 	if (error) {
@@ -352,6 +354,7 @@ int suspend_devices_and_enter(suspend_state_t state)
 	suspend_test_start();
 	device_resume(PMSG_RESUME);
 	suspend_test_finish("resume devices");
+	__ftrace_enabled_restore(ftrace_save);
 	resume_console();
  Close:
 	if (suspend_ops->end)
-- 
cgit v1.2.3


From f3ade837808121ff8bab9c56725f4fe40ec85a56 Mon Sep 17 00:00:00 2001
From: John Blackwood <john.blackwood@ccur.com>
Date: Tue, 26 Aug 2008 15:09:43 -0400
Subject: sched: fix sched_rt_rq_enqueue() resched idle

When sysctl_sched_rt_runtime is set to something other than -1 and the
CONFIG_RT_GROUP_SCHED kernel parameter is NOT enabled, we get into a state
where we see one or more CPUs idling forvever even though there are
real-time
tasks in their rt runqueue that are able to run (no longer throttled).

The sequence is:

- A real-time task is running when the timer sets the rt runqueue
    to throttled, and the rt task is resched_task()ed and switched
    out, and idle is switched in since there are no non-rt tasks to
    run on that cpu.

- Eventually the do_sched_rt_period_timer() runs and un-throttles
    the rt runqueue, but we just exit the timer interrupt and go back
    to executing the idle task in the idle loop forever.

If we change the sched_rt_rq_enqueue() routine to use some of the code
from the CONFIG_RT_GROUP_SCHED enabled version of this same routine and
resched_task() the currently executing task (idle in our case) if it is
a lower priority task than the higher rt task in the now un-throttled
runqueue, the problem is no longer observed.

Signed-off-by: John Blackwood <john.blackwood@ccur.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 998ba54b4543..07d9b3307907 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -199,6 +199,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 
 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
+	if (rt_rq->rt_nr_running)
+		resched_task(rq_of_rt_rq(rt_rq)->curr);
 }
 
 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
-- 
cgit v1.2.3


From aec0a5142cb52aaa152d962d84a838e25d520742 Mon Sep 17 00:00:00 2001
From: Bharata B Rao <bharata@linux.vnet.ibm.com>
Date: Thu, 28 Aug 2008 14:42:49 +0530
Subject: sched: call resched_task() conditionally from new task wake up path

- During wake up of a new task, task_new_fair() can do a resched_task()
  on the current task. Later in the code path, check_preempt_curr() also ends
  up doing the same, which can be avoided. Check if TIF_NEED_RESCHED is
  already set for the current task.

- task_new_fair() does a resched_task() on the current task unconditionally.
  This can be done only in case when child runs before the parent.

So this is a small speedup.

Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index fb8994c6d4bb..8264bb5dbd51 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1348,6 +1348,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 	if (unlikely(se == pse))
 		return;
 
+	/*
+	 * We can come here with TIF_NEED_RESCHED already set from new task
+	 * wake up path.
+	 */
+	if (test_tsk_need_resched(curr))
+		return;
+
 	cfs_rq_of(pse)->next = pse;
 
 	/*
@@ -1620,10 +1627,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 		 * 'current' within the tree based on its new key value.
 		 */
 		swap(curr->vruntime, se->vruntime);
+		resched_task(rq->curr);
 	}
 
 	enqueue_task_fair(rq, p, 0);
-	resched_task(rq->curr);
 }
 
 /*
-- 
cgit v1.2.3


From 29cbef4869bf288256ab76c7dc674cb132b35de2 Mon Sep 17 00:00:00 2001
From: Joe Korty <joe.korty@ccur.com>
Date: Wed, 27 Aug 2008 11:21:39 -0400
Subject: make might_sleep() display the oopsing process

Expand might_sleep's printk to indicate the oopsing process.

Signed-off-by: Joe Korty <joe.korty@ccur.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 93f5ea08be97..6e283dc7679a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8233,8 +8233,8 @@ void __might_sleep(char *file, int line)
 		prev_jiffy = jiffies;
 		printk(KERN_ERR "BUG: sleeping function called from invalid"
 				" context at %s:%d\n", file, line);
-		printk("in_atomic():%d, irqs_disabled():%d\n",
-			in_atomic(), irqs_disabled());
+		printk("in_atomic():%d, irqs_disabled():%d, pid: %d, name: %s\n",
+			in_atomic(), irqs_disabled(), current->pid, current->comm);
 		debug_show_held_locks(current);
 		if (irqs_disabled())
 			print_irqtrace_events(current);
-- 
cgit v1.2.3


From aef745fca016aea45adae5c98e8698904dd8ad51 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 28 Aug 2008 11:34:43 +0200
Subject: sched: clean up __might_sleep()

add KERN_ to the printout and clean up the flow a bit.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 6e283dc7679a..b112caaa400a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8226,20 +8226,25 @@ void __might_sleep(char *file, int line)
 #ifdef in_atomic
 	static unsigned long prev_jiffy;	/* ratelimiting */
 
-	if ((in_atomic() || irqs_disabled()) &&
-	    system_state == SYSTEM_RUNNING && !oops_in_progress) {
-		if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
-			return;
-		prev_jiffy = jiffies;
-		printk(KERN_ERR "BUG: sleeping function called from invalid"
-				" context at %s:%d\n", file, line);
-		printk("in_atomic():%d, irqs_disabled():%d, pid: %d, name: %s\n",
-			in_atomic(), irqs_disabled(), current->pid, current->comm);
-		debug_show_held_locks(current);
-		if (irqs_disabled())
-			print_irqtrace_events(current);
-		dump_stack();
-	}
+	if ((!in_atomic() && !irqs_disabled()) ||
+		    system_state != SYSTEM_RUNNING || oops_in_progress)
+		return;
+	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+		return;
+	prev_jiffy = jiffies;
+
+	printk(KERN_ERR
+		"BUG: sleeping function called from invalid context at %s:%d\n",
+			file, line);
+	printk(KERN_ERR
+		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+			in_atomic(), irqs_disabled(),
+			current->pid, current->comm);
+
+	debug_show_held_locks(current);
+	if (irqs_disabled())
+		print_irqtrace_events(current);
+	dump_stack();
 #endif
 }
 EXPORT_SYMBOL(__might_sleep);
-- 
cgit v1.2.3


From 7940ca3605b77f20cc6e9852e4ca6f2d725b5653 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 19 Aug 2008 13:40:47 +0200
Subject: sched: extract walk_tg_tree(), fix

fix:

 kernel/sched.c: In function '__rt_schedulable':
 kernel/sched.c:8771: error: implicit declaration of function 'walk_tg_tree'
 kernel/sched.c:8771: error: 'tg_nop' undeclared (first use in this function)
 kernel/sched.c:8771: error: (Each undeclared identifier is reported only once
 kernel/sched.c:8771: error: for each function it appears in.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e41bdae2778d..703f56d5db5f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1387,7 +1387,7 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
 	update_load_sub(&rq->load, load);
 }
 
-#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(SCHED_RT_GROUP_SCHED)
+#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
 typedef int (*tg_visitor)(struct task_group *, void *);
 
 /*
-- 
cgit v1.2.3


From cc2991cf15ae92fa30b3ea9f56a8a5a337bd33c7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 19 Aug 2008 12:33:03 +0200
Subject: sched: rt-bandwidth accounting fix

It fixes an accounting bug where we would continue accumulating runtime
even though the bandwidth control is disabled. This would lead to very long
throttle periods once bandwidth control gets turned on again.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 07d9b3307907..552310798dad 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -440,9 +440,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 {
 	u64 runtime = sched_rt_runtime(rt_rq);
 
-	if (runtime == RUNTIME_INF)
-		return 0;
-
 	if (rt_rq->rt_throttled)
 		return rt_rq_throttled(rt_rq);
 
@@ -493,9 +490,11 @@ static void update_curr_rt(struct rq *rq)
 		rt_rq = rt_rq_of_se(rt_se);
 
 		spin_lock(&rt_rq->rt_runtime_lock);
-		rt_rq->rt_time += delta_exec;
-		if (sched_rt_runtime_exceeded(rt_rq))
-			resched_task(curr);
+		if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
+			rt_rq->rt_time += delta_exec;
+			if (sched_rt_runtime_exceeded(rt_rq))
+				resched_task(curr);
+		}
 		spin_unlock(&rt_rq->rt_runtime_lock);
 	}
 }
-- 
cgit v1.2.3


From 41108eb10142e0552f2de1e4c0675b108c5f018f Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 28 Aug 2008 14:39:12 +0200
Subject: ftrace: disable tracing for hibernation

In accordance with commit f42ac38c59e0a03d6da0c24a63fb211393f484b0
("ftrace: disable tracing for suspend to ram"), disable tracing
around the suspend code in hibernation code paths.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/disk.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index f011e0870b52..bbd85c60f741 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -21,6 +21,7 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
+#include <linux/ftrace.h>
 
 #include "power.h"
 
@@ -255,7 +256,7 @@ static int create_image(int platform_mode)
 
 int hibernation_snapshot(int platform_mode)
 {
-	int error;
+	int error, ftrace_save;
 
 	/* Free memory before shutting down devices. */
 	error = swsusp_shrink_memory();
@@ -267,6 +268,7 @@ int hibernation_snapshot(int platform_mode)
 		goto Close;
 
 	suspend_console();
+	ftrace_save = __ftrace_enabled_save();
 	error = device_suspend(PMSG_FREEZE);
 	if (error)
 		goto Recover_platform;
@@ -296,6 +298,7 @@ int hibernation_snapshot(int platform_mode)
  Resume_devices:
 	device_resume(in_suspend ?
 		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
+	__ftrace_enabled_restore(ftrace_save);
 	resume_console();
  Close:
 	platform_end(platform_mode);
@@ -366,10 +369,11 @@ static int resume_target_kernel(void)
 
 int hibernation_restore(int platform_mode)
 {
-	int error;
+	int error, ftrace_save;
 
 	pm_prepare_console();
 	suspend_console();
+	ftrace_save = __ftrace_enabled_save();
 	error = device_suspend(PMSG_QUIESCE);
 	if (error)
 		goto Finish;
@@ -384,6 +388,7 @@ int hibernation_restore(int platform_mode)
 	platform_restore_cleanup(platform_mode);
 	device_resume(PMSG_RECOVER);
  Finish:
+	__ftrace_enabled_restore(ftrace_save);
 	resume_console();
 	pm_restore_console();
 	return error;
@@ -396,7 +401,7 @@ int hibernation_restore(int platform_mode)
 
 int hibernation_platform_enter(void)
 {
-	int error;
+	int error, ftrace_save;
 
 	if (!hibernation_ops)
 		return -ENOSYS;
@@ -411,6 +416,7 @@ int hibernation_platform_enter(void)
 		goto Close;
 
 	suspend_console();
+	ftrace_save = __ftrace_enabled_save();
 	error = device_suspend(PMSG_HIBERNATE);
 	if (error) {
 		if (hibernation_ops->recover)
@@ -445,6 +451,7 @@ int hibernation_platform_enter(void)
 	hibernation_ops->finish();
  Resume_devices:
 	device_resume(PMSG_RESTORE);
+	__ftrace_enabled_restore(ftrace_save);
 	resume_console();
  Close:
 	hibernation_ops->end();
-- 
cgit v1.2.3


From 316d9679f33caf7e683471647d1472bfe133d858 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Fri, 29 Aug 2008 20:06:23 +0200
Subject: Don't trigger softlockup detector on network fs blocked tasks

Pulling the ethernet cable on a 2.6.27-rc system with NFS mounts
currently leads to an ongoing flood of soft lockup detector backtraces
for all tasks blocked on the NFS mounts when the hickup takes
longer than 120s.

I don't think NFS problems should be all that noisy.

Luckily there's a reasonably easy way to distingush this case.

Don't report task softlockup warnings for tasks in TASK_KILLABLE
state, which is used by the network file systems.

I believe this patch is a 2.6.27 candidate.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/softlockup.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index b75b492fbfcf..1a07f8ca4b92 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -180,6 +180,10 @@ static void check_hung_task(struct task_struct *t, unsigned long now)
 	if (t->flags & PF_FROZEN)
 		return;
 
+	/* Don't check for tasks waiting on network file systems like NFS */
+	if (t->state & TASK_KILLABLE)
+		return;
+
 	if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
 		t->last_switch_count = switch_count;
 		t->last_switch_timestamp = now;
-- 
cgit v1.2.3


From bef69ea0dcce574a425feb0a5aa4c63dd108b9a6 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 29 Aug 2008 20:18:31 -0700
Subject: Resource handling: add 'insert_resource_expand_to_fit()' function

Not used anywhere yet, but this complements the existing plain
'insert_resource()' functionality with a version that can expand the
resource we are adding in order to fix up any conflicts it has with
existing resources.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/resource.c | 88 +++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 63 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index f5b518eabefe..cf0a178c7513 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -362,35 +362,21 @@ int allocate_resource(struct resource *root, struct resource *new,
 
 EXPORT_SYMBOL(allocate_resource);
 
-/**
- * insert_resource - Inserts a resource in the resource tree
- * @parent: parent of the new resource
- * @new: new resource to insert
- *
- * Returns 0 on success, -EBUSY if the resource can't be inserted.
- *
- * This function is equivalent to request_resource when no conflict
- * happens. If a conflict happens, and the conflicting resources
- * entirely fit within the range of the new resource, then the new
- * resource is inserted and the conflicting resources become children of
- * the new resource.
+/*
+ * Insert a resource into the resource tree. If successful, return NULL,
+ * otherwise return the conflicting resource (compare to __request_resource())
  */
-int insert_resource(struct resource *parent, struct resource *new)
+static struct resource * __insert_resource(struct resource *parent, struct resource *new)
 {
-	int result;
 	struct resource *first, *next;
 
-	write_lock(&resource_lock);
-
 	for (;; parent = first) {
-	 	result = 0;
 		first = __request_resource(parent, new);
 		if (!first)
-			goto out;
+			return first;
 
-		result = -EBUSY;
 		if (first == parent)
-			goto out;
+			return first;
 
 		if ((first->start > new->start) || (first->end < new->end))
 			break;
@@ -401,15 +387,13 @@ int insert_resource(struct resource *parent, struct resource *new)
 	for (next = first; ; next = next->sibling) {
 		/* Partial overlap? Bad, and unfixable */
 		if (next->start < new->start || next->end > new->end)
-			goto out;
+			return next;
 		if (!next->sibling)
 			break;
 		if (next->sibling->start > new->end)
 			break;
 	}
 
-	result = 0;
-
 	new->parent = parent;
 	new->sibling = next->sibling;
 	new->child = first;
@@ -426,10 +410,64 @@ int insert_resource(struct resource *parent, struct resource *new)
 			next = next->sibling;
 		next->sibling = new;
 	}
+	return NULL;
+}
 
- out:
+/**
+ * insert_resource - Inserts a resource in the resource tree
+ * @parent: parent of the new resource
+ * @new: new resource to insert
+ *
+ * Returns 0 on success, -EBUSY if the resource can't be inserted.
+ *
+ * This function is equivalent to request_resource when no conflict
+ * happens. If a conflict happens, and the conflicting resources
+ * entirely fit within the range of the new resource, then the new
+ * resource is inserted and the conflicting resources become children of
+ * the new resource.
+ */
+int insert_resource(struct resource *parent, struct resource *new)
+{
+	struct resource *conflict;
+
+	write_lock(&resource_lock);
+	conflict = __insert_resource(parent, new);
+	write_unlock(&resource_lock);
+	return conflict ? -EBUSY : 0;
+}
+
+/**
+ * insert_resource_expand_to_fit - Insert a resource into the resource tree
+ * @parent: parent of the new resource
+ * @new: new resource to insert
+ *
+ * Insert a resource into the resource tree, possibly expanding it in order
+ * to make it encompass any conflicting resources.
+ */
+void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
+{
+	if (new->parent)
+		return;
+
+	write_lock(&resource_lock);
+	for (;;) {
+		struct resource *conflict;
+
+		conflict = __insert_resource(root, new);
+		if (!conflict)
+			break;
+		if (conflict == root)
+			break;
+
+		/* Ok, expand resource to cover the conflict, then try again .. */
+		if (conflict->start < new->start)
+			new->start = conflict->start;
+		if (conflict->end > new->end)
+			new->end = conflict->end;
+
+		printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name);
+	}
 	write_unlock(&resource_lock);
-	return result;
 }
 
 /**
-- 
cgit v1.2.3


From c4bacefb7aaf49da11a695f29d85d40909f17693 Mon Sep 17 00:00:00 2001
From: Cordelia <cordsam@linux.vnet.ibm.com>
Date: Mon, 18 Aug 2008 09:45:51 -0700
Subject: [PATCH] audit: Moved variable declaration to beginning of function

got rid of compilation warning:
ISO C90 forbids mixed declarations and code

Signed-off-by: Cordelia Sam <cordesam@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 972f8e61d36a..59cedfb040e7 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -243,10 +243,11 @@ static inline int open_arg(int flags, int mask)
 
 static int audit_match_perm(struct audit_context *ctx, int mask)
 {
+	unsigned n;
 	if (unlikely(!ctx))
 		return 0;
 
-	unsigned n = ctx->major;
+	n = ctx->major;
 	switch (audit_classify_syscall(ctx->arch, n)) {
 	case 0:	/* native */
 		if ((mask & AUDIT_PERM_WRITE) &&
-- 
cgit v1.2.3


From 6781f4ae30bbb8ebf31187b3c9304be16966f5a0 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Sun, 31 Aug 2008 20:31:55 -0700
Subject: kernel/resource.c: fix new kernel-doc warning

Fix kernel-doc warning for new function:

Warning(linux-2.6.27-rc5-git2//kernel/resource.c:448): No description found for parameter 'root'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/resource.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index cf0a178c7513..03d796c1b2e9 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -438,7 +438,7 @@ int insert_resource(struct resource *parent, struct resource *new)
 
 /**
  * insert_resource_expand_to_fit - Insert a resource into the resource tree
- * @parent: parent of the new resource
+ * @root: root resource descriptor
  * @new: new resource to insert
  *
  * Insert a resource into the resource tree, possibly expanding it in order
-- 
cgit v1.2.3


From cbaed698f37494b30b2449b51c728ae48630cb2b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sat, 30 Aug 2008 21:08:40 +0400
Subject: softlockup: minor cleanup, don't check task->state twice

The recent commit 16d9679f33caf7e683471647d1472bfe133d858 changed
check_hung_task() to filter out the TASK_KILLABLE tasks. We can
move this check to the caller which has to test t->state anyway.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/softlockup.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 1a07f8ca4b92..cb838ee93a82 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -180,10 +180,6 @@ static void check_hung_task(struct task_struct *t, unsigned long now)
 	if (t->flags & PF_FROZEN)
 		return;
 
-	/* Don't check for tasks waiting on network file systems like NFS */
-	if (t->state & TASK_KILLABLE)
-		return;
-
 	if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
 		t->last_switch_count = switch_count;
 		t->last_switch_timestamp = now;
@@ -237,7 +233,8 @@ static void check_hung_uninterruptible_tasks(int this_cpu)
 	do_each_thread(g, t) {
 		if (!--max_count)
 			goto unlock;
-		if (t->state & TASK_UNINTERRUPTIBLE)
+		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
+		if (t->state == TASK_UNINTERRUPTIBLE)
 			check_hung_task(t, now);
 	} while_each_thread(g, t);
  unlock:
-- 
cgit v1.2.3


From add0d4dfd660e9e4fd0af3eac3cad23583c9558f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 2 Sep 2008 14:35:48 -0700
Subject: pid_ns: zap_pid_ns_processes: fix the ->child_reaper changing

zap_pid_ns_processes() sets pid_ns->child_reaper = NULL, this is wrong.

Yes, we have already killed all tasks in this namespace, and sys_wait4()
doesn't see any child.  But this doesn't mean ->children list is empty, we
may have EXIT_DEAD tasks which are not visible to do_wait().  In that case
the subsequent forget_original_parent() will crash the kernel because it
will try to re-parent these tasks to the NULL reaper.

Even if there are no childs, it is not good that forget_original_parent()
uses reaper == NULL.

Change the code to set ->child_reaper = init_pid_ns.child_reaper instead.
We could use pid_ns->parent->child_reaper as well, I think this does not
really matter.  These EXIT_DEAD tasks are not visible to the new ->parent
after re-parenting, they will silently do release_task() eventually.

Note that we must change ->child_reaper, otherwise
forget_original_parent() will use reaper == father, and in that case we
will hit the (correct) BUG_ON(!list_empty(&father->children)).

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Acked-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/pid_namespace.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index ea567b78d1aa..598f1eec9826 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -179,9 +179,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 		rc = sys_wait4(-1, NULL, __WALL, NULL);
 	} while (rc != -ECHILD);
 
-
-	/* Child reaper for the pid namespace is going away */
-	pid_ns->child_reaper = NULL;
+	/*
+	 * We can not clear ->child_reaper or leave it alone.
+	 * There may by stealth EXIT_DEAD tasks on ->children,
+	 * forget_original_parent() must move them somewhere.
+	 */
+	pid_ns->child_reaper = init_pid_ns.child_reaper;
 	acct_exit_ns(pid_ns);
 	return;
 }
-- 
cgit v1.2.3


From 950bbabb5a804690a0201190de5c22837f72f83f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Tue, 2 Sep 2008 14:35:49 -0700
Subject: pid_ns: (BUG 11391) change ->child_reaper when init->group_leader
 exits

We don't change pid_ns->child_reaper when the main thread of the
subnamespace init exits.  As Robert Rex <robert.rex@exasol.com> pointed
out this is wrong.

Yes, the re-parenting itself works correctly, but if the reparented task
exits it needs ->parent->nsproxy->pid_ns in do_notify_parent(), and if the
main thread is zombie its ->nsproxy was already cleared by
exit_task_namespaces().

Introduce the new function, find_new_reaper(), which finds the new
->parent for the re-parenting and changes ->child_reaper if needed.  Kill
the now unneeded exit_child_reaper().

Also move the changing of ->child_reaper from zap_pid_ns_processes() to
find_new_reaper(), this consolidates the games with ->child_reaper and
makes it stable under tasklist_lock.

Addresses http://bugzilla.kernel.org/show_bug.cgi?id=11391

Reported-by: Robert Rex <robert.rex@exasol.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Pavel Emelyanov <xemul@openvz.org>
Acked-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c          | 78 ++++++++++++++++++++++----------------------------
 kernel/pid_namespace.c |  6 ----
 2 files changed, 34 insertions(+), 50 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 75c647387639..25ed2ad986df 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -831,26 +831,50 @@ static void reparent_thread(struct task_struct *p, struct task_struct *father)
  * the child reaper process (ie "init") in our pid
  * space.
  */
+static struct task_struct *find_new_reaper(struct task_struct *father)
+{
+	struct pid_namespace *pid_ns = task_active_pid_ns(father);
+	struct task_struct *thread;
+
+	thread = father;
+	while_each_thread(father, thread) {
+		if (thread->flags & PF_EXITING)
+			continue;
+		if (unlikely(pid_ns->child_reaper == father))
+			pid_ns->child_reaper = thread;
+		return thread;
+	}
+
+	if (unlikely(pid_ns->child_reaper == father)) {
+		write_unlock_irq(&tasklist_lock);
+		if (unlikely(pid_ns == &init_pid_ns))
+			panic("Attempted to kill init!");
+
+		zap_pid_ns_processes(pid_ns);
+		write_lock_irq(&tasklist_lock);
+		/*
+		 * We can not clear ->child_reaper or leave it alone.
+		 * There may by stealth EXIT_DEAD tasks on ->children,
+		 * forget_original_parent() must move them somewhere.
+		 */
+		pid_ns->child_reaper = init_pid_ns.child_reaper;
+	}
+
+	return pid_ns->child_reaper;
+}
+
 static void forget_original_parent(struct task_struct *father)
 {
-	struct task_struct *p, *n, *reaper = father;
+	struct task_struct *p, *n, *reaper;
 	LIST_HEAD(ptrace_dead);
 
 	write_lock_irq(&tasklist_lock);
-
+	reaper = find_new_reaper(father);
 	/*
 	 * First clean up ptrace if we were using it.
 	 */
 	ptrace_exit(father, &ptrace_dead);
 
-	do {
-		reaper = next_thread(reaper);
-		if (reaper == father) {
-			reaper = task_child_reaper(father);
-			break;
-		}
-	} while (reaper->flags & PF_EXITING);
-
 	list_for_each_entry_safe(p, n, &father->children, sibling) {
 		p->real_parent = reaper;
 		if (p->parent == father) {
@@ -959,39 +983,6 @@ static void check_stack_usage(void)
 static inline void check_stack_usage(void) {}
 #endif
 
-static inline void exit_child_reaper(struct task_struct *tsk)
-{
-	if (likely(tsk->group_leader != task_child_reaper(tsk)))
-		return;
-
-	if (tsk->nsproxy->pid_ns == &init_pid_ns)
-		panic("Attempted to kill init!");
-
-	/*
-	 * @tsk is the last thread in the 'cgroup-init' and is exiting.
-	 * Terminate all remaining processes in the namespace and reap them
-	 * before exiting @tsk.
-	 *
-	 * Note that @tsk (last thread of cgroup-init) may not necessarily
-	 * be the child-reaper (i.e main thread of cgroup-init) of the
-	 * namespace i.e the child_reaper may have already exited.
-	 *
-	 * Even after a child_reaper exits, we let it inherit orphaned children,
-	 * because, pid_ns->child_reaper remains valid as long as there is
-	 * at least one living sub-thread in the cgroup init.
-
-	 * This living sub-thread of the cgroup-init will be notified when
-	 * a child inherited by the 'child-reaper' exits (do_notify_parent()
-	 * uses __group_send_sig_info()). Further, when reaping child processes,
-	 * do_wait() iterates over children of all living sub threads.
-
-	 * i.e even though 'child_reaper' thread is listed as the parent of the
-	 * orphaned children, any living sub-thread in the cgroup-init can
-	 * perform the role of the child_reaper.
-	 */
-	zap_pid_ns_processes(tsk->nsproxy->pid_ns);
-}
-
 NORET_TYPE void do_exit(long code)
 {
 	struct task_struct *tsk = current;
@@ -1051,7 +1042,6 @@ NORET_TYPE void do_exit(long code)
 	}
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
-		exit_child_reaper(tsk);
 		hrtimer_cancel(&tsk->signal->real_timer);
 		exit_itimers(tsk->signal);
 	}
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 598f1eec9826..fab8ea86fac3 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -179,12 +179,6 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 		rc = sys_wait4(-1, NULL, __WALL, NULL);
 	} while (rc != -ECHILD);
 
-	/*
-	 * We can not clear ->child_reaper or leave it alone.
-	 * There may by stealth EXIT_DEAD tasks on ->children,
-	 * forget_original_parent() must move them somewhere.
-	 */
-	pid_ns->child_reaper = init_pid_ns.child_reaper;
 	acct_exit_ns(pid_ns);
 	return;
 }
-- 
cgit v1.2.3


From 9d3593574702ae1899e23a1535da1ac71f928042 Mon Sep 17 00:00:00 2001
From: John Kacur <jkacur@gmail.com>
Date: Tue, 2 Sep 2008 14:36:13 -0700
Subject: pm_qos_requirement might sleep

Make PM_QOS and CPU_IDLE play nicer when run with the RT-Preempt kernel.

The purpose of the patch is to remove the spin_lock around the read in the
function pm_qos_requirement - since spinlocks can sleep in -rt and this
function is called from idle.

CPU_IDLE polls the target_value's of some of the pm_qos parameters from
the idle loop causing sleeping locking warnings.  Changing the
target_value to an atomic avoids this issue.

Remove the spinlock in pm_qos_requirement by making target_value an atomic
type.

Signed-off-by: mark gross <mgross@linux.intel.com>
Signed-off-by: John Kacur <jkacur@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/pm_qos_params.c | 25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index da9c2dda6a4e..dfdec524d1b7 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -43,7 +43,7 @@
 #include <linux/uaccess.h>
 
 /*
- * locking rule: all changes to target_value or requirements or notifiers lists
+ * locking rule: all changes to requirements or notifiers lists
  * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
  * held, taken with _irqsave.  One lock to rule them all
  */
@@ -66,7 +66,7 @@ struct pm_qos_object {
 	struct miscdevice pm_qos_power_miscdev;
 	char *name;
 	s32 default_value;
-	s32 target_value;
+	atomic_t target_value;
 	s32 (*comparitor)(s32, s32);
 };
 
@@ -77,7 +77,7 @@ static struct pm_qos_object cpu_dma_pm_qos = {
 	.notifiers = &cpu_dma_lat_notifier,
 	.name = "cpu_dma_latency",
 	.default_value = 2000 * USEC_PER_SEC,
-	.target_value = 2000 * USEC_PER_SEC,
+	.target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
 	.comparitor = min_compare
 };
 
@@ -87,7 +87,7 @@ static struct pm_qos_object network_lat_pm_qos = {
 	.notifiers = &network_lat_notifier,
 	.name = "network_latency",
 	.default_value = 2000 * USEC_PER_SEC,
-	.target_value = 2000 * USEC_PER_SEC,
+	.target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
 	.comparitor = min_compare
 };
 
@@ -99,7 +99,7 @@ static struct pm_qos_object network_throughput_pm_qos = {
 	.notifiers = &network_throughput_notifier,
 	.name = "network_throughput",
 	.default_value = 0,
-	.target_value = 0,
+	.target_value = ATOMIC_INIT(0),
 	.comparitor = max_compare
 };
 
@@ -150,11 +150,11 @@ static void update_target(int target)
 		extreme_value = pm_qos_array[target]->comparitor(
 				extreme_value, node->value);
 	}
-	if (pm_qos_array[target]->target_value != extreme_value) {
+	if (atomic_read(&pm_qos_array[target]->target_value) != extreme_value) {
 		call_notifier = 1;
-		pm_qos_array[target]->target_value = extreme_value;
+		atomic_set(&pm_qos_array[target]->target_value, extreme_value);
 		pr_debug(KERN_ERR "new target for qos %d is %d\n", target,
-			pm_qos_array[target]->target_value);
+			atomic_read(&pm_qos_array[target]->target_value));
 	}
 	spin_unlock_irqrestore(&pm_qos_lock, flags);
 
@@ -193,14 +193,7 @@ static int find_pm_qos_object_by_minor(int minor)
  */
 int pm_qos_requirement(int pm_qos_class)
 {
-	int ret_val;
-	unsigned long flags;
-
-	spin_lock_irqsave(&pm_qos_lock, flags);
-	ret_val = pm_qos_array[pm_qos_class]->target_value;
-	spin_unlock_irqrestore(&pm_qos_lock, flags);
-
-	return ret_val;
+	return atomic_read(&pm_qos_array[pm_qos_class]->target_value);
 }
 EXPORT_SYMBOL_GPL(pm_qos_requirement);
 
-- 
cgit v1.2.3


From b380b0d4f7dffcc235c0facefa537d4655619101 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Thu, 4 Sep 2008 17:05:57 +0100
Subject: forgotten refcount on sysctl root table

We should've set refcount on the root sysctl table; otherwise we'll blow
up the first time we get down to zero dynamically registered sysctl
tables.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index fe4713347275..50ec0886fa3d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -159,6 +159,7 @@ static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *
 static struct ctl_table root_table[];
 static struct ctl_table_root sysctl_table_root;
 static struct ctl_table_header root_table_header = {
+	.count = 1,
 	.ctl_table = root_table,
 	.ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),
 	.root = &sysctl_table_root,
-- 
cgit v1.2.3


From 268364a0f48aee2f851f9d1ef8a6cda0f3039ef1 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Thu, 4 Sep 2008 21:02:44 +0200
Subject: IO resources: add reserve_region_with_split()

add reserve_region_with_split() to not lose e820 reserved entries if
they overlap with existing IO regions:

with test case by extend 0xe0000000 - 0xeffffff to 0xdd800000 -
we get:
	e0000000-efffffff : PCI MMCONFIG 0
		 e0000000-efffffff : reserved

and in /proc/iomem we get:
	found conflict for reserved [dd800000, efffffff], try to reserve with split
	    __reserve_region_with_split: (PCI Bus #80) [dd000000, ddffffff], res: (reserved) [dd800000, efffffff]
	    __reserve_region_with_split: (PCI Bus #00) [de000000, dfffffff], res: (reserved) [de000000, efffffff]
	initcall pci_subsys_init+0x0/0x121 returned 0 after 381 msecs
in dmesg

various fixes and improvements suggested by Linus.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/resource.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index 03d796c1b2e9..414d6fc9131e 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -516,6 +516,74 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
 	return result;
 }
 
+static void __init __reserve_region_with_split(struct resource *root,
+		resource_size_t start, resource_size_t end,
+		const char *name)
+{
+	struct resource *parent = root;
+	struct resource *conflict;
+	struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
+
+	if (!res)
+		return;
+
+	res->name = name;
+	res->start = start;
+	res->end = end;
+	res->flags = IORESOURCE_BUSY;
+
+	for (;;) {
+		conflict = __request_resource(parent, res);
+		if (!conflict)
+			break;
+		if (conflict != parent) {
+			parent = conflict;
+			if (!(conflict->flags & IORESOURCE_BUSY))
+				continue;
+		}
+
+		/* Uhhuh, that didn't work out.. */
+		kfree(res);
+		res = NULL;
+		break;
+	}
+
+	if (!res) {
+		printk(KERN_DEBUG "    __reserve_region_with_split: (%s) [%llx, %llx], res: (%s) [%llx, %llx]\n",
+			 conflict->name, conflict->start, conflict->end,
+			 name, start, end);
+
+		/* failed, split and try again */
+
+		/* conflict coverred whole area */
+		if (conflict->start <= start && conflict->end >= end)
+			return;
+
+		if (conflict->start > start)
+			__reserve_region_with_split(root, start, conflict->start-1, name);
+		if (!(conflict->flags & IORESOURCE_BUSY)) {
+			resource_size_t common_start, common_end;
+
+			common_start = max(conflict->start, start);
+			common_end = min(conflict->end, end);
+			if (common_start < common_end)
+				__reserve_region_with_split(root, common_start, common_end, name);
+		}
+		if (conflict->end < end)
+			__reserve_region_with_split(root, conflict->end+1, end, name);
+	}
+
+}
+
+void reserve_region_with_split(struct resource *root,
+		resource_size_t start, resource_size_t end,
+		const char *name)
+{
+	write_lock(&resource_lock);
+	__reserve_region_with_split(root, start, end, name);
+	write_unlock(&resource_lock);
+}
+
 EXPORT_SYMBOL(adjust_resource);
 
 /**
-- 
cgit v1.2.3


From 7c1e76897492d92b6a1c2d6892494d39ded9680c Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Wed, 3 Sep 2008 21:36:50 +0000
Subject: clockevents: prevent clockevent event_handler ending up handler_noop

There is a ordering related problem with clockevents code, due to which
clockevents_register_device() called after tickless/highres switch
will not work. The new clockevent ends up with clockevents_handle_noop as
event handler, resulting in no timer activity.

The problematic path seems to be

* old device already has hrtimer_interrupt as the event_handler
* new clockevent device registers with a higher rating
* tick_check_new_device() is called
  * clockevents_exchange_device() gets called
    * old->event_handler is set to clockevents_handle_noop
  * tick_setup_device() is called for the new device
    * which sets new->event_handler using the old->event_handler which is noop.

Change the ordering so that new device inherits the proper handler.

This does not have any issue in normal case as most likely all the clockevent
devices are setup before the highres switch. But, can potentially be affecting
some corner case where HPET force detect happens after the highres switch.
This was a problem with HPET in MSI mode code that we have been experimenting
with.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/clockevents.c | 3 +--
 kernel/time/tick-common.c | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 3d1e3e1a1971..1876b526c778 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -177,7 +177,7 @@ void clockevents_register_device(struct clock_event_device *dev)
 /*
  * Noop handler when we shut down an event device
  */
-static void clockevents_handle_noop(struct clock_event_device *dev)
+void clockevents_handle_noop(struct clock_event_device *dev)
 {
 }
 
@@ -199,7 +199,6 @@ void clockevents_exchange_device(struct clock_event_device *old,
 	 * released list and do a notify add later.
 	 */
 	if (old) {
-		old->event_handler = clockevents_handle_noop;
 		clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
 		list_del(&old->list);
 		list_add(&old->list, &clockevents_released);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 80c4336f4188..c4777193d567 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -161,6 +161,7 @@ static void tick_setup_device(struct tick_device *td,
 	} else {
 		handler = td->evtdev->event_handler;
 		next_event = td->evtdev->next_event;
+		td->evtdev->event_handler = clockevents_handle_noop;
 	}
 
 	td->evtdev = newdev;
-- 
cgit v1.2.3


From d4496b39559c6d43f83e4c08b899984f8b8089b5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 3 Sep 2008 21:36:57 +0000
Subject: clockevents: prevent endless loop in periodic broadcast handler

The reprogramming of the periodic broadcast handler was broken,
when the first programming returned -ETIME. The clockevents code
stores the new expiry value in the clock events device next_event field
only when the programming time has not been elapsed yet. The loop in
question calculates the new expiry value from the next_event value
and therefor never increases.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/tick-broadcast.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 31463d370b94..3044a88357fa 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -175,6 +175,8 @@ static void tick_do_periodic_broadcast(void)
  */
 static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
 {
+	ktime_t next;
+
 	tick_do_periodic_broadcast();
 
 	/*
@@ -185,10 +187,13 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
 
 	/*
 	 * Setup the next period for devices, which do not have
-	 * periodic mode:
+	 * periodic mode. We read dev->next_event first and add to it
+	 * when the event alrady expired. clockevents_program_event()
+	 * sets dev->next_event only when the event is really
+	 * programmed to the device.
 	 */
-	for (;;) {
-		ktime_t next = ktime_add(dev->next_event, tick_period);
+	for (next = dev->next_event; ;) {
+		next = ktime_add(next, tick_period);
 
 		if (!clockevents_program_event(dev, next, ktime_get()))
 			return;
-- 
cgit v1.2.3


From 7205656ab48da29a95d7f55e43a81db755d3cb3a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 3 Sep 2008 21:37:03 +0000
Subject: clockevents: enforce reprogram in oneshot setup

In tick_oneshot_setup we program the device to the given next_event,
but we do not check the return value. We need to make sure that the
device is programmed enforced so the interrupt handler engine starts
working. Split out the reprogramming function from tick_program_event()
and call it with the device, which was handed in to tick_setup_oneshot().
Set the force argument, so the devices is firing an interrupt.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/tick-oneshot.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 450c04935b66..06595c64b0c9 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -23,11 +23,11 @@
 #include "tick-internal.h"
 
 /**
- * tick_program_event
+ * tick_program_event internal worker function
  */
-int tick_program_event(ktime_t expires, int force)
+static int __tick_program_event(struct clock_event_device *dev,
+				ktime_t expires, int force)
 {
-	struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
 	ktime_t now = ktime_get();
 
 	while (1) {
@@ -40,6 +40,16 @@ int tick_program_event(ktime_t expires, int force)
 	}
 }
 
+/**
+ * tick_program_event
+ */
+int tick_program_event(ktime_t expires, int force)
+{
+	struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
+
+	return __tick_program_event(dev, expires, force);
+}
+
 /**
  * tick_resume_onshot - resume oneshot mode
  */
@@ -61,7 +71,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
 {
 	newdev->event_handler = handler;
 	clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
-	clockevents_program_event(newdev, next_event, ktime_get());
+	__tick_program_event(newdev, next_event, 1);
 }
 
 /**
-- 
cgit v1.2.3


From 9c17bcda991000351cb2373f78be7e4b1c44caa3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 3 Sep 2008 21:37:08 +0000
Subject: clockevents: prevent multiple init/shutdown

While chasing the C1E/HPET bugreports I went through the clock events
code inch by inch and found that the broadcast device can be initialized
and shutdown multiple times. Multiple shutdowns are not critical, but
useless waste of time. Multiple initializations are simply broken. Another
CPU might have the device in use already after the first initialization and
the second init could just render it unusable again.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/tick-broadcast.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 3044a88357fa..5744f40b2697 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -210,7 +210,7 @@ static void tick_do_broadcast_on_off(void *why)
 	struct clock_event_device *bc, *dev;
 	struct tick_device *td;
 	unsigned long flags, *reason = why;
-	int cpu;
+	int cpu, bc_stopped;
 
 	spin_lock_irqsave(&tick_broadcast_lock, flags);
 
@@ -228,6 +228,8 @@ static void tick_do_broadcast_on_off(void *why)
 	if (!tick_device_is_functional(dev))
 		goto out;
 
+	bc_stopped = cpus_empty(tick_broadcast_mask);
+
 	switch (*reason) {
 	case CLOCK_EVT_NOTIFY_BROADCAST_ON:
 	case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
@@ -250,9 +252,10 @@ static void tick_do_broadcast_on_off(void *why)
 		break;
 	}
 
-	if (cpus_empty(tick_broadcast_mask))
-		clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
-	else {
+	if (cpus_empty(tick_broadcast_mask)) {
+		if (!bc_stopped)
+			clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
+	} else if (bc_stopped) {
 		if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
 			tick_broadcast_start_periodic(bc);
 		else
@@ -501,9 +504,12 @@ static void tick_broadcast_clear_oneshot(int cpu)
  */
 void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 {
-	bc->event_handler = tick_handle_oneshot_broadcast;
-	clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
-	bc->next_event.tv64 = KTIME_MAX;
+	/* Set it up only once ! */
+	if (bc->event_handler != tick_handle_oneshot_broadcast) {
+		bc->event_handler = tick_handle_oneshot_broadcast;
+		clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+		bc->next_event.tv64 = KTIME_MAX;
+	}
 }
 
 /*
-- 
cgit v1.2.3


From 1fb9b7d29d8e85ba3196eaa7ab871bf76fc98d36 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 3 Sep 2008 21:37:14 +0000
Subject: clockevents: prevent endless loop lockup

The C1E/HPET bug reports on AMDX2/RS690 systems where tracked down to a
too small value of the HPET minumum delta for programming an event.

The clockevents code needs to enforce an interrupt event on the clock event
device in some cases. The enforcement code was stupid and naive, as it just
added the minimum delta to the current time and tried to reprogram the device.
When the minimum delta is too small, then this loops forever.

Add a sanity check. Allow reprogramming to fail 3 times, then print a warning
and double the minimum delta value to make sure, that this does not happen again.
Use the same function for both tick-oneshot and tick-broadcast code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/tick-broadcast.c | 12 ++----------
 kernel/time/tick-internal.h  |  2 ++
 kernel/time/tick-oneshot.c   | 36 ++++++++++++++++++++++++++++++------
 3 files changed, 34 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 5744f40b2697..2bc1f046151c 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -372,16 +372,8 @@ cpumask_t *tick_get_broadcast_oneshot_mask(void)
 static int tick_broadcast_set_event(ktime_t expires, int force)
 {
 	struct clock_event_device *bc = tick_broadcast_device.evtdev;
-	ktime_t now = ktime_get();
-	int res;
-
-	for(;;) {
-		res = clockevents_program_event(bc, expires, now);
-		if (!res || !force)
-			return res;
-		now = ktime_get();
-		expires = ktime_add(now, ktime_set(0, bc->min_delta_ns));
-	}
+
+	return tick_dev_program_event(bc, expires, force);
 }
 
 int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index f13f2b7f4fd4..0ffc2918ea6f 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -17,6 +17,8 @@ extern void tick_handle_periodic(struct clock_event_device *dev);
 extern void tick_setup_oneshot(struct clock_event_device *newdev,
 			       void (*handler)(struct clock_event_device *),
 			       ktime_t nextevt);
+extern int tick_dev_program_event(struct clock_event_device *dev,
+				  ktime_t expires, int force);
 extern int tick_program_event(ktime_t expires, int force);
 extern void tick_oneshot_notify(void);
 extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 06595c64b0c9..2e35501e61dd 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -25,18 +25,42 @@
 /**
  * tick_program_event internal worker function
  */
-static int __tick_program_event(struct clock_event_device *dev,
-				ktime_t expires, int force)
+int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
+			   int force)
 {
 	ktime_t now = ktime_get();
+	int i;
 
-	while (1) {
+	for (i = 0;;) {
 		int ret = clockevents_program_event(dev, expires, now);
 
 		if (!ret || !force)
 			return ret;
+
+		/*
+		 * We tried 2 times to program the device with the given
+		 * min_delta_ns. If that's not working then we double it
+		 * and emit a warning.
+		 */
+		if (++i > 2) {
+			printk(KERN_WARNING "CE: __tick_program_event of %s is "
+			       "stuck %llx %llx\n", dev->name ? dev->name : "?",
+			       now.tv64, expires.tv64);
+			printk(KERN_WARNING
+			       "CE: increasing min_delta_ns %ld to %ld nsec\n",
+			       dev->min_delta_ns, dev->min_delta_ns << 1);
+			WARN_ON(1);
+
+			/* Double the min. delta and try again */
+			if (!dev->min_delta_ns)
+				dev->min_delta_ns = 5000;
+			else
+				dev->min_delta_ns <<= 1;
+			i = 0;
+		}
+
 		now = ktime_get();
-		expires = ktime_add(now, ktime_set(0, dev->min_delta_ns));
+		expires = ktime_add_ns(now, dev->min_delta_ns);
 	}
 }
 
@@ -47,7 +71,7 @@ int tick_program_event(ktime_t expires, int force)
 {
 	struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
 
-	return __tick_program_event(dev, expires, force);
+	return tick_dev_program_event(dev, expires, force);
 }
 
 /**
@@ -71,7 +95,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
 {
 	newdev->event_handler = handler;
 	clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
-	__tick_program_event(newdev, next_event, 1);
+	tick_dev_program_event(newdev, next_event, 1);
 }
 
 /**
-- 
cgit v1.2.3


From 56c7426b3951e4f35a71d695f1c982989399d6fd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 1 Sep 2008 16:44:23 +0200
Subject: sched_clock: fix NOHZ interaction

If HLT stops the TSC, we'll fail to account idle time, thereby inflating the
actual process times. Fix this by re-calibrating the clock against GTOD when
leaving nohz mode.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Avi Kivity <avi@qumranet.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/tick-sched.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7a46bde78c66..a87b0468568b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -162,6 +162,8 @@ void tick_nohz_stop_idle(int cpu)
 		ts->idle_lastupdate = now;
 		ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
 		ts->idle_active = 0;
+
+		sched_clock_idle_wakeup_event(0);
 	}
 }
 
@@ -177,6 +179,7 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
 	}
 	ts->idle_entrytime = now;
 	ts->idle_active = 1;
+	sched_clock_idle_sleep_event();
 	return now;
 }
 
-- 
cgit v1.2.3


From 49048622eae698e5c4ae61f7e71200f265ccc529 Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbir@linux.vnet.ibm.com>
Date: Fri, 5 Sep 2008 18:12:23 +0200
Subject: sched: fix process time monotonicity

Spencer reported a problem where utime and stime were going negative despite
the fixes in commit b27f03d4bdc145a09fb7b0c0e004b29f1ee555fa. The suspected
reason for the problem is that signal_struct maintains it's own utime and
stime (of exited tasks), these are not updated using the new task_utime()
routine, hence sig->utime can go backwards and cause the same problem
to occur (sig->utime, adds tsk->utime and not task_utime()). This patch
fixes the problem

TODO: using max(task->prev_utime, derived utime) works for now, but a more
generic solution is to implement cputime_max() and use the cputime_gt()
function for comparison.

Reported-by: spencer@bluehost.com
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/exit.c  |  6 +++---
 kernel/sched.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 25ed2ad986df..16395644a98f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -112,9 +112,9 @@ static void __exit_signal(struct task_struct *tsk)
 		 * We won't ever get here for the group leader, since it
 		 * will have been the last reference on the signal_struct.
 		 */
-		sig->utime = cputime_add(sig->utime, tsk->utime);
-		sig->stime = cputime_add(sig->stime, tsk->stime);
-		sig->gtime = cputime_add(sig->gtime, tsk->gtime);
+		sig->utime = cputime_add(sig->utime, task_utime(tsk));
+		sig->stime = cputime_add(sig->stime, task_stime(tsk));
+		sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
 		sig->min_flt += tsk->min_flt;
 		sig->maj_flt += tsk->maj_flt;
 		sig->nvcsw += tsk->nvcsw;
diff --git a/kernel/sched.c b/kernel/sched.c
index 9a1ddb84e26d..1a5f73c1fcdc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4178,6 +4178,65 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
 		cpustat->steal = cputime64_add(cpustat->steal, tmp);
 }
 
+/*
+ * Use precise platform statistics if available:
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+cputime_t task_utime(struct task_struct *p)
+{
+	return p->utime;
+}
+
+cputime_t task_stime(struct task_struct *p)
+{
+	return p->stime;
+}
+#else
+cputime_t task_utime(struct task_struct *p)
+{
+	clock_t utime = cputime_to_clock_t(p->utime),
+		total = utime + cputime_to_clock_t(p->stime);
+	u64 temp;
+
+	/*
+	 * Use CFS's precise accounting:
+	 */
+	temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
+
+	if (total) {
+		temp *= utime;
+		do_div(temp, total);
+	}
+	utime = (clock_t)temp;
+
+	p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
+	return p->prev_utime;
+}
+
+cputime_t task_stime(struct task_struct *p)
+{
+	clock_t stime;
+
+	/*
+	 * Use CFS's precise accounting. (we subtract utime from
+	 * the total, to make sure the total observed by userspace
+	 * grows monotonically - apps rely on that):
+	 */
+	stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
+			cputime_to_clock_t(task_utime(p));
+
+	if (stime >= 0)
+		p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
+
+	return p->prev_stime;
+}
+#endif
+
+inline cputime_t task_gtime(struct task_struct *p)
+{
+	return p->gtime;
+}
+
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
-- 
cgit v1.2.3


From 7300711e8c6824fcfbd42a126980ff50439d8dd0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 6 Sep 2008 03:01:45 +0200
Subject: clockevents: broadcast fixup possible waiters

Until the C1E patches arrived there where no users of periodic broadcast
before switching to oneshot mode. Now we need to trigger a possible
waiter for a periodic broadcast when switching to oneshot mode.
Otherwise we can starve them for ever.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast.c | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 2bc1f046151c..2f5a38294bf9 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -491,6 +491,18 @@ static void tick_broadcast_clear_oneshot(int cpu)
 	cpu_clear(cpu, tick_broadcast_oneshot_mask);
 }
 
+static void tick_broadcast_init_next_event(cpumask_t *mask, ktime_t expires)
+{
+	struct tick_device *td;
+	int cpu;
+
+	for_each_cpu_mask_nr(cpu, *mask) {
+		td = &per_cpu(tick_cpu_device, cpu);
+		if (td->evtdev)
+			td->evtdev->next_event = expires;
+	}
+}
+
 /**
  * tick_broadcast_setup_oneshot - setup the broadcast device
  */
@@ -498,9 +510,32 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 {
 	/* Set it up only once ! */
 	if (bc->event_handler != tick_handle_oneshot_broadcast) {
+		int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
+		int cpu = smp_processor_id();
+		cpumask_t mask;
+
 		bc->event_handler = tick_handle_oneshot_broadcast;
 		clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
-		bc->next_event.tv64 = KTIME_MAX;
+
+		/* Take the do_timer update */
+		tick_do_timer_cpu = cpu;
+
+		/*
+		 * We must be careful here. There might be other CPUs
+		 * waiting for periodic broadcast. We need to set the
+		 * oneshot_mask bits for those and program the
+		 * broadcast device to fire.
+		 */
+		mask = tick_broadcast_mask;
+		cpu_clear(cpu, mask);
+		cpus_or(tick_broadcast_oneshot_mask,
+			tick_broadcast_oneshot_mask, mask);
+
+		if (was_periodic && !cpus_empty(mask)) {
+			tick_broadcast_init_next_event(&mask, tick_next_period);
+			tick_broadcast_set_event(tick_next_period, 1);
+		} else
+			bc->next_event.tv64 = KTIME_MAX;
 	}
 }
 
-- 
cgit v1.2.3


From c8bfff6dd4d41834f4952cbc49e28e31906a6188 Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@poczta.fm>
Date: Fri, 5 Sep 2008 23:46:19 +0200
Subject: sched: compilation fix with gcc 3.4.6

I found that 2.6.27-rc5-mm1 does not compile with gcc 3.4.6.
The error is:
  CC      kernel/sched.o
kernel/sched.c: In function `start_rt_bandwidth':
kernel/sched.c:208: sorry, unimplemented: inlining failed in call to 'rt_bandwidth_enabled': function body not available
kernel/sched.c:214: sorry, unimplemented: called from here
make[1]: *** [kernel/sched.o] Error 1
make: *** [kernel] Error 2

It seems that the gcc 3.4.6 requires full inline definition before first usage.
The patch below fixes the compilation problem.

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl> (if needed>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 703f56d5db5f..4de2bfb28c58 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -204,7 +204,10 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
 	rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 }
 
-static inline int rt_bandwidth_enabled(void);
+static inline int rt_bandwidth_enabled(void)
+{
+	return sysctl_sched_rt_runtime >= 0;
+}
 
 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 {
@@ -841,11 +844,6 @@ static inline u64 global_rt_runtime(void)
 	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 }
 
-static inline int rt_bandwidth_enabled(void)
-{
-	return sysctl_sched_rt_runtime >= 0;
-}
-
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif
-- 
cgit v1.2.3


From 4ff4b9e19a80b73959ebeb28d1df40176686f0a8 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Date: Fri, 5 Sep 2008 14:05:31 -0700
Subject: ntp: fix calculation of the next jiffie to trigger RTC sync

We have a bug in the calculation of the next jiffie to trigger the RTC
synchronisation.  The aim here is to run sync_cmos_clock() as close as
possible to the middle of a second.  Which means we want this function to
be called less than or equal to half a jiffie away from when now.tv_nsec
equals 5e8 (500000000).

If this is not the case for a given call to the function, for this purpose
instead of updating the RTC we calculate the offset in nanoseconds to the
next point in time where now.tv_nsec will be equal 5e8.  The calculated
offset is then converted to jiffies as these are the unit used by the
timer.

Hovewer timespec_to_jiffies() used here uses a ceil()-type rounding mode,
where the resulting value is rounded up.  As a result the range of
now.tv_nsec when the timer will trigger is from 5e8 to 5e8 + TICK_NSEC
rather than the desired 5e8 - TICK_NSEC / 2 to 5e8 + TICK_NSEC / 2.

As a result if for example sync_cmos_clock() happens to be called at the
time when now.tv_nsec is between 5e8 + TICK_NSEC / 2 and 5e8 to 5e8 +
TICK_NSEC, it will simply be rescheduled HZ jiffies later, falling in the
same range of now.tv_nsec again.  Similarly for cases offsetted by an
integer multiple of TICK_NSEC.

This change addresses the problem by subtracting TICK_NSEC / 2 from the
nanosecond offset to the next point in time where now.tv_nsec will be
equal 5e8, effectively shifting the following rounding in
timespec_to_jiffies() so that it produces a rounded-to-nearest result.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/ntp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5125ddd8196b..1ad46f3df6e7 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -245,7 +245,7 @@ static void sync_cmos_clock(unsigned long dummy)
 	if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
 		fail = update_persistent_clock(now);
 
-	next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec;
+	next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
 	if (next.tv_nsec <= 0)
 		next.tv_nsec += NSEC_PER_SEC;
 
-- 
cgit v1.2.3


From 38736f475071b80b66be28af7b44c854073699cc Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Sat, 6 Sep 2008 14:50:23 +0530
Subject: sched: fix __load_balance_iterator() for cfq with only one task

The __load_balance_iterator() returns a NULL when there's only one
sched_entity which is a task. It is caused by the following code-path.

	/* Skip over entities that are not tasks */
	do {
		se = list_entry(next, struct sched_entity, group_node);
		next = next->next;
	} while (next != &cfs_rq->tasks && !entity_is_task(se));

	if (next == &cfs_rq->tasks)
		return NULL;
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
      This will return NULL even when se is a task.

As a side-effect, there was a regression in sched_mc behavior since 2.6.25,
since iter_move_one_task() when it calls load_balance_start_fair(),
would not get any tasks to move!

Fix this by checking if the last entity was a task or not.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 8264bb5dbd51..a10ac0bcee64 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1458,7 +1458,7 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
 		next = next->next;
 	} while (next != &cfs_rq->tasks && !entity_is_task(se));
 
-	if (next == &cfs_rq->tasks)
+	if (next == &cfs_rq->tasks && !entity_is_task(se))
 		return NULL;
 
 	cfs_rq->balance_iterator = next;
-- 
cgit v1.2.3


From 3ba35573ad9a149a3af19625b502679283382f6b Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Sun, 31 Aug 2008 19:58:49 +0200
Subject: kernel/cpu.c: Move the CPU_DYING notifiers

When a cpu is taken offline, the CPU_DYING notifiers are called on the
dying cpu. According to <linux/notifiers.h>, the cpu should be "not
running any task, not handling interrupts, soon dead".

For the current implementation, this is not true:
- __cpu_disable can fail. If it fails, then the cpu will remain alive
  and happy.
- At least on x86, __cpu_disable() briefly enables the local interrupts
  to handle any outstanding interrupts.

What about moving CPU_DYING down a few lines, behind the __cpu_disable()
line?
There are only two CPU_DYING handlers in the kernel right now: one in
kvm, one in the scheduler. Both should work with the patch applied
[and: I'm not sure if either one handles a failing __cpu_disable()]

The patch survives simple offlining a cpu. kvm untested due to lack
of a test setup.

Signed-off-By: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/cpu.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index f17e9854c246..9e7ebde1331c 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -199,13 +199,14 @@ static int __ref take_cpu_down(void *_param)
 	struct take_cpu_down_param *param = _param;
 	int err;
 
-	raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
-				param->hcpu);
 	/* Ensure this CPU doesn't handle any more interrupts. */
 	err = __cpu_disable();
 	if (err < 0)
 		return err;
 
+	raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
+				param->hcpu);
+
 	/* Force idle task to run as soon as we yield: it should
 	   immediately notice cpu is offline and die quickly. */
 	sched_idle_next();
-- 
cgit v1.2.3


From dfb512ec4834116124da61d6c1ee10fd0aa32bd6 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qualcomm.com>
Date: Fri, 29 Aug 2008 13:11:41 -0700
Subject: sched: arch_reinit_sched_domains() must destroy domains to force
 rebuild

What I realized recently is that calling rebuild_sched_domains() in
arch_reinit_sched_domains() by itself is not enough when cpusets are enabled.
partition_sched_domains() code is trying to avoid unnecessary domain rebuilds
and will not actually rebuild anything if new domain masks match the old ones.

What this means is that doing
     echo 1 > /sys/devices/system/cpu/sched_mc_power_savings
on a system with cpusets enabled will not take affect untill something changes
in the cpuset setup (ie new sets created or deleted).

This patch fixes restore correct behaviour where domains must be rebuilt in
order to enable MC powersaving flags.

Test on quad-core Core2 box with both CONFIG_CPUSETS and !CONFIG_CPUSETS.
Also tested on dual-core Core2 laptop. Lockdep is happy and things are working
as expected.

Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Tested-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index d601fb0406ca..d72ee9a0eacd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7589,24 +7589,27 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
  * and partition_sched_domains() will fallback to the single partition
  * 'fallback_doms', it also forces the domains to be rebuilt.
  *
+ * If doms_new==NULL it will be replaced with cpu_online_map.
+ * ndoms_new==0 is a special case for destroying existing domains.
+ * It will not create the default domain.
+ *
  * Call with hotplug lock held
  */
 void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
 			     struct sched_domain_attr *dattr_new)
 {
-	int i, j;
+	int i, j, n;
 
 	mutex_lock(&sched_domains_mutex);
 
 	/* always unregister in case we don't destroy any domains */
 	unregister_sched_domain_sysctl();
 
-	if (doms_new == NULL)
-		ndoms_new = 0;
+	n = doms_new ? ndoms_new : 0;
 
 	/* Destroy deleted domains */
 	for (i = 0; i < ndoms_cur; i++) {
-		for (j = 0; j < ndoms_new; j++) {
+		for (j = 0; j < n; j++) {
 			if (cpus_equal(doms_cur[i], doms_new[j])
 			    && dattrs_equal(dattr_cur, i, dattr_new, j))
 				goto match1;
@@ -7619,7 +7622,6 @@ match1:
 
 	if (doms_new == NULL) {
 		ndoms_cur = 0;
-		ndoms_new = 1;
 		doms_new = &fallback_doms;
 		cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
 		dattr_new = NULL;
@@ -7656,8 +7658,13 @@ match2:
 int arch_reinit_sched_domains(void)
 {
 	get_online_cpus();
+
+	/* Destroy domains first to force the rebuild */
+	partition_sched_domains(0, NULL, NULL);
+
 	rebuild_sched_domains();
 	put_online_cpus();
+
 	return 0;
 }
 
@@ -7741,7 +7748,7 @@ static int update_sched_domains(struct notifier_block *nfb,
 	case CPU_ONLINE_FROZEN:
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
-		partition_sched_domains(0, NULL, NULL);
+		partition_sched_domains(1, NULL, NULL);
 		return NOTIFY_OK;
 
 	default:
-- 
cgit v1.2.3


From e545a6140b698b2494daf0b32107bdcc5e901390 Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Sun, 7 Sep 2008 16:57:22 +0200
Subject: kernel/cpu.c: create a CPU_STARTING cpu_chain notifier

Right now, there is no notifier that is called on a new cpu, before the new
cpu begins processing interrupts/softirqs.
Various kernel function would need that notification, e.g. kvm works around
by calling smp_call_function_single(), rcu polls cpu_online_map.

The patch adds a CPU_STARTING notification. It also adds a helper function
that sends the message to all cpu_chain handlers.

Tested on x86-64.
All other archs are untested. Especially on sparc, I'm not sure if I got
it right.

Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/cpu.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index f17e9854c246..dc45f2459efb 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -453,6 +453,25 @@ out:
 }
 #endif /* CONFIG_PM_SLEEP_SMP */
 
+/**
+ * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
+ * @cpu: cpu that just started
+ *
+ * This function calls the cpu_chain notifiers with CPU_STARTING.
+ * It must be called by the arch code on the new cpu, before the new cpu
+ * enables interrupts and before the "boot" cpu returns from __cpu_up().
+ */
+void notify_cpu_starting(unsigned int cpu)
+{
+	unsigned long val = CPU_STARTING;
+
+#ifdef CONFIG_PM_SLEEP_SMP
+	if (cpu_isset(cpu, frozen_cpus))
+		val = CPU_STARTING_FROZEN;
+#endif /* CONFIG_PM_SLEEP_SMP */
+	raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu);
+}
+
 #endif /* CONFIG_SMP */
 
 /*
-- 
cgit v1.2.3


From 61c22c34c6f80a8e89cff5ff717627c54cc14fd4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Sep 2008 21:38:57 +0200
Subject: clockevents: remove WARN_ON which was used to gather information

The issue of the endless reprogramming loop due to a too small
min_delta_ns was fixed with the previous updates of the clock events
code, but we had no information about the spread of this problem. I
added a WARN_ON to get automated information via kerneloops.org and to
get some direct reports, which allowed me to analyse the affected
machines.

The WARN_ON has served its purpose and would be annoying for a release
kernel. Remove it and just keep the information about the increase of
the min_delta_ns value.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-oneshot.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 2e35501e61dd..2e8de678e767 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -43,19 +43,17 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
 		 * and emit a warning.
 		 */
 		if (++i > 2) {
-			printk(KERN_WARNING "CE: __tick_program_event of %s is "
-			       "stuck %llx %llx\n", dev->name ? dev->name : "?",
-			       now.tv64, expires.tv64);
-			printk(KERN_WARNING
-			       "CE: increasing min_delta_ns %ld to %ld nsec\n",
-			       dev->min_delta_ns, dev->min_delta_ns << 1);
-			WARN_ON(1);
-
-			/* Double the min. delta and try again */
+			/* Increase the min. delta and try again */
 			if (!dev->min_delta_ns)
 				dev->min_delta_ns = 5000;
 			else
-				dev->min_delta_ns <<= 1;
+				dev->min_delta_ns += dev->min_delta_ns >> 1;
+
+			printk(KERN_WARNING
+			       "CE: %s increasing min_delta_ns to %lu nsec\n",
+			       dev->name ? dev->name : "?",
+			       dev->min_delta_ns << 1);
+
 			i = 0;
 		}
 
-- 
cgit v1.2.3


From baf25731e54d06eb13dc4eda78c6dc7da4ce84e8 Mon Sep 17 00:00:00 2001
From: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Date: Tue, 9 Sep 2008 11:26:33 +0800
Subject: sched: fix 2.6.27-rc5 couldn't boot on tulsa machine randomly

On my tulsa x86-64 machine, kernel 2.6.25-rc5 couldn't boot randomly.

Basically, function __enable_runtime forgets to reset rt_rq->rt_throttled
to 0. When every cpu is up, per-cpu migration_thread is created and it runs
very fast, sometimes to mark the corresponding rt_rq->rt_throttled to 1 very
quickly. After all cpus are up, with below calling chain:

   sched_init_smp => arch_init_sched_domains => build_sched_domains => ...
=> cpu_attach_domain => rq_attach_root => set_rq_online => ...
=> _enable_runtime

_enable_runtime is called against every rt_rq again, so rt_rq->rt_time is
reset to 0, but rt_rq->rt_throttled might be still 1. Later on function
do_sched_rt_period_timer couldn't reset it, and all RT tasks couldn't be
scheduled to run on that cpu. here is RT task migration_thread which is
woken up when a task is migrated to another cpu.

Below patch fixes it against 2.6.27-rc5.

Signed-off-by: Zhang Yanmin <yanmin_zhang@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 552310798dad..1113157b2058 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -350,6 +350,7 @@ static void __enable_runtime(struct rq *rq)
 		spin_lock(&rt_rq->rt_runtime_lock);
 		rt_rq->rt_runtime = rt_b->rt_runtime;
 		rt_rq->rt_time = 0;
+		rt_rq->rt_throttled = 0;
 		spin_unlock(&rt_rq->rt_runtime_lock);
 		spin_unlock(&rt_b->rt_runtime_lock);
 	}
-- 
cgit v1.2.3


From ec5d498991e87c74730509508b25c3959192b7e7 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Wed, 10 Sep 2008 17:00:19 -0700
Subject: sched: fix deadlock in setting scheduler parameter to zero

Andrei Gusev wrote:

> I played witch scheduler settings. After doing something like:
> echo -n 1000000 >sched_rt_period_us
>
> command is locked. I found in kernel.log:
>
> Sep 11 00:39:34 zaratustra
> Sep 11 00:39:34 zaratustra Pid: 4495, comm: bash Tainted: G        W
> (2.6.26.3 #12)
> Sep 11 00:39:34 zaratustra EIP: 0060:[<c0213fc7>] EFLAGS: 00210246 CPU: 0
> Sep 11 00:39:34 zaratustra EIP is at div64_u64+0x57/0x80
> Sep 11 00:39:34 zaratustra EAX: 0000389f EBX: 00000000 ECX: 00000000
> EDX: 00000000
> Sep 11 00:39:34 zaratustra ESI: d9800000 EDI: d9800000 EBP: 0000389f
> ESP: ea7a6edc
> Sep 11 00:39:34 zaratustra DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068
> Sep 11 00:39:34 zaratustra Process bash (pid: 4495, ti=ea7a6000
> task=ea744000 task.ti=ea7a6000)
> Sep 11 00:39:34 zaratustra Stack: 00000000 000003e8 d9800000 0000389f
> c0119042 00000000 00000000 00000001
> Sep 11 00:39:34 zaratustra 00000000 00000000 ea7a6f54 00010000 00000000
> c04d2e80 00000001 000e7ef0
> Sep 11 00:39:34 zaratustra c01191a3 00000000 00000000 ea7a6fa0 00000001
> ffffffff c04d2e80 ea5b2480
> Sep 11 00:39:34 zaratustra Call Trace:
> Sep 11 00:39:34 zaratustra [<c0119042>] __rt_schedulable+0x52/0x130
> Sep 11 00:39:34 zaratustra [<c01191a3>] sched_rt_handler+0x83/0x120
> Sep 11 00:39:34 zaratustra [<c01a76a6>] proc_sys_call_handler+0xb6/0xd0
> Sep 11 00:39:34 zaratustra [<c01a76c0>] proc_sys_write+0x0/0x20
> Sep 11 00:39:34 zaratustra [<c01a76d9>] proc_sys_write+0x19/0x20
> Sep 11 00:39:34 zaratustra [<c016cc68>] vfs_write+0xa8/0x140
> Sep 11 00:39:34 zaratustra [<c016cdd1>] sys_write+0x41/0x80
> Sep 11 00:39:34 zaratustra [<c0103051>] sysenter_past_esp+0x6a/0x91
> Sep 11 00:39:34 zaratustra =======================
> Sep 11 00:39:34 zaratustra Code: c8 41 0f ad f3 d3 ee f6 c1 20 0f 45 de
> 31 f6 0f ad ef d3 ed f6 c1 20 0f 45 fd 0f 45 ee 31 c9 39 eb 89 fe 89 ea
> 77 08 89 e8 31 d2 <f7> f3 89 c1 89 f0 8b 7c 24 08 f7 f3 8b 74 24 04 89
> ca 8b 1c 24
> Sep 11 00:39:34 zaratustra EIP: [<c0213fc7>] div64_u64+0x57/0x80 SS:ESP
> 0068:ea7a6edc
> Sep 11 00:39:34 zaratustra ---[ end trace 4eaa2a86a8e2da22 ]---

fix the boundary condition.

sysctl_sched_rt_period=0 makes exception at to_ratio().

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index cc1f81b50b82..98890807375b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8909,6 +8909,9 @@ static int sched_rt_global_constraints(void)
 	u64 rt_runtime, rt_period;
 	int ret = 0;
 
+	if (sysctl_sched_rt_period <= 0)
+		return -EINVAL;
+
 	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
 	rt_runtime = tg->rt_bandwidth.rt_runtime;
 
@@ -8925,6 +8928,9 @@ static int sched_rt_global_constraints(void)
 	unsigned long flags;
 	int i;
 
+	if (sysctl_sched_rt_period <= 0)
+		return -EINVAL;
+
 	spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
 	for_each_possible_cpu(i) {
 		struct rt_rq *rt_rq = &cpu_rq(i)->rt;
-- 
cgit v1.2.3


From 72c57ed50663dc04b0b329beaec39b557c8ac5a5 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 11 Sep 2008 23:29:54 -0700
Subject: sysctl: Use CONFIG_SPARC instead of __sparc__ for ifdef tests.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/sysctl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index fe4713347275..eda6162c58f2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -118,7 +118,7 @@ extern char modprobe_path[];
 extern int sg_big_buff;
 #endif
 
-#ifdef __sparc__
+#ifdef CONFIG_SPARC
 extern char reboot_command [];
 extern int stop_a_enabled;
 extern int scons_pwroff;
@@ -414,7 +414,7 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-#ifdef __sparc__
+#ifdef CONFIG_SPARC
 	{
 		.ctl_name	= KERN_SPARC_REBOOT,
 		.procname	= "reboot-cmd",
-- 
cgit v1.2.3


From 17f04fbb0f7153d95ec33da81189b113cc778157 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 11 Sep 2008 23:33:53 -0700
Subject: sysctl: Use header file for sysctl knob declarations on sparc.

This also takes care of a sparse warning as scons_pwroff's definition
point.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/sysctl.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index eda6162c58f2..da5152088cc3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -119,9 +119,7 @@ extern int sg_big_buff;
 #endif
 
 #ifdef CONFIG_SPARC
-extern char reboot_command [];
-extern int stop_a_enabled;
-extern int scons_pwroff;
+#include <asm/system.h>
 #endif
 
 #ifdef __hppa__
-- 
cgit v1.2.3


From 4e74339af6a59c061cf02f1ac497766bca1de19a Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Sat, 13 Sep 2008 02:33:08 -0700
Subject: cpuset: avoid changing cpuset's cpus when -errno returned

After the patch:

commit 0b2f630a28d53b5a2082a5275bc3334b10373508
Author: Miao Xie <miaox@cn.fujitsu.com>
Date:   Fri Jul 25 01:47:21 2008 -0700

    cpusets: restructure the function update_cpumask() and update_nodemask()

It might happen that 'echo 0 > /cpuset/sub/cpus' returned failure but 'cpus'
has been changed, because cpus was changed before calling heap_init() which
may return -ENOMEM.

This patch restores the orginal behavior.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 37 +++++++++++++++----------------------
 1 file changed, 15 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f227bc172690..827cd9adccb2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -843,37 +843,25 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
 /**
  * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
  * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
+ * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
  *
  * Called with cgroup_mutex held
  *
  * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
  * calling callback functions for each.
  *
- * Return 0 if successful, -errno if not.
+ * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
+ * if @heap != NULL.
  */
-static int update_tasks_cpumask(struct cpuset *cs)
+static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
 {
 	struct cgroup_scanner scan;
-	struct ptr_heap heap;
-	int retval;
-
-	/*
-	 * cgroup_scan_tasks() will initialize heap->gt for us.
-	 * heap_init() is still needed here for we should not change
-	 * cs->cpus_allowed when heap_init() fails.
-	 */
-	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
-	if (retval)
-		return retval;
 
 	scan.cg = cs->css.cgroup;
 	scan.test_task = cpuset_test_cpumask;
 	scan.process_task = cpuset_change_cpumask;
-	scan.heap = &heap;
-	retval = cgroup_scan_tasks(&scan);
-
-	heap_free(&heap);
-	return retval;
+	scan.heap = heap;
+	cgroup_scan_tasks(&scan);
 }
 
 /**
@@ -883,6 +871,7 @@ static int update_tasks_cpumask(struct cpuset *cs)
  */
 static int update_cpumask(struct cpuset *cs, const char *buf)
 {
+	struct ptr_heap heap;
 	struct cpuset trialcs;
 	int retval;
 	int is_load_balanced;
@@ -917,6 +906,10 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
 	if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
 		return 0;
 
+	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
+	if (retval)
+		return retval;
+
 	is_load_balanced = is_sched_load_balance(&trialcs);
 
 	mutex_lock(&callback_mutex);
@@ -927,9 +920,9 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
 	 * Scan tasks in the cpuset, and update the cpumasks of any
 	 * that need an update.
 	 */
-	retval = update_tasks_cpumask(cs);
-	if (retval < 0)
-		return retval;
+	update_tasks_cpumask(cs, &heap);
+
+	heap_free(&heap);
 
 	if (is_load_balanced)
 		async_rebuild_sched_domains();
@@ -1965,7 +1958,7 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
 		     nodes_empty(cp->mems_allowed))
 			remove_tasks_in_empty_cpuset(cp);
 		else {
-			update_tasks_cpumask(cp);
+			update_tasks_cpumask(cp, NULL);
 			update_tasks_nodemask(cp, &oldmems);
 		}
 	}
-- 
cgit v1.2.3


From 2344abbcbdb82140050e8be29d3d55e4f6fe860b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 16 Sep 2008 11:32:50 -0700
Subject: clockevents: make device shutdown robust

The device shut down does not cleanup the next_event variable of the
clock event device. So when the device is reactivated the possible
stale next_event value can prevent the device to be reprogrammed as it
claims to wait on a event already.

This is the root cause of the resurfacing suspend/resume problem,
where systems need key press to come back to life.

Fix this by setting next_event to KTIME_MAX when the device is shut
down. Use a separate function for shutdown which takes care of that
and only keep the direct set mode call in the broadcast code, where we
can not touch the next_event value.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clockevents.c    | 12 +++++++++++-
 kernel/time/tick-broadcast.c |  9 ++++-----
 kernel/time/tick-common.c    |  4 ++--
 kernel/time/tick-internal.h  |  2 ++
 4 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 1876b526c778..f8d968063cea 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -71,6 +71,16 @@ void clockevents_set_mode(struct clock_event_device *dev,
 	}
 }
 
+/**
+ * clockevents_shutdown - shutdown the device and clear next_event
+ * @dev:	device to shutdown
+ */
+void clockevents_shutdown(struct clock_event_device *dev)
+{
+	clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+	dev->next_event.tv64 = KTIME_MAX;
+}
+
 /**
  * clockevents_program_event - Reprogram the clock event device.
  * @expires:	absolute expiry time (monotonic clock)
@@ -206,7 +216,7 @@ void clockevents_exchange_device(struct clock_event_device *old,
 
 	if (new) {
 		BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED);
-		clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN);
+		clockevents_shutdown(new);
 	}
 	local_irq_restore(flags);
 }
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 2f5a38294bf9..f1f3eee28113 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -236,8 +236,7 @@ static void tick_do_broadcast_on_off(void *why)
 		if (!cpu_isset(cpu, tick_broadcast_mask)) {
 			cpu_set(cpu, tick_broadcast_mask);
 			if (td->mode == TICKDEV_MODE_PERIODIC)
-				clockevents_set_mode(dev,
-						     CLOCK_EVT_MODE_SHUTDOWN);
+				clockevents_shutdown(dev);
 		}
 		if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
 			tick_broadcast_force = 1;
@@ -254,7 +253,7 @@ static void tick_do_broadcast_on_off(void *why)
 
 	if (cpus_empty(tick_broadcast_mask)) {
 		if (!bc_stopped)
-			clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
+			clockevents_shutdown(bc);
 	} else if (bc_stopped) {
 		if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
 			tick_broadcast_start_periodic(bc);
@@ -306,7 +305,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
 
 	if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
 		if (bc && cpus_empty(tick_broadcast_mask))
-			clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
+			clockevents_shutdown(bc);
 	}
 
 	spin_unlock_irqrestore(&tick_broadcast_lock, flags);
@@ -321,7 +320,7 @@ void tick_suspend_broadcast(void)
 
 	bc = tick_broadcast_device.evtdev;
 	if (bc)
-		clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
+		clockevents_shutdown(bc);
 
 	spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index c4777193d567..019315ebf9de 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -249,7 +249,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
 	 * not give it back to the clockevents layer !
 	 */
 	if (tick_is_broadcast_device(curdev)) {
-		clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN);
+		clockevents_shutdown(curdev);
 		curdev = NULL;
 	}
 	clockevents_exchange_device(curdev, newdev);
@@ -311,7 +311,7 @@ static void tick_suspend(void)
 	unsigned long flags;
 
 	spin_lock_irqsave(&tick_device_lock, flags);
-	clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_SHUTDOWN);
+	clockevents_shutdown(td->evtdev);
 	spin_unlock_irqrestore(&tick_device_lock, flags);
 }
 
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 0ffc2918ea6f..6e9db9734aa6 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -10,6 +10,8 @@ extern int tick_do_timer_cpu __read_mostly;
 extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
 extern void tick_handle_periodic(struct clock_event_device *dev);
 
+extern void clockevents_shutdown(struct clock_event_device *dev);
+
 /*
  * NO_HZ / high resolution timer shared code
  */
-- 
cgit v1.2.3


From 15afe09bf496ae10c989e1a375a6b5da7bd3e16e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 20 Sep 2008 23:38:02 +0200
Subject: sched: wakeup preempt when small overlap

Lin Ming reported a 10% OLTP regression against 2.6.27-rc4.

The difference seems to come from different preemption agressiveness,
which affects the cache footprint of the workload and its effective
cache trashing.

Aggresively preempt a task if its avg overlap is very small, this should
avoid the task going to sleep and find it still running when we schedule
back to it - saving a wakeup.

Reported-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c          | 12 ++++++------
 kernel/sched_fair.c     | 13 ++++++++++---
 kernel/sched_features.h |  1 +
 kernel/sched_idletask.c |  6 +++---
 kernel/sched_rt.c       |  2 +-
 5 files changed, 21 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0d8905a1b8ca..ad9d39b021f8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -604,9 +604,9 @@ struct rq {
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
-static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
+static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
 {
-	rq->curr->sched_class->check_preempt_curr(rq, p);
+	rq->curr->sched_class->check_preempt_curr(rq, p, sync);
 }
 
 static inline int cpu_of(struct rq *rq)
@@ -2282,7 +2282,7 @@ out_running:
 	trace_mark(kernel_sched_wakeup,
 		"pid %d state %ld ## rq %p task %p rq->curr %p",
 		p->pid, p->state, rq, p, rq->curr);
-	check_preempt_curr(rq, p);
+	check_preempt_curr(rq, p, sync);
 
 	p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
@@ -2417,7 +2417,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 	trace_mark(kernel_sched_wakeup_new,
 		"pid %d state %ld ## rq %p task %p rq->curr %p",
 		p->pid, p->state, rq, p, rq->curr);
-	check_preempt_curr(rq, p);
+	check_preempt_curr(rq, p, 0);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
 		p->sched_class->task_wake_up(rq, p);
@@ -2877,7 +2877,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
 	 * Note that idle threads have a prio of MAX_PRIO, for this test
 	 * to be always true for them.
 	 */
-	check_preempt_curr(this_rq, p);
+	check_preempt_curr(this_rq, p, 0);
 }
 
 /*
@@ -6007,7 +6007,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 	set_task_cpu(p, dest_cpu);
 	if (on_rq) {
 		activate_task(rq_dest, p, 0);
-		check_preempt_curr(rq_dest, p);
+		check_preempt_curr(rq_dest, p, 0);
 	}
 done:
 	ret = 1;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a10ac0bcee64..7328383690f1 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1331,7 +1331,7 @@ static inline int depth_se(struct sched_entity *se)
 /*
  * Preempt the current task with a newly woken task if needed:
  */
-static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
+static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
 {
 	struct task_struct *curr = rq->curr;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
@@ -1367,6 +1367,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 	if (!sched_feat(WAKEUP_PREEMPT))
 		return;
 
+	if (sched_feat(WAKEUP_OVERLAP) && sync &&
+			se->avg_overlap < sysctl_sched_migration_cost &&
+			pse->avg_overlap < sysctl_sched_migration_cost) {
+		resched_task(curr);
+		return;
+	}
+
 	/*
 	 * preemption test can be made between sibling entities who are in the
 	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
@@ -1649,7 +1656,7 @@ static void prio_changed_fair(struct rq *rq, struct task_struct *p,
 		if (p->prio > oldprio)
 			resched_task(rq->curr);
 	} else
-		check_preempt_curr(rq, p);
+		check_preempt_curr(rq, p, 0);
 }
 
 /*
@@ -1666,7 +1673,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p,
 	if (running)
 		resched_task(rq->curr);
 	else
-		check_preempt_curr(rq, p);
+		check_preempt_curr(rq, p, 0);
 }
 
 /* Account for a task changing its policy or group.
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 9353ca78154e..bf027a7accf8 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -11,3 +11,4 @@ SCHED_FEAT(ASYM_GRAN, 1)
 SCHED_FEAT(LB_BIAS, 1)
 SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
 SCHED_FEAT(ASYM_EFF_LOAD, 1)
+SCHED_FEAT(WAKEUP_OVERLAP, 1)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 3a4f92dbbe66..dec4ccabe2f5 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync)
 /*
  * Idle tasks are unconditionally rescheduled:
  */
-static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p)
+static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync)
 {
 	resched_task(rq->idle);
 }
@@ -76,7 +76,7 @@ static void switched_to_idle(struct rq *rq, struct task_struct *p,
 	if (running)
 		resched_task(rq->curr);
 	else
-		check_preempt_curr(rq, p);
+		check_preempt_curr(rq, p, 0);
 }
 
 static void prio_changed_idle(struct rq *rq, struct task_struct *p,
@@ -93,7 +93,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
 		if (p->prio > oldprio)
 			resched_task(rq->curr);
 	} else
-		check_preempt_curr(rq, p);
+		check_preempt_curr(rq, p, 0);
 }
 
 /*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 552310798dad..6d2d0a5d030b 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -783,7 +783,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 /*
  * Preempt the current task with a newly woken task if needed:
  */
-static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
+static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync)
 {
 	if (p->prio < rq->curr->prio) {
 		resched_task(rq->curr);
-- 
cgit v1.2.3


From f681bbd656b01439be904250a1581ca9c27505a1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 22 Sep 2008 16:29:00 +0200
Subject: sched: turn off WAKEUP_OVERLAP

WAKEUP_OVERLAP is not a winner on a 16way box, running psql+sysbench:

       .27-rc7-NO_WAKEUP_OVERLAP  .27-rc7-WAKEUP_OVERLAP
-------------------------------------------------
    1:             694              811    +14.39%
    2:            1454             1427    -1.86%
    4:            3017             3070    +1.70%
    8:            5694             5808    +1.96%
   16:           10592            10612    +0.19%
   32:            9693             9647    -0.48%
   64:            8507             8262    -2.97%
  128:            8402             7087    -18.55%
  256:            8419             5124    -64.30%
  512:            7990             3671    -117.62%
-------------------------------------------------
  SUM:           64466            55524    -16.11%

... so turn it off by default.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_features.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index bf027a7accf8..7c9e8f4a049f 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -11,4 +11,4 @@ SCHED_FEAT(ASYM_GRAN, 1)
 SCHED_FEAT(LB_BIAS, 1)
 SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
 SCHED_FEAT(ASYM_EFF_LOAD, 1)
-SCHED_FEAT(WAKEUP_OVERLAP, 1)
+SCHED_FEAT(WAKEUP_OVERLAP, 0)
-- 
cgit v1.2.3


From caea8a03702c147e8ae90da0801e7ba8297b1d46 Mon Sep 17 00:00:00 2001
From: Chris Friesen <cfriesen@nortel.com>
Date: Mon, 22 Sep 2008 11:06:09 -0600
Subject: sched: fix list traversal to use _rcu variant

load_balance_fair() calls rcu_read_lock() but then traverses the list
 using the regular list traversal routine.  This patch converts the
list traversal to use the _rcu version.

Signed-off-by: Chris Friesen <cfriesen@nortel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 7328383690f1..3b89aa6594a9 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1521,7 +1521,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	rcu_read_lock();
 	update_h_load(busiest_cpu);
 
-	list_for_each_entry(tg, &task_groups, list) {
+	list_for_each_entry_rcu(tg, &task_groups, list) {
 		struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
 		unsigned long busiest_h_load = busiest_cfs_rq->h_load;
 		unsigned long busiest_weight = busiest_cfs_rq->load.weight;
-- 
cgit v1.2.3


From fa748203175de7c08f2df80e5a0eeca40329b5e2 Mon Sep 17 00:00:00 2001
From: Rakib Mullick <rakib.mullick@gmail.com>
Date: Mon, 22 Sep 2008 14:55:45 -0700
Subject: sched: fix init_hrtick() section mismatch warning

LD      kernel/built-in.o
WARNING: kernel/built-in.o(.text+0x326): Section mismatch in reference
from the function init_hrtick() to the variable
.cpuinit.data:hotplug_hrtick_nb.8
The function init_hrtick() references
the variable __cpuinitdata hotplug_hrtick_nb.8.
This is often because init_hrtick lacks a __cpuinitdata
annotation or the annotation of hotplug_hrtick_nb.8 is wrong.

Signed-off-by: Md.Rakib H. Mullick <rakib.mullick@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 98890807375b..13dd2db9fb2d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1087,7 +1087,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	return NOTIFY_DONE;
 }
 
-static void init_hrtick(void)
+static __init void init_hrtick(void)
 {
 	hotcpu_notifier(hotplug_hrtick, 0);
 }
-- 
cgit v1.2.3


From 006c75f146e58e080d2b2725a6664f71886e112b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 22 Sep 2008 14:55:46 -0700
Subject: sched: clarify ifdef tangle

- Add some comments to try to make the ifdef puzzle a bit clearer

- Explicitly inline one of the three init_hrtick() implementations.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ad9d39b021f8..927c9307cd00 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1102,7 +1102,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
 	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
 }
 
-static void init_hrtick(void)
+static inline void init_hrtick(void)
 {
 }
 #endif /* CONFIG_SMP */
@@ -1121,7 +1121,7 @@ static void init_rq_hrtick(struct rq *rq)
 	rq->hrtick_timer.function = hrtick;
 	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 }
-#else
+#else	/* CONFIG_SCHED_HRTICK */
 static inline void hrtick_clear(struct rq *rq)
 {
 }
@@ -1133,7 +1133,7 @@ static inline void init_rq_hrtick(struct rq *rq)
 static inline void init_hrtick(void)
 {
 }
-#endif
+#endif	/* CONFIG_SCHED_HRTICK */
 
 /*
  * resched_task - mark a task 'to be rescheduled now'.
-- 
cgit v1.2.3


From 3a72dc8eb5a7122fff439a22bd22486a4fff505c Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Mon, 22 Sep 2008 14:55:46 -0700
Subject: rcu: fix sparse shadowed variable warning

kernel/rcuclassic.c:564:18: warning: symbol 'flags' shadows an earlier one
kernel/rcuclassic.c:527:16: originally declared here

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 743cf0550ff4..ed15128ca2c9 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -561,15 +561,15 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 		local_irq_restore(flags);
 
 		if (rcu_batch_after(rdp->batch, rcp->pending)) {
-			unsigned long flags;
+			unsigned long flags2;
 
 			/* and start it/schedule start if it's a new batch */
-			spin_lock_irqsave(&rcp->lock, flags);
+			spin_lock_irqsave(&rcp->lock, flags2);
 			if (rcu_batch_after(rdp->batch, rcp->pending)) {
 				rcp->pending = rdp->batch;
 				rcu_start_batch(rcp);
 			}
-			spin_unlock_irqrestore(&rcp->lock, flags);
+			spin_unlock_irqrestore(&rcp->lock, flags2);
 		}
 	}
 
-- 
cgit v1.2.3


From 6441402b1f173fa38e561d3cee7c01c32e5281ad Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Sep 2008 18:46:37 +0200
Subject: clockevents: prevent cpu online to interfere with nohz

Impact: rare hang which can be triggered on CPU online.

tick_do_timer_cpu keeps track of the CPU which updates jiffies
via do_timer. The value -1 is used to signal, that currently no
CPU is doing this. There are two cases, where the variable can
have this state:

 boot:
    necessary for systems where the boot cpu id can be != 0

 nohz long idle sleep:
    When the CPU which did the jiffies update last goes into
    a long idle sleep it drops the update jiffies duty so
    another CPU which is not idle can pick it up and keep
    jiffies going.

Using the same value for both situations is wrong, as the CPU online
code can see the -1 state when the timer of the newly onlined CPU is
setup. The setup for a newly onlined CPU goes through periodic mode
and can pick up the do_timer duty without being aware of the nohz /
highres mode of the already running system.

Use two separate states and make them constants to avoid magic
numbers confusion.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-common.c   | 7 ++++---
 kernel/time/tick-internal.h | 4 ++++
 kernel/time/tick-sched.c    | 8 ++++----
 3 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 019315ebf9de..b523d095decf 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
  */
 ktime_t tick_next_period;
 ktime_t tick_period;
-int tick_do_timer_cpu __read_mostly = -1;
+int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
 DEFINE_SPINLOCK(tick_device_lock);
 
 /*
@@ -148,7 +148,7 @@ static void tick_setup_device(struct tick_device *td,
 		 * If no cpu took the do_timer update, assign it to
 		 * this cpu:
 		 */
-		if (tick_do_timer_cpu == -1) {
+		if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
 			tick_do_timer_cpu = cpu;
 			tick_next_period = ktime_get();
 			tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
@@ -300,7 +300,8 @@ static void tick_shutdown(unsigned int *cpup)
 	if (*cpup == tick_do_timer_cpu) {
 		int cpu = first_cpu(cpu_online_map);
 
-		tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : -1;
+		tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu :
+			TICK_DO_TIMER_NONE;
 	}
 	spin_unlock_irqrestore(&tick_device_lock, flags);
 }
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 6e9db9734aa6..e18014fadf95 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -1,6 +1,10 @@
 /*
  * tick internal variable and functions used by low/high res code
  */
+
+#define TICK_DO_TIMER_NONE	-1
+#define TICK_DO_TIMER_BOOT	-2
+
 DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
 extern spinlock_t tick_device_lock;
 extern ktime_t tick_next_period;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index a87b0468568b..31a14e8caac1 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -221,7 +221,7 @@ void tick_nohz_stop_sched_tick(int inidle)
 	 */
 	if (unlikely(!cpu_online(cpu))) {
 		if (cpu == tick_do_timer_cpu)
-			tick_do_timer_cpu = -1;
+			tick_do_timer_cpu = TICK_DO_TIMER_NONE;
 	}
 
 	if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
@@ -303,7 +303,7 @@ void tick_nohz_stop_sched_tick(int inidle)
 		 * invoked.
 		 */
 		if (cpu == tick_do_timer_cpu)
-			tick_do_timer_cpu = -1;
+			tick_do_timer_cpu = TICK_DO_TIMER_NONE;
 
 		ts->idle_sleeps++;
 
@@ -468,7 +468,7 @@ static void tick_nohz_handler(struct clock_event_device *dev)
 	 * this duty, then the jiffies update is still serialized by
 	 * xtime_lock.
 	 */
-	if (unlikely(tick_do_timer_cpu == -1))
+	if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
 		tick_do_timer_cpu = cpu;
 
 	/* Check, if the jiffies need an update */
@@ -570,7 +570,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
 	 * this duty, then the jiffies update is still serialized by
 	 * xtime_lock.
 	 */
-	if (unlikely(tick_do_timer_cpu == -1))
+	if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
 		tick_do_timer_cpu = cpu;
 #endif
 
-- 
cgit v1.2.3


From 49d670fb8dd62d3ed4e3ed2513538ea65b051aed Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Sep 2008 18:56:01 +0200
Subject: clockevents: prevent stale tick_next_period for onlining CPUs

Impact: possible hang on CPU onlining in timer one shot mode.

The tick_next_period variable is only used during boot on nohz/highres
enabled systems, but for CPU onlining it needs to be maintained when
the per cpu clock events device operates in one shot mode.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-sched.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 31a14e8caac1..39019b3f7621 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -75,6 +75,9 @@ static void tick_do_update_jiffies64(ktime_t now)
 							   incr * ticks);
 		}
 		do_timer(++ticks);
+
+		/* Keep the tick_next_period variable up to date */
+		tick_next_period = ktime_add(last_jiffies_update, tick_period);
 	}
 	write_sequnlock(&xtime_lock);
 }
-- 
cgit v1.2.3


From 302745699c1b675b5d2a1af87271de10e4d96b6a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Sep 2008 19:02:25 +0200
Subject: clockevents: check broadcast device not tick device

Impact: Possible hang on CPU online observed on AMD C1E machines.

The broadcast setup code looks at the mode of the tick device to
determine whether it needs to be shut down or setup. This is wrong
when the broadcast mode is set to one shot already. This can happen
when a CPU is brought online as it goes through the periodic setup
first.

The problem went unnoticed as sane systems do not call into that code
before the switch to one shot for the clock event device happens.
The AMD C1E idle routine switches over immediately and thereby shuts
down the just setup device before the first interrupt happens.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f1f3eee28113..e2b66d1c8ca5 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -235,7 +235,7 @@ static void tick_do_broadcast_on_off(void *why)
 	case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
 		if (!cpu_isset(cpu, tick_broadcast_mask)) {
 			cpu_set(cpu, tick_broadcast_mask);
-			if (td->mode == TICKDEV_MODE_PERIODIC)
+			if (bc->mode == TICKDEV_MODE_PERIODIC)
 				clockevents_shutdown(dev);
 		}
 		if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
@@ -245,7 +245,7 @@ static void tick_do_broadcast_on_off(void *why)
 		if (!tick_broadcast_force &&
 		    cpu_isset(cpu, tick_broadcast_mask)) {
 			cpu_clear(cpu, tick_broadcast_mask);
-			if (td->mode == TICKDEV_MODE_PERIODIC)
+			if (bc->mode == TICKDEV_MODE_PERIODIC)
 				tick_setup_periodic(dev, 0);
 		}
 		break;
-- 
cgit v1.2.3


From 27ce4cb4a0c7cf59b9a9952266883862f2e4c99f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Sep 2008 19:04:02 +0200
Subject: clockevents: prevent mode mismatch on cpu online

Impact: timer hang on CPU online observed on AMD C1E systems

When a CPU is brought online then the broadcast machinery can
be in the one shot state already. Check this and setup the timer
device of the new CPU in one shot mode so the broadcast code
can pick up the next_event value correctly.

Another AMD C1E oddity, as we switch to broadcast immediately and
not after the full bring up via the ACPI cpu idle code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast.c | 8 ++++++++
 kernel/time/tick-common.c    | 3 ++-
 kernel/time/tick-internal.h  | 2 ++
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index e2b66d1c8ca5..bd7034542399 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -575,4 +575,12 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
 	spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
 
+/*
+ * Check, whether the broadcast device is in one shot mode
+ */
+int tick_broadcast_oneshot_active(void)
+{
+	return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT;
+}
+
 #endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index b523d095decf..df12434b43ca 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -109,7 +109,8 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
 	if (!tick_device_is_functional(dev))
 		return;
 
-	if (dev->features & CLOCK_EVT_FEAT_PERIODIC) {
+	if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
+	    !tick_broadcast_oneshot_active()) {
 		clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
 	} else {
 		unsigned long seq;
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index e18014fadf95..55c3f4be6077 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -35,6 +35,7 @@ extern void tick_broadcast_oneshot_control(unsigned long reason);
 extern void tick_broadcast_switch_to_oneshot(void);
 extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
 extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
+extern int tick_broadcast_oneshot_active(void);
 # else /* BROADCAST */
 static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 {
@@ -43,6 +44,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
 static inline void tick_broadcast_switch_to_oneshot(void) { }
 static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
+static inline int tick_broadcast_oneshot_active(void) { return 0; }
 # endif /* !BROADCAST */
 
 #else /* !ONESHOT */
-- 
cgit v1.2.3


From f8e256c687eb53850685747757c8d75e58756e15 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 23 Sep 2008 13:00:57 +0200
Subject: timers: fix build error in !oneshot case
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

 kernel/time/tick-common.c: In function ‘tick_setup_periodic’:
 kernel/time/tick-common.c:113: error: implicit declaration of function ‘tick_broadcast_oneshot_active’

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/tick-internal.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 55c3f4be6077..469248782c23 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -74,6 +74,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
 {
 	return 0;
 }
+static inline int tick_broadcast_oneshot_active(void) { return 0; }
 #endif /* !TICK_ONESHOT */
 
 /*
-- 
cgit v1.2.3


From 695698500912c4479ddf4723e492de3970ff8530 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 23 Sep 2008 14:54:23 +0200
Subject: sched: rework wakeup preemption

Rework the wakeup preemption to work on real runtime instead of
the virtual runtime. This greatly simplifies the code.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 133 ++--------------------------------------------------
 1 file changed, 4 insertions(+), 129 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3b89aa6594a9..c20899763457 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -408,64 +408,6 @@ static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	return __sched_period(nr_running);
 }
 
-/*
- * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
- * that it favours >=0 over <0.
- *
- *   -20         |
- *               |
- *     0 --------+-------
- *             .'
- *    19     .'
- *
- */
-static unsigned long
-calc_delta_asym(unsigned long delta, struct sched_entity *se)
-{
-	struct load_weight lw = {
-		.weight = NICE_0_LOAD,
-		.inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
-	};
-
-	for_each_sched_entity(se) {
-		struct load_weight *se_lw = &se->load;
-		unsigned long rw = cfs_rq_of(se)->load.weight;
-
-#ifdef CONFIG_FAIR_SCHED_GROUP
-		struct cfs_rq *cfs_rq = se->my_q;
-		struct task_group *tg = NULL
-
-		if (cfs_rq)
-			tg = cfs_rq->tg;
-
-		if (tg && tg->shares < NICE_0_LOAD) {
-			/*
-			 * scale shares to what it would have been had
-			 * tg->weight been NICE_0_LOAD:
-			 *
-			 *   weight = 1024 * shares / tg->weight
-			 */
-			lw.weight *= se->load.weight;
-			lw.weight /= tg->shares;
-
-			lw.inv_weight = 0;
-
-			se_lw = &lw;
-			rw += lw.weight - se->load.weight;
-		} else
-#endif
-
-		if (se->load.weight < NICE_0_LOAD) {
-			se_lw = &lw;
-			rw += NICE_0_LOAD - se->load.weight;
-		}
-
-		delta = calc_delta_mine(delta, rw, se_lw);
-	}
-
-	return delta;
-}
-
 /*
  * Update the current task's runtime statistics. Skip current tasks that
  * are not in our scheduling class.
@@ -1281,53 +1223,11 @@ static unsigned long wakeup_gran(struct sched_entity *se)
 	 * + nice tasks.
 	 */
 	if (sched_feat(ASYM_GRAN))
-		gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
-	else
-		gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
+		gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load);
 
 	return gran;
 }
 
-/*
- * Should 'se' preempt 'curr'.
- *
- *             |s1
- *        |s2
- *   |s3
- *         g
- *      |<--->|c
- *
- *  w(c, s1) = -1
- *  w(c, s2) =  0
- *  w(c, s3) =  1
- *
- */
-static int
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
-{
-	s64 gran, vdiff = curr->vruntime - se->vruntime;
-
-	if (vdiff < 0)
-		return -1;
-
-	gran = wakeup_gran(curr);
-	if (vdiff > gran)
-		return 1;
-
-	return 0;
-}
-
-/* return depth at which a sched entity is present in the hierarchy */
-static inline int depth_se(struct sched_entity *se)
-{
-	int depth = 0;
-
-	for_each_sched_entity(se)
-		depth++;
-
-	return depth;
-}
-
 /*
  * Preempt the current task with a newly woken task if needed:
  */
@@ -1336,7 +1236,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
 	struct task_struct *curr = rq->curr;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 	struct sched_entity *se = &curr->se, *pse = &p->se;
-	int se_depth, pse_depth;
+	s64 delta_exec;
 
 	if (unlikely(rt_prio(p->prio))) {
 		update_rq_clock(rq);
@@ -1374,33 +1274,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
 		return;
 	}
 
-	/*
-	 * preemption test can be made between sibling entities who are in the
-	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
-	 * both tasks until we find their ancestors who are siblings of common
-	 * parent.
-	 */
-
-	/* First walk up until both entities are at same depth */
-	se_depth = depth_se(se);
-	pse_depth = depth_se(pse);
-
-	while (se_depth > pse_depth) {
-		se_depth--;
-		se = parent_entity(se);
-	}
-
-	while (pse_depth > se_depth) {
-		pse_depth--;
-		pse = parent_entity(pse);
-	}
-
-	while (!is_same_group(se, pse)) {
-		se = parent_entity(se);
-		pse = parent_entity(pse);
-	}
-
-	if (wakeup_preempt_entity(se, pse) == 1)
+	delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+	if (delta_exec > wakeup_gran(pse))
 		resched_task(curr);
 }
 
-- 
cgit v1.2.3


From 940959e93949e839c14f8ddc3b9b0e34a2ab6e29 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 23 Sep 2008 15:33:42 +0200
Subject: sched: fixlet for group load balance

We should not only correct the increment for the initial group, but should
be consistent and do so for all the groups we encounter.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c20899763457..0c59da7e3120 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1027,7 +1027,6 @@ static long effective_load(struct task_group *tg, int cpu,
 		long wl, long wg)
 {
 	struct sched_entity *se = tg->se[cpu];
-	long more_w;
 
 	if (!tg->parent)
 		return wl;
@@ -1039,18 +1038,17 @@ static long effective_load(struct task_group *tg, int cpu,
 	if (!wl && sched_feat(ASYM_EFF_LOAD))
 		return wl;
 
-	/*
-	 * Instead of using this increment, also add the difference
-	 * between when the shares were last updated and now.
-	 */
-	more_w = se->my_q->load.weight - se->my_q->rq_weight;
-	wl += more_w;
-	wg += more_w;
-
 	for_each_sched_entity(se) {
-#define D(n) (likely(n) ? (n) : 1)
-
 		long S, rw, s, a, b;
+		long more_w;
+
+		/*
+		 * Instead of using this increment, also add the difference
+		 * between when the shares were last updated and now.
+		 */
+		more_w = se->my_q->load.weight - se->my_q->rq_weight;
+		wl += more_w;
+		wg += more_w;
 
 		S = se->my_q->tg->shares;
 		s = se->my_q->shares;
@@ -1059,7 +1057,11 @@ static long effective_load(struct task_group *tg, int cpu,
 		a = S*(rw + wl);
 		b = S*rw + s*wg;
 
-		wl = s*(a-b)/D(b);
+		wl = s*(a-b);
+
+		if (likely(b))
+			wl /= b;
+
 		/*
 		 * Assume the group is already running and will
 		 * thus already be accounted for in the weight.
@@ -1068,7 +1070,6 @@ static long effective_load(struct task_group *tg, int cpu,
 		 * alter the group weight.
 		 */
 		wg = 0;
-#undef D
 	}
 
 	return wl;
-- 
cgit v1.2.3


From 78333cdd0e472180743d35988e576d6ecc6f6ddb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 23 Sep 2008 15:33:43 +0200
Subject: sched: add some comments to the bandwidth code

Hopefully clarify some of this code a little.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 2e228bd5395e..d570a8cc4fcd 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -231,6 +231,9 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 #endif /* CONFIG_RT_GROUP_SCHED */
 
 #ifdef CONFIG_SMP
+/*
+ * We ran out of runtime, see if we can borrow some from our neighbours.
+ */
 static int do_balance_runtime(struct rt_rq *rt_rq)
 {
 	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
@@ -250,9 +253,18 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
 			continue;
 
 		spin_lock(&iter->rt_runtime_lock);
+		/*
+		 * Either all rqs have inf runtime and there's nothing to steal
+		 * or __disable_runtime() below sets a specific rq to inf to
+		 * indicate its been disabled and disalow stealing.
+		 */
 		if (iter->rt_runtime == RUNTIME_INF)
 			goto next;
 
+		/*
+		 * From runqueues with spare time, take 1/n part of their
+		 * spare time, but no more than our period.
+		 */
 		diff = iter->rt_runtime - iter->rt_time;
 		if (diff > 0) {
 			diff = div_u64((u64)diff, weight);
@@ -274,6 +286,9 @@ next:
 	return more;
 }
 
+/*
+ * Ensure this RQ takes back all the runtime it lend to its neighbours.
+ */
 static void __disable_runtime(struct rq *rq)
 {
 	struct root_domain *rd = rq->rd;
@@ -289,17 +304,33 @@ static void __disable_runtime(struct rq *rq)
 
 		spin_lock(&rt_b->rt_runtime_lock);
 		spin_lock(&rt_rq->rt_runtime_lock);
+		/*
+		 * Either we're all inf and nobody needs to borrow, or we're
+		 * already disabled and thus have nothing to do, or we have
+		 * exactly the right amount of runtime to take out.
+		 */
 		if (rt_rq->rt_runtime == RUNTIME_INF ||
 				rt_rq->rt_runtime == rt_b->rt_runtime)
 			goto balanced;
 		spin_unlock(&rt_rq->rt_runtime_lock);
 
+		/*
+		 * Calculate the difference between what we started out with
+		 * and what we current have, that's the amount of runtime
+		 * we lend and now have to reclaim.
+		 */
 		want = rt_b->rt_runtime - rt_rq->rt_runtime;
 
+		/*
+		 * Greedy reclaim, take back as much as we can.
+		 */
 		for_each_cpu_mask(i, rd->span) {
 			struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 			s64 diff;
 
+			/*
+			 * Can't reclaim from ourselves or disabled runqueues.
+			 */
 			if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
 				continue;
 
@@ -319,8 +350,16 @@ static void __disable_runtime(struct rq *rq)
 		}
 
 		spin_lock(&rt_rq->rt_runtime_lock);
+		/*
+		 * We cannot be left wanting - that would mean some runtime
+		 * leaked out of the system.
+		 */
 		BUG_ON(want);
 balanced:
+		/*
+		 * Disable all the borrow logic by pretending we have inf
+		 * runtime - in which case borrowing doesn't make sense.
+		 */
 		rt_rq->rt_runtime = RUNTIME_INF;
 		spin_unlock(&rt_rq->rt_runtime_lock);
 		spin_unlock(&rt_b->rt_runtime_lock);
@@ -343,6 +382,9 @@ static void __enable_runtime(struct rq *rq)
 	if (unlikely(!scheduler_running))
 		return;
 
+	/*
+	 * Reset each runqueue's bandwidth settings
+	 */
 	for_each_leaf_rt_rq(rt_rq, rq) {
 		struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 
-- 
cgit v1.2.3


From 4653f803e6e0d970ffeac0efd2c01743eb6c5228 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 23 Sep 2008 15:33:44 +0200
Subject: sched: more sanity checks on the bandwidth settings

While playing around with it, I noticed we missed some sanity checks.
Also add some comments while we're there.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 669c49aa57f0..e1299de1765e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8866,11 +8866,29 @@ static int tg_schedulable(struct task_group *tg, void *data)
 		runtime = d->rt_runtime;
 	}
 
+	/*
+	 * Cannot have more runtime than the period.
+	 */
+	if (runtime > period && runtime != RUNTIME_INF)
+		return -EINVAL;
+
+	/*
+	 * Ensure we don't starve existing RT tasks.
+	 */
 	if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
 		return -EBUSY;
 
 	total = to_ratio(period, runtime);
 
+	/*
+	 * Nobody can have more than the global setting allows.
+	 */
+	if (total > to_ratio(global_rt_period(), global_rt_runtime()))
+		return -EINVAL;
+
+	/*
+	 * The sum of our children's runtime should not exceed our own.
+	 */
 	list_for_each_entry_rcu(child, &tg->children, siblings) {
 		period = ktime_to_ns(child->rt_bandwidth.rt_period);
 		runtime = child->rt_bandwidth.rt_runtime;
@@ -8978,19 +8996,24 @@ long sched_group_rt_period(struct task_group *tg)
 
 static int sched_rt_global_constraints(void)
 {
-	struct task_group *tg = &root_task_group;
-	u64 rt_runtime, rt_period;
+	u64 runtime, period;
 	int ret = 0;
 
 	if (sysctl_sched_rt_period <= 0)
 		return -EINVAL;
 
-	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
-	rt_runtime = tg->rt_bandwidth.rt_runtime;
+	runtime = global_rt_runtime();
+	period = global_rt_period();
+
+	/*
+	 * Sanity check on the sysctl variables.
+	 */
+	if (runtime > period && runtime != RUNTIME_INF)
+		return -EINVAL;
 
 	mutex_lock(&rt_constraints_mutex);
 	read_lock(&tasklist_lock);
-	ret = __rt_schedulable(tg, rt_period, rt_runtime);
+	ret = __rt_schedulable(NULL, 0, 0);
 	read_unlock(&tasklist_lock);
 	mutex_unlock(&rt_constraints_mutex);
 
-- 
cgit v1.2.3


From 57fdc26d4a734a3e00c6b2fc0e1e40ff8da4dc31 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 23 Sep 2008 15:33:45 +0200
Subject: sched: fixup buddy selection

We should set the buddy even though we might already have the
TIF_RESCHED flag set.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0c59da7e3120..e3f3c10f7033 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1249,6 +1249,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
 	if (unlikely(se == pse))
 		return;
 
+	cfs_rq_of(pse)->next = pse;
+
 	/*
 	 * We can come here with TIF_NEED_RESCHED already set from new task
 	 * wake up path.
@@ -1256,8 +1258,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
 	if (test_tsk_need_resched(curr))
 		return;
 
-	cfs_rq_of(pse)->next = pse;
-
 	/*
 	 * Batch tasks do not preempt (their preemption is driven by
 	 * the tick):
-- 
cgit v1.2.3


From f9092f358bc2ec5367621478811f046f82873376 Mon Sep 17 00:00:00 2001
From: Jonathan Steel <jon.steel@esentire.com>
Date: Mon, 22 Sep 2008 13:57:45 -0700
Subject: kexec: fix segmentation fault in kimage_add_entry

A segmentation fault can occur in kimage_add_entry in kexec.c when loading
a kernel image into memory.  The fault occurs because a page is requested
by calling kimage_alloc_page with gfp_mask GFP_KERNEL and the function may
actually return a page with gfp_mask GFP_HIGHUSER.  The high mem page is
returned because it was swapped with the kernel page due to the kernel
page being a page that will shortly be copied to.

This patch ensures that kimage_alloc_page returns a page that was created
with the correct gfp flags.

I have verified the change and fixed the whitespace damage of the original
patch.  Jonathan did a great job of tracking this down after he hit the
problem.  -- Eric

Signed-off-by: Jonathan Steel <jon.steel@esentire.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 59f3f0df35d4..aef265325cd3 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -753,8 +753,14 @@ static struct page *kimage_alloc_page(struct kimage *image,
 			*old = addr | (*old & ~PAGE_MASK);
 
 			/* The old page I have found cannot be a
-			 * destination page, so return it.
+			 * destination page, so return it if it's
+			 * gfp_flags honor the ones passed in.
 			 */
+			if (!(gfp_mask & __GFP_HIGHMEM) &&
+			    PageHighMem(old_page)) {
+				kimage_free_pages(old_page);
+				continue;
+			}
 			addr = old_addr;
 			page = old_page;
 			break;
-- 
cgit v1.2.3


From b87f17242da6b2ac6db2d179b2f93fb84cff2fbe Mon Sep 17 00:00:00 2001
From: Bharata B Rao <bharata@linux.vnet.ibm.com>
Date: Thu, 25 Sep 2008 09:53:54 +0530
Subject: sched: maintain only task entities in cfs_rq->tasks list

cfs_rq->tasks list is used by the load balancer to iterate
over all the tasks. Currently it holds all the entities
(both task and group entities) because of which there is
a need to check for group entities explicitly during load
balancing. This patch changes the cfs_rq->tasks list to
hold only task entities.

Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e3f3c10f7033..95c1295ad26d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -528,11 +528,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	update_load_add(&cfs_rq->load, se->load.weight);
 	if (!parent_entity(se))
 		inc_cpu_load(rq_of(cfs_rq), se->load.weight);
-	if (entity_is_task(se))
+	if (entity_is_task(se)) {
 		add_cfs_task_weight(cfs_rq, se->load.weight);
+		list_add(&se->group_node, &cfs_rq->tasks);
+	}
 	cfs_rq->nr_running++;
 	se->on_rq = 1;
-	list_add(&se->group_node, &cfs_rq->tasks);
 }
 
 static void
@@ -541,11 +542,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	update_load_sub(&cfs_rq->load, se->load.weight);
 	if (!parent_entity(se))
 		dec_cpu_load(rq_of(cfs_rq), se->load.weight);
-	if (entity_is_task(se))
+	if (entity_is_task(se)) {
 		add_cfs_task_weight(cfs_rq, -se->load.weight);
+		list_del_init(&se->group_node);
+	}
 	cfs_rq->nr_running--;
 	se->on_rq = 0;
-	list_del_init(&se->group_node);
 }
 
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -1335,19 +1337,9 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
 	if (next == &cfs_rq->tasks)
 		return NULL;
 
-	/* Skip over entities that are not tasks */
-	do {
-		se = list_entry(next, struct sched_entity, group_node);
-		next = next->next;
-	} while (next != &cfs_rq->tasks && !entity_is_task(se));
-
-	if (next == &cfs_rq->tasks && !entity_is_task(se))
-		return NULL;
-
-	cfs_rq->balance_iterator = next;
-
-	if (entity_is_task(se))
-		p = task_of(se);
+	se = list_entry(next, struct sched_entity, group_node);
+	p = task_of(se);
+	cfs_rq->balance_iterator = next->next;
 
 	return p;
 }
-- 
cgit v1.2.3


From 18d6522b86d21a04c8ac1ea79747e2e434a956d9 Mon Sep 17 00:00:00 2001
From: Atsuo Igarashi <atsuo_igarashi@tripeaks.co.jp>
Date: Fri, 26 Sep 2008 10:36:41 -0500
Subject: kgdb: could not write to the last of valid memory with kgdb

On the ARM architecture, kgdb will crash the kernel if the last byte
of valid memory is written due to a flush_icache_range flushing
beyond the memory boundary.

Signed-off-by: Atsuo Igarashi <atsuo_igarashi@tripeaks.co.jp>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 kernel/kgdb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index eaa21fc9ad1d..949806ab67de 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -488,7 +488,7 @@ static int write_mem_msg(int binary)
 		if (err)
 			return err;
 		if (CACHE_FLUSH_IS_SAFE)
-			flush_icache_range(addr, addr + length + 1);
+			flush_icache_range(addr, addr + length);
 		return 0;
 	}
 
-- 
cgit v1.2.3


From d7161a65341556bacb5e6654e133803f46f51063 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Fri, 26 Sep 2008 10:36:41 -0500
Subject: kgdb, x86, arm, mips, powerpc: ignore user space single stepping

On the x86 arch, user space single step exceptions should be ignored
if they occur in the kernel space, such as ptrace stepping through a
system call.

First check if it is kgdb that is executing a single step, then ensure
it is not an accidental traversal into the user space, while in kgdb,
any other time the TIF_SINGLESTEP is set, kgdb should ignore the
exception.

On x86, arm, mips and powerpc, the kgdb_contthread usage was
inconsistent with the way single stepping is implemented in the kgdb
core.  The arch specific stub should always set the
kgdb_cpu_doing_single_step correctly if it is single stepping.  This
allows kgdb to correctly process an instruction steps if ptrace
happens to be requesting an instruction step over a system call.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 kernel/kgdb.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 949806ab67de..25d955dbb989 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -1462,7 +1462,7 @@ acquirelock:
 	 * Get the passive CPU lock which will hold all the non-primary
 	 * CPU in a spin state while the debugger is active
 	 */
-	if (!kgdb_single_step || !kgdb_contthread) {
+	if (!kgdb_single_step) {
 		for (i = 0; i < NR_CPUS; i++)
 			atomic_set(&passive_cpu_wait[i], 1);
 	}
@@ -1475,7 +1475,7 @@ acquirelock:
 
 #ifdef CONFIG_SMP
 	/* Signal the other CPUs to enter kgdb_wait() */
-	if ((!kgdb_single_step || !kgdb_contthread) && kgdb_do_roundup)
+	if ((!kgdb_single_step) && kgdb_do_roundup)
 		kgdb_roundup_cpus(flags);
 #endif
 
@@ -1494,7 +1494,7 @@ acquirelock:
 	kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
 	kgdb_deactivate_sw_breakpoints();
 	kgdb_single_step = 0;
-	kgdb_contthread = NULL;
+	kgdb_contthread = current;
 	exception_level = 0;
 
 	/* Talk to debugger with gdbserial protocol */
@@ -1508,7 +1508,7 @@ acquirelock:
 	kgdb_info[ks->cpu].task = NULL;
 	atomic_set(&cpu_in_kgdb[ks->cpu], 0);
 
-	if (!kgdb_single_step || !kgdb_contthread) {
+	if (!kgdb_single_step) {
 		for (i = NR_CPUS-1; i >= 0; i--)
 			atomic_set(&passive_cpu_wait[i], 0);
 		/*
-- 
cgit v1.2.3


From 7659e349672bb0d378ef8d7d62bae4c53d2bdd18 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 29 Sep 2008 14:06:45 +0200
Subject: hrtimer: migrate pending list on cpu offline

Impact: hrtimers which are on the pending list are not migrated at cpu
	offline and can be stale forever

Add the pending list migration when CONFIG_HIGH_RES_TIMERS is enabled

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/hrtimer.c | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index b8e4dce80a74..580bc66ae136 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1610,10 +1610,36 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 	}
 }
 
+#ifdef CONFIG_HIGH_RES_TIMERS
+static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
+				   struct hrtimer_cpu_base *new_base)
+{
+	struct hrtimer *timer;
+	int raise = 0;
+
+	while (!list_empty(&old_base->cb_pending)) {
+		timer = list_entry(old_base->cb_pending.next,
+				   struct hrtimer, cb_entry);
+
+		__remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0);
+		timer->base = &new_base->clock_base[timer->base->index];
+		list_add_tail(&timer->cb_entry, &new_base->cb_pending);
+		raise = 1;
+	}
+	return raise;
+}
+#else
+static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
+				   struct hrtimer_cpu_base *new_base)
+{
+	return 0;
+}
+#endif
+
 static void migrate_hrtimers(int cpu)
 {
 	struct hrtimer_cpu_base *old_base, *new_base;
-	int i;
+	int i, raise = 0;
 
 	BUG_ON(cpu_online(cpu));
 	old_base = &per_cpu(hrtimer_bases, cpu);
@@ -1630,10 +1656,16 @@ static void migrate_hrtimers(int cpu)
 				     &new_base->clock_base[i]);
 	}
 
+	if (migrate_hrtimer_pending(old_base, new_base))
+		raise = 1;
+
 	spin_unlock(&old_base->lock);
 	spin_unlock(&new_base->lock);
 	local_irq_enable();
 	put_cpu_var(hrtimer_bases);
+
+	if (raise)
+		hrtimer_raise_softirq();
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-- 
cgit v1.2.3


From 41e1022eae71707f1ce6801a746f70b1e57b7567 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 29 Sep 2008 14:09:39 +0200
Subject: hrtimer: fix migration of CB_IRQSAFE_NO_SOFTIRQ hrtimers

Impact: Stale timers after a CPU went offline.

commit 37bb6cb4097e29ffee970065b74499cbf10603a3
       hrtimer: unlock hrtimer_wakeup

changed the hrtimer sleeper callback mode to CB_IRQSAFE_NO_SOFTIRQ due
to locking problems. A result of this change is that when enqueue is
called for an already expired hrtimer the callback function is not
longer called directly from the enqueue code. The normal callers have
been fixed in the code, but the migration code which moves hrtimers
from a dead CPU to a live CPU was not made aware of this.

This can be fixed by checking the timer state after the call to
enqueue in the migration code.


Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/hrtimer.c | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 580bc66ae136..ac2f6d6d4868 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1591,11 +1591,12 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
 
 #ifdef CONFIG_HOTPLUG_CPU
 
-static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
+static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 				struct hrtimer_clock_base *new_base)
 {
 	struct hrtimer *timer;
 	struct rb_node *node;
+	int raise = 0;
 
 	while ((node = rb_first(&old_base->active))) {
 		timer = rb_entry(node, struct hrtimer, node);
@@ -1607,7 +1608,27 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		 * Enqueue the timer. Allow reprogramming of the event device
 		 */
 		enqueue_hrtimer(timer, new_base, 1);
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+		/*
+		 * Happens with high res enabled when the timer was
+		 * already expired and the callback mode is
+		 * HRTIMER_CB_IRQSAFE_NO_SOFTIRQ
+		 * (hrtimer_sleeper). The enqueue code does not move
+		 * them to the soft irq pending list for
+		 * performance/latency reasons, but in the migration
+		 * state, we need to do that otherwise we end up with
+		 * a stale timer.
+		 */
+		if (timer->state == HRTIMER_STATE_INACTIVE) {
+			timer->state = HRTIMER_STATE_PENDING;
+			list_add_tail(&timer->cb_entry,
+				      &new_base->cpu_base->cb_pending);
+			raise = 1;
+		}
+#endif
 	}
+	return raise;
 }
 
 #ifdef CONFIG_HIGH_RES_TIMERS
@@ -1652,8 +1673,9 @@ static void migrate_hrtimers(int cpu)
 	spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
 
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
-		migrate_hrtimer_list(&old_base->clock_base[i],
-				     &new_base->clock_base[i]);
+		if (migrate_hrtimer_list(&old_base->clock_base[i],
+					 &new_base->clock_base[i]))
+			raise = 1;
 	}
 
 	if (migrate_hrtimer_pending(old_base, new_base))
-- 
cgit v1.2.3


From b00c1a99e7758f794923c61e5cd55268d61c9469 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 29 Sep 2008 15:44:46 +0200
Subject: hrtimer: mark migration state

Impact: during migration active hrtimers can be seen as inactive

The migration code removes the hrtimers from the queues of the dead
CPU and sets the state temporary to INACTIVE. The enqueue code sets it
to ACTIVE/PENDING again.

Prevent that the wrong state can be seen by using a separate migration
state bit.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/hrtimer.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ac2f6d6d4868..ace723dd1e52 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1602,7 +1602,13 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		timer = rb_entry(node, struct hrtimer, node);
 		BUG_ON(hrtimer_callback_running(timer));
 		debug_hrtimer_deactivate(timer);
-		__remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0);
+
+		/*
+		 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
+		 * timer could be seen as !active and just vanish away
+		 * under us on another CPU
+		 */
+		__remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
 		timer->base = new_base;
 		/*
 		 * Enqueue the timer. Allow reprogramming of the event device
@@ -1620,13 +1626,15 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		 * state, we need to do that otherwise we end up with
 		 * a stale timer.
 		 */
-		if (timer->state == HRTIMER_STATE_INACTIVE) {
+		if (timer->state == HRTIMER_STATE_MIGRATE) {
 			timer->state = HRTIMER_STATE_PENDING;
 			list_add_tail(&timer->cb_entry,
 				      &new_base->cpu_base->cb_pending);
 			raise = 1;
 		}
 #endif
+		/* Clear the migration state bit */
+		timer->state &= ~HRTIMER_STATE_MIGRATE;
 	}
 	return raise;
 }
-- 
cgit v1.2.3


From ccc7dadf736639da86f3e0c86832c11a66fc8221 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 29 Sep 2008 15:47:42 +0200
Subject: hrtimer: prevent migration of per CPU hrtimers

Impact: per CPU hrtimers can be migrated from a dead CPU

The hrtimer code has no knowledge about per CPU timers, but we need to
prevent the migration of such timers and warn when such a timer is
active at migration time.

Explicitely mark the timers as per CPU and use a more understandable
mode descriptor for the interrupts safe unlocked callback mode, which
is used by hrtimer_sleeper and the scheduler code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/hrtimer.c             | 37 +++++++++++++++++++++++++------------
 kernel/sched.c               |  4 ++--
 kernel/time/tick-sched.c     |  2 +-
 kernel/trace/trace_sysprof.c |  2 +-
 4 files changed, 29 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ace723dd1e52..cdec83e722fa 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -672,13 +672,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 			 */
 			BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
 			return 1;
-		case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ:
+		case HRTIMER_CB_IRQSAFE_PERCPU:
+		case HRTIMER_CB_IRQSAFE_UNLOCKED:
 			/*
 			 * This is solely for the sched tick emulation with
 			 * dynamic tick support to ensure that we do not
 			 * restart the tick right on the edge and end up with
 			 * the tick timer in the softirq ! The calling site
-			 * takes care of this.
+			 * takes care of this. Also used for hrtimer sleeper !
 			 */
 			debug_hrtimer_deactivate(timer);
 			return 1;
@@ -1245,7 +1246,8 @@ static void __run_hrtimer(struct hrtimer *timer)
 	timer_stats_account_hrtimer(timer);
 
 	fn = timer->function;
-	if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
+	if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
+	    timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) {
 		/*
 		 * Used for scheduler timers, avoid lock inversion with
 		 * rq->lock and tasklist_lock.
@@ -1452,7 +1454,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
 	sl->timer.function = hrtimer_wakeup;
 	sl->task = task;
 #ifdef CONFIG_HIGH_RES_TIMERS
-	sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+	sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
 #endif
 }
 
@@ -1592,7 +1594,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
 #ifdef CONFIG_HOTPLUG_CPU
 
 static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
-				struct hrtimer_clock_base *new_base)
+				struct hrtimer_clock_base *new_base, int dcpu)
 {
 	struct hrtimer *timer;
 	struct rb_node *node;
@@ -1603,6 +1605,18 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		BUG_ON(hrtimer_callback_running(timer));
 		debug_hrtimer_deactivate(timer);
 
+		/*
+		 * Should not happen. Per CPU timers should be
+		 * canceled _before_ the migration code is called
+		 */
+		if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) {
+			__remove_hrtimer(timer, old_base,
+					 HRTIMER_STATE_INACTIVE, 0);
+			WARN(1, "hrtimer (%p %p)active but cpu %d dead\n",
+			     timer, timer->function, dcpu);
+			continue;
+		}
+
 		/*
 		 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
 		 * timer could be seen as !active and just vanish away
@@ -1619,12 +1633,11 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		/*
 		 * Happens with high res enabled when the timer was
 		 * already expired and the callback mode is
-		 * HRTIMER_CB_IRQSAFE_NO_SOFTIRQ
-		 * (hrtimer_sleeper). The enqueue code does not move
-		 * them to the soft irq pending list for
-		 * performance/latency reasons, but in the migration
-		 * state, we need to do that otherwise we end up with
-		 * a stale timer.
+		 * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The
+		 * enqueue code does not move them to the soft irq
+		 * pending list for performance/latency reasons, but
+		 * in the migration state, we need to do that
+		 * otherwise we end up with a stale timer.
 		 */
 		if (timer->state == HRTIMER_STATE_MIGRATE) {
 			timer->state = HRTIMER_STATE_PENDING;
@@ -1682,7 +1695,7 @@ static void migrate_hrtimers(int cpu)
 
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
 		if (migrate_hrtimer_list(&old_base->clock_base[i],
-					 &new_base->clock_base[i]))
+					 &new_base->clock_base[i], cpu))
 			raise = 1;
 	}
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 13dd2db9fb2d..ad1962dc0aa2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -201,7 +201,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
 	hrtimer_init(&rt_b->rt_period_timer,
 			CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	rt_b->rt_period_timer.function = sched_rt_period_timer;
-	rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+	rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
 }
 
 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
@@ -1119,7 +1119,7 @@ static void init_rq_hrtick(struct rq *rq)
 
 	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	rq->hrtick_timer.function = hrtick;
-	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
 }
 #else
 static inline void hrtick_clear(struct rq *rq)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 39019b3f7621..cb02324bdb88 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -625,7 +625,7 @@ void tick_setup_sched_timer(void)
 	 */
 	hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 	ts->sched_timer.function = tick_sched_timer;
-	ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+	ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
 
 	/* Get the next period (per cpu) */
 	ts->sched_timer.expires = tick_init_jiffy_update();
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index bb948e52ce20..db58fb66a135 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -202,7 +202,7 @@ static void start_stack_timer(int cpu)
 
 	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hrtimer->function = stack_trace_timer_fn;
-	hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+	hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
 
 	hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
 }
-- 
cgit v1.2.3


From 31a78f23bac0069004e69f98808b6988baccb6b6 Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbir@linux.vnet.ibm.com>
Date: Sun, 28 Sep 2008 23:09:31 +0100
Subject: mm owner: fix race between swapoff and exit

There's a race between mm->owner assignment and swapoff, more easily
seen when task slab poisoning is turned on.  The condition occurs when
try_to_unuse() runs in parallel with an exiting task.  A similar race
can occur with callers of get_task_mm(), such as /proc/<pid>/<mmstats>
or ptrace or page migration.

CPU0                                    CPU1
                                        try_to_unuse
                                        looks at mm = task0->mm
                                        increments mm->mm_users
task 0 exits
mm->owner needs to be updated, but no
new owner is found (mm_users > 1, but
no other task has task->mm = task0->mm)
mm_update_next_owner() leaves
                                        mmput(mm) decrements mm->mm_users
task0 freed
                                        dereferencing mm->owner fails

The fix is to notify the subsystem via mm_owner_changed callback(),
if no new owner is found, by specifying the new task as NULL.

Jiri Slaby:
mm->owner was set to NULL prior to calling cgroup_mm_owner_callbacks(), but
must be set after that, so as not to pass NULL as old owner causing oops.

Daisuke Nishimura:
mm_update_next_owner() may set mm->owner to NULL, but mem_cgroup_from_task()
and its callers need to take account of this situation to avoid oops.

Hugh Dickins:
Lockdep warning and hang below exec_mmap() when testing these patches.
exit_mm() up_reads mmap_sem before calling mm_update_next_owner(),
so exec_mmap() now needs to do the same.  And with that repositioning,
there's now no point in mm_need_new_owner() allowing for NULL mm.

Reported-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c |  5 +++--
 kernel/exit.c   | 12 ++++++++++--
 2 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 13932abde159..a0123d75ec9a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2738,14 +2738,15 @@ void cgroup_fork_callbacks(struct task_struct *child)
  */
 void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
 {
-	struct cgroup *oldcgrp, *newcgrp;
+	struct cgroup *oldcgrp, *newcgrp = NULL;
 
 	if (need_mm_owner_callback) {
 		int i;
 		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 			struct cgroup_subsys *ss = subsys[i];
 			oldcgrp = task_cgroup(old, ss->subsys_id);
-			newcgrp = task_cgroup(new, ss->subsys_id);
+			if (new)
+				newcgrp = task_cgroup(new, ss->subsys_id);
 			if (oldcgrp == newcgrp)
 				continue;
 			if (ss->mm_owner_changed)
diff --git a/kernel/exit.c b/kernel/exit.c
index 16395644a98f..85a83c831856 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -583,8 +583,6 @@ mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
 	 * If there are other users of the mm and the owner (us) is exiting
 	 * we need to find a new owner to take on the responsibility.
 	 */
-	if (!mm)
-		return 0;
 	if (atomic_read(&mm->mm_users) <= 1)
 		return 0;
 	if (mm->owner != p)
@@ -627,6 +625,16 @@ retry:
 	} while_each_thread(g, c);
 
 	read_unlock(&tasklist_lock);
+	/*
+	 * We found no owner yet mm_users > 1: this implies that we are
+	 * most likely racing with swapoff (try_to_unuse()) or /proc or
+	 * ptrace or page migration (get_task_mm()).  Mark owner as NULL,
+	 * so that subsystems can understand the callback and take action.
+	 */
+	down_write(&mm->mmap_sem);
+	cgroup_mm_owner_callbacks(mm->owner, NULL);
+	mm->owner = NULL;
+	up_write(&mm->mmap_sem);
 	return;
 
 assign_new_owner:
-- 
cgit v1.2.3


From bfcd17a6c5529bc37234cfa720a047cf9397bcfc Mon Sep 17 00:00:00 2001
From: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Date: Wed, 6 Aug 2008 15:12:22 +0200
Subject: Configure out file locking features

This patch adds the CONFIG_FILE_LOCKING option which allows to remove
support for advisory locks. With this patch enabled, the flock()
system call, the F_GETLK, F_SETLK and F_SETLKW operations of fcntl()
and NFS support are disabled. These features are not necessarly needed
on embedded systems. It allows to save ~11 Kb of kernel code and data:

   text          data     bss     dec     hex filename
1125436        118764  212992 1457192  163c28 vmlinux.old
1114299        118564  212992 1445855  160fdf vmlinux
 -11137    -200       0  -11337   -2C49 +/-

This patch has originally been written by Matt Mackall
<mpm@selenic.com>, and is part of the Linux Tiny project.

Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Signed-off-by: Matt Mackall <mpm@selenic.com>
Cc: matthew@wil.cx
Cc: linux-fsdevel@vger.kernel.org
Cc: mpm@selenic.com
Cc: akpm@linux-foundation.org
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 kernel/sys_ni.c | 1 +
 kernel/sysctl.c | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 08d6e1bb99ac..503d8d4eb80a 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -125,6 +125,7 @@ cond_syscall(sys_vm86old);
 cond_syscall(sys_vm86);
 cond_syscall(compat_sys_ipc);
 cond_syscall(compat_sys_sysctl);
+cond_syscall(sys_flock);
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 50ec0886fa3d..4588b2cf2ecb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -97,7 +97,7 @@ static int sixty = 60;
 static int neg_one = -1;
 #endif
 
-#ifdef CONFIG_MMU
+#if defined(CONFIG_MMU) && defined(CONFIG_FILE_LOCKING)
 static int two = 2;
 #endif
 
@@ -1261,6 +1261,7 @@ static struct ctl_table fs_table[] = {
 		.extra1		= &minolduid,
 		.extra2		= &maxolduid,
 	},
+#ifdef CONFIG_FILE_LOCKING
 	{
 		.ctl_name	= FS_LEASES,
 		.procname	= "leases-enable",
@@ -1269,6 +1270,7 @@ static struct ctl_table fs_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+#endif
 #ifdef CONFIG_DNOTIFY
 	{
 		.ctl_name	= FS_DIR_NOTIFY,
@@ -1280,6 +1282,7 @@ static struct ctl_table fs_table[] = {
 	},
 #endif
 #ifdef CONFIG_MMU
+#ifdef CONFIG_FILE_LOCKING
 	{
 		.ctl_name	= FS_LEASE_TIME,
 		.procname	= "lease-break-time",
@@ -1291,6 +1294,7 @@ static struct ctl_table fs_table[] = {
 		.extra1		= &zero,
 		.extra2		= &two,
 	},
+#endif
 	{
 		.procname	= "aio-nr",
 		.data		= &aio_nr,
-- 
cgit v1.2.3


From 64b9e0294d24a4204232e13e01630b0690e48d61 Mon Sep 17 00:00:00 2001
From: "Amit K. Arora" <aarora@linux.vnet.ibm.com>
Date: Tue, 30 Sep 2008 17:15:39 +0530
Subject: sched: minor optimizations in wake_affine and select_task_rq_fair

This patch does following:
o Removes unused variable and argument "rq".
o Optimizes one of the "if" conditions in wake_affine() - i.e.  if
  "balanced" is true, we need not do rest of the calculations in the
  condition.
o If this cpu is same as the previous cpu (on which woken up task
  was running when it went to sleep), no need to call wake_affine at all.

Signed-off-by: Amit K Arora <aarora@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 95c1295ad26d..fcbe850a5a90 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1088,7 +1088,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
 #endif
 
 static int
-wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
+wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
 	    struct task_struct *p, int prev_cpu, int this_cpu, int sync,
 	    int idx, unsigned long load, unsigned long this_load,
 	    unsigned int imbalance)
@@ -1136,8 +1136,8 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
 	schedstat_inc(p, se.nr_wakeups_affine_attempts);
 	tl_per_task = cpu_avg_load_per_task(this_cpu);
 
-	if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) ||
-			balanced) {
+	if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
+			tl_per_task)) {
 		/*
 		 * This domain has SD_WAKE_AFFINE and
 		 * p is cache cold in this domain, and
@@ -1156,16 +1156,17 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
 	struct sched_domain *sd, *this_sd = NULL;
 	int prev_cpu, this_cpu, new_cpu;
 	unsigned long load, this_load;
-	struct rq *rq, *this_rq;
+	struct rq *this_rq;
 	unsigned int imbalance;
 	int idx;
 
 	prev_cpu	= task_cpu(p);
-	rq		= task_rq(p);
 	this_cpu	= smp_processor_id();
 	this_rq		= cpu_rq(this_cpu);
 	new_cpu		= prev_cpu;
 
+	if (prev_cpu == this_cpu)
+		goto out;
 	/*
 	 * 'this_sd' is the first domain that both
 	 * this_cpu and prev_cpu are present in:
@@ -1193,13 +1194,10 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
 	load = source_load(prev_cpu, idx);
 	this_load = target_load(this_cpu, idx);
 
-	if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
+	if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
 				     load, this_load, imbalance))
 		return this_cpu;
 
-	if (prev_cpu == this_cpu)
-		goto out;
-
 	/*
 	 * Start passive balancing when half the imbalance_pct
 	 * limit is reached.
-- 
cgit v1.2.3


From aa94fbd5ccd840c8ab26d02439ec799b03a72547 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Thu, 2 Oct 2008 14:50:14 -0700
Subject: fix error-path NULL deref in alloc_posix_timer()

Found by static checker (http://repo.or.cz/w/smatch.git).

Signed-off-by: Dan Carpenter <error27@gmail.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/posix-timers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index e36d5798cbff..5131e5471169 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -441,7 +441,7 @@ static struct k_itimer * alloc_posix_timer(void)
 		return tmr;
 	if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
 		kmem_cache_free(posix_timers_cache, tmr);
-		tmr = NULL;
+		return NULL;
 	}
 	memset(&tmr->sigq->info, 0, sizeof(siginfo_t));
 	return tmr;
-- 
cgit v1.2.3


From 2133b5d7ff531bc15a923db4a6a50bf96c561be9 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 2 Oct 2008 16:06:39 -0700
Subject: rcu: RCU-based detection of stalled CPUs for Classic RCU

This patch adds stalled-CPU detection to Classic RCU.  This capability
is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR, which
defaults disabled.

This is a debugging feature to detect infinite loops in kernel code, not
something that non-kernel-hackers would be expected to care about.

This feature can detect looping CPUs in !PREEMPT builds and looping CPUs
with preemption disabled in PREEMPT builds.  This is essentially a port of
this functionality from the treercu patch, replacing the stall debug patch
that is already in tip/core/rcu (commit 67182ae1c4).

The changes from the patch in tip/core/rcu include making the config
variable name match that in treercu, changing from seconds to jiffies to
avoid spurious warnings, and printing a boot message when this feature
is enabled.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c | 166 +++++++++++++++++++++++++++-------------------------
 1 file changed, 86 insertions(+), 80 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index ed15128ca2c9..0d07e6e51578 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -164,6 +164,87 @@ static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
 	}
 }
 
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+
+static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
+{
+	rcp->gp_start = jiffies;
+	rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
+}
+
+static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+	int cpu;
+	long delta;
+	unsigned long flags;
+
+	/* Only let one CPU complain about others per time interval. */
+
+	spin_lock_irqsave(&rcp->lock, flags);
+	delta = jiffies - rcp->jiffies_stall;
+	if (delta < 2 || rcp->cur != rcp->completed) {
+		spin_unlock_irqrestore(&rcp->lock, flags);
+		return;
+	}
+	rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
+	spin_unlock_irqrestore(&rcp->lock, flags);
+
+	/* OK, time to rat on our buddy... */
+
+	printk(KERN_ERR "RCU detected CPU stalls:");
+	for_each_possible_cpu(cpu) {
+		if (cpu_isset(cpu, rcp->cpumask))
+			printk(" %d", cpu);
+	}
+	printk(" (detected by %d, t=%ld jiffies)\n",
+	       smp_processor_id(), (long)(jiffies - rcp->gp_start));
+}
+
+static void print_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+	unsigned long flags;
+
+	printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
+			smp_processor_id(), jiffies,
+			jiffies - rcp->gp_start);
+	dump_stack();
+	spin_lock_irqsave(&rcp->lock, flags);
+	if ((long)(jiffies - rcp->jiffies_stall) >= 0)
+		rcp->jiffies_stall =
+			jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
+	spin_unlock_irqrestore(&rcp->lock, flags);
+	set_need_resched();  /* kick ourselves to get things going. */
+}
+
+static void check_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+	long delta;
+
+	delta = jiffies - rcp->jiffies_stall;
+	if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) {
+
+		/* We haven't checked in, so go dump stack. */
+		print_cpu_stall(rcp);
+
+	} else if (rcp->cur != rcp->completed && delta >= 2) {
+
+		/* They had two seconds to dump stack, so complain. */
+		print_other_cpu_stall(rcp);
+	}
+}
+
+#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
+static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
+{
+}
+
+static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
 /**
  * call_rcu - Queue an RCU callback for invocation after a grace period.
  * @head: structure to be used for queueing the RCU updates.
@@ -293,84 +374,6 @@ static void rcu_do_batch(struct rcu_data *rdp)
  *   period (if necessary).
  */
 
-#ifdef CONFIG_DEBUG_RCU_STALL
-
-static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
-{
-	rcp->gp_check = get_seconds() + 3;
-}
-
-static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
-{
-	int cpu;
-	long delta;
-	unsigned long flags;
-
-	/* Only let one CPU complain about others per time interval. */
-
-	spin_lock_irqsave(&rcp->lock, flags);
-	delta = get_seconds() - rcp->gp_check;
-	if (delta < 2L || cpus_empty(rcp->cpumask)) {
-		spin_unlock(&rcp->lock);
-		return;
-	}
-	rcp->gp_check = get_seconds() + 30;
-	spin_unlock_irqrestore(&rcp->lock, flags);
-
-	/* OK, time to rat on our buddy... */
-
-	printk(KERN_ERR "RCU detected CPU stalls:");
-	for_each_cpu_mask(cpu, rcp->cpumask)
-		printk(" %d", cpu);
-	printk(" (detected by %d, t=%lu/%lu)\n",
-	       smp_processor_id(), get_seconds(), rcp->gp_check);
-}
-
-static void print_cpu_stall(struct rcu_ctrlblk *rcp)
-{
-	unsigned long flags;
-
-	printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n",
-			smp_processor_id(), get_seconds(), rcp->gp_check);
-	dump_stack();
-	spin_lock_irqsave(&rcp->lock, flags);
-	if ((long)(get_seconds() - rcp->gp_check) >= 0L)
-		rcp->gp_check = get_seconds() + 30;
-	spin_unlock_irqrestore(&rcp->lock, flags);
-}
-
-static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
-{
-	long delta;
-
-	delta = get_seconds() - rcp->gp_check;
-	if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0L) {
-
-		/* We haven't checked in, so go dump stack. */
-
-		print_cpu_stall(rcp);
-
-	} else {
-		if (!cpus_empty(rcp->cpumask) && delta >= 2L) {
-			/* They had two seconds to dump stack, so complain. */
-			print_other_cpu_stall(rcp);
-		}
-	}
-}
-
-#else /* #ifdef CONFIG_DEBUG_RCU_STALL */
-
-static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
-{
-}
-
-static inline void
-check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_DEBUG_RCU_STALL */
-
 /*
  * Register a new batch of callbacks, and start it up if there is currently no
  * active batch and the batch to be registered has not already occurred.
@@ -381,7 +384,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
 	if (rcp->cur != rcp->pending &&
 			rcp->completed == rcp->cur) {
 		rcp->cur++;
-		record_gp_check_time(rcp);
+		record_gp_stall_check_time(rcp);
 
 		/*
 		 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
@@ -603,7 +606,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
 static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
 {
 	/* Check for CPU stalls, if enabled. */
-	check_cpu_stall(rcp, rdp);
+	check_cpu_stall(rcp);
 
 	if (rdp->nxtlist) {
 		long completed_snap = ACCESS_ONCE(rcp->completed);
@@ -769,6 +772,9 @@ static struct notifier_block __cpuinitdata rcu_nb = {
  */
 void __init __rcu_init(void)
 {
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+	printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 	rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
 			(void *)(long)smp_processor_id());
 	/* Register notifier for non-boot CPUs */
-- 
cgit v1.2.3


From 2ec2b482b10a1ed3493c224f1893cddd3d33833b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Oct 2008 10:41:00 +0200
Subject: rcu: RCU-based detection of stalled CPUs for Classic RCU, fix

fix the !CONFIG_RCU_CPU_STALL_DETECTOR path:

 kernel/rcuclassic.c: In function '__rcu_pending':
 kernel/rcuclassic.c:609: error: too few arguments to function 'check_cpu_stall'

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 0d07e6e51578..37f72e551542 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -239,7 +239,7 @@ static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
 {
 }
 
-static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
+static inline void check_cpu_stall(struct rcu_ctrlblk *rcp)
 {
 }
 
-- 
cgit v1.2.3


From d294eb83d8d39a29f01dad391f15fc3a29aa04f9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 3 Oct 2008 12:10:10 +0200
Subject: cpusets: scan_for_empty_cpusets(), cpuset doesn't seem to be so const
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes a warning on latest -tip:

 kernel/cpuset.c: Dans la fonction «scan_for_empty_cpusets» :
 kernel/cpuset.c:1932: attention : passing argument 1 of «list_add_tail» discards qualifiers from pointer target type

Actually the struct cpuset *root passed in parameter to scan_for_empty_cpusets
is not supposed to be const since an entry is added on the tail of its list.
Just correct the qualifier.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/cpuset.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 827cd9adccb2..eab7bd6628e0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1921,7 +1921,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
  * that has tasks along with an empty 'mems'.  But if we did see such
  * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
  */
-static void scan_for_empty_cpusets(const struct cpuset *root)
+static void scan_for_empty_cpusets(struct cpuset *root)
 {
 	LIST_HEAD(queue);
 	struct cpuset *cp;	/* scans cpusets being updated */
-- 
cgit v1.2.3


From 07454bfff151d2465ada809bbaddf3548cc1097c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 4 Oct 2008 10:51:07 +0200
Subject: clockevents: check broadcast tick device not the clock events device

Impact: jiffies increment too fast.

Hugh Dickins noted that with NOHZ=n and HIGHRES=n jiffies get
incremented too fast. The reason is a wrong check in the broadcast
enter/exit code, which keeps the local apic timer in periodic mode
when the switch happens.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index bd7034542399..cb01cd8f919b 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -235,7 +235,8 @@ static void tick_do_broadcast_on_off(void *why)
 	case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
 		if (!cpu_isset(cpu, tick_broadcast_mask)) {
 			cpu_set(cpu, tick_broadcast_mask);
-			if (bc->mode == TICKDEV_MODE_PERIODIC)
+			if (tick_broadcast_device.mode ==
+			    TICKDEV_MODE_PERIODIC)
 				clockevents_shutdown(dev);
 		}
 		if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
@@ -245,7 +246,8 @@ static void tick_do_broadcast_on_off(void *why)
 		if (!tick_broadcast_force &&
 		    cpu_isset(cpu, tick_broadcast_mask)) {
 			cpu_clear(cpu, tick_broadcast_mask);
-			if (bc->mode == TICKDEV_MODE_PERIODIC)
+			if (tick_broadcast_device.mode ==
+			    TICKDEV_MODE_PERIODIC)
 				tick_setup_periodic(dev, 0);
 		}
 		break;
-- 
cgit v1.2.3


From f6121f4f8708195e88cbdf8dd8d171b226b3f858 Mon Sep 17 00:00:00 2001
From: Dario Faggioli <raistlin@linux.it>
Date: Fri, 3 Oct 2008 17:40:46 +0200
Subject: sched_rt.c: resch needed in rt_rq_enqueue() for the root rt_rq

While working on the new version of the code for SCHED_SPORADIC I
noticed something strange in the present throttling mechanism. More
specifically in the throttling timer handler in sched_rt.c
(do_sched_rt_period_timer()) and in rt_rq_enqueue().

The problem is that, when unthrottling a runqueue, rt_rq_enqueue() only
asks for rescheduling if the runqueue has a sched_entity associated to
it (i.e., rt_rq->rt_se != NULL).
Now, if the runqueue is the root rq (which has a rt_se = NULL)
rescheduling does not take place, and it is delayed to some undefined
instant in the future.

This imply some random bandwidth usage by the RT tasks under throttling.
For instance, setting rt_runtime_us/rt_period_us = 950ms/1000ms an RT
task will get less than 95%. In our tests we got something varying
between 70% to 95%.
Using smaller time values, e.g., 95ms/100ms, things are even worse, and
I can see values also going down to 20-25%!!

The tests we performed are simply running 'yes' as a SCHED_FIFO task,
and checking the CPU usage with top, but we can investigate thoroughly
if you think it is needed.

Things go much better, for us, with the attached patch... Don't know if
it is the best approach, but it solved the issue for us.

Signed-off-by: Dario Faggioli <raistlin@linux.it>
Signed-off-by: Michael Trimarchi <trimarchimichael@yahoo.it>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d570a8cc4fcd..cdf5740ab03e 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -102,12 +102,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
 
 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
+	struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
 	struct sched_rt_entity *rt_se = rt_rq->rt_se;
 
-	if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) {
-		struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
-
-		enqueue_rt_entity(rt_se);
+	if (rt_rq->rt_nr_running) {
+		if (rt_se && !on_rt_rq(rt_se))
+			enqueue_rt_entity(rt_se);
 		if (rt_rq->highest_prio < curr->prio)
 			resched_task(curr);
 	}
-- 
cgit v1.2.3


From 34b3ede2353604ec9861c1d900b2a835ff85de47 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 6 Oct 2008 09:27:00 +0800
Subject: sched: remove redundant code in cpu_cgroup_create()

css will be initialized by cgroup core.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 2caedc47e764..9715f4ce6cfe 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9088,7 +9088,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
 
 	if (!cgrp->parent) {
 		/* This is early initialization for the top cgroup */
-		init_task_group.css.cgroup = cgrp;
 		return &init_task_group.css;
 	}
 
@@ -9097,9 +9096,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
 	if (IS_ERR(tg))
 		return ERR_PTR(-ENOMEM);
 
-	/* Bind the cgroup to task_group object we just created */
-	tg->css.cgroup = cgrp;
-
 	return &tg->css;
 }
 
-- 
cgit v1.2.3


From cc1e0f4f7ad95a9eb81e1904cb16068af226180d Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Mon, 6 Oct 2008 13:50:59 -0500
Subject: kgdb: call touch_softlockup_watchdog on resume

The softlockup watchdog needs to be touched when resuming the from the
kgdb stopped state to avoid the printk that a CPU is stuck if the
debugger was active for longer than the softlockup threshold.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 kernel/kgdb.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 25d955dbb989..e4dcfb2272a4 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -590,6 +590,7 @@ static void kgdb_wait(struct pt_regs *regs)
 
 	/* Signal the primary CPU that we are done: */
 	atomic_set(&cpu_in_kgdb[cpu], 0);
+	touch_softlockup_watchdog();
 	clocksource_touch_watchdog();
 	local_irq_restore(flags);
 }
@@ -1432,6 +1433,7 @@ acquirelock:
 	    atomic_read(&kgdb_cpu_doing_single_step) != cpu) {
 
 		atomic_set(&kgdb_active, -1);
+		touch_softlockup_watchdog();
 		clocksource_touch_watchdog();
 		local_irq_restore(flags);
 
@@ -1524,6 +1526,7 @@ acquirelock:
 kgdb_restore:
 	/* Free kgdb_active */
 	atomic_set(&kgdb_active, -1);
+	touch_softlockup_watchdog();
 	clocksource_touch_watchdog();
 	local_irq_restore(flags);
 
-- 
cgit v1.2.3


From 2fb7635c4cea310992a39580133099dd99ad151c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 8 Oct 2008 09:16:04 +0200
Subject: sched: sync wakeups vs avg_overlap

While looking at the code I wondered why we always do:

  sync && avg_overlap < migration_cost

Which is a bit odd, since the overlap test was meant to detect sync wakeups
so using it to specialize sync wakeups doesn't make much sense.

Hence change the code to do:

  sync || avg_overlap < migration_cost

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index fcbe850a5a90..18fd17172eb6 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1103,6 +1103,11 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
 	if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
 		return 0;
 
+	if (!sync && sched_feat(SYNC_WAKEUPS) &&
+	    curr->se.avg_overlap < sysctl_sched_migration_cost &&
+	    p->se.avg_overlap < sysctl_sched_migration_cost)
+		sync = 1;
+
 	/*
 	 * If sync wakeup then subtract the (maximum possible)
 	 * effect of the currently running task from the load
@@ -1127,11 +1132,8 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
 	 * a reasonable amount of time then attract this newly
 	 * woken task:
 	 */
-	if (sync && balanced) {
-		if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
-		    p->se.avg_overlap < sysctl_sched_migration_cost)
-			return 1;
-	}
+	if (sync && balanced)
+		return 1;
 
 	schedstat_inc(p, se.nr_wakeups_affine_attempts);
 	tl_per_task = cpu_avg_load_per_task(this_cpu);
@@ -1268,9 +1270,9 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
 	if (!sched_feat(WAKEUP_PREEMPT))
 		return;
 
-	if (sched_feat(WAKEUP_OVERLAP) && sync &&
-			se->avg_overlap < sysctl_sched_migration_cost &&
-			pse->avg_overlap < sysctl_sched_migration_cost) {
+	if (sched_feat(WAKEUP_OVERLAP) && (sync ||
+			(se->avg_overlap < sysctl_sched_migration_cost &&
+			 pse->avg_overlap < sysctl_sched_migration_cost))) {
 		resched_task(curr);
 		return;
 	}
-- 
cgit v1.2.3


From a5d8c3483a6e19aca95ef6a2c5890e33bfa5b293 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Oct 2008 11:35:51 +0200
Subject: sched debug: add name to sched_domain sysctl entries

add /proc/sys/kernel/sched_domain/cpu0/domain0/name, to make
it easier to see which specific scheduler domain remained at
that entry.

Since we process the scheduler domain tree and
simplify it, it's not always immediately clear during debugging
which domain came from where.

depends on CONFIG_SCHED_DEBUG=y.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 9715f4ce6cfe..6f230596bd0c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6351,7 +6351,7 @@ set_table_entry(struct ctl_table *entry,
 static struct ctl_table *
 sd_alloc_ctl_domain_table(struct sched_domain *sd)
 {
-	struct ctl_table *table = sd_alloc_ctl_entry(12);
+	struct ctl_table *table = sd_alloc_ctl_entry(13);
 
 	if (table == NULL)
 		return NULL;
@@ -6379,7 +6379,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[10], "flags", &sd->flags,
 		sizeof(int), 0644, proc_dointvec_minmax);
-	/* &table[11] is terminator */
+	set_table_entry(&table[11], "name", sd->name,
+		CORENAME_MAX_SIZE, 0444, proc_dostring);
+	/* &table[12] is terminator */
 
 	return table;
 }
@@ -7263,13 +7265,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
  * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
  */
 
+#ifdef CONFIG_SCHED_DEBUG
+# define SD_INIT_NAME(sd, type)		sd->name = #type
+#else
+# define SD_INIT_NAME(sd, type)		do { } while (0)
+#endif
+
 #define	SD_INIT(sd, type)	sd_init_##type(sd)
+
 #define SD_INIT_FUNC(type)	\
 static noinline void sd_init_##type(struct sched_domain *sd)	\
 {								\
 	memset(sd, 0, sizeof(*sd));				\
 	*sd = SD_##type##_INIT;					\
 	sd->level = SD_LV_##type;				\
+	SD_INIT_NAME(sd, type);					\
 }
 
 SD_INIT_FUNC(CPU)
-- 
cgit v1.2.3


From 8083e4ad970e4eb567e31037060cdd4ba346f0c0 Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Mon, 4 Aug 2008 11:59:11 -0700
Subject: [CPUFREQ][5/6] cpufreq: Changes to get_cpu_idle_time_us(), used by
 ondemand governor

export get_cpu_idle_time_us() for it to be used in ondemand governor.
Last update time can be current time when the CPU is currently non-idle,
accounting for the busy time since last idle.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 kernel/time/tick-sched.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index cb02324bdb88..a4d219398167 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -20,6 +20,7 @@
 #include <linux/profile.h>
 #include <linux/sched.h>
 #include <linux/tick.h>
+#include <linux/module.h>
 
 #include <asm/irq_regs.h>
 
@@ -190,9 +191,17 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 {
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 
-	*last_update_time = ktime_to_us(ts->idle_lastupdate);
+	if (!tick_nohz_enabled)
+		return -1;
+
+	if (ts->idle_active)
+		*last_update_time = ktime_to_us(ts->idle_lastupdate);
+	else
+		*last_update_time = ktime_to_us(ktime_get());
+
 	return ktime_to_us(ts->idle_sleeptime);
 }
+EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
 
 /**
  * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
-- 
cgit v1.2.3


From a6bebbc87a8c16eabb6bd5c6fd2d994be0236fba Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Sun, 5 Oct 2008 00:51:15 +0400
Subject: [PATCH] signal, procfs: some lock_task_sighand() users do not need
 rcu_read_lock()

lock_task_sighand() make sure task->sighand is being protected,
so we do not need rcu_read_lock().
[ exec() will get task->sighand->siglock before change task->sighand! ]

But code using rcu_read_lock() _just_ to protect lock_task_sighand()
only appear in procfs. (and some code in procfs use lock_task_sighand()
without such redundant protection.)

Other subsystem may put lock_task_sighand() into rcu_read_lock()
critical region, but these rcu_read_lock() are used for protecting
"for_each_process()", "find_task_by_vpid()" etc. , not for protecting
lock_task_sighand().

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
[ok from Oleg]
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 kernel/sched_debug.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index bbe6b31c3c56..ad958c1ec708 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -333,12 +333,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	unsigned long flags;
 	int num_threads = 1;
 
-	rcu_read_lock();
 	if (lock_task_sighand(p, &flags)) {
 		num_threads = atomic_read(&p->signal->count);
 		unlock_task_sighand(p, &flags);
 	}
-	rcu_read_unlock();
 
 	SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
 	SEQ_printf(m,
-- 
cgit v1.2.3


From 3bbfe0596746e1590888a6e1e6a07583265238b7 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 10 Oct 2008 03:27:16 +0400
Subject: proc: remove kernel.maps_protect

After commit 831830b5a2b5d413407adf380ef62fe17d6fcbf2 aka
"restrict reading from /proc/<pid>/maps to those who share ->mm or can ptrace"
sysctl stopped being relevant because commit moved security checks from ->show
time to ->start time (mm_for_maps()).

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Kees Cook <kees.cook@canonical.com>
---
 kernel/sysctl.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 50ec0886fa3d..cc3e0d7a5acf 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -80,7 +80,6 @@ extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
 extern int compat_log;
-extern int maps_protect;
 extern int latencytop_enabled;
 extern int sysctl_nr_open_min, sysctl_nr_open_max;
 #ifdef CONFIG_RCU_TORTURE_TEST
@@ -809,16 +808,6 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-#endif
-#ifdef CONFIG_PROC_FS
-	{
-		.ctl_name       = CTL_UNNUMBERED,
-		.procname       = "maps_protect",
-		.data           = &maps_protect,
-		.maxlen         = sizeof(int),
-		.mode           = 0644,
-		.proc_handler   = &proc_dointvec,
-	},
 #endif
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-- 
cgit v1.2.3


From 9c9f4ded90a59eee84e15f5fd38c03d60184e112 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@redhat.com>
Date: Mon, 13 Oct 2008 10:37:26 +0100
Subject: tty: Add a kref count

Introduce a kref to the tty structure and use it to protect the tty->signal
tty references. For now we don't introduce it for anything else.

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 5 ++++-
 kernel/sys.c  | 4 +---
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 7ce2ebe84796..30de644a40c4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -802,6 +802,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 
 	sig->leader = 0;	/* session leadership doesn't inherit */
 	sig->tty_old_pgrp = NULL;
+	sig->tty = NULL;
 
 	sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
 	sig->gtime = cputime_zero;
@@ -838,6 +839,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 void __cleanup_signal(struct signal_struct *sig)
 {
 	exit_thread_group_keys(sig);
+	tty_kref_put(sig->tty);
 	kmem_cache_free(signal_cachep, sig);
 }
 
@@ -1227,7 +1229,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 				p->nsproxy->pid_ns->child_reaper = p;
 
 			p->signal->leader_pid = pid;
-			p->signal->tty = current->signal->tty;
+			tty_kref_put(p->signal->tty);
+			p->signal->tty = tty_kref_get(current->signal->tty);
 			set_task_pgrp(p, task_pgrp_nr(current));
 			set_task_session(p, task_session_nr(current));
 			attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
diff --git a/kernel/sys.c b/kernel/sys.c
index 038a7bc0901d..234d9454294e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1060,9 +1060,7 @@ asmlinkage long sys_setsid(void)
 	group_leader->signal->leader = 1;
 	__set_special_pids(sid);
 
-	spin_lock(&group_leader->sighand->siglock);
-	group_leader->signal->tty = NULL;
-	spin_unlock(&group_leader->sighand->siglock);
+	proc_clear_tty(group_leader);
 
 	err = session;
 out:
-- 
cgit v1.2.3


From 95f9bfc6b76e862265a2d70ae061eec18fe14140 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@redhat.com>
Date: Mon, 13 Oct 2008 10:39:23 +0100
Subject: tty: Move tty_write_message out of kernel/printk

This is pure tty code so put it in the tty layer where it can be with the
locking relevant material it uses

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index b51b1567bb55..a430fd04008b 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1291,22 +1291,6 @@ static int __init disable_boot_consoles(void)
 }
 late_initcall(disable_boot_consoles);
 
-/**
- * tty_write_message - write a message to a certain tty, not just the console.
- * @tty: the destination tty_struct
- * @msg: the message to write
- *
- * This is used for messages that need to be redirected to a specific tty.
- * We don't put it into the syslog queue right now maybe in the future if
- * really needed.
- */
-void tty_write_message(struct tty_struct *tty, char *msg)
-{
-	if (tty && tty->ops->write)
-		tty->ops->write(tty, msg, strlen(msg));
-	return;
-}
-
 #if defined CONFIG_PRINTK
 
 /*
-- 
cgit v1.2.3


From dbda4c0b97b18fd59b3964548361b4f92357f730 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@redhat.com>
Date: Mon, 13 Oct 2008 10:40:53 +0100
Subject: tty: Fix abusers of current->sighand->tty

Various people outside the tty layer still stick their noses in behind the
scenes. We need to make sure they also obey the locking and referencing rules.

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/acct.c    | 2 +-
 kernel/auditsc.c | 9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index dd68b9059418..f6006a60df5d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -548,7 +548,7 @@ static void do_acct_process(struct bsd_acct_struct *acct,
 #endif
 
 	spin_lock_irq(&current->sighand->siglock);
-	tty = current->signal->tty;
+	tty = current->signal->tty;	/* Safe as we hold the siglock */
 	ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
 	ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
 	ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 59cedfb040e7..cf5bc2f5f9c3 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -246,8 +246,8 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
 	unsigned n;
 	if (unlikely(!ctx))
 		return 0;
-
 	n = ctx->major;
+
 	switch (audit_classify_syscall(ctx->arch, n)) {
 	case 0:	/* native */
 		if ((mask & AUDIT_PERM_WRITE) &&
@@ -1204,13 +1204,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 				 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
 				 context->return_code);
 
-	mutex_lock(&tty_mutex);
-	read_lock(&tasklist_lock);
+	spin_lock_irq(&tsk->sighand->siglock);
 	if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
 		tty = tsk->signal->tty->name;
 	else
 		tty = "(none)";
-	read_unlock(&tasklist_lock);
+	spin_unlock_irq(&tsk->sighand->siglock);
+
 	audit_log_format(ab,
 		  " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
 		  " ppid=%d pid=%d auid=%u uid=%u gid=%u"
@@ -1230,7 +1230,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 		  context->egid, context->sgid, context->fsgid, tty,
 		  tsk->sessionid);
 
-	mutex_unlock(&tty_mutex);
 
 	audit_log_task_info(ab, tsk);
 	if (context->filterkey) {
-- 
cgit v1.2.3