From d3283fb45c06dfd7b1a36da8e6ff39d313f0600c Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Mon, 9 Apr 2012 11:00:25 -0700
Subject: trace: Remove unused workqueue tracer

This tracer was temporarily removed in 6416669 (workqueue:
temporarily remove workqueue tracing, 2010-06-29) but never
reinstated after concurrency managed workqueues were completed.
For almost two years it hasn't been compilable so it seems nobody
is using it. Delete it.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/trace/Makefile          |   1 -
 kernel/trace/trace_workqueue.c | 300 -----------------------------------------
 2 files changed, 301 deletions(-)
 delete mode 100644 kernel/trace/trace_workqueue.c

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 5f39a07fe5ea..b3afe0e76f79 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -41,7 +41,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
-obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
 ifeq ($(CONFIG_BLOCK),y)
 obj-$(CONFIG_EVENT_TRACING) += blktrace.o
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
deleted file mode 100644
index 209b379a4721..000000000000
--- a/kernel/trace/trace_workqueue.c
+++ /dev/null
@@ -1,300 +0,0 @@
-/*
- * Workqueue statistical tracer.
- *
- * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
- *
- */
-
-
-#include <trace/events/workqueue.h>
-#include <linux/list.h>
-#include <linux/percpu.h>
-#include <linux/slab.h>
-#include <linux/kref.h>
-#include "trace_stat.h"
-#include "trace.h"
-
-
-/* A cpu workqueue thread */
-struct cpu_workqueue_stats {
-	struct list_head            list;
-	struct kref                 kref;
-	int		            cpu;
-	pid_t			    pid;
-/* Can be inserted from interrupt or user context, need to be atomic */
-	atomic_t	            inserted;
-/*
- *  Don't need to be atomic, works are serialized in a single workqueue thread
- *  on a single CPU.
- */
-	unsigned int		    executed;
-};
-
-/* List of workqueue threads on one cpu */
-struct workqueue_global_stats {
-	struct list_head	list;
-	spinlock_t		lock;
-};
-
-/* Don't need a global lock because allocated before the workqueues, and
- * never freed.
- */
-static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);
-#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu))
-
-static void cpu_workqueue_stat_free(struct kref *kref)
-{
-	kfree(container_of(kref, struct cpu_workqueue_stats, kref));
-}
-
-/* Insertion of a work */
-static void
-probe_workqueue_insertion(void *ignore,
-			  struct task_struct *wq_thread,
-			  struct work_struct *work)
-{
-	int cpu = cpumask_first(&wq_thread->cpus_allowed);
-	struct cpu_workqueue_stats *node;
-	unsigned long flags;
-
-	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-	list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
-		if (node->pid == wq_thread->pid) {
-			atomic_inc(&node->inserted);
-			goto found;
-		}
-	}
-	pr_debug("trace_workqueue: entry not found\n");
-found:
-	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-}
-
-/* Execution of a work */
-static void
-probe_workqueue_execution(void *ignore,
-			  struct task_struct *wq_thread,
-			  struct work_struct *work)
-{
-	int cpu = cpumask_first(&wq_thread->cpus_allowed);
-	struct cpu_workqueue_stats *node;
-	unsigned long flags;
-
-	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-	list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
-		if (node->pid == wq_thread->pid) {
-			node->executed++;
-			goto found;
-		}
-	}
-	pr_debug("trace_workqueue: entry not found\n");
-found:
-	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-}
-
-/* Creation of a cpu workqueue thread */
-static void probe_workqueue_creation(void *ignore,
-				     struct task_struct *wq_thread, int cpu)
-{
-	struct cpu_workqueue_stats *cws;
-	unsigned long flags;
-
-	WARN_ON(cpu < 0);
-
-	/* Workqueues are sometimes created in atomic context */
-	cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC);
-	if (!cws) {
-		pr_warning("trace_workqueue: not enough memory\n");
-		return;
-	}
-	INIT_LIST_HEAD(&cws->list);
-	kref_init(&cws->kref);
-	cws->cpu = cpu;
-	cws->pid = wq_thread->pid;
-
-	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-	list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list);
-	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-}
-
-/* Destruction of a cpu workqueue thread */
-static void
-probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread)
-{
-	/* Workqueue only execute on one cpu */
-	int cpu = cpumask_first(&wq_thread->cpus_allowed);
-	struct cpu_workqueue_stats *node, *next;
-	unsigned long flags;
-
-	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-	list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
-							list) {
-		if (node->pid == wq_thread->pid) {
-			list_del(&node->list);
-			kref_put(&node->kref, cpu_workqueue_stat_free);
-			goto found;
-		}
-	}
-
-	pr_debug("trace_workqueue: don't find workqueue to destroy\n");
-found:
-	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-
-}
-
-static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
-{
-	unsigned long flags;
-	struct cpu_workqueue_stats *ret = NULL;
-
-
-	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-
-	if (!list_empty(&workqueue_cpu_stat(cpu)->list)) {
-		ret = list_entry(workqueue_cpu_stat(cpu)->list.next,
-				 struct cpu_workqueue_stats, list);
-		kref_get(&ret->kref);
-	}
-
-	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-
-	return ret;
-}
-
-static void *workqueue_stat_start(struct tracer_stat *trace)
-{
-	int cpu;
-	void *ret = NULL;
-
-	for_each_possible_cpu(cpu) {
-		ret = workqueue_stat_start_cpu(cpu);
-		if (ret)
-			return ret;
-	}
-	return NULL;
-}
-
-static void *workqueue_stat_next(void *prev, int idx)
-{
-	struct cpu_workqueue_stats *prev_cws = prev;
-	struct cpu_workqueue_stats *ret;
-	int cpu = prev_cws->cpu;
-	unsigned long flags;
-
-	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-	if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
-		spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-		do {
-			cpu = cpumask_next(cpu, cpu_possible_mask);
-			if (cpu >= nr_cpu_ids)
-				return NULL;
-		} while (!(ret = workqueue_stat_start_cpu(cpu)));
-		return ret;
-	} else {
-		ret = list_entry(prev_cws->list.next,
-				 struct cpu_workqueue_stats, list);
-		kref_get(&ret->kref);
-	}
-	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-
-	return ret;
-}
-
-static int workqueue_stat_show(struct seq_file *s, void *p)
-{
-	struct cpu_workqueue_stats *cws = p;
-	struct pid *pid;
-	struct task_struct *tsk;
-
-	pid = find_get_pid(cws->pid);
-	if (pid) {
-		tsk = get_pid_task(pid, PIDTYPE_PID);
-		if (tsk) {
-			seq_printf(s, "%3d %6d     %6u       %s\n", cws->cpu,
-				   atomic_read(&cws->inserted), cws->executed,
-				   tsk->comm);
-			put_task_struct(tsk);
-		}
-		put_pid(pid);
-	}
-
-	return 0;
-}
-
-static void workqueue_stat_release(void *stat)
-{
-	struct cpu_workqueue_stats *node = stat;
-
-	kref_put(&node->kref, cpu_workqueue_stat_free);
-}
-
-static int workqueue_stat_headers(struct seq_file *s)
-{
-	seq_printf(s, "# CPU  INSERTED  EXECUTED   NAME\n");
-	seq_printf(s, "# |      |         |          |\n");
-	return 0;
-}
-
-struct tracer_stat workqueue_stats __read_mostly = {
-	.name = "workqueues",
-	.stat_start = workqueue_stat_start,
-	.stat_next = workqueue_stat_next,
-	.stat_show = workqueue_stat_show,
-	.stat_release = workqueue_stat_release,
-	.stat_headers = workqueue_stat_headers
-};
-
-
-int __init stat_workqueue_init(void)
-{
-	if (register_stat_tracer(&workqueue_stats)) {
-		pr_warning("Unable to register workqueue stat tracer\n");
-		return 1;
-	}
-
-	return 0;
-}
-fs_initcall(stat_workqueue_init);
-
-/*
- * Workqueues are created very early, just after pre-smp initcalls.
- * So we must register our tracepoints at this stage.
- */
-int __init trace_workqueue_early_init(void)
-{
-	int ret, cpu;
-
-	for_each_possible_cpu(cpu) {
-		spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
-		INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
-	}
-
-	ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
-	if (ret)
-		goto out;
-
-	ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL);
-	if (ret)
-		goto no_insertion;
-
-	ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL);
-	if (ret)
-		goto no_execution;
-
-	ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL);
-	if (ret)
-		goto no_creation;
-
-	return 0;
-
-no_creation:
-	unregister_trace_workqueue_creation(probe_workqueue_creation, NULL);
-no_execution:
-	unregister_trace_workqueue_execution(probe_workqueue_execution, NULL);
-no_insertion:
-	unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
-out:
-	pr_warning("trace_workqueue: unable to trace workqueues\n");
-
-	return 1;
-}
-early_initcall(trace_workqueue_early_init);
-- 
cgit v1.2.3


From f5b2552b4ebbeadcadde1532d7bbd3f850719046 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 13 Apr 2012 22:06:58 +0300
Subject: workqueue: change BUG_ON() to WARN_ON()

This BUG_ON() can be triggered if you call schedule_work() before
calling INIT_WORK().  It is a bug definitely, but it's nicer to just
print a stack trace and return.

Reported-by: Matt Renzelmann <mjr@cs.wisc.edu>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5abf42f63c08..66ec08de6dac 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1032,7 +1032,10 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
 	cwq = get_cwq(gcwq->cpu, wq);
 	trace_workqueue_queue_work(cpu, cwq, work);
 
-	BUG_ON(!list_empty(&work->entry));
+	if (WARN_ON(!list_empty(&work->entry))) {
+		spin_unlock_irqrestore(&gcwq->lock, flags);
+		return;
+	}
 
 	cwq->nr_in_flight[cwq->work_color]++;
 	work_flags = work_color_to_flags(cwq->work_color);
-- 
cgit v1.2.3


From 0976dfc1d0cd80a4e9dfaf87bd8744612bde475a Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Fri, 20 Apr 2012 17:28:50 -0700
Subject: workqueue: Catch more locking problems with flush_work()

If a workqueue is flushed with flush_work() lockdep checking can
be circumvented. For example:

 static DEFINE_MUTEX(mutex);

 static void my_work(struct work_struct *w)
 {
         mutex_lock(&mutex);
         mutex_unlock(&mutex);
 }

 static DECLARE_WORK(work, my_work);

 static int __init start_test_module(void)
 {
         schedule_work(&work);
         return 0;
 }
 module_init(start_test_module);

 static void __exit stop_test_module(void)
 {
         mutex_lock(&mutex);
         flush_work(&work);
         mutex_unlock(&mutex);
 }
 module_exit(stop_test_module);

would not always print a warning when flush_work() was called.
In this trivial example nothing could go wrong since we are
guaranteed module_init() and module_exit() don't run concurrently,
but if the work item is schedule asynchronously we could have a
scenario where the work item is running just at the time flush_work()
is called resulting in a classic ABBA locking problem.

Add a lockdep hint by acquiring and releasing the work item
lockdep_map in flush_work() so that we always catch this
potential deadlock scenario.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Reviewed-by: Yong Zhang <yong.zhang0@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 66ec08de6dac..211eadb23323 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2509,6 +2509,9 @@ bool flush_work(struct work_struct *work)
 {
 	struct wq_barrier barr;
 
+	lock_map_acquire(&work->lockdep_map);
+	lock_map_release(&work->lockdep_map);
+
 	if (start_flush_work(work, &barr, true)) {
 		wait_for_completion(&barr.done);
 		destroy_work_on_stack(&barr.work);
-- 
cgit v1.2.3


From 544ecf310f0e7f51fa057ac2a295fc1b3b35a9d3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 14 May 2012 15:04:50 -0700
Subject: workqueue: skip nr_running sanity check in worker_enter_idle() if
 trustee is active

worker_enter_idle() has WARN_ON_ONCE() which triggers if nr_running
isn't zero when every worker is idle.  This can trigger spuriously
while a cpu is going down due to the way trustee sets %WORKER_ROGUE
and zaps nr_running.

It first sets %WORKER_ROGUE on all workers without updating
nr_running, releases gcwq->lock, schedules, regrabs gcwq->lock and
then zaps nr_running.  If the last running worker enters idle
inbetween, it would see stale nr_running which hasn't been zapped yet
and trigger the WARN_ON_ONCE().

Fix it by performing the sanity check iff the trustee is idle.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: stable@vger.kernel.org
---
 kernel/workqueue.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 211eadb23323..c36c86cf7900 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1213,8 +1213,13 @@ static void worker_enter_idle(struct worker *worker)
 	} else
 		wake_up_all(&gcwq->trustee_wait);
 
-	/* sanity check nr_running */
-	WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle &&
+	/*
+	 * Sanity check nr_running.  Because trustee releases gcwq->lock
+	 * between setting %WORKER_ROGUE and zapping nr_running, the
+	 * warning may trigger spuriously.  Check iff trustee is idle.
+	 */
+	WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE &&
+		     gcwq->nr_workers == gcwq->nr_idle &&
 		     atomic_read(get_gcwq_nr_running(gcwq->cpu)));
 }
 
-- 
cgit v1.2.3


From 4d82a1debbffec129cc387aafa8f40b7bbab3297 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 15 May 2012 08:06:19 -0700
Subject: lockdep: fix oops in processing workqueue

Under memory load, on x86_64, with lockdep enabled, the workqueue's
process_one_work() has been seen to oops in __lock_acquire(), barfing
on a 0xffffffff00000000 pointer in the lockdep_map's class_cache[].

Because it's permissible to free a work_struct from its callout function,
the map used is an onstack copy of the map given in the work_struct: and
that copy is made without any locking.

Surprisingly, gcc (4.5.1 in Hugh's case) uses "rep movsl" rather than
"rep movsq" for that structure copy: which might race with a workqueue
user's wait_on_work() doing lock_map_acquire() on the source of the
copy, putting a pointer into the class_cache[], but only in time for
the top half of that pointer to be copied to the destination map.

Boom when process_one_work() subsequently does lock_map_acquire()
on its onstack copy of the lockdep_map.

Fix this, and a similar instance in call_timer_fn(), with a
lockdep_copy_map() function which additionally NULLs the class_cache[].

Note: this oops was actually seen on 3.4-next, where flush_work() newly
does the racing lock_map_acquire(); but Tejun points out that 3.4 and
earlier are already vulnerable to the same through wait_on_work().

* Patch orginally from Peter.  Hugh modified it a bit and wrote the
  description.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Reported-by: Hugh Dickins <hughd@google.com>
LKML-Reference: <alpine.LSU.2.00.1205070951170.1544@eggly.anvils>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/lockdep.h | 18 ++++++++++++++++++
 kernel/timer.c          |  4 +++-
 kernel/workqueue.c      |  4 +++-
 3 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index d36619ead3ba..00e46376e28f 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -157,6 +157,24 @@ struct lockdep_map {
 #endif
 };
 
+static inline void lockdep_copy_map(struct lockdep_map *to,
+				    struct lockdep_map *from)
+{
+	int i;
+
+	*to = *from;
+	/*
+	 * Since the class cache can be modified concurrently we could observe
+	 * half pointers (64bit arch using 32bit copy insns). Therefore clear
+	 * the caches and take the performance hit.
+	 *
+	 * XXX it doesn't work well with lockdep_set_class_and_subclass(), since
+	 *     that relies on cache abuse.
+	 */
+	for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
+		to->class_cache[i] = NULL;
+}
+
 /*
  * Every lock has a list of other locks that were taken after it.
  * We only grow the list, never remove from it:
diff --git a/kernel/timer.c b/kernel/timer.c
index a297ffcf888e..b12385244bb5 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1102,7 +1102,9 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
 	 * warnings as well as problems when looking into
 	 * timer->lockdep_map, make a copy and use that here.
 	 */
-	struct lockdep_map lockdep_map = timer->lockdep_map;
+	struct lockdep_map lockdep_map;
+
+	lockdep_copy_map(&lockdep_map, &timer->lockdep_map);
 #endif
 	/*
 	 * Couple the lock chain with the lock chain at
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c36c86cf7900..9a3128dc67df 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1818,7 +1818,9 @@ __acquires(&gcwq->lock)
 	 * lock freed" warnings as well as problems when looking into
 	 * work->lockdep_map, make a copy and use that here.
 	 */
-	struct lockdep_map lockdep_map = work->lockdep_map;
+	struct lockdep_map lockdep_map;
+
+	lockdep_copy_map(&lockdep_map, &work->lockdep_map);
 #endif
 	/*
 	 * A single work shouldn't be executed concurrently by
-- 
cgit v1.2.3