summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2009-02-04 15:12:14 -0800
committerGreg Kroah-Hartman <gregkh@suse.de>2009-02-12 09:31:04 -0800
commit2b46f3769896dc04e1e49144d282e4655677105a (patch)
tree62059341ccc20b9ca86839b266b4fd13800f6dd3
parent6106d9f94e0cf107af83b6514b1b193233314a33 (diff)
downloadlinux-stable-2b46f3769896dc04e1e49144d282e4655677105a.tar.gz
linux-stable-2b46f3769896dc04e1e49144d282e4655677105a.tar.bz2
linux-stable-2b46f3769896dc04e1e49144d282e4655677105a.zip
wait: prevent exclusive waiter starvation
commit 777c6c5f1f6e757ae49ecca2ed72d6b1f523c007 upstream. With exclusive waiters, every process woken up through the wait queue must ensure that the next waiter down the line is woken when it has finished. Interruptible waiters don't do that when aborting due to a signal. And if an aborting waiter is concurrently woken up through the waitqueue, noone will ever wake up the next waiter. This has been observed with __wait_on_bit_lock() used by lock_page_killable(): the first contender on the queue was aborting when the actual lock holder woke it up concurrently. The aborted contender didn't acquire the lock and therefor never did an unlock followed by waking up the next waiter. Add abort_exclusive_wait() which removes the process' wait descriptor from the waitqueue, iff still queued, or wakes up the next waiter otherwise. It does so under the waitqueue lock. Racing with a wake up means the aborting process is either already woken (removed from the queue) and will wake up the next waiter, or it will remove itself from the queue and the concurrent wake up will apply to the next waiter after it. Use abort_exclusive_wait() in __wait_event_interruptible_exclusive() and __wait_on_bit_lock() when they were interrupted by other means than a wake up through the queue. [akpm@linux-foundation.org: coding-style fixes] Reported-by: Chris Mason <chris.mason@oracle.com> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Mentored-by: Oleg Nesterov <oleg@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Matthew Wilcox <matthew@wil.cx> Cc: Chuck Lever <cel@citi.umich.edu> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
-rw-r--r--include/linux/wait.h11
-rw-r--r--kernel/sched.c4
-rw-r--r--kernel/wait.c59
3 files changed, 63 insertions, 11 deletions
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 0081147a9fe8..3c2f411510e8 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -141,6 +141,8 @@ static inline void __remove_wait_queue(wait_queue_head_t *head,
list_del(&old->task_list);
}
+void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+ int nr_exclusive, int sync, void *key);
void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
extern void __wake_up_locked(wait_queue_head_t *q, unsigned int mode);
extern void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
@@ -342,16 +344,19 @@ do { \
for (;;) { \
prepare_to_wait_exclusive(&wq, &__wait, \
TASK_INTERRUPTIBLE); \
- if (condition) \
+ if (condition) { \
+ finish_wait(&wq, &__wait); \
break; \
+ } \
if (!signal_pending(current)) { \
schedule(); \
continue; \
} \
ret = -ERESTARTSYS; \
+ abort_exclusive_wait(&wq, &__wait, \
+ TASK_INTERRUPTIBLE, NULL); \
break; \
} \
- finish_wait(&wq, &__wait); \
} while (0)
#define wait_event_interruptible_exclusive(wq, condition) \
@@ -440,6 +445,8 @@ extern long interruptible_sleep_on_timeout(wait_queue_head_t *q,
void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state);
void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state);
void finish_wait(wait_queue_head_t *q, wait_queue_t *wait);
+void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
+ unsigned int mode, void *key);
int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
diff --git a/kernel/sched.c b/kernel/sched.c
index 2a09e44374af..98c0cdc85043 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4556,8 +4556,8 @@ EXPORT_SYMBOL(default_wake_function);
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
* zero in this (rare) case, and we handle it by continuing to scan the queue.
*/
-static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
- int nr_exclusive, int sync, void *key)
+void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+ int nr_exclusive, int sync, void *key)
{
wait_queue_t *curr, *next;
diff --git a/kernel/wait.c b/kernel/wait.c
index c275c56cf2d3..24688c762577 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -101,6 +101,15 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
}
EXPORT_SYMBOL(prepare_to_wait_exclusive);
+/*
+ * finish_wait - clean up after waiting in a queue
+ * @q: waitqueue waited on
+ * @wait: wait descriptor
+ *
+ * Sets current thread back to running state and removes
+ * the wait descriptor from the given waitqueue if still
+ * queued.
+ */
void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
{
unsigned long flags;
@@ -127,6 +136,39 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
}
EXPORT_SYMBOL(finish_wait);
+/*
+ * abort_exclusive_wait - abort exclusive waiting in a queue
+ * @q: waitqueue waited on
+ * @wait: wait descriptor
+ * @state: runstate of the waiter to be woken
+ * @key: key to identify a wait bit queue or %NULL
+ *
+ * Sets current thread back to running state and removes
+ * the wait descriptor from the given waitqueue if still
+ * queued.
+ *
+ * Wakes up the next waiter if the caller is concurrently
+ * woken up through the queue.
+ *
+ * This prevents waiter starvation where an exclusive waiter
+ * aborts and is woken up concurrently and noone wakes up
+ * the next waiter.
+ */
+void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
+ unsigned int mode, void *key)
+{
+ unsigned long flags;
+
+ __set_current_state(TASK_RUNNING);
+ spin_lock_irqsave(&q->lock, flags);
+ if (!list_empty(&wait->task_list))
+ list_del_init(&wait->task_list);
+ else if (waitqueue_active(q))
+ __wake_up_common(q, mode, 1, 0, key);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(abort_exclusive_wait);
+
int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
int ret = default_wake_function(wait, mode, sync, key);
@@ -187,17 +229,20 @@ int __sched
__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
int (*action)(void *), unsigned mode)
{
- int ret = 0;
-
do {
+ int ret;
+
prepare_to_wait_exclusive(wq, &q->wait, mode);
- if (test_bit(q->key.bit_nr, q->key.flags)) {
- if ((ret = (*action)(q->key.flags)))
- break;
- }
+ if (!test_bit(q->key.bit_nr, q->key.flags))
+ continue;
+ ret = action(q->key.flags);
+ if (!ret)
+ continue;
+ abort_exclusive_wait(wq, &q->wait, mode, &q->key);
+ return ret;
} while (test_and_set_bit(q->key.bit_nr, q->key.flags));
finish_wait(wq, &q->wait);
- return ret;
+ return 0;
}
EXPORT_SYMBOL(__wait_on_bit_lock);