summaryrefslogtreecommitdiffstats
path: root/kernel/locking/rtmutex_common.h
diff options
context:
space:
mode:
authorPeter Zijlstra <peterz@infradead.org>2023-07-07 16:19:09 +0200
committerPeter Zijlstra <peterz@infradead.org>2023-07-17 13:59:10 +0200
commitf7853c34241807bb97673a5e97719123be39a09e (patch)
treec4304a821a5a5a960143abe0cf860a7d163014d6 /kernel/locking/rtmutex_common.h
parentfdf0eaf11452d72945af31804e2a1048ee1b574c (diff)
downloadlinux-stable-f7853c34241807bb97673a5e97719123be39a09e.tar.gz
linux-stable-f7853c34241807bb97673a5e97719123be39a09e.tar.bz2
linux-stable-f7853c34241807bb97673a5e97719123be39a09e.zip
locking/rtmutex: Fix task->pi_waiters integrity
Henry reported that rt_mutex_adjust_prio_check() has an ordering problem and puts the lie to the comment in [7]. Sharing the sort key between lock->waiters and owner->pi_waiters *does* create problems, since unlike what the comment claims, holding [L] is insufficient. Notably, consider: A / \ M1 M2 | | B C That is, task A owns both M1 and M2, B and C block on them. In this case a concurrent chain walk (B & C) will modify their resp. sort keys in [7] while holding M1->wait_lock and M2->wait_lock. So holding [L] is meaningless, they're different Ls. This then gives rise to a race condition between [7] and [11], where the requeue of pi_waiters will observe an inconsistent tree order. B C (holds M1->wait_lock, (holds M2->wait_lock, holds B->pi_lock) holds A->pi_lock) [7] waiter_update_prio(); ... [8] raw_spin_unlock(B->pi_lock); ... [10] raw_spin_lock(A->pi_lock); [11] rt_mutex_enqueue_pi(); // observes inconsistent A->pi_waiters // tree order Fixing this means either extending the range of the owner lock from [10-13] to [6-13], with the immediate problem that this means [6-8] hold both blocked and owner locks, or duplicating the sort key. Since the locking in chain walk is horrible enough without having to consider pi_lock nesting rules, duplicate the sort key instead. By giving each tree their own sort key, the above race becomes harmless, if C sees B at the old location, then B will correct things (if they need correcting) when it walks up the chain and reaches A. Fixes: fb00aca47440 ("rtmutex: Turn the plist into an rb-tree") Reported-by: Henry Wu <triangletrap12@gmail.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Thomas Gleixner <tglx@linutronix.de> Tested-by: Henry Wu <triangletrap12@gmail.com> Link: https://lkml.kernel.org/r/20230707161052.GF2883469%40hirez.programming.kicks-ass.net
Diffstat (limited to 'kernel/locking/rtmutex_common.h')
-rw-r--r--kernel/locking/rtmutex_common.h47
1 files changed, 34 insertions, 13 deletions
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index c47e8361bfb5..1162e07cdaea 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -17,27 +17,44 @@
#include <linux/rtmutex.h>
#include <linux/sched/wake_q.h>
+
+/*
+ * This is a helper for the struct rt_mutex_waiter below. A waiter goes in two
+ * separate trees and they need their own copy of the sort keys because of
+ * different locking requirements.
+ *
+ * @entry: rbtree node to enqueue into the waiters tree
+ * @prio: Priority of the waiter
+ * @deadline: Deadline of the waiter if applicable
+ *
+ * See rt_waiter_node_less() and waiter_*_prio().
+ */
+struct rt_waiter_node {
+ struct rb_node entry;
+ int prio;
+ u64 deadline;
+};
+
/*
* This is the control structure for tasks blocked on a rt_mutex,
* which is allocated on the kernel stack on of the blocked task.
*
- * @tree_entry: pi node to enqueue into the mutex waiters tree
- * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree
+ * @tree: node to enqueue into the mutex waiters tree
+ * @pi_tree: node to enqueue into the mutex owner waiters tree
* @task: task reference to the blocked task
* @lock: Pointer to the rt_mutex on which the waiter blocks
* @wake_state: Wakeup state to use (TASK_NORMAL or TASK_RTLOCK_WAIT)
- * @prio: Priority of the waiter
- * @deadline: Deadline of the waiter if applicable
* @ww_ctx: WW context pointer
+ *
+ * @tree is ordered by @lock->wait_lock
+ * @pi_tree is ordered by rt_mutex_owner(@lock)->pi_lock
*/
struct rt_mutex_waiter {
- struct rb_node tree_entry;
- struct rb_node pi_tree_entry;
+ struct rt_waiter_node tree;
+ struct rt_waiter_node pi_tree;
struct task_struct *task;
struct rt_mutex_base *lock;
unsigned int wake_state;
- int prio;
- u64 deadline;
struct ww_acquire_ctx *ww_ctx;
};
@@ -105,7 +122,7 @@ static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock,
{
struct rb_node *leftmost = rb_first_cached(&lock->waiters);
- return rb_entry(leftmost, struct rt_mutex_waiter, tree_entry) == waiter;
+ return rb_entry(leftmost, struct rt_mutex_waiter, tree.entry) == waiter;
}
static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock)
@@ -113,8 +130,10 @@ static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *
struct rb_node *leftmost = rb_first_cached(&lock->waiters);
struct rt_mutex_waiter *w = NULL;
+ lockdep_assert_held(&lock->wait_lock);
+
if (leftmost) {
- w = rb_entry(leftmost, struct rt_mutex_waiter, tree_entry);
+ w = rb_entry(leftmost, struct rt_mutex_waiter, tree.entry);
BUG_ON(w->lock != lock);
}
return w;
@@ -127,8 +146,10 @@ static inline int task_has_pi_waiters(struct task_struct *p)
static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p)
{
+ lockdep_assert_held(&p->pi_lock);
+
return rb_entry(p->pi_waiters.rb_leftmost, struct rt_mutex_waiter,
- pi_tree_entry);
+ pi_tree.entry);
}
#define RT_MUTEX_HAS_WAITERS 1UL
@@ -190,8 +211,8 @@ static inline void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
static inline void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
{
debug_rt_mutex_init_waiter(waiter);
- RB_CLEAR_NODE(&waiter->pi_tree_entry);
- RB_CLEAR_NODE(&waiter->tree_entry);
+ RB_CLEAR_NODE(&waiter->pi_tree.entry);
+ RB_CLEAR_NODE(&waiter->tree.entry);
waiter->wake_state = TASK_NORMAL;
waiter->task = NULL;
}