From 978315462d3ea3cf6cfacd34c563ec1eb02a3aa5 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 17 May 2019 23:22:34 +0200 Subject: locking/lockdep: Don't complain about incorrect name for no validate class It is possible to ignore the validation for a certain lock by using: lockdep_set_novalidate_class() on it. Each invocation will assign a new name to the class it created for created __lockdep_no_validate__. That means that once lockdep_set_novalidate_class() has been used on two locks then class->name won't match lock->name for the first lock triggering the warning. So ignore changed non-matching ->name pointer for the special __lockdep_no_validate__ class. Signed-off-by: Sebastian Andrzej Siewior Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Link: http://lkml.kernel.org/r/20190517212234.32611-1-bigeasy@linutronix.de Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index c47788fa85f9..6b283b4f87aa 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -732,7 +732,8 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) * Huh! same key, different name? Did someone trample * on some memory? We're most confused. */ - WARN_ON_ONCE(class->name != lock->name); + WARN_ON_ONCE(class->name != lock->name && + lock->key != &__lockdep_no_validate__); return class; } } -- cgit v1.2.3 From c0090c4c85c27d1fa3d785c935501b7207cd2869 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Thu, 16 May 2019 21:13:26 +0200 Subject: locking/lockdep: Remove the unused print_lock_trace() function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc warns that function print_lock_trace() is unused if CONFIG_PROVE_LOCKING isn't set: ../kernel/locking/lockdep.c:2820:13: warning: ‘print_lock_trace’ defined but not used [-Wunused-function] Rework so we remove the function if CONFIG_PROVE_LOCKING isn't set. Signed-off-by: Anders Roxell Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: will.deacon@arm.com Fixes: c120bce78065 ("lockdep: Simplify stack trace handling") Link: http://lkml.kernel.org/r/20190516191326.27003-1-anders.roxell@linaro.org Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 6b283b4f87aa..8d32ae7768a7 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2818,10 +2818,6 @@ static inline int validate_chain(struct task_struct *curr, { return 1; } - -static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) -{ -} #endif /* -- cgit v1.2.3 From f7c1c6b36a3874d3a7987fb3af829d5b0d75bda7 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:17 +0800 Subject: locking/lockdep: Change all print_*() return type to void Since none of the print_*() function's return value is necessary, change their return type to void. No functional change. In cases where an invariable return value is used, this change slightly improves readability, i.e.: print_x(); return 0; is definitely better than: return print_x(); /* where print_x() always returns 0 */ Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-2-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 209 ++++++++++++++++++++++++----------------------- 1 file changed, 108 insertions(+), 101 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 8d32ae7768a7..109b56267c8f 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1427,16 +1427,15 @@ static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) * Print a dependency chain entry (this is only done when a deadlock * has been detected): */ -static noinline int +static noinline void print_circular_bug_entry(struct lock_list *target, int depth) { if (debug_locks_silent) - return 0; + return; printk("\n-> #%u", depth); print_lock_name(target->class); printk(KERN_CONT ":\n"); print_lock_trace(&target->trace, 6); - return 0; } static void @@ -1493,7 +1492,7 @@ print_circular_lock_scenario(struct held_lock *src, * When a circular dependency is detected, print the * header first: */ -static noinline int +static noinline void print_circular_bug_header(struct lock_list *entry, unsigned int depth, struct held_lock *check_src, struct held_lock *check_tgt) @@ -1501,7 +1500,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, struct task_struct *curr = current; if (debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("======================================================\n"); @@ -1519,8 +1518,6 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, pr_warn("\nthe existing dependency chain (in reverse order) is:\n"); print_circular_bug_entry(entry, depth); - - return 0; } static inline int class_equal(struct lock_list *entry, void *data) @@ -1528,10 +1525,10 @@ static inline int class_equal(struct lock_list *entry, void *data) return entry->class == data; } -static noinline int print_circular_bug(struct lock_list *this, - struct lock_list *target, - struct held_lock *check_src, - struct held_lock *check_tgt) +static noinline void print_circular_bug(struct lock_list *this, + struct lock_list *target, + struct held_lock *check_src, + struct held_lock *check_tgt) { struct task_struct *curr = current; struct lock_list *parent; @@ -1539,10 +1536,10 @@ static noinline int print_circular_bug(struct lock_list *this, int depth; if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; + return; if (!save_trace(&this->trace)) - return 0; + return; depth = get_lock_depth(target); @@ -1564,21 +1561,17 @@ static noinline int print_circular_bug(struct lock_list *this, printk("\nstack backtrace:\n"); dump_stack(); - - return 0; } -static noinline int print_bfs_bug(int ret) +static noinline void print_bfs_bug(int ret) { if (!debug_locks_off_graph_unlock()) - return 0; + return; /* * Breadth-first-search failed, graph got corrupted? */ WARN(1, "lockdep bfs error:%d\n", ret); - - return 0; } static int noop_count(struct lock_list *entry, void *data) @@ -1767,7 +1760,7 @@ static void print_lock_class_header(struct lock_class *class, int depth) */ static void __used print_shortest_lock_dependencies(struct lock_list *leaf, - struct lock_list *root) + struct lock_list *root) { struct lock_list *entry = leaf; int depth; @@ -1789,8 +1782,6 @@ print_shortest_lock_dependencies(struct lock_list *leaf, entry = get_lock_parent(entry); depth--; } while (entry && (depth >= 0)); - - return; } static void @@ -1849,7 +1840,7 @@ print_irq_lock_scenario(struct lock_list *safe_entry, printk("\n *** DEADLOCK ***\n\n"); } -static int +static void print_bad_irq_dependency(struct task_struct *curr, struct lock_list *prev_root, struct lock_list *next_root, @@ -1862,7 +1853,7 @@ print_bad_irq_dependency(struct task_struct *curr, const char *irqclass) { if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("=====================================================\n"); @@ -1908,19 +1899,17 @@ print_bad_irq_dependency(struct task_struct *curr, pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); if (!save_trace(&prev_root->trace)) - return 0; + return; print_shortest_lock_dependencies(backwards_entry, prev_root); pr_warn("\nthe dependencies between the lock to be acquired"); pr_warn(" and %s-irq-unsafe lock:\n", irqclass); if (!save_trace(&next_root->trace)) - return 0; + return; print_shortest_lock_dependencies(forwards_entry, next_root); pr_warn("\nstack backtrace:\n"); dump_stack(); - - return 0; } static const char *state_names[] = { @@ -2067,8 +2056,10 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, this.class = hlock_class(prev); ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL); - if (ret < 0) - return print_bfs_bug(ret); + if (ret < 0) { + print_bfs_bug(ret); + return 0; + } usage_mask &= LOCKF_USED_IN_IRQ_ALL; if (!usage_mask) @@ -2084,8 +2075,10 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, that.class = hlock_class(next); ret = find_usage_forwards(&that, forward_mask, &target_entry1); - if (ret < 0) - return print_bfs_bug(ret); + if (ret < 0) { + print_bfs_bug(ret); + return 0; + } if (ret == 1) return ret; @@ -2097,8 +2090,10 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, backward_mask = original_mask(target_entry1->class->usage_mask); ret = find_usage_backwards(&this, backward_mask, &target_entry); - if (ret < 0) - return print_bfs_bug(ret); + if (ret < 0) { + print_bfs_bug(ret); + return 0; + } if (DEBUG_LOCKS_WARN_ON(ret == 1)) return 1; @@ -2112,11 +2107,13 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, if (DEBUG_LOCKS_WARN_ON(ret == -1)) return 1; - return print_bad_irq_dependency(curr, &this, &that, - target_entry, target_entry1, - prev, next, - backward_bit, forward_bit, - state_name(backward_bit)); + print_bad_irq_dependency(curr, &this, &that, + target_entry, target_entry1, + prev, next, + backward_bit, forward_bit, + state_name(backward_bit)); + + return 0; } static void inc_chains(void) @@ -2147,8 +2144,7 @@ static inline void inc_chains(void) #endif static void -print_deadlock_scenario(struct held_lock *nxt, - struct held_lock *prv) +print_deadlock_scenario(struct held_lock *nxt, struct held_lock *prv) { struct lock_class *next = hlock_class(nxt); struct lock_class *prev = hlock_class(prv); @@ -2166,12 +2162,12 @@ print_deadlock_scenario(struct held_lock *nxt, printk(" May be due to missing lock nesting notation\n\n"); } -static int +static void print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, struct held_lock *next) { if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("============================================\n"); @@ -2190,8 +2186,6 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, pr_warn("\nstack backtrace:\n"); dump_stack(); - - return 0; } /* @@ -2233,7 +2227,8 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, if (nest) return 2; - return print_deadlock_bug(curr, prev, next); + print_deadlock_bug(curr, prev, next); + return 0; } return 1; } @@ -2308,10 +2303,13 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, */ save_trace(trace); } - return print_circular_bug(&this, target_entry, next, prev); + print_circular_bug(&this, target_entry, next, prev); + return 0; + } + else if (unlikely(ret < 0)) { + print_bfs_bug(ret); + return 0; } - else if (unlikely(ret < 0)) - return print_bfs_bug(ret); if (!check_irq_usage(curr, prev, next)) return 0; @@ -2352,8 +2350,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, debug_atomic_inc(nr_redundant); return 2; } - if (ret < 0) - return print_bfs_bug(ret); + if (ret < 0) { + print_bfs_bug(ret); + return 0; + } if (!trace->nr_entries && !save_trace(trace)) @@ -2877,8 +2877,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) -static void -print_usage_bug_scenario(struct held_lock *lock) +static void print_usage_bug_scenario(struct held_lock *lock) { struct lock_class *class = hlock_class(lock); @@ -2895,12 +2894,12 @@ print_usage_bug_scenario(struct held_lock *lock) printk("\n *** DEADLOCK ***\n\n"); } -static int +static void print_usage_bug(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) { if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("================================\n"); @@ -2930,8 +2929,6 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, pr_warn("\nstack backtrace:\n"); dump_stack(); - - return 0; } /* @@ -2941,8 +2938,10 @@ static inline int valid_state(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) { - if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) - return print_usage_bug(curr, this, bad_bit, new_bit); + if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) { + print_usage_bug(curr, this, bad_bit, new_bit); + return 0; + } return 1; } @@ -2950,7 +2949,7 @@ valid_state(struct task_struct *curr, struct held_lock *this, /* * print irq inversion bug: */ -static int +static void print_irq_inversion_bug(struct task_struct *curr, struct lock_list *root, struct lock_list *other, struct held_lock *this, int forwards, @@ -2961,7 +2960,7 @@ print_irq_inversion_bug(struct task_struct *curr, int depth; if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("========================================================\n"); @@ -3002,13 +3001,11 @@ print_irq_inversion_bug(struct task_struct *curr, pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); if (!save_trace(&root->trace)) - return 0; + return; print_shortest_lock_dependencies(other, root); pr_warn("\nstack backtrace:\n"); dump_stack(); - - return 0; } /* @@ -3026,13 +3023,16 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, root.parent = NULL; root.class = hlock_class(this); ret = find_usage_forwards(&root, lock_flag(bit), &target_entry); - if (ret < 0) - return print_bfs_bug(ret); + if (ret < 0) { + print_bfs_bug(ret); + return 0; + } if (ret == 1) return ret; - return print_irq_inversion_bug(curr, &root, target_entry, - this, 1, irqclass); + print_irq_inversion_bug(curr, &root, target_entry, + this, 1, irqclass); + return 0; } /* @@ -3050,13 +3050,16 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, root.parent = NULL; root.class = hlock_class(this); ret = find_usage_backwards(&root, lock_flag(bit), &target_entry); - if (ret < 0) - return print_bfs_bug(ret); + if (ret < 0) { + print_bfs_bug(ret); + return 0; + } if (ret == 1) return ret; - return print_irq_inversion_bug(curr, &root, target_entry, - this, 0, irqclass); + print_irq_inversion_bug(curr, &root, target_entry, + this, 0, irqclass); + return 0; } void print_irqtrace_events(struct task_struct *curr) @@ -3599,15 +3602,15 @@ EXPORT_SYMBOL_GPL(lockdep_init_map); struct lock_class_key __lockdep_no_validate__; EXPORT_SYMBOL_GPL(__lockdep_no_validate__); -static int +static void print_lock_nested_lock_not_held(struct task_struct *curr, struct held_lock *hlock, unsigned long ip) { if (!debug_locks_off()) - return 0; + return; if (debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("==================================\n"); @@ -3629,8 +3632,6 @@ print_lock_nested_lock_not_held(struct task_struct *curr, pr_warn("\nstack backtrace:\n"); dump_stack(); - - return 0; } static int __lock_is_held(const struct lockdep_map *lock, int read); @@ -3779,8 +3780,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, } chain_key = iterate_chain_key(chain_key, class_idx); - if (nest_lock && !__lock_is_held(nest_lock, -1)) - return print_lock_nested_lock_not_held(curr, hlock, ip); + if (nest_lock && !__lock_is_held(nest_lock, -1)) { + print_lock_nested_lock_not_held(curr, hlock, ip); + return 0; + } if (!debug_locks_silent) { WARN_ON_ONCE(depth && !hlock_class(hlock - 1)->key); @@ -3816,14 +3819,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, return 1; } -static int -print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, - unsigned long ip) +static void print_unlock_imbalance_bug(struct task_struct *curr, + struct lockdep_map *lock, + unsigned long ip) { if (!debug_locks_off()) - return 0; + return; if (debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("=====================================\n"); @@ -3841,8 +3844,6 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, pr_warn("\nstack backtrace:\n"); dump_stack(); - - return 0; } static int match_held_lock(const struct held_lock *hlock, @@ -3961,8 +3962,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name, return 0; hlock = find_held_lock(curr, lock, depth, &i); - if (!hlock) - return print_unlock_imbalance_bug(curr, lock, ip); + if (!hlock) { + print_unlock_imbalance_bug(curr, lock, ip); + return 0; + } lockdep_init_map(lock, name, key, 0); class = register_lock_class(lock, subclass, 0); @@ -4002,8 +4005,10 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) return 0; hlock = find_held_lock(curr, lock, depth, &i); - if (!hlock) - return print_unlock_imbalance_bug(curr, lock, ip); + if (!hlock) { + print_unlock_imbalance_bug(curr, lock, ip); + return 0; + } curr->lockdep_depth = i; curr->curr_chain_key = hlock->prev_chain_key; @@ -4047,16 +4052,20 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) * So we're all set to release this lock.. wait what lock? We don't * own any locks, you've been drinking again? */ - if (DEBUG_LOCKS_WARN_ON(depth <= 0)) - return print_unlock_imbalance_bug(curr, lock, ip); + if (DEBUG_LOCKS_WARN_ON(depth <= 0)) { + print_unlock_imbalance_bug(curr, lock, ip); + return 0; + } /* * Check whether the lock exists in the current stack * of held locks: */ hlock = find_held_lock(curr, lock, depth, &i); - if (!hlock) - return print_unlock_imbalance_bug(curr, lock, ip); + if (!hlock) { + print_unlock_imbalance_bug(curr, lock, ip); + return 0; + } if (hlock->instance == lock) lock_release_holdtime(hlock); @@ -4399,14 +4408,14 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie) EXPORT_SYMBOL_GPL(lock_unpin_lock); #ifdef CONFIG_LOCK_STAT -static int -print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, - unsigned long ip) +static void print_lock_contention_bug(struct task_struct *curr, + struct lockdep_map *lock, + unsigned long ip) { if (!debug_locks_off()) - return 0; + return; if (debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("=================================\n"); @@ -4424,8 +4433,6 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, pr_warn("\nstack backtrace:\n"); dump_stack(); - - return 0; } static void -- cgit v1.2.3 From c52478f4f38ace598475413a08dba9b9fd827eaf Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:19 +0800 Subject: locking/lockdep: Adjust lock usage bit character checks The lock usage bit characters are defined and determined with tricks. Add some explanation to make it a bit clearer, then adjust the logic to check the usage, which optimizes the code a bit. No functional change. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-4-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 109b56267c8f..a033df00fd1d 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -500,15 +500,26 @@ static inline unsigned long lock_flag(enum lock_usage_bit bit) static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit) { + /* + * The usage character defaults to '.' (i.e., irqs disabled and not in + * irq context), which is the safest usage category. + */ char c = '.'; - if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK)) + /* + * The order of the following usage checks matters, which will + * result in the outcome character as follows: + * + * - '+': irq is enabled and not in irq context + * - '-': in irq context and irq is disabled + * - '?': in irq context and irq is enabled + */ + if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK)) { c = '+'; - if (class->usage_mask & lock_flag(bit)) { - c = '-'; - if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK)) + if (class->usage_mask & lock_flag(bit)) c = '?'; - } + } else if (class->usage_mask & lock_flag(bit)) + c = '-'; return c; } -- cgit v1.2.3 From e7a38f63ba50dc95426dd50c43383dfecaa35d7f Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:20 +0800 Subject: locking/lockdep: Remove useless conditional macro Since #defined(CONFIG_PROVE_LOCKING) is used in the scope of #ifdef CONFIG_PROVE_LOCKING, it can be removed. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-5-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index a033df00fd1d..3c477018e184 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1674,7 +1674,7 @@ check_redundant(struct lock_list *root, struct lock_class *target, return result; } -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) +#ifdef CONFIG_TRACE_IRQFLAGS static inline int usage_accumulate(struct lock_list *entry, void *mask) { @@ -2152,7 +2152,7 @@ static inline void inc_chains(void) nr_process_chains++; } -#endif +#endif /* CONFIG_TRACE_IRQFLAGS */ static void print_deadlock_scenario(struct held_lock *nxt, struct held_lock *prv) @@ -2829,7 +2829,7 @@ static inline int validate_chain(struct task_struct *curr, { return 1; } -#endif +#endif /* CONFIG_PROVE_LOCKING */ /* * We are building curr_chain_key incrementally, so double-check -- cgit v1.2.3 From 834494b28024b39d45aea6bcc642b0fe94fe2503 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:21 +0800 Subject: locking/lockdep: Print the right depth for chain key collision Since chains are separated by IRQ context, so when printing a chain the depth should be consistent with it. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-6-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 3c477018e184..bc1efc12a8c5 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2519,10 +2519,11 @@ print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_ne struct held_lock *hlock; u64 chain_key = 0; int depth = curr->lockdep_depth; - int i; + int i = get_first_held_lock(curr, hlock_next); - printk("depth: %u\n", depth + 1); - for (i = get_first_held_lock(curr, hlock_next); i < depth; i++) { + printk("depth: %u (irq_context %u)\n", depth - i + 1, + hlock_next->irq_context); + for (; i < depth; i++) { hlock = curr->held_locks + i; chain_key = print_chain_key_iteration(hlock->class_idx, chain_key); -- cgit v1.2.3 From e196e479a3b844da6e6e71e0d2a8694040cb4e52 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:23 +0800 Subject: locking/lockdep: Use lockdep_init_task for task initiation consistently Despite that there is a lockdep_init_task() which does nothing, lockdep initiates tasks by assigning lockdep fields and does so inconsistently. Fix this by using lockdep_init_task(). Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-8-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index bc1efc12a8c5..b7d9c28ecf3b 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -359,6 +359,13 @@ static inline u64 iterate_chain_key(u64 key, u32 idx) return k0 | (u64)k1 << 32; } +void lockdep_init_task(struct task_struct *task) +{ + task->lockdep_depth = 0; /* no locks held yet */ + task->curr_chain_key = 0; + task->lockdep_recursion = 0; +} + void lockdep_off(void) { current->lockdep_recursion++; @@ -4589,9 +4596,7 @@ void lockdep_reset(void) int i; raw_local_irq_save(flags); - current->curr_chain_key = 0; - current->lockdep_depth = 0; - current->lockdep_recursion = 0; + lockdep_init_task(current); memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock)); nr_hardirq_chains = 0; nr_softirq_chains = 0; -- cgit v1.2.3 From f6ec8829ac9d59b637366c13038f15d6f6156fe1 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:24 +0800 Subject: locking/lockdep: Define INITIAL_CHAIN_KEY for chain keys to start with Chain keys are computed using Jenkins hash function, which needs an initial hash to start with. Dedicate a macro to make this clear and configurable. A later patch changes this initial chain key. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-9-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index b7d9c28ecf3b..9edf6f12b711 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -362,7 +362,7 @@ static inline u64 iterate_chain_key(u64 key, u32 idx) void lockdep_init_task(struct task_struct *task) { task->lockdep_depth = 0; /* no locks held yet */ - task->curr_chain_key = 0; + task->curr_chain_key = INITIAL_CHAIN_KEY; task->lockdep_recursion = 0; } @@ -857,7 +857,7 @@ static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS]; static bool check_lock_chain_key(struct lock_chain *chain) { #ifdef CONFIG_PROVE_LOCKING - u64 chain_key = 0; + u64 chain_key = INITIAL_CHAIN_KEY; int i; for (i = chain->base; i < chain->base + chain->depth; i++) @@ -2524,7 +2524,7 @@ static void print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_next) { struct held_lock *hlock; - u64 chain_key = 0; + u64 chain_key = INITIAL_CHAIN_KEY; int depth = curr->lockdep_depth; int i = get_first_held_lock(curr, hlock_next); @@ -2544,7 +2544,7 @@ print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_ne static void print_chain_keys_chain(struct lock_chain *chain) { int i; - u64 chain_key = 0; + u64 chain_key = INITIAL_CHAIN_KEY; int class_id; printk("depth: %u\n", chain->depth); @@ -2848,7 +2848,7 @@ static void check_chain_key(struct task_struct *curr) #ifdef CONFIG_DEBUG_LOCKDEP struct held_lock *hlock, *prev_hlock = NULL; unsigned int i; - u64 chain_key = 0; + u64 chain_key = INITIAL_CHAIN_KEY; for (i = 0; i < curr->lockdep_depth; i++) { hlock = curr->held_locks + i; @@ -2872,7 +2872,7 @@ static void check_chain_key(struct task_struct *curr) if (prev_hlock && (prev_hlock->irq_context != hlock->irq_context)) - chain_key = 0; + chain_key = INITIAL_CHAIN_KEY; chain_key = iterate_chain_key(chain_key, hlock->class_idx); prev_hlock = hlock; } @@ -3787,14 +3787,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, /* * How can we have a chain hash when we ain't got no keys?! */ - if (DEBUG_LOCKS_WARN_ON(chain_key != 0)) + if (DEBUG_LOCKS_WARN_ON(chain_key != INITIAL_CHAIN_KEY)) return 0; chain_head = 1; } hlock->prev_chain_key = chain_key; if (separate_irq_context(curr, hlock)) { - chain_key = 0; + chain_key = INITIAL_CHAIN_KEY; chain_head = 1; } chain_key = iterate_chain_key(chain_key, class_idx); @@ -4636,7 +4636,7 @@ static void remove_class_from_lock_chain(struct pending_free *pf, return; recalc: - chain_key = 0; + chain_key = INITIAL_CHAIN_KEY; for (i = chain->base; i < chain->base + chain->depth; i++) chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1); if (chain->depth && chain->chain_key == chain_key) -- cgit v1.2.3 From 01bb6f0af992a1e6b7797d92fd31a7864872e347 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:25 +0800 Subject: locking/lockdep: Change the range of class_idx in held_lock struct held_lock->class_idx is used to point to the class of the held lock. The index is shifted by 1 to make index 0 mean no class, which results in class index shifting back and forth but is not worth doing so. The reason is: (1) there will be no "no-class" held_lock to begin with, and (2) index 0 seems to be used for error checking, but if something wrong indeed happened, the index can't be counted on to distinguish it as that something won't set the class_idx to 0 on purpose to tell us it is wrong. Therefore, change the index to start from 0. This saves a lot of back-and-forth shifts and a class slot back to lock_classes. Since index 0 is now used for lock class, we change the initial chain key to -1 to avoid key collision, which is due to the fact that __jhash_mix(0, 0, 0) = 0. Actually, the initial chain key can be any arbitrary value other than 0. In addition, a bitmap is maintained to keep track of the used lock classes, and we check the validity of the held lock against that bitmap. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-10-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 59 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 20 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 9edf6f12b711..3eecae315885 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -151,17 +151,28 @@ unsigned long nr_lock_classes; static #endif struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; +static DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS); static inline struct lock_class *hlock_class(struct held_lock *hlock) { - if (!hlock->class_idx) { + unsigned int class_idx = hlock->class_idx; + + /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfield */ + barrier(); + + if (!test_bit(class_idx, lock_classes_in_use)) { /* * Someone passed in garbage, we give up. */ DEBUG_LOCKS_WARN_ON(1); return NULL; } - return lock_classes + hlock->class_idx - 1; + + /* + * At this point, if the passed hlock->class_idx is still garbage, + * we just have to live with it + */ + return lock_classes + class_idx; } #ifdef CONFIG_LOCK_STAT @@ -590,19 +601,22 @@ static void print_lock(struct held_lock *hlock) /* * We can be called locklessly through debug_show_all_locks() so be * extra careful, the hlock might have been released and cleared. + * + * If this indeed happens, lets pretend it does not hurt to continue + * to print the lock unless the hlock class_idx does not point to a + * registered class. The rationale here is: since we don't attempt + * to distinguish whether we are in this situation, if it just + * happened we can't count on class_idx to tell either. */ - unsigned int class_idx = hlock->class_idx; + struct lock_class *lock = hlock_class(hlock); - /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfields: */ - barrier(); - - if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) { + if (!lock) { printk(KERN_CONT "\n"); return; } printk(KERN_CONT "%p", hlock->instance); - print_lock_name(lock_classes + class_idx - 1); + print_lock_name(lock); printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip); } @@ -861,7 +875,7 @@ static bool check_lock_chain_key(struct lock_chain *chain) int i; for (i = chain->base; i < chain->base + chain->depth; i++) - chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1); + chain_key = iterate_chain_key(chain_key, chain_hlocks[i]); /* * The 'unsigned long long' casts avoid that a compiler warning * is reported when building tools/lib/lockdep. @@ -1136,6 +1150,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) return NULL; } nr_lock_classes++; + __set_bit(class - lock_classes, lock_classes_in_use); debug_atomic_inc(nr_unused_locks); class->key = key; class->name = lock->name; @@ -2550,7 +2565,7 @@ static void print_chain_keys_chain(struct lock_chain *chain) printk("depth: %u\n", chain->depth); for (i = 0; i < chain->depth; i++) { class_id = chain_hlocks[chain->base + i]; - chain_key = print_chain_key_iteration(class_id + 1, chain_key); + chain_key = print_chain_key_iteration(class_id, chain_key); print_lock_name(lock_classes + class_id); printk("\n"); @@ -2601,7 +2616,7 @@ static int check_no_collision(struct task_struct *curr, } for (j = 0; j < chain->depth - 1; j++, i++) { - id = curr->held_locks[i].class_idx - 1; + id = curr->held_locks[i].class_idx; if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) { print_collision(curr, hlock, chain); @@ -2684,7 +2699,7 @@ static inline int add_chain_cache(struct task_struct *curr, if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { chain->base = nr_chain_hlocks; for (j = 0; j < chain->depth - 1; j++, i++) { - int lock_id = curr->held_locks[i].class_idx - 1; + int lock_id = curr->held_locks[i].class_idx; chain_hlocks[chain->base + j] = lock_id; } chain_hlocks[chain->base + j] = class - lock_classes; @@ -2864,10 +2879,12 @@ static void check_chain_key(struct task_struct *curr) (unsigned long long)hlock->prev_chain_key); return; } + /* - * Whoops ran out of static storage again? + * hlock->class_idx can't go beyond MAX_LOCKDEP_KEYS, but is + * it registered lock class index? */ - if (DEBUG_LOCKS_WARN_ON(hlock->class_idx > MAX_LOCKDEP_KEYS)) + if (DEBUG_LOCKS_WARN_ON(!test_bit(hlock->class_idx, lock_classes_in_use))) return; if (prev_hlock && (prev_hlock->irq_context != @@ -3715,7 +3732,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) return 0; - class_idx = class - lock_classes + 1; + class_idx = class - lock_classes; if (depth) { hlock = curr->held_locks + depth - 1; @@ -3777,9 +3794,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, * the hash, not class->key. */ /* - * Whoops, we did it again.. ran straight out of our static allocation. + * Whoops, we did it again.. class_idx is invalid. */ - if (DEBUG_LOCKS_WARN_ON(class_idx > MAX_LOCKDEP_KEYS)) + if (DEBUG_LOCKS_WARN_ON(!test_bit(class_idx, lock_classes_in_use))) return 0; chain_key = curr->curr_chain_key; @@ -3894,7 +3911,7 @@ static int match_held_lock(const struct held_lock *hlock, if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) return 0; - if (hlock->class_idx == class - lock_classes + 1) + if (hlock->class_idx == class - lock_classes) return 1; } @@ -3988,7 +4005,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name, lockdep_init_map(lock, name, key, 0); class = register_lock_class(lock, subclass, 0); - hlock->class_idx = class - lock_classes + 1; + hlock->class_idx = class - lock_classes; curr->lockdep_depth = i; curr->curr_chain_key = hlock->prev_chain_key; @@ -4638,7 +4655,7 @@ static void remove_class_from_lock_chain(struct pending_free *pf, recalc: chain_key = INITIAL_CHAIN_KEY; for (i = chain->base; i < chain->base + chain->depth; i++) - chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1); + chain_key = iterate_chain_key(chain_key, chain_hlocks[i]); if (chain->depth && chain->chain_key == chain_key) return; /* Overwrite the chain key for concurrent RCU readers. */ @@ -4712,6 +4729,7 @@ static void zap_class(struct pending_free *pf, struct lock_class *class) WRITE_ONCE(class->key, NULL); WRITE_ONCE(class->name, NULL); nr_lock_classes--; + __clear_bit(class - lock_classes, lock_classes_in_use); } else { WARN_ONCE(true, "%s() failed for class %s\n", __func__, class->name); @@ -5057,6 +5075,7 @@ void __init lockdep_init(void) printk(" memory used by lock dependency info: %zu kB\n", (sizeof(lock_classes) + + sizeof(lock_classes_in_use) + sizeof(classhash_table) + sizeof(list_entries) + sizeof(list_entries_in_use) + -- cgit v1.2.3 From 0b9fc8ecfa30000c8900da7adbbef23438de9ec0 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:26 +0800 Subject: locking/lockdep: Remove unused argument in validate_chain() and check_deadlock() The lockdep_map argument in them is not used, remove it. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Bart Van Assche Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-11-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 3eecae315885..6cf14c84eb6d 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2230,8 +2230,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read */ static int -check_deadlock(struct task_struct *curr, struct held_lock *next, - struct lockdep_map *next_instance, int read) +check_deadlock(struct task_struct *curr, struct held_lock *next, int read) { struct held_lock *prev; struct held_lock *nest = NULL; @@ -2789,8 +2788,9 @@ cache_hit: return 1; } -static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, - struct held_lock *hlock, int chain_head, u64 chain_key) +static int validate_chain(struct task_struct *curr, + struct held_lock *hlock, + int chain_head, u64 chain_key) { /* * Trylock needs to maintain the stack of held locks, but it @@ -2816,7 +2816,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, * any of these scenarios could lead to a deadlock. If * All validations */ - int ret = check_deadlock(curr, hlock, lock, hlock->read); + int ret = check_deadlock(curr, hlock, hlock->read); if (!ret) return 0; @@ -2847,8 +2847,8 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, } #else static inline int validate_chain(struct task_struct *curr, - struct lockdep_map *lock, struct held_lock *hlock, - int chain_head, u64 chain_key) + struct held_lock *hlock, + int chain_head, u64 chain_key) { return 1; } @@ -3826,7 +3826,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, WARN_ON_ONCE(!hlock_class(hlock)->key); } - if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) + if (!validate_chain(curr, hlock, chain_head, chain_key)) return 0; curr->curr_chain_key = chain_key; -- cgit v1.2.3 From 31a490e5c54f5499aa744f8524611e2a4b19f8ba Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:27 +0800 Subject: locking/lockdep: Update comment A leftover comment is removed. While at it, add more explanatory comments. Such a trivial patch! Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-12-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 6cf14c84eb6d..a9799f9ed093 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2811,10 +2811,16 @@ static int validate_chain(struct task_struct *curr, * - is softirq-safe, if this lock is hardirq-unsafe * * And check whether the new lock's dependency graph - * could lead back to the previous lock. + * could lead back to the previous lock: * - * any of these scenarios could lead to a deadlock. If - * All validations + * - within the current held-lock stack + * - across our accumulated lock dependency records + * + * any of these scenarios could lead to a deadlock. + */ + /* + * The simple case: does the current hold the same lock + * already? */ int ret = check_deadlock(curr, hlock, hlock->read); -- cgit v1.2.3 From aa4807719e076bfb2dee9c96adf2c648e47d472f Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:28 +0800 Subject: locking/lockdep: Change type of the element field in circular_queue The element field is an array in struct circular_queue to keep track of locks in the search. Making it the same type as the locks avoids type cast. Also fix a typo and elaborate the comment above struct circular_queue. No functional change. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Bart Van Assche Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-13-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index a9799f9ed093..d467ba825dca 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1262,13 +1262,17 @@ static int add_lock_to_list(struct lock_class *this, #define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1) /* - * The circular_queue and helpers is used to implement the - * breadth-first search(BFS)algorithem, by which we can build - * the shortest path from the next lock to be acquired to the - * previous held lock if there is a circular between them. + * The circular_queue and helpers are used to implement graph + * breadth-first search (BFS) algorithm, by which we can determine + * whether there is a path from a lock to another. In deadlock checks, + * a path from the next lock to be acquired to a previous held lock + * indicates that adding the -> lock dependency will + * produce a circle in the graph. Breadth-first search instead of + * depth-first search is used in order to find the shortest (circular) + * path. */ struct circular_queue { - unsigned long element[MAX_CIRCULAR_QUEUE_SIZE]; + struct lock_list *element[MAX_CIRCULAR_QUEUE_SIZE]; unsigned int front, rear; }; @@ -1294,7 +1298,7 @@ static inline int __cq_full(struct circular_queue *cq) return ((cq->rear + 1) & CQ_MASK) == cq->front; } -static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem) +static inline int __cq_enqueue(struct circular_queue *cq, struct lock_list *elem) { if (__cq_full(cq)) return -1; @@ -1304,7 +1308,7 @@ static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem) return 0; } -static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem) +static inline int __cq_dequeue(struct circular_queue *cq, struct lock_list **elem) { if (__cq_empty(cq)) return -1; @@ -1382,12 +1386,12 @@ static int __bfs(struct lock_list *source_entry, goto exit; __cq_init(cq); - __cq_enqueue(cq, (unsigned long)source_entry); + __cq_enqueue(cq, source_entry); while (!__cq_empty(cq)) { struct lock_list *lock; - __cq_dequeue(cq, (unsigned long *)&lock); + __cq_dequeue(cq, &lock); if (!lock->class) { ret = -2; @@ -1411,7 +1415,7 @@ static int __bfs(struct lock_list *source_entry, goto exit; } - if (__cq_enqueue(cq, (unsigned long)entry)) { + if (__cq_enqueue(cq, entry)) { ret = -1; goto exit; } -- cgit v1.2.3 From c1661325597f68bc9e632c4fa9c86983d56fba4f Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:29 +0800 Subject: locking/lockdep: Change the return type of __cq_dequeue() With the change, we can slightly adjust the code to iterate the queue in BFS search, which simplifies the code. No functional change. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-14-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index d467ba825dca..d23dcb47389e 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1308,14 +1308,21 @@ static inline int __cq_enqueue(struct circular_queue *cq, struct lock_list *elem return 0; } -static inline int __cq_dequeue(struct circular_queue *cq, struct lock_list **elem) +/* + * Dequeue an element from the circular_queue, return a lock_list if + * the queue is not empty, or NULL if otherwise. + */ +static inline struct lock_list * __cq_dequeue(struct circular_queue *cq) { + struct lock_list * lock; + if (__cq_empty(cq)) - return -1; + return NULL; - *elem = cq->element[cq->front]; + lock = cq->element[cq->front]; cq->front = (cq->front + 1) & CQ_MASK; - return 0; + + return lock; } static inline unsigned int __cq_get_elem_count(struct circular_queue *cq) @@ -1367,6 +1374,7 @@ static int __bfs(struct lock_list *source_entry, int forward) { struct lock_list *entry; + struct lock_list *lock; struct list_head *head; struct circular_queue *cq = &lock_cq; int ret = 1; @@ -1388,10 +1396,7 @@ static int __bfs(struct lock_list *source_entry, __cq_init(cq); __cq_enqueue(cq, source_entry); - while (!__cq_empty(cq)) { - struct lock_list *lock; - - __cq_dequeue(cq, &lock); + while ((lock = __cq_dequeue(cq))) { if (!lock->class) { ret = -2; -- cgit v1.2.3 From 77a806922cfdebcf3ae89d31a8b592a7f7fbe537 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:30 +0800 Subject: locking/lockdep: Avoid constant checks in __bfs by using offset reference In search of a dependency in the lock graph, there is contant checks for forward or backward search. Directly reference the field offset of the struct that differentiates the type of search to avoid those checks. No functional change. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-15-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index d23dcb47389e..2e8ef6082f72 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1367,11 +1367,25 @@ static inline int get_lock_depth(struct lock_list *child) return depth; } +/* + * Return the forward or backward dependency list. + * + * @lock: the lock_list to get its class's dependency list + * @offset: the offset to struct lock_class to determine whether it is + * locks_after or locks_before + */ +static inline struct list_head *get_dep_list(struct lock_list *lock, int offset) +{ + void *lock_class = lock->class; + + return lock_class + offset; +} + static int __bfs(struct lock_list *source_entry, void *data, int (*match)(struct lock_list *entry, void *data), struct lock_list **target_entry, - int forward) + int offset) { struct lock_list *entry; struct lock_list *lock; @@ -1385,11 +1399,7 @@ static int __bfs(struct lock_list *source_entry, goto exit; } - if (forward) - head = &source_entry->class->locks_after; - else - head = &source_entry->class->locks_before; - + head = get_dep_list(source_entry, offset); if (list_empty(head)) goto exit; @@ -1403,10 +1413,7 @@ static int __bfs(struct lock_list *source_entry, goto exit; } - if (forward) - head = &lock->class->locks_after; - else - head = &lock->class->locks_before; + head = get_dep_list(lock, offset); DEBUG_LOCKS_WARN_ON(!irqs_disabled()); @@ -1439,7 +1446,8 @@ static inline int __bfs_forwards(struct lock_list *src_entry, int (*match)(struct lock_list *entry, void *data), struct lock_list **target_entry) { - return __bfs(src_entry, data, match, target_entry, 1); + return __bfs(src_entry, data, match, target_entry, + offsetof(struct lock_class, locks_after)); } @@ -1448,7 +1456,8 @@ static inline int __bfs_backwards(struct lock_list *src_entry, int (*match)(struct lock_list *entry, void *data), struct lock_list **target_entry) { - return __bfs(src_entry, data, match, target_entry, 0); + return __bfs(src_entry, data, match, target_entry, + offsetof(struct lock_class, locks_before)); } -- cgit v1.2.3 From 154f185e9c0f6c50ac8e901630e14aa5b36f9414 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:31 +0800 Subject: locking/lockdep: Update comments on dependency search The breadth-first search is implemented as flat-out non-recursive now, but the comments are still describing it as recursive, update the comments in that regard. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-16-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 2e8ef6082f72..b2ca20aa69aa 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1381,6 +1381,10 @@ static inline struct list_head *get_dep_list(struct lock_list *lock, int offset) return lock_class + offset; } +/* + * Forward- or backward-dependency search, used for both circular dependency + * checking and hardirq-unsafe/softirq-unsafe checking. + */ static int __bfs(struct lock_list *source_entry, void *data, int (*match)(struct lock_list *entry, void *data), @@ -1461,12 +1465,6 @@ static inline int __bfs_backwards(struct lock_list *src_entry, } -/* - * Recursive, forwards-direction lock-dependency checking, used for - * both noncyclic checking and for hardirq-unsafe/softirq-unsafe - * checking. - */ - static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) { unsigned long *entries = stack_trace + trace->offset; @@ -2285,7 +2283,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, int read) /* * There was a chain-cache miss, and we are about to add a new dependency - * to a previous lock. We recursively validate the following rules: + * to a previous lock. We validate the following rules: * * - would the adding of the -> dependency create a * circular dependency in the graph? [== circular deadlock] @@ -2335,11 +2333,12 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, /* * Prove that the new -> dependency would not * create a circular dependency in the graph. (We do this by - * forward-recursing into the graph starting at , and - * checking whether we can reach .) + * a breadth-first search into the graph starting at , + * and check whether we can reach .) * - * We are using global variables to control the recursion, to - * keep the stackframe size of the recursive functions low: + * The search is limited by the size of the circular queue (i.e., + * MAX_CIRCULAR_QUEUE_SIZE) which keeps track of a breadth of nodes + * in the graph whose neighbours are to be checked. */ this.class = hlock_class(next); this.parent = NULL; -- cgit v1.2.3 From 4609c4f963f353613812f999bb027aac795bcde8 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:33 +0800 Subject: locking/lockdep: Remove redundant argument in check_deadlock In check_deadlock(), the third argument read comes from the second argument hlock so that it can be removed. No functional change. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-18-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index b2ca20aa69aa..be4c1348ddcd 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2246,7 +2246,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read */ static int -check_deadlock(struct task_struct *curr, struct held_lock *next, int read) +check_deadlock(struct task_struct *curr, struct held_lock *next) { struct held_lock *prev; struct held_lock *nest = NULL; @@ -2265,7 +2265,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, int read) * Allow read-after-read recursion of the same * lock class (i.e. read_lock(lock)+read_lock(lock)): */ - if ((read == 2) && prev->read) + if ((next->read == 2) && prev->read) return 2; /* @@ -2839,7 +2839,7 @@ static int validate_chain(struct task_struct *curr, * The simple case: does the current hold the same lock * already? */ - int ret = check_deadlock(curr, hlock, hlock->read); + int ret = check_deadlock(curr, hlock); if (!ret) return 0; -- cgit v1.2.3 From b4adfe8e05f15d7e73309c93c2c337df7eb5278f Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:34 +0800 Subject: locking/lockdep: Remove unused argument in __lock_release The @nested is not used in __release_lock so remove it despite that it is not used in lock_release in the first place. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-19-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index be4c1348ddcd..8169706df767 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4096,7 +4096,7 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) * @nested is an hysterical artifact, needs a tree wide cleanup. */ static int -__lock_release(struct lockdep_map *lock, int nested, unsigned long ip) +__lock_release(struct lockdep_map *lock, unsigned long ip) { struct task_struct *curr = current; struct held_lock *hlock; @@ -4384,7 +4384,7 @@ void lock_release(struct lockdep_map *lock, int nested, check_flags(flags); current->lockdep_recursion = 1; trace_lock_release(lock, ip); - if (__lock_release(lock, nested, ip)) + if (__lock_release(lock, ip)) check_chain_key(current); current->lockdep_recursion = 0; raw_local_irq_restore(flags); -- cgit v1.2.3 From 8c2c2b449aa50463ba4cc1f33cdfc98750ed03ab Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:35 +0800 Subject: locking/lockdep: Refactorize check_noncircular and check_redundant These two functions now handle different check results themselves. A new check_path function is added to check whether there is a path in the dependency graph. No functional change. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-20-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 118 +++++++++++++++++++++++++++++------------------ 1 file changed, 74 insertions(+), 44 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 8169706df767..30a1c0e32573 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1683,33 +1683,90 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) } /* - * Prove that the dependency graph starting at can not - * lead to . Print an error and return 0 if it does. + * Check that the dependency graph starting at can lead to + * or not. Print an error and return 0 if it does. */ static noinline int -check_noncircular(struct lock_list *root, struct lock_class *target, - struct lock_list **target_entry) +check_path(struct lock_class *target, struct lock_list *src_entry, + struct lock_list **target_entry) { - int result; + int ret; + + ret = __bfs_forwards(src_entry, (void *)target, class_equal, + target_entry); + + if (unlikely(ret < 0)) + print_bfs_bug(ret); + + return ret; +} + +/* + * Prove that the dependency graph starting at can not + * lead to . If it can, there is a circle when adding + * -> dependency. + * + * Print an error and return 0 if it does. + */ +static noinline int +check_noncircular(struct held_lock *src, struct held_lock *target, + struct lock_trace *trace) +{ + int ret; + struct lock_list *uninitialized_var(target_entry); + struct lock_list src_entry = { + .class = hlock_class(src), + .parent = NULL, + }; debug_atomic_inc(nr_cyclic_checks); - result = __bfs_forwards(root, target, class_equal, target_entry); + ret = check_path(hlock_class(target), &src_entry, &target_entry); - return result; + if (unlikely(!ret)) { + if (!trace->nr_entries) { + /* + * If save_trace fails here, the printing might + * trigger a WARN but because of the !nr_entries it + * should not do bad things. + */ + save_trace(trace); + } + + print_circular_bug(&src_entry, target_entry, src, target); + } + + return ret; } +/* + * Check that the dependency graph starting at can lead to + * or not. If it can, -> dependency is already + * in the graph. + * + * Print an error and return 2 if it does or 1 if it does not. + */ static noinline int -check_redundant(struct lock_list *root, struct lock_class *target, - struct lock_list **target_entry) +check_redundant(struct held_lock *src, struct held_lock *target) { - int result; + int ret; + struct lock_list *uninitialized_var(target_entry); + struct lock_list src_entry = { + .class = hlock_class(src), + .parent = NULL, + }; debug_atomic_inc(nr_redundant_checks); - result = __bfs_forwards(root, target, class_equal, target_entry); + ret = check_path(hlock_class(target), &src_entry, &target_entry); - return result; + if (!ret) { + debug_atomic_inc(nr_redundant); + ret = 2; + } else if (ret < 0) + ret = 0; + + return ret; } #ifdef CONFIG_TRACE_IRQFLAGS @@ -2307,9 +2364,7 @@ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, struct held_lock *next, int distance, struct lock_trace *trace) { - struct lock_list *uninitialized_var(target_entry); struct lock_list *entry; - struct lock_list this; int ret; if (!hlock_class(prev)->key || !hlock_class(next)->key) { @@ -2340,25 +2395,9 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * MAX_CIRCULAR_QUEUE_SIZE) which keeps track of a breadth of nodes * in the graph whose neighbours are to be checked. */ - this.class = hlock_class(next); - this.parent = NULL; - ret = check_noncircular(&this, hlock_class(prev), &target_entry); - if (unlikely(!ret)) { - if (!trace->nr_entries) { - /* - * If save_trace fails here, the printing might - * trigger a WARN but because of the !nr_entries it - * should not do bad things. - */ - save_trace(trace); - } - print_circular_bug(&this, target_entry, next, prev); + ret = check_noncircular(next, prev, trace); + if (unlikely(ret <= 0)) return 0; - } - else if (unlikely(ret < 0)) { - print_bfs_bug(ret); - return 0; - } if (!check_irq_usage(curr, prev, next)) return 0; @@ -2392,18 +2431,9 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, /* * Is the -> link redundant? */ - this.class = hlock_class(prev); - this.parent = NULL; - ret = check_redundant(&this, hlock_class(next), &target_entry); - if (!ret) { - debug_atomic_inc(nr_redundant); - return 2; - } - if (ret < 0) { - print_bfs_bug(ret); - return 0; - } - + ret = check_redundant(prev, next); + if (ret != 1) + return ret; if (!trace->nr_entries && !save_trace(trace)) return 0; -- cgit v1.2.3 From 68e9dc29f8f42c79d2a3755223ed910ce36b4ae2 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:36 +0800 Subject: locking/lockdep: Check redundant dependency only when CONFIG_LOCKDEP_SMALL As Peter has put it all sound and complete for the cause, I simply quote: "It (check_redundant) was added for cross-release (which has since been reverted) which would generate a lot of redundant links (IIRC) but having it makes the reports more convoluted -- basically, if we had an A-B-C relation, then A-C will not be added to the graph because it is already covered. This then means any report will include B, even though a shorter cycle might have been possible." This would increase the number of direct dependencies. For a simple workload (make clean; reboot; make vmlinux -j8), the data looks like this: CONFIG_LOCKDEP_SMALL: direct dependencies: 6926 !CONFIG_LOCKDEP_SMALL: direct dependencies: 9052 (+30.7%) Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-21-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 30a1c0e32573..63b82921698d 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1739,6 +1739,7 @@ check_noncircular(struct held_lock *src, struct held_lock *target, return ret; } +#ifdef CONFIG_LOCKDEP_SMALL /* * Check that the dependency graph starting at can lead to * or not. If it can, -> dependency is already @@ -1768,6 +1769,7 @@ check_redundant(struct held_lock *src, struct held_lock *target) return ret; } +#endif #ifdef CONFIG_TRACE_IRQFLAGS @@ -2428,12 +2430,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, } } +#ifdef CONFIG_LOCKDEP_SMALL /* * Is the -> link redundant? */ ret = check_redundant(prev, next); if (ret != 1) return ret; +#endif if (!trace->nr_entries && !save_trace(trace)) return 0; -- cgit v1.2.3 From 091806515124b20f8cff7927b4b7ff399483b109 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:37 +0800 Subject: locking/lockdep: Consolidate lock usage bit initialization Lock usage bit initialization is consolidated into one function mark_usage(). Trivial readability improvement. No functional change. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-22-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 63b82921698d..1123e7e6c78d 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3460,8 +3460,12 @@ void trace_softirqs_off(unsigned long ip) debug_atomic_inc(redundant_softirqs_off); } -static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) +static int +mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) { + if (!check) + goto lock_used; + /* * If non-trylock use in a hardirq or softirq context, then * mark the lock as used in these contexts: @@ -3505,6 +3509,11 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) } } +lock_used: + /* mark it as used: */ + if (!mark_lock(curr, hlock, LOCK_USED)) + return 0; + return 1; } @@ -3546,8 +3555,8 @@ int mark_lock_irq(struct task_struct *curr, struct held_lock *this, return 1; } -static inline int mark_irqflags(struct task_struct *curr, - struct held_lock *hlock) +static inline int +mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) { return 1; } @@ -3833,11 +3842,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, #endif hlock->pin_count = pin_count; - if (check && !mark_irqflags(curr, hlock)) - return 0; - - /* mark it as used: */ - if (!mark_lock(curr, hlock, LOCK_USED)) + /* Initialize the lock usage bit */ + if (!mark_usage(curr, hlock, check)) return 0; /* -- cgit v1.2.3 From 4d56330df22dd9dd9a24f147014f60ee4c914fb8 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:38 +0800 Subject: locking/lockdep: Adjust new bit cases in mark_lock The new bit can be any possible lock usage except it is garbage, so the cases in switch can be made simpler. Warn early on if wrong usage bit is passed without taking locks. No functional change. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-23-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 1123e7e6c78d..9c4e2a7547d3 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3582,6 +3582,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, { unsigned int new_mask = 1 << new_bit, ret = 1; + if (new_bit >= LOCK_USAGE_STATES) { + DEBUG_LOCKS_WARN_ON(1); + return 0; + } + /* * If already set then do not dirty the cacheline, * nor do any checks: @@ -3605,25 +3610,13 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, return 0; switch (new_bit) { -#define LOCKDEP_STATE(__STATE) \ - case LOCK_USED_IN_##__STATE: \ - case LOCK_USED_IN_##__STATE##_READ: \ - case LOCK_ENABLED_##__STATE: \ - case LOCK_ENABLED_##__STATE##_READ: -#include "lockdep_states.h" -#undef LOCKDEP_STATE - ret = mark_lock_irq(curr, this, new_bit); - if (!ret) - return 0; - break; case LOCK_USED: debug_atomic_dec(nr_unused_locks); break; default: - if (!debug_locks_off_graph_unlock()) + ret = mark_lock_irq(curr, this, new_bit); + if (!ret) return 0; - WARN_ON(1); - return 0; } graph_unlock(); -- cgit v1.2.3 From bf998b98f5bce4ebc97b3980016f54fabb7a4958 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:39 +0800 Subject: locking/lockdep: Remove !dir in lock irq usage check In mark_lock_irq(), the following checks are performed: ---------------------------------- | -> | unsafe | read unsafe | |----------------------------------| | safe | F B | F* B* | |----------------------------------| | read safe | F? B* | - | ---------------------------------- Where: F: check_usage_forwards B: check_usage_backwards *: check enabled by STRICT_READ_CHECKS ?: check enabled by the !dir condition From checking point of view, the special F? case does not make sense, whereas it perhaps is made for peroformance concern. As later patch will address this issue, remove this exception, which makes the checks consistent later. With STRICT_READ_CHECKS = 1 which is default, there is no functional change. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-24-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 9c4e2a7547d3..2168e94715b9 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3235,7 +3235,7 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, * Validate that the lock dependencies don't have conflicting usage * states. */ - if ((!read || !dir || STRICT_READ_CHECKS) && + if ((!read || STRICT_READ_CHECKS) && !usage(curr, this, excl_bit, state_name(new_bit & ~LOCK_USAGE_READ_MASK))) return 0; -- cgit v1.2.3 From 8c8889d8eaf4501ae4aaf870b6f8f55db5d5109a Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Fri, 24 May 2019 23:15:08 +0300 Subject: locking/lockdep: Fix OOO unlock when hlocks need merging The sequence static DEFINE_WW_CLASS(test_ww_class); struct ww_acquire_ctx ww_ctx; struct ww_mutex ww_lock_a; struct ww_mutex ww_lock_b; struct mutex lock_c; struct mutex lock_d; ww_acquire_init(&ww_ctx, &test_ww_class); ww_mutex_init(&ww_lock_a, &test_ww_class); ww_mutex_init(&ww_lock_b, &test_ww_class); mutex_init(&lock_c); ww_mutex_lock(&ww_lock_a, &ww_ctx); mutex_lock(&lock_c); ww_mutex_lock(&ww_lock_b, &ww_ctx); mutex_unlock(&lock_c); (*) ww_mutex_unlock(&ww_lock_b); ww_mutex_unlock(&ww_lock_a); ww_acquire_fini(&ww_ctx); triggers the following WARN in __lock_release() when doing the unlock at *: DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1); The problem is that the WARN check doesn't take into account the merging of ww_lock_a and ww_lock_b which results in decreasing curr->lockdep_depth by 2 not only 1. Note that the following sequence doesn't trigger the WARN, since there won't be any hlock merging. ww_acquire_init(&ww_ctx, &test_ww_class); ww_mutex_init(&ww_lock_a, &test_ww_class); ww_mutex_init(&ww_lock_b, &test_ww_class); mutex_init(&lock_c); mutex_init(&lock_d); ww_mutex_lock(&ww_lock_a, &ww_ctx); mutex_lock(&lock_c); mutex_lock(&lock_d); ww_mutex_lock(&ww_lock_b, &ww_ctx); mutex_unlock(&lock_d); ww_mutex_unlock(&ww_lock_b); ww_mutex_unlock(&ww_lock_a); mutex_unlock(&lock_c); ww_acquire_fini(&ww_ctx); In general both of the above two sequences are valid and shouldn't trigger any lockdep warning. Fix this by taking the decrement due to the hlock merging into account during lock release and hlock class re-setting. Merging can't happen during lock downgrading since there won't be a new possibility to merge hlocks in that case, so add a WARN if merging still happens then. Signed-off-by: Imre Deak Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: ville.syrjala@linux.intel.com Link: https://lkml.kernel.org/r/20190524201509.9199-1-imre.deak@intel.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 2168e94715b9..6c97f67ec321 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3808,7 +3808,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->references = 2; } - return 1; + return 2; } } @@ -4011,22 +4011,33 @@ out: } static int reacquire_held_locks(struct task_struct *curr, unsigned int depth, - int idx) + int idx, unsigned int *merged) { struct held_lock *hlock; + int first_idx = idx; if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; for (hlock = curr->held_locks + idx; idx < depth; idx++, hlock++) { - if (!__lock_acquire(hlock->instance, + switch (__lock_acquire(hlock->instance, hlock_class(hlock)->subclass, hlock->trylock, hlock->read, hlock->check, hlock->hardirqs_off, hlock->nest_lock, hlock->acquire_ip, - hlock->references, hlock->pin_count)) + hlock->references, hlock->pin_count)) { + case 0: return 1; + case 1: + break; + case 2: + *merged += (idx == first_idx); + break; + default: + WARN_ON(1); + return 0; + } } return 0; } @@ -4037,9 +4048,9 @@ __lock_set_class(struct lockdep_map *lock, const char *name, unsigned long ip) { struct task_struct *curr = current; + unsigned int depth, merged = 0; struct held_lock *hlock; struct lock_class *class; - unsigned int depth; int i; if (unlikely(!debug_locks)) @@ -4066,14 +4077,14 @@ __lock_set_class(struct lockdep_map *lock, const char *name, curr->lockdep_depth = i; curr->curr_chain_key = hlock->prev_chain_key; - if (reacquire_held_locks(curr, depth, i)) + if (reacquire_held_locks(curr, depth, i, &merged)) return 0; /* * I took it apart and put it back together again, except now I have * these 'spare' parts.. where shall I put them. */ - if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) + if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - merged)) return 0; return 1; } @@ -4081,8 +4092,8 @@ __lock_set_class(struct lockdep_map *lock, const char *name, static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) { struct task_struct *curr = current; + unsigned int depth, merged = 0; struct held_lock *hlock; - unsigned int depth; int i; if (unlikely(!debug_locks)) @@ -4109,7 +4120,11 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) hlock->read = 1; hlock->acquire_ip = ip; - if (reacquire_held_locks(curr, depth, i)) + if (reacquire_held_locks(curr, depth, i, &merged)) + return 0; + + /* Merging can't happen with unchanged classes.. */ + if (DEBUG_LOCKS_WARN_ON(merged)) return 0; /* @@ -4118,6 +4133,7 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) */ if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) return 0; + return 1; } @@ -4132,8 +4148,8 @@ static int __lock_release(struct lockdep_map *lock, unsigned long ip) { struct task_struct *curr = current; + unsigned int depth, merged = 1; struct held_lock *hlock; - unsigned int depth; int i; if (unlikely(!debug_locks)) @@ -4192,14 +4208,15 @@ __lock_release(struct lockdep_map *lock, unsigned long ip) if (i == depth-1) return 1; - if (reacquire_held_locks(curr, depth, i + 1)) + if (reacquire_held_locks(curr, depth, i + 1, &merged)) return 0; /* * We had N bottles of beer on the wall, we drank one, but now * there's not N-1 bottles of beer left on the wall... + * Pouring two of the bottles together is acceptable. */ - DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth-1); + DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - merged); /* * Since reacquire_held_locks() would have called check_chain_key() -- cgit v1.2.3 From d9349850e188b8b59e5322fda17ff389a1c0cd7d Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Fri, 24 May 2019 23:15:09 +0300 Subject: locking/lockdep: Fix merging of hlocks with non-zero references The sequence static DEFINE_WW_CLASS(test_ww_class); struct ww_acquire_ctx ww_ctx; struct ww_mutex ww_lock_a; struct ww_mutex ww_lock_b; struct ww_mutex ww_lock_c; struct mutex lock_c; ww_acquire_init(&ww_ctx, &test_ww_class); ww_mutex_init(&ww_lock_a, &test_ww_class); ww_mutex_init(&ww_lock_b, &test_ww_class); ww_mutex_init(&ww_lock_c, &test_ww_class); mutex_init(&lock_c); ww_mutex_lock(&ww_lock_a, &ww_ctx); mutex_lock(&lock_c); ww_mutex_lock(&ww_lock_b, &ww_ctx); ww_mutex_lock(&ww_lock_c, &ww_ctx); mutex_unlock(&lock_c); (*) ww_mutex_unlock(&ww_lock_c); ww_mutex_unlock(&ww_lock_b); ww_mutex_unlock(&ww_lock_a); ww_acquire_fini(&ww_ctx); (**) will trigger the following error in __lock_release() when calling mutex_release() at **: DEBUG_LOCKS_WARN_ON(depth <= 0) The problem is that the hlock merging happening at * updates the references for test_ww_class incorrectly to 3 whereas it should've updated it to 4 (representing all the instances for ww_ctx and ww_lock_[abc]). Fix this by updating the references during merging correctly taking into account that we can have non-zero references (both for the hlock that we merge into another hlock or for the hlock we are merging into). Signed-off-by: Imre Deak Signed-off-by: Peter Zijlstra (Intel) Cc: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Link: https://lkml.kernel.org/r/20190524201509.9199-2-imre.deak@intel.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 6c97f67ec321..48a840adb281 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3796,17 +3796,17 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (depth) { hlock = curr->held_locks + depth - 1; if (hlock->class_idx == class_idx && nest_lock) { - if (hlock->references) { - /* - * Check: unsigned int references:12, overflow. - */ - if (DEBUG_LOCKS_WARN_ON(hlock->references == (1 << 12)-1)) - return 0; + if (!references) + references++; + if (!hlock->references) hlock->references++; - } else { - hlock->references = 2; - } + + hlock->references += references; + + /* Overflow */ + if (DEBUG_LOCKS_WARN_ON(hlock->references < references)) + return 0; return 2; } -- cgit v1.2.3 From 24811637dbfd07c69da7e9db586d35d17e6afca3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 27 May 2019 10:23:26 +0200 Subject: locking/lock_events: Use raw_cpu_{add,inc}() for stats Instead of playing silly games with CONFIG_DEBUG_PREEMPT toggling between this_cpu_*() and __this_cpu_*() use raw_cpu_*(), which is exactly what we want here. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Linus Torvalds Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Waiman Long Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190527082326.GP2623@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/locking/lock_events.h | 45 ++++---------------------------------------- 1 file changed, 4 insertions(+), 41 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h index 46b71af8eef2..8c7e7d25f09c 100644 --- a/kernel/locking/lock_events.h +++ b/kernel/locking/lock_events.h @@ -31,50 +31,13 @@ enum lock_events { DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]); /* - * The purpose of the lock event counting subsystem is to provide a low - * overhead way to record the number of specific locking events by using - * percpu counters. It is the percpu sum that matters, not specifically - * how many of them happens in each cpu. - * - * It is possible that the same percpu counter may be modified in both - * the process and interrupt contexts. For architectures that perform - * percpu operation with multiple instructions, it is possible to lose - * count if a process context percpu update is interrupted in the middle - * and the same counter is updated in the interrupt context. Therefore, - * the generated percpu sum may not be precise. The error, if any, should - * be small and insignificant. - * - * For those architectures that do multi-instruction percpu operation, - * preemption in the middle and moving the task to another cpu may cause - * a larger error in the count. Again, this will be few and far between. - * Given the imprecise nature of the count and the possibility of resetting - * the count and doing the measurement again, this is not really a big - * problem. - * - * To get a better picture of what is happening under the hood, it is - * suggested that a few measurements should be taken with the counts - * reset in between to stamp out outliner because of these possible - * error conditions. - * - * To minimize overhead, we use __this_cpu_*() in all cases except when - * CONFIG_DEBUG_PREEMPT is defined. In this particular case, this_cpu_*() - * will be used to avoid the appearance of unwanted BUG messages. - */ -#ifdef CONFIG_DEBUG_PREEMPT -#define lockevent_percpu_inc(x) this_cpu_inc(x) -#define lockevent_percpu_add(x, v) this_cpu_add(x, v) -#else -#define lockevent_percpu_inc(x) __this_cpu_inc(x) -#define lockevent_percpu_add(x, v) __this_cpu_add(x, v) -#endif - -/* - * Increment the PV qspinlock statistical counters + * Increment the statistical counters. use raw_cpu_inc() because of lower + * overhead and we don't care if we loose the occasional update. */ static inline void __lockevent_inc(enum lock_events event, bool cond) { if (cond) - lockevent_percpu_inc(lockevents[event]); + raw_cpu_inc(lockevents[event]); } #define lockevent_inc(ev) __lockevent_inc(LOCKEVENT_ ##ev, true) @@ -82,7 +45,7 @@ static inline void __lockevent_inc(enum lock_events event, bool cond) static inline void __lockevent_add(enum lock_events event, int inc) { - lockevent_percpu_add(lockevents[event], inc); + raw_cpu_add(lockevents[event], inc); } #define lockevent_add(ev, c) __lockevent_add(LOCKEVENT_ ##ev, c) -- cgit v1.2.3 From dd471efe345bf6f9e1206f6c629ca3e80eb43523 Mon Sep 17 00:00:00 2001 From: Kobe Wu Date: Thu, 30 May 2019 19:59:35 +0800 Subject: locking/lockdep: Remove unnecessary DEBUG_LOCKS_WARN_ON() DEBUG_LOCKS_WARN_ON() will turn off debug_locks and makes print_unlock_imbalance_bug() return directly. Remove a redundant whitespace. Signed-off-by: Kobe Wu Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Eason Lin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Link: https://lkml.kernel.org/r/1559217575-30298-1-git-send-email-kobe-cp.wu@mediatek.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 48a840adb281..5e368f485330 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4160,7 +4160,7 @@ __lock_release(struct lockdep_map *lock, unsigned long ip) * So we're all set to release this lock.. wait what lock? We don't * own any locks, you've been drinking again? */ - if (DEBUG_LOCKS_WARN_ON(depth <= 0)) { + if (depth <= 0) { print_unlock_imbalance_bug(curr, lock, ip); return 0; } -- cgit v1.2.3 From c71fd893f614f205dbc050d60299cc5496491c19 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:00 -0400 Subject: locking/rwsem: Make owner available even if !CONFIG_RWSEM_SPIN_ON_OWNER The owner field in the rw_semaphore structure is used primarily for optimistic spinning. However, identifying the rwsem owner can also be helpful in debugging as well as tracing locking related issues when analyzing crash dump. The owner field may also store state information that can be important to the operation of the rwsem. So the owner field is now made a permanent member of the rw_semaphore structure irrespective of CONFIG_RWSEM_SPIN_ON_OWNER. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-2-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 2 +- kernel/locking/rwsem.h | 23 ----------------------- 2 files changed, 1 insertion(+), 24 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 0b1f77957240..c0500679fd2f 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -86,8 +86,8 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER sem->owner = NULL; +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER osq_lock_init(&sem->osq); #endif } diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 64877f5294e3..eb9c8534299b 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -61,7 +61,6 @@ #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER /* * All writes to owner are protected by WRITE_ONCE() to make sure that * store tearing can't happen as optimistic spinners may read and use @@ -126,7 +125,6 @@ static inline bool rwsem_has_anonymous_owner(struct task_struct *owner) * real owner or one of the real owners. The only exception is when the * unlock is done by up_read_non_owner(). */ -#define rwsem_clear_reader_owned rwsem_clear_reader_owned static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) { unsigned long val = (unsigned long)current | RWSEM_READER_OWNED @@ -135,28 +133,7 @@ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) cmpxchg_relaxed((unsigned long *)&sem->owner, val, RWSEM_READER_OWNED | RWSEM_ANONYMOUSLY_OWNED); } -#endif - #else -static inline void rwsem_set_owner(struct rw_semaphore *sem) -{ -} - -static inline void rwsem_clear_owner(struct rw_semaphore *sem) -{ -} - -static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, - struct task_struct *owner) -{ -} - -static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) -{ -} -#endif - -#ifndef rwsem_clear_reader_owned static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) { } -- cgit v1.2.3 From 5c1ec49b60cdb31e51010f8a647f3189b774bddf Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:01 -0400 Subject: locking/rwsem: Remove rwsem_wake() wakeup optimization After the following commit: 59aabfc7e959 ("locking/rwsem: Reduce spinlock contention in wakeup after up_read()/up_write()") the rwsem_wake() forgoes doing a wakeup if the wait_lock cannot be directly acquired and an optimistic spinning locker is present. This can help performance by avoiding spinning on the wait_lock when it is contended. With the later commit: 133e89ef5ef3 ("locking/rwsem: Enable lockless waiter wakeup(s)") the performance advantage of the above optimization diminishes as the average wait_lock hold time become much shorter. With a later patch that supports rwsem lock handoff, we can no longer relies on the fact that the presence of an optimistic spinning locker will ensure that the lock will be acquired by a task soon and rwsem_wake() will be called later on to wake up waiters. This can lead to missed wakeup and application hang. So the original 59aabfc7e959 commit has to be reverted. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-3-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 72 --------------------------------------------- 1 file changed, 72 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index c0500679fd2f..3083fdf50447 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -411,25 +411,11 @@ done: lockevent_cond_inc(rwsem_opt_fail, !taken); return taken; } - -/* - * Return true if the rwsem has active spinner - */ -static inline bool rwsem_has_spinner(struct rw_semaphore *sem) -{ - return osq_is_locked(&sem->osq); -} - #else static bool rwsem_optimistic_spin(struct rw_semaphore *sem) { return false; } - -static inline bool rwsem_has_spinner(struct rw_semaphore *sem) -{ - return false; -} #endif /* @@ -651,65 +637,7 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) unsigned long flags; DEFINE_WAKE_Q(wake_q); - /* - * __rwsem_down_write_failed_common(sem) - * rwsem_optimistic_spin(sem) - * osq_unlock(sem->osq) - * ... - * atomic_long_add_return(&sem->count) - * - * - VS - - * - * __up_write() - * if (atomic_long_sub_return_release(&sem->count) < 0) - * rwsem_wake(sem) - * osq_is_locked(&sem->osq) - * - * And __up_write() must observe !osq_is_locked() when it observes the - * atomic_long_add_return() in order to not miss a wakeup. - * - * This boils down to: - * - * [S.rel] X = 1 [RmW] r0 = (Y += 0) - * MB RMB - * [RmW] Y += 1 [L] r1 = X - * - * exists (r0=1 /\ r1=0) - */ - smp_rmb(); - - /* - * If a spinner is present, it is not necessary to do the wakeup. - * Try to do wakeup only if the trylock succeeds to minimize - * spinlock contention which may introduce too much delay in the - * unlock operation. - * - * spinning writer up_write/up_read caller - * --------------- ----------------------- - * [S] osq_unlock() [L] osq - * MB RMB - * [RmW] rwsem_try_write_lock() [RmW] spin_trylock(wait_lock) - * - * Here, it is important to make sure that there won't be a missed - * wakeup while the rwsem is free and the only spinning writer goes - * to sleep without taking the rwsem. Even when the spinning writer - * is just going to break out of the waiting loop, it will still do - * a trylock in rwsem_down_write_failed() before sleeping. IOW, if - * rwsem_has_spinner() is true, it will guarantee at least one - * trylock attempt on the rwsem later on. - */ - if (rwsem_has_spinner(sem)) { - /* - * The smp_rmb() here is to make sure that the spinner - * state is consulted before reading the wait_lock. - */ - smp_rmb(); - if (!raw_spin_trylock_irqsave(&sem->wait_lock, flags)) - return sem; - goto locked; - } raw_spin_lock_irqsave(&sem->wait_lock, flags); -locked: if (!list_empty(&sem->wait_list)) __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); -- cgit v1.2.3 From 64489e78004cb5623211c75790cac90bd25ff5e9 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:02 -0400 Subject: locking/rwsem: Implement a new locking scheme The current way of using various reader, writer and waiting biases in the rwsem code are confusing and hard to understand. I have to reread the rwsem count guide in the rwsem-xadd.c file from time to time to remind myself how this whole thing works. It also makes the rwsem code harder to be optimized. To make rwsem more sane, a new locking scheme similar to the one in qrwlock is now being used. The atomic long count has the following bit definitions: Bit 0 - writer locked bit Bit 1 - waiters present bit Bits 2-7 - reserved for future extension Bits 8-X - reader count (24/56 bits) The cmpxchg instruction is now used to acquire the write lock. The read lock is still acquired with xadd instruction, so there is no change here. This scheme will allow up to 16M/64P active readers which should be more than enough. We can always use some more reserved bits if necessary. With that change, we can deterministically know if a rwsem has been write-locked. Looking at the count alone, however, one cannot determine for certain if a rwsem is owned by readers or not as the readers that set the reader count bits may be in the process of backing out. So we still need the reader-owned bit in the owner field to be sure. With a locking microbenchmark running on 5.1 based kernel, the total locking rates (in kops/s) of the benchmark on a 8-socket 120-core IvyBridge-EX system before and after the patch were as follows: Before Patch After Patch # of Threads wlock rlock wlock rlock ------------ ----- ----- ----- ----- 1 30,659 31,341 31,055 31,283 2 8,909 16,457 9,884 17,659 4 9,028 15,823 8,933 20,233 8 8,410 14,212 7,230 17,140 16 8,217 25,240 7,479 24,607 The locking rates of the benchmark on a Power8 system were as follows: Before Patch After Patch # of Threads wlock rlock wlock rlock ------------ ----- ----- ----- ----- 1 12,963 13,647 13,275 13,601 2 7,570 11,569 7,902 10,829 4 5,232 5,516 5,466 5,435 8 5,233 3,386 5,467 3,168 The locking rates of the benchmark on a 2-socket ARM64 system were as follows: Before Patch After Patch # of Threads wlock rlock wlock rlock ------------ ----- ----- ----- ----- 1 21,495 21,046 21,524 21,074 2 5,293 10,502 5,333 10,504 4 5,325 11,463 5,358 11,631 8 5,391 11,712 5,470 11,680 The performance are roughly the same before and after the patch. There are run-to-run variations in performance. Runs with higher variances usually have higher throughput. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-4-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 147 +++++++++++++++----------------------------- kernel/locking/rwsem.h | 74 +++++++++++----------- 2 files changed, 85 insertions(+), 136 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 3083fdf50447..7d537b50a849 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -9,6 +9,8 @@ * * Optimistic spinning by Tim Chen * and Davidlohr Bueso . Based on mutexes. + * + * Rwsem count bit fields re-definition by Waiman Long . */ #include #include @@ -22,52 +24,20 @@ #include "rwsem.h" /* - * Guide to the rw_semaphore's count field for common values. - * (32-bit case illustrated, similar for 64-bit) - * - * 0x0000000X (1) X readers active or attempting lock, no writer waiting - * X = #active_readers + #readers attempting to lock - * (X*ACTIVE_BIAS) - * - * 0x00000000 rwsem is unlocked, and no one is waiting for the lock or - * attempting to read lock or write lock. - * - * 0xffff000X (1) X readers active or attempting lock, with waiters for lock - * X = #active readers + # readers attempting lock - * (X*ACTIVE_BIAS + WAITING_BIAS) - * (2) 1 writer attempting lock, no waiters for lock - * X-1 = #active readers + #readers attempting lock - * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS) - * (3) 1 writer active, no waiters for lock - * X-1 = #active readers + #readers attempting lock - * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS) - * - * 0xffff0001 (1) 1 reader active or attempting lock, waiters for lock - * (WAITING_BIAS + ACTIVE_BIAS) - * (2) 1 writer active or attempting lock, no waiters for lock - * (ACTIVE_WRITE_BIAS) + * Guide to the rw_semaphore's count field. * - * 0xffff0000 (1) There are writers or readers queued but none active - * or in the process of attempting lock. - * (WAITING_BIAS) - * Note: writer can attempt to steal lock for this count by adding - * ACTIVE_WRITE_BIAS in cmpxchg and checking the old count + * When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned + * by a writer. * - * 0xfffe0001 (1) 1 writer active, or attempting lock. Waiters on queue. - * (ACTIVE_WRITE_BIAS + WAITING_BIAS) - * - * Note: Readers attempt to lock by adding ACTIVE_BIAS in down_read and checking - * the count becomes more than 0 for successful lock acquisition, - * i.e. the case where there are only readers or nobody has lock. - * (1st and 2nd case above). - * - * Writers attempt to lock by adding ACTIVE_WRITE_BIAS in down_write and - * checking the count becomes ACTIVE_WRITE_BIAS for successful lock - * acquisition (i.e. nobody else has lock or attempts lock). If - * unsuccessful, in rwsem_down_write_failed, we'll check to see if there - * are only waiters but none active (5th case above), and attempt to - * steal the lock. + * The lock is owned by readers when + * (1) the RWSEM_WRITER_LOCKED isn't set in count, + * (2) some of the reader bits are set in count, and + * (3) the owner field has RWSEM_READ_OWNED bit set. * + * Having some reader bits set is not enough to guarantee a readers owned + * lock as the readers may be in the process of backing out from the count + * and a writer has just released the lock. So another writer may steal + * the lock immediately after that. */ /* @@ -113,9 +83,8 @@ enum rwsem_wake_type { /* * handle the lock release when processes blocked on it that can now run - * - if we come here from up_xxxx(), then: - * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) - * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) + * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must + * have been set. * - there must be someone on the queue * - the wait_lock must be held by the caller * - tasks are marked for wakeup, the caller must later invoke wake_up_q() @@ -160,22 +129,11 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, * so we can bail out early if a writer stole the lock. */ if (wake_type != RWSEM_WAKE_READ_OWNED) { - adjustment = RWSEM_ACTIVE_READ_BIAS; - try_reader_grant: + adjustment = RWSEM_READER_BIAS; oldcount = atomic_long_fetch_add(adjustment, &sem->count); - if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { - /* - * If the count is still less than RWSEM_WAITING_BIAS - * after removing the adjustment, it is assumed that - * a writer has stolen the lock. We have to undo our - * reader grant. - */ - if (atomic_long_add_return(-adjustment, &sem->count) < - RWSEM_WAITING_BIAS) - return; - - /* Last active locker left. Retry waking readers. */ - goto try_reader_grant; + if (unlikely(oldcount & RWSEM_WRITER_MASK)) { + atomic_long_sub(adjustment, &sem->count); + return; } /* * Set it to reader-owned to give spinners an early @@ -209,11 +167,11 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, } list_cut_before(&wlist, &sem->wait_list, &waiter->list); - adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; + adjustment = woken * RWSEM_READER_BIAS - adjustment; lockevent_cond_inc(rwsem_wake_reader, woken); if (list_empty(&sem->wait_list)) { /* hit end of list above */ - adjustment -= RWSEM_WAITING_BIAS; + adjustment -= RWSEM_FLAG_WAITERS; } if (adjustment) @@ -248,22 +206,15 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, */ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) { - /* - * Avoid trying to acquire write lock if count isn't RWSEM_WAITING_BIAS. - */ - if (count != RWSEM_WAITING_BIAS) + long new; + + if (count & RWSEM_LOCK_MASK) return false; - /* - * Acquire the lock by trying to set it to ACTIVE_WRITE_BIAS. If there - * are other tasks on the wait list, we need to add on WAITING_BIAS. - */ - count = list_is_singular(&sem->wait_list) ? - RWSEM_ACTIVE_WRITE_BIAS : - RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS; + new = count + RWSEM_WRITER_LOCKED - + (list_is_singular(&sem->wait_list) ? RWSEM_FLAG_WAITERS : 0); - if (atomic_long_cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count) - == RWSEM_WAITING_BIAS) { + if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, new)) { rwsem_set_owner(sem); return true; } @@ -279,9 +230,9 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) { long count = atomic_long_read(&sem->count); - while (!count || count == RWSEM_WAITING_BIAS) { + while (!(count & RWSEM_LOCK_MASK)) { if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, - count + RWSEM_ACTIVE_WRITE_BIAS)) { + count + RWSEM_WRITER_LOCKED)) { rwsem_set_owner(sem); lockevent_inc(rwsem_opt_wlock); return true; @@ -424,7 +375,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) static inline struct rw_semaphore __sched * __rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) { - long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; + long count, adjustment = -RWSEM_READER_BIAS; struct rwsem_waiter waiter; DEFINE_WAKE_Q(wake_q); @@ -436,16 +387,16 @@ __rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) /* * In case the wait queue is empty and the lock isn't owned * by a writer, this reader can exit the slowpath and return - * immediately as its RWSEM_ACTIVE_READ_BIAS has already - * been set in the count. + * immediately as its RWSEM_READER_BIAS has already been + * set in the count. */ - if (atomic_long_read(&sem->count) >= 0) { + if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) { raw_spin_unlock_irq(&sem->wait_lock); rwsem_set_reader_owned(sem); lockevent_inc(rwsem_rlock_fast); return sem; } - adjustment += RWSEM_WAITING_BIAS; + adjustment += RWSEM_FLAG_WAITERS; } list_add_tail(&waiter.list, &sem->wait_list); @@ -458,9 +409,8 @@ __rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) * If there are no writers and we are first in the queue, * wake our own waiter to join the existing active readers ! */ - if (count == RWSEM_WAITING_BIAS || - (count > RWSEM_WAITING_BIAS && - adjustment != -RWSEM_ACTIVE_READ_BIAS)) + if (!(count & RWSEM_LOCK_MASK) || + (!(count & RWSEM_WRITER_MASK) && (adjustment & RWSEM_FLAG_WAITERS))) __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); @@ -488,7 +438,7 @@ __rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) out_nolock: list_del(&waiter.list); if (list_empty(&sem->wait_list)) - atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); + atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count); raw_spin_unlock_irq(&sem->wait_lock); __set_current_state(TASK_RUNNING); lockevent_inc(rwsem_rlock_fail); @@ -521,9 +471,6 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) struct rw_semaphore *ret = sem; DEFINE_WAKE_Q(wake_q); - /* undo write bias from down_write operation, stop active locking */ - count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count); - /* do optimistic spinning and steal lock if possible */ if (rwsem_optimistic_spin(sem)) return sem; @@ -543,16 +490,18 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) list_add_tail(&waiter.list, &sem->wait_list); - /* we're now waiting on the lock, but no longer actively locking */ + /* we're now waiting on the lock */ if (waiting) { count = atomic_long_read(&sem->count); /* * If there were already threads queued before us and there are - * no active writers, the lock must be read owned; so we try to - * wake any read locks that were queued ahead of us. + * no active writers and some readers, the lock must be read + * owned; so we try to any read locks that were queued ahead + * of us. */ - if (count > RWSEM_WAITING_BIAS) { + if (!(count & RWSEM_WRITER_MASK) && + (count & RWSEM_READER_MASK)) { __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); /* * The wakeup is normally called _after_ the wait_lock @@ -569,8 +518,9 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) wake_q_init(&wake_q); } - } else - count = atomic_long_add_return(RWSEM_WAITING_BIAS, &sem->count); + } else { + count = atomic_long_add_return(RWSEM_FLAG_WAITERS, &sem->count); + } /* wait until we successfully acquire the lock */ set_current_state(state); @@ -587,7 +537,8 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) schedule(); lockevent_inc(rwsem_sleep_writer); set_current_state(state); - } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK); + count = atomic_long_read(&sem->count); + } while (count & RWSEM_LOCK_MASK); raw_spin_lock_irq(&sem->wait_lock); } @@ -603,7 +554,7 @@ out_nolock: raw_spin_lock_irq(&sem->wait_lock); list_del(&waiter.list); if (list_empty(&sem->wait_list)) - atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); + atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count); else __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index eb9c8534299b..499a9b2bda82 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -42,24 +42,24 @@ #endif /* - * R/W semaphores originally for PPC using the stuff in lib/rwsem.c. - * Adapted largely from include/asm-i386/rwsem.h - * by Paul Mackerras . - */ - -/* - * the semaphore definition + * The definition of the atomic counter in the semaphore: + * + * Bit 0 - writer locked bit + * Bit 1 - waiters present bit + * Bits 2-7 - reserved + * Bits 8-X - 24-bit (32-bit) or 56-bit reader count + * + * atomic_long_fetch_add() is used to obtain reader lock, whereas + * atomic_long_cmpxchg() will be used to obtain writer lock. */ -#ifdef CONFIG_64BIT -# define RWSEM_ACTIVE_MASK 0xffffffffL -#else -# define RWSEM_ACTIVE_MASK 0x0000ffffL -#endif - -#define RWSEM_ACTIVE_BIAS 0x00000001L -#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1) -#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS -#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) +#define RWSEM_WRITER_LOCKED (1UL << 0) +#define RWSEM_FLAG_WAITERS (1UL << 1) +#define RWSEM_READER_SHIFT 8 +#define RWSEM_READER_BIAS (1UL << RWSEM_READER_SHIFT) +#define RWSEM_READER_MASK (~(RWSEM_READER_BIAS - 1)) +#define RWSEM_WRITER_MASK RWSEM_WRITER_LOCKED +#define RWSEM_LOCK_MASK (RWSEM_WRITER_MASK|RWSEM_READER_MASK) +#define RWSEM_READ_FAILED_MASK (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS) /* * All writes to owner are protected by WRITE_ONCE() to make sure that @@ -151,7 +151,8 @@ extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); */ static inline void __down_read(struct rw_semaphore *sem) { - if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) { + if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, + &sem->count) & RWSEM_READ_FAILED_MASK)) { rwsem_down_read_failed(sem); DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), sem); @@ -162,7 +163,8 @@ static inline void __down_read(struct rw_semaphore *sem) static inline int __down_read_killable(struct rw_semaphore *sem) { - if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) { + if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, + &sem->count) & RWSEM_READ_FAILED_MASK)) { if (IS_ERR(rwsem_down_read_failed_killable(sem))) return -EINTR; DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & @@ -183,11 +185,11 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) lockevent_inc(rwsem_rtrylock); do { if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, - tmp + RWSEM_ACTIVE_READ_BIAS)) { + tmp + RWSEM_READER_BIAS)) { rwsem_set_reader_owned(sem); return 1; } - } while (tmp >= 0); + } while (!(tmp & RWSEM_READ_FAILED_MASK)); return 0; } @@ -196,22 +198,16 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) */ static inline void __down_write(struct rw_semaphore *sem) { - long tmp; - - tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS, - &sem->count); - if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) + if (unlikely(atomic_long_cmpxchg_acquire(&sem->count, 0, + RWSEM_WRITER_LOCKED))) rwsem_down_write_failed(sem); rwsem_set_owner(sem); } static inline int __down_write_killable(struct rw_semaphore *sem) { - long tmp; - - tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS, - &sem->count); - if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) + if (unlikely(atomic_long_cmpxchg_acquire(&sem->count, 0, + RWSEM_WRITER_LOCKED))) if (IS_ERR(rwsem_down_write_failed_killable(sem))) return -EINTR; rwsem_set_owner(sem); @@ -224,7 +220,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) lockevent_inc(rwsem_wtrylock); tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE, - RWSEM_ACTIVE_WRITE_BIAS); + RWSEM_WRITER_LOCKED); if (tmp == RWSEM_UNLOCKED_VALUE) { rwsem_set_owner(sem); return true; @@ -242,8 +238,9 @@ static inline void __up_read(struct rw_semaphore *sem) DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), sem); rwsem_clear_reader_owned(sem); - tmp = atomic_long_dec_return_release(&sem->count); - if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0)) + tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); + if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) + == RWSEM_FLAG_WAITERS)) rwsem_wake(sem); } @@ -254,8 +251,8 @@ static inline void __up_write(struct rw_semaphore *sem) { DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); rwsem_clear_owner(sem); - if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS, - &sem->count) < 0)) + if (unlikely(atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, + &sem->count) & RWSEM_FLAG_WAITERS)) rwsem_wake(sem); } @@ -274,8 +271,9 @@ static inline void __downgrade_write(struct rw_semaphore *sem) * write side. As such, rely on RELEASE semantics. */ DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); - tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS, &sem->count); + tmp = atomic_long_fetch_add_release( + -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count); rwsem_set_reader_owned(sem); - if (tmp < 0) + if (tmp & RWSEM_FLAG_WAITERS) rwsem_downgrade_wake(sem); } -- cgit v1.2.3 From 5dec94d4923683b1dd6a09dc62427a24d79ee7b4 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:03 -0400 Subject: locking/rwsem: Merge rwsem.h and rwsem-xadd.c into rwsem.c Now we only have one implementation of rwsem. Even though we still use xadd to handle reader locking, we use cmpxchg for writer instead. So the filename rwsem-xadd.c is not strictly correct. Also no one outside of the rwsem code need to know the internal implementation other than function prototypes for two internal functions that are called directly from percpu-rwsem.c. So the rwsem-xadd.c and rwsem.h files are now merged into rwsem.c in the following order: The rwsem.h file now contains only 2 function declarations for __up_read() and __down_read(). This is a code relocation patch with no code change at all except making __up_read() and __down_read() non-static functions so they can be used by percpu-rwsem.c. Suggested-by: Peter Zijlstra Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-5-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/Makefile | 2 +- kernel/locking/rwsem-xadd.c | 624 ------------------------------- kernel/locking/rwsem.c | 884 ++++++++++++++++++++++++++++++++++++++++++++ kernel/locking/rwsem.h | 281 +------------- 4 files changed, 891 insertions(+), 900 deletions(-) delete mode 100644 kernel/locking/rwsem-xadd.c (limited to 'kernel/locking') diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 6fe2f333aecb..45452facff3b 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -3,7 +3,7 @@ # and is generally not a function of system call inputs. KCOV_INSTRUMENT := n -obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o rwsem-xadd.o +obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c deleted file mode 100644 index 7d537b50a849..000000000000 --- a/kernel/locking/rwsem-xadd.c +++ /dev/null @@ -1,624 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* rwsem.c: R/W semaphores: contention handling functions - * - * Written by David Howells (dhowells@redhat.com). - * Derived from arch/i386/kernel/semaphore.c - * - * Writer lock-stealing by Alex Shi - * and Michel Lespinasse - * - * Optimistic spinning by Tim Chen - * and Davidlohr Bueso . Based on mutexes. - * - * Rwsem count bit fields re-definition by Waiman Long . - */ -#include -#include -#include -#include -#include -#include -#include -#include - -#include "rwsem.h" - -/* - * Guide to the rw_semaphore's count field. - * - * When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned - * by a writer. - * - * The lock is owned by readers when - * (1) the RWSEM_WRITER_LOCKED isn't set in count, - * (2) some of the reader bits are set in count, and - * (3) the owner field has RWSEM_READ_OWNED bit set. - * - * Having some reader bits set is not enough to guarantee a readers owned - * lock as the readers may be in the process of backing out from the count - * and a writer has just released the lock. So another writer may steal - * the lock immediately after that. - */ - -/* - * Initialize an rwsem: - */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held semaphore: - */ - debug_check_no_locks_freed((void *)sem, sizeof(*sem)); - lockdep_init_map(&sem->dep_map, name, key, 0); -#endif - atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); - raw_spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); - sem->owner = NULL; -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER - osq_lock_init(&sem->osq); -#endif -} - -EXPORT_SYMBOL(__init_rwsem); - -enum rwsem_waiter_type { - RWSEM_WAITING_FOR_WRITE, - RWSEM_WAITING_FOR_READ -}; - -struct rwsem_waiter { - struct list_head list; - struct task_struct *task; - enum rwsem_waiter_type type; -}; - -enum rwsem_wake_type { - RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ - RWSEM_WAKE_READERS, /* Wake readers only */ - RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ -}; - -/* - * handle the lock release when processes blocked on it that can now run - * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must - * have been set. - * - there must be someone on the queue - * - the wait_lock must be held by the caller - * - tasks are marked for wakeup, the caller must later invoke wake_up_q() - * to actually wakeup the blocked task(s) and drop the reference count, - * preferably when the wait_lock is released - * - woken process blocks are discarded from the list after having task zeroed - * - writers are only marked woken if downgrading is false - */ -static void __rwsem_mark_wake(struct rw_semaphore *sem, - enum rwsem_wake_type wake_type, - struct wake_q_head *wake_q) -{ - struct rwsem_waiter *waiter, *tmp; - long oldcount, woken = 0, adjustment = 0; - struct list_head wlist; - - /* - * Take a peek at the queue head waiter such that we can determine - * the wakeup(s) to perform. - */ - waiter = list_first_entry(&sem->wait_list, struct rwsem_waiter, list); - - if (waiter->type == RWSEM_WAITING_FOR_WRITE) { - if (wake_type == RWSEM_WAKE_ANY) { - /* - * Mark writer at the front of the queue for wakeup. - * Until the task is actually later awoken later by - * the caller, other writers are able to steal it. - * Readers, on the other hand, will block as they - * will notice the queued writer. - */ - wake_q_add(wake_q, waiter->task); - lockevent_inc(rwsem_wake_writer); - } - - return; - } - - /* - * Writers might steal the lock before we grant it to the next reader. - * We prefer to do the first reader grant before counting readers - * so we can bail out early if a writer stole the lock. - */ - if (wake_type != RWSEM_WAKE_READ_OWNED) { - adjustment = RWSEM_READER_BIAS; - oldcount = atomic_long_fetch_add(adjustment, &sem->count); - if (unlikely(oldcount & RWSEM_WRITER_MASK)) { - atomic_long_sub(adjustment, &sem->count); - return; - } - /* - * Set it to reader-owned to give spinners an early - * indication that readers now have the lock. - */ - __rwsem_set_reader_owned(sem, waiter->task); - } - - /* - * Grant an infinite number of read locks to the readers at the front - * of the queue. We know that woken will be at least 1 as we accounted - * for above. Note we increment the 'active part' of the count by the - * number of readers before waking any processes up. - * - * We have to do wakeup in 2 passes to prevent the possibility that - * the reader count may be decremented before it is incremented. It - * is because the to-be-woken waiter may not have slept yet. So it - * may see waiter->task got cleared, finish its critical section and - * do an unlock before the reader count increment. - * - * 1) Collect the read-waiters in a separate list, count them and - * fully increment the reader count in rwsem. - * 2) For each waiters in the new list, clear waiter->task and - * put them into wake_q to be woken up later. - */ - list_for_each_entry(waiter, &sem->wait_list, list) { - if (waiter->type == RWSEM_WAITING_FOR_WRITE) - break; - - woken++; - } - list_cut_before(&wlist, &sem->wait_list, &waiter->list); - - adjustment = woken * RWSEM_READER_BIAS - adjustment; - lockevent_cond_inc(rwsem_wake_reader, woken); - if (list_empty(&sem->wait_list)) { - /* hit end of list above */ - adjustment -= RWSEM_FLAG_WAITERS; - } - - if (adjustment) - atomic_long_add(adjustment, &sem->count); - - /* 2nd pass */ - list_for_each_entry_safe(waiter, tmp, &wlist, list) { - struct task_struct *tsk; - - tsk = waiter->task; - get_task_struct(tsk); - - /* - * Ensure calling get_task_struct() before setting the reader - * waiter to nil such that rwsem_down_read_failed() cannot - * race with do_exit() by always holding a reference count - * to the task to wakeup. - */ - smp_store_release(&waiter->task, NULL); - /* - * Ensure issuing the wakeup (either by us or someone else) - * after setting the reader waiter to nil. - */ - wake_q_add_safe(wake_q, tsk); - } -} - -/* - * This function must be called with the sem->wait_lock held to prevent - * race conditions between checking the rwsem wait list and setting the - * sem->count accordingly. - */ -static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) -{ - long new; - - if (count & RWSEM_LOCK_MASK) - return false; - - new = count + RWSEM_WRITER_LOCKED - - (list_is_singular(&sem->wait_list) ? RWSEM_FLAG_WAITERS : 0); - - if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, new)) { - rwsem_set_owner(sem); - return true; - } - - return false; -} - -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER -/* - * Try to acquire write lock before the writer has been put on wait queue. - */ -static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) -{ - long count = atomic_long_read(&sem->count); - - while (!(count & RWSEM_LOCK_MASK)) { - if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, - count + RWSEM_WRITER_LOCKED)) { - rwsem_set_owner(sem); - lockevent_inc(rwsem_opt_wlock); - return true; - } - } - return false; -} - -static inline bool owner_on_cpu(struct task_struct *owner) -{ - /* - * As lock holder preemption issue, we both skip spinning if - * task is not on cpu or its cpu is preempted - */ - return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); -} - -static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) -{ - struct task_struct *owner; - bool ret = true; - - BUILD_BUG_ON(!rwsem_has_anonymous_owner(RWSEM_OWNER_UNKNOWN)); - - if (need_resched()) - return false; - - rcu_read_lock(); - owner = READ_ONCE(sem->owner); - if (owner) { - ret = is_rwsem_owner_spinnable(owner) && - owner_on_cpu(owner); - } - rcu_read_unlock(); - return ret; -} - -/* - * Return true only if we can still spin on the owner field of the rwsem. - */ -static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) -{ - struct task_struct *owner = READ_ONCE(sem->owner); - - if (!is_rwsem_owner_spinnable(owner)) - return false; - - rcu_read_lock(); - while (owner && (READ_ONCE(sem->owner) == owner)) { - /* - * Ensure we emit the owner->on_cpu, dereference _after_ - * checking sem->owner still matches owner, if that fails, - * owner might point to free()d memory, if it still matches, - * the rcu_read_lock() ensures the memory stays valid. - */ - barrier(); - - /* - * abort spinning when need_resched or owner is not running or - * owner's cpu is preempted. - */ - if (need_resched() || !owner_on_cpu(owner)) { - rcu_read_unlock(); - return false; - } - - cpu_relax(); - } - rcu_read_unlock(); - - /* - * If there is a new owner or the owner is not set, we continue - * spinning. - */ - return is_rwsem_owner_spinnable(READ_ONCE(sem->owner)); -} - -static bool rwsem_optimistic_spin(struct rw_semaphore *sem) -{ - bool taken = false; - - preempt_disable(); - - /* sem->wait_lock should not be held when doing optimistic spinning */ - if (!rwsem_can_spin_on_owner(sem)) - goto done; - - if (!osq_lock(&sem->osq)) - goto done; - - /* - * Optimistically spin on the owner field and attempt to acquire the - * lock whenever the owner changes. Spinning will be stopped when: - * 1) the owning writer isn't running; or - * 2) readers own the lock as we can't determine if they are - * actively running or not. - */ - while (rwsem_spin_on_owner(sem)) { - /* - * Try to acquire the lock - */ - if (rwsem_try_write_lock_unqueued(sem)) { - taken = true; - break; - } - - /* - * When there's no owner, we might have preempted between the - * owner acquiring the lock and setting the owner field. If - * we're an RT task that will live-lock because we won't let - * the owner complete. - */ - if (!sem->owner && (need_resched() || rt_task(current))) - break; - - /* - * The cpu_relax() call is a compiler barrier which forces - * everything in this loop to be re-loaded. We don't need - * memory barriers as we'll eventually observe the right - * values at the cost of a few extra spins. - */ - cpu_relax(); - } - osq_unlock(&sem->osq); -done: - preempt_enable(); - lockevent_cond_inc(rwsem_opt_fail, !taken); - return taken; -} -#else -static bool rwsem_optimistic_spin(struct rw_semaphore *sem) -{ - return false; -} -#endif - -/* - * Wait for the read lock to be granted - */ -static inline struct rw_semaphore __sched * -__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) -{ - long count, adjustment = -RWSEM_READER_BIAS; - struct rwsem_waiter waiter; - DEFINE_WAKE_Q(wake_q); - - waiter.task = current; - waiter.type = RWSEM_WAITING_FOR_READ; - - raw_spin_lock_irq(&sem->wait_lock); - if (list_empty(&sem->wait_list)) { - /* - * In case the wait queue is empty and the lock isn't owned - * by a writer, this reader can exit the slowpath and return - * immediately as its RWSEM_READER_BIAS has already been - * set in the count. - */ - if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) { - raw_spin_unlock_irq(&sem->wait_lock); - rwsem_set_reader_owned(sem); - lockevent_inc(rwsem_rlock_fast); - return sem; - } - adjustment += RWSEM_FLAG_WAITERS; - } - list_add_tail(&waiter.list, &sem->wait_list); - - /* we're now waiting on the lock, but no longer actively locking */ - count = atomic_long_add_return(adjustment, &sem->count); - - /* - * If there are no active locks, wake the front queued process(es). - * - * If there are no writers and we are first in the queue, - * wake our own waiter to join the existing active readers ! - */ - if (!(count & RWSEM_LOCK_MASK) || - (!(count & RWSEM_WRITER_MASK) && (adjustment & RWSEM_FLAG_WAITERS))) - __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); - - raw_spin_unlock_irq(&sem->wait_lock); - wake_up_q(&wake_q); - - /* wait to be given the lock */ - while (true) { - set_current_state(state); - if (!waiter.task) - break; - if (signal_pending_state(state, current)) { - raw_spin_lock_irq(&sem->wait_lock); - if (waiter.task) - goto out_nolock; - raw_spin_unlock_irq(&sem->wait_lock); - break; - } - schedule(); - lockevent_inc(rwsem_sleep_reader); - } - - __set_current_state(TASK_RUNNING); - lockevent_inc(rwsem_rlock); - return sem; -out_nolock: - list_del(&waiter.list); - if (list_empty(&sem->wait_list)) - atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count); - raw_spin_unlock_irq(&sem->wait_lock); - __set_current_state(TASK_RUNNING); - lockevent_inc(rwsem_rlock_fail); - return ERR_PTR(-EINTR); -} - -__visible struct rw_semaphore * __sched -rwsem_down_read_failed(struct rw_semaphore *sem) -{ - return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(rwsem_down_read_failed); - -__visible struct rw_semaphore * __sched -rwsem_down_read_failed_killable(struct rw_semaphore *sem) -{ - return __rwsem_down_read_failed_common(sem, TASK_KILLABLE); -} -EXPORT_SYMBOL(rwsem_down_read_failed_killable); - -/* - * Wait until we successfully acquire the write lock - */ -static inline struct rw_semaphore * -__rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) -{ - long count; - bool waiting = true; /* any queued threads before us */ - struct rwsem_waiter waiter; - struct rw_semaphore *ret = sem; - DEFINE_WAKE_Q(wake_q); - - /* do optimistic spinning and steal lock if possible */ - if (rwsem_optimistic_spin(sem)) - return sem; - - /* - * Optimistic spinning failed, proceed to the slowpath - * and block until we can acquire the sem. - */ - waiter.task = current; - waiter.type = RWSEM_WAITING_FOR_WRITE; - - raw_spin_lock_irq(&sem->wait_lock); - - /* account for this before adding a new element to the list */ - if (list_empty(&sem->wait_list)) - waiting = false; - - list_add_tail(&waiter.list, &sem->wait_list); - - /* we're now waiting on the lock */ - if (waiting) { - count = atomic_long_read(&sem->count); - - /* - * If there were already threads queued before us and there are - * no active writers and some readers, the lock must be read - * owned; so we try to any read locks that were queued ahead - * of us. - */ - if (!(count & RWSEM_WRITER_MASK) && - (count & RWSEM_READER_MASK)) { - __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); - /* - * The wakeup is normally called _after_ the wait_lock - * is released, but given that we are proactively waking - * readers we can deal with the wake_q overhead as it is - * similar to releasing and taking the wait_lock again - * for attempting rwsem_try_write_lock(). - */ - wake_up_q(&wake_q); - - /* - * Reinitialize wake_q after use. - */ - wake_q_init(&wake_q); - } - - } else { - count = atomic_long_add_return(RWSEM_FLAG_WAITERS, &sem->count); - } - - /* wait until we successfully acquire the lock */ - set_current_state(state); - while (true) { - if (rwsem_try_write_lock(count, sem)) - break; - raw_spin_unlock_irq(&sem->wait_lock); - - /* Block until there are no active lockers. */ - do { - if (signal_pending_state(state, current)) - goto out_nolock; - - schedule(); - lockevent_inc(rwsem_sleep_writer); - set_current_state(state); - count = atomic_long_read(&sem->count); - } while (count & RWSEM_LOCK_MASK); - - raw_spin_lock_irq(&sem->wait_lock); - } - __set_current_state(TASK_RUNNING); - list_del(&waiter.list); - raw_spin_unlock_irq(&sem->wait_lock); - lockevent_inc(rwsem_wlock); - - return ret; - -out_nolock: - __set_current_state(TASK_RUNNING); - raw_spin_lock_irq(&sem->wait_lock); - list_del(&waiter.list); - if (list_empty(&sem->wait_list)) - atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count); - else - __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); - raw_spin_unlock_irq(&sem->wait_lock); - wake_up_q(&wake_q); - lockevent_inc(rwsem_wlock_fail); - - return ERR_PTR(-EINTR); -} - -__visible struct rw_semaphore * __sched -rwsem_down_write_failed(struct rw_semaphore *sem) -{ - return __rwsem_down_write_failed_common(sem, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(rwsem_down_write_failed); - -__visible struct rw_semaphore * __sched -rwsem_down_write_failed_killable(struct rw_semaphore *sem) -{ - return __rwsem_down_write_failed_common(sem, TASK_KILLABLE); -} -EXPORT_SYMBOL(rwsem_down_write_failed_killable); - -/* - * handle waking up a waiter on the semaphore - * - up_read/up_write has decremented the active part of count if we come here - */ -__visible -struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) -{ - unsigned long flags; - DEFINE_WAKE_Q(wake_q); - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (!list_empty(&sem->wait_list)) - __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - wake_up_q(&wake_q); - - return sem; -} -EXPORT_SYMBOL(rwsem_wake); - -/* - * downgrade a write lock into a read lock - * - caller incremented waiting part of count and discovered it still negative - * - just wake up any readers at the front of the queue - */ -__visible -struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) -{ - unsigned long flags; - DEFINE_WAKE_Q(wake_q); - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (!list_empty(&sem->wait_list)) - __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - wake_up_q(&wake_q); - - return sem; -} -EXPORT_SYMBOL(rwsem_downgrade_wake); diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index ccbf18f560ff..8317bcdf063b 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -3,17 +3,901 @@ * * Written by David Howells (dhowells@redhat.com). * Derived from asm-i386/semaphore.h + * + * Writer lock-stealing by Alex Shi + * and Michel Lespinasse + * + * Optimistic spinning by Tim Chen + * and Davidlohr Bueso . Based on mutexes. + * + * Rwsem count bit fields re-definition and rwsem rearchitecture + * by Waiman Long . */ #include #include #include +#include +#include #include +#include +#include #include #include #include #include "rwsem.h" +#include "lock_events.h" + +/* + * The least significant 2 bits of the owner value has the following + * meanings when set. + * - RWSEM_READER_OWNED (bit 0): The rwsem is owned by readers + * - RWSEM_ANONYMOUSLY_OWNED (bit 1): The rwsem is anonymously owned, + * i.e. the owner(s) cannot be readily determined. It can be reader + * owned or the owning writer is indeterminate. + * + * When a writer acquires a rwsem, it puts its task_struct pointer + * into the owner field. It is cleared after an unlock. + * + * When a reader acquires a rwsem, it will also puts its task_struct + * pointer into the owner field with both the RWSEM_READER_OWNED and + * RWSEM_ANONYMOUSLY_OWNED bits set. On unlock, the owner field will + * largely be left untouched. So for a free or reader-owned rwsem, + * the owner value may contain information about the last reader that + * acquires the rwsem. The anonymous bit is set because that particular + * reader may or may not still own the lock. + * + * That information may be helpful in debugging cases where the system + * seems to hang on a reader owned rwsem especially if only one reader + * is involved. Ideally we would like to track all the readers that own + * a rwsem, but the overhead is simply too big. + */ +#define RWSEM_READER_OWNED (1UL << 0) +#define RWSEM_ANONYMOUSLY_OWNED (1UL << 1) + +#ifdef CONFIG_DEBUG_RWSEMS +# define DEBUG_RWSEMS_WARN_ON(c, sem) do { \ + if (!debug_locks_silent && \ + WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\ + #c, atomic_long_read(&(sem)->count), \ + (long)((sem)->owner), (long)current, \ + list_empty(&(sem)->wait_list) ? "" : "not ")) \ + debug_locks_off(); \ + } while (0) +#else +# define DEBUG_RWSEMS_WARN_ON(c, sem) +#endif + +/* + * The definition of the atomic counter in the semaphore: + * + * Bit 0 - writer locked bit + * Bit 1 - waiters present bit + * Bits 2-7 - reserved + * Bits 8-X - 24-bit (32-bit) or 56-bit reader count + * + * atomic_long_fetch_add() is used to obtain reader lock, whereas + * atomic_long_cmpxchg() will be used to obtain writer lock. + */ +#define RWSEM_WRITER_LOCKED (1UL << 0) +#define RWSEM_FLAG_WAITERS (1UL << 1) +#define RWSEM_READER_SHIFT 8 +#define RWSEM_READER_BIAS (1UL << RWSEM_READER_SHIFT) +#define RWSEM_READER_MASK (~(RWSEM_READER_BIAS - 1)) +#define RWSEM_WRITER_MASK RWSEM_WRITER_LOCKED +#define RWSEM_LOCK_MASK (RWSEM_WRITER_MASK|RWSEM_READER_MASK) +#define RWSEM_READ_FAILED_MASK (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS) + +/* + * All writes to owner are protected by WRITE_ONCE() to make sure that + * store tearing can't happen as optimistic spinners may read and use + * the owner value concurrently without lock. Read from owner, however, + * may not need READ_ONCE() as long as the pointer value is only used + * for comparison and isn't being dereferenced. + */ +static inline void rwsem_set_owner(struct rw_semaphore *sem) +{ + WRITE_ONCE(sem->owner, current); +} + +static inline void rwsem_clear_owner(struct rw_semaphore *sem) +{ + WRITE_ONCE(sem->owner, NULL); +} + +/* + * The task_struct pointer of the last owning reader will be left in + * the owner field. + * + * Note that the owner value just indicates the task has owned the rwsem + * previously, it may not be the real owner or one of the real owners + * anymore when that field is examined, so take it with a grain of salt. + */ +static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, + struct task_struct *owner) +{ + unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED + | RWSEM_ANONYMOUSLY_OWNED; + + WRITE_ONCE(sem->owner, (struct task_struct *)val); +} + +static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) +{ + __rwsem_set_reader_owned(sem, current); +} + +/* + * Return true if the a rwsem waiter can spin on the rwsem's owner + * and steal the lock, i.e. the lock is not anonymously owned. + * N.B. !owner is considered spinnable. + */ +static inline bool is_rwsem_owner_spinnable(struct task_struct *owner) +{ + return !((unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED); +} + +/* + * Return true if rwsem is owned by an anonymous writer or readers. + */ +static inline bool rwsem_has_anonymous_owner(struct task_struct *owner) +{ + return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED; +} + +#ifdef CONFIG_DEBUG_RWSEMS +/* + * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there + * is a task pointer in owner of a reader-owned rwsem, it will be the + * real owner or one of the real owners. The only exception is when the + * unlock is done by up_read_non_owner(). + */ +static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) +{ + unsigned long val = (unsigned long)current | RWSEM_READER_OWNED + | RWSEM_ANONYMOUSLY_OWNED; + if (READ_ONCE(sem->owner) == (struct task_struct *)val) + cmpxchg_relaxed((unsigned long *)&sem->owner, val, + RWSEM_READER_OWNED | RWSEM_ANONYMOUSLY_OWNED); +} +#else +static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) +{ +} +#endif + +/* + * Guide to the rw_semaphore's count field. + * + * When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned + * by a writer. + * + * The lock is owned by readers when + * (1) the RWSEM_WRITER_LOCKED isn't set in count, + * (2) some of the reader bits are set in count, and + * (3) the owner field has RWSEM_READ_OWNED bit set. + * + * Having some reader bits set is not enough to guarantee a readers owned + * lock as the readers may be in the process of backing out from the count + * and a writer has just released the lock. So another writer may steal + * the lock immediately after that. + */ + +/* + * Initialize an rwsem: + */ +void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held semaphore: + */ + debug_check_no_locks_freed((void *)sem, sizeof(*sem)); + lockdep_init_map(&sem->dep_map, name, key, 0); +#endif + atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); + raw_spin_lock_init(&sem->wait_lock); + INIT_LIST_HEAD(&sem->wait_list); + sem->owner = NULL; +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER + osq_lock_init(&sem->osq); +#endif +} + +EXPORT_SYMBOL(__init_rwsem); + +enum rwsem_waiter_type { + RWSEM_WAITING_FOR_WRITE, + RWSEM_WAITING_FOR_READ +}; + +struct rwsem_waiter { + struct list_head list; + struct task_struct *task; + enum rwsem_waiter_type type; +}; + +enum rwsem_wake_type { + RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ + RWSEM_WAKE_READERS, /* Wake readers only */ + RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ +}; + +/* + * handle the lock release when processes blocked on it that can now run + * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must + * have been set. + * - there must be someone on the queue + * - the wait_lock must be held by the caller + * - tasks are marked for wakeup, the caller must later invoke wake_up_q() + * to actually wakeup the blocked task(s) and drop the reference count, + * preferably when the wait_lock is released + * - woken process blocks are discarded from the list after having task zeroed + * - writers are only marked woken if downgrading is false + */ +static void __rwsem_mark_wake(struct rw_semaphore *sem, + enum rwsem_wake_type wake_type, + struct wake_q_head *wake_q) +{ + struct rwsem_waiter *waiter, *tmp; + long oldcount, woken = 0, adjustment = 0; + struct list_head wlist; + + /* + * Take a peek at the queue head waiter such that we can determine + * the wakeup(s) to perform. + */ + waiter = list_first_entry(&sem->wait_list, struct rwsem_waiter, list); + + if (waiter->type == RWSEM_WAITING_FOR_WRITE) { + if (wake_type == RWSEM_WAKE_ANY) { + /* + * Mark writer at the front of the queue for wakeup. + * Until the task is actually later awoken later by + * the caller, other writers are able to steal it. + * Readers, on the other hand, will block as they + * will notice the queued writer. + */ + wake_q_add(wake_q, waiter->task); + lockevent_inc(rwsem_wake_writer); + } + + return; + } + + /* + * Writers might steal the lock before we grant it to the next reader. + * We prefer to do the first reader grant before counting readers + * so we can bail out early if a writer stole the lock. + */ + if (wake_type != RWSEM_WAKE_READ_OWNED) { + adjustment = RWSEM_READER_BIAS; + oldcount = atomic_long_fetch_add(adjustment, &sem->count); + if (unlikely(oldcount & RWSEM_WRITER_MASK)) { + atomic_long_sub(adjustment, &sem->count); + return; + } + /* + * Set it to reader-owned to give spinners an early + * indication that readers now have the lock. + */ + __rwsem_set_reader_owned(sem, waiter->task); + } + + /* + * Grant an infinite number of read locks to the readers at the front + * of the queue. We know that woken will be at least 1 as we accounted + * for above. Note we increment the 'active part' of the count by the + * number of readers before waking any processes up. + * + * We have to do wakeup in 2 passes to prevent the possibility that + * the reader count may be decremented before it is incremented. It + * is because the to-be-woken waiter may not have slept yet. So it + * may see waiter->task got cleared, finish its critical section and + * do an unlock before the reader count increment. + * + * 1) Collect the read-waiters in a separate list, count them and + * fully increment the reader count in rwsem. + * 2) For each waiters in the new list, clear waiter->task and + * put them into wake_q to be woken up later. + */ + list_for_each_entry(waiter, &sem->wait_list, list) { + if (waiter->type == RWSEM_WAITING_FOR_WRITE) + break; + + woken++; + } + list_cut_before(&wlist, &sem->wait_list, &waiter->list); + + adjustment = woken * RWSEM_READER_BIAS - adjustment; + lockevent_cond_inc(rwsem_wake_reader, woken); + if (list_empty(&sem->wait_list)) { + /* hit end of list above */ + adjustment -= RWSEM_FLAG_WAITERS; + } + + if (adjustment) + atomic_long_add(adjustment, &sem->count); + + /* 2nd pass */ + list_for_each_entry_safe(waiter, tmp, &wlist, list) { + struct task_struct *tsk; + + tsk = waiter->task; + get_task_struct(tsk); + + /* + * Ensure calling get_task_struct() before setting the reader + * waiter to nil such that rwsem_down_read_failed() cannot + * race with do_exit() by always holding a reference count + * to the task to wakeup. + */ + smp_store_release(&waiter->task, NULL); + /* + * Ensure issuing the wakeup (either by us or someone else) + * after setting the reader waiter to nil. + */ + wake_q_add_safe(wake_q, tsk); + } +} + +/* + * This function must be called with the sem->wait_lock held to prevent + * race conditions between checking the rwsem wait list and setting the + * sem->count accordingly. + */ +static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) +{ + long new; + + if (count & RWSEM_LOCK_MASK) + return false; + + new = count + RWSEM_WRITER_LOCKED - + (list_is_singular(&sem->wait_list) ? RWSEM_FLAG_WAITERS : 0); + + if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, new)) { + rwsem_set_owner(sem); + return true; + } + + return false; +} + +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER +/* + * Try to acquire write lock before the writer has been put on wait queue. + */ +static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) +{ + long count = atomic_long_read(&sem->count); + + while (!(count & RWSEM_LOCK_MASK)) { + if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, + count + RWSEM_WRITER_LOCKED)) { + rwsem_set_owner(sem); + lockevent_inc(rwsem_opt_wlock); + return true; + } + } + return false; +} + +static inline bool owner_on_cpu(struct task_struct *owner) +{ + /* + * As lock holder preemption issue, we both skip spinning if + * task is not on cpu or its cpu is preempted + */ + return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); +} + +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) +{ + struct task_struct *owner; + bool ret = true; + + BUILD_BUG_ON(!rwsem_has_anonymous_owner(RWSEM_OWNER_UNKNOWN)); + + if (need_resched()) + return false; + + rcu_read_lock(); + owner = READ_ONCE(sem->owner); + if (owner) { + ret = is_rwsem_owner_spinnable(owner) && + owner_on_cpu(owner); + } + rcu_read_unlock(); + return ret; +} + +/* + * Return true only if we can still spin on the owner field of the rwsem. + */ +static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) +{ + struct task_struct *owner = READ_ONCE(sem->owner); + + if (!is_rwsem_owner_spinnable(owner)) + return false; + + rcu_read_lock(); + while (owner && (READ_ONCE(sem->owner) == owner)) { + /* + * Ensure we emit the owner->on_cpu, dereference _after_ + * checking sem->owner still matches owner, if that fails, + * owner might point to free()d memory, if it still matches, + * the rcu_read_lock() ensures the memory stays valid. + */ + barrier(); + + /* + * abort spinning when need_resched or owner is not running or + * owner's cpu is preempted. + */ + if (need_resched() || !owner_on_cpu(owner)) { + rcu_read_unlock(); + return false; + } + + cpu_relax(); + } + rcu_read_unlock(); + + /* + * If there is a new owner or the owner is not set, we continue + * spinning. + */ + return is_rwsem_owner_spinnable(READ_ONCE(sem->owner)); +} + +static bool rwsem_optimistic_spin(struct rw_semaphore *sem) +{ + bool taken = false; + + preempt_disable(); + + /* sem->wait_lock should not be held when doing optimistic spinning */ + if (!rwsem_can_spin_on_owner(sem)) + goto done; + + if (!osq_lock(&sem->osq)) + goto done; + + /* + * Optimistically spin on the owner field and attempt to acquire the + * lock whenever the owner changes. Spinning will be stopped when: + * 1) the owning writer isn't running; or + * 2) readers own the lock as we can't determine if they are + * actively running or not. + */ + while (rwsem_spin_on_owner(sem)) { + /* + * Try to acquire the lock + */ + if (rwsem_try_write_lock_unqueued(sem)) { + taken = true; + break; + } + + /* + * When there's no owner, we might have preempted between the + * owner acquiring the lock and setting the owner field. If + * we're an RT task that will live-lock because we won't let + * the owner complete. + */ + if (!sem->owner && (need_resched() || rt_task(current))) + break; + + /* + * The cpu_relax() call is a compiler barrier which forces + * everything in this loop to be re-loaded. We don't need + * memory barriers as we'll eventually observe the right + * values at the cost of a few extra spins. + */ + cpu_relax(); + } + osq_unlock(&sem->osq); +done: + preempt_enable(); + lockevent_cond_inc(rwsem_opt_fail, !taken); + return taken; +} +#else +static bool rwsem_optimistic_spin(struct rw_semaphore *sem) +{ + return false; +} +#endif + +/* + * Wait for the read lock to be granted + */ +static inline struct rw_semaphore __sched * +__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) +{ + long count, adjustment = -RWSEM_READER_BIAS; + struct rwsem_waiter waiter; + DEFINE_WAKE_Q(wake_q); + + waiter.task = current; + waiter.type = RWSEM_WAITING_FOR_READ; + + raw_spin_lock_irq(&sem->wait_lock); + if (list_empty(&sem->wait_list)) { + /* + * In case the wait queue is empty and the lock isn't owned + * by a writer, this reader can exit the slowpath and return + * immediately as its RWSEM_READER_BIAS has already been + * set in the count. + */ + if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) { + raw_spin_unlock_irq(&sem->wait_lock); + rwsem_set_reader_owned(sem); + lockevent_inc(rwsem_rlock_fast); + return sem; + } + adjustment += RWSEM_FLAG_WAITERS; + } + list_add_tail(&waiter.list, &sem->wait_list); + + /* we're now waiting on the lock, but no longer actively locking */ + count = atomic_long_add_return(adjustment, &sem->count); + + /* + * If there are no active locks, wake the front queued process(es). + * + * If there are no writers and we are first in the queue, + * wake our own waiter to join the existing active readers ! + */ + if (!(count & RWSEM_LOCK_MASK) || + (!(count & RWSEM_WRITER_MASK) && (adjustment & RWSEM_FLAG_WAITERS))) + __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + + raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); + + /* wait to be given the lock */ + while (true) { + set_current_state(state); + if (!waiter.task) + break; + if (signal_pending_state(state, current)) { + raw_spin_lock_irq(&sem->wait_lock); + if (waiter.task) + goto out_nolock; + raw_spin_unlock_irq(&sem->wait_lock); + break; + } + schedule(); + lockevent_inc(rwsem_sleep_reader); + } + + __set_current_state(TASK_RUNNING); + lockevent_inc(rwsem_rlock); + return sem; +out_nolock: + list_del(&waiter.list); + if (list_empty(&sem->wait_list)) + atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count); + raw_spin_unlock_irq(&sem->wait_lock); + __set_current_state(TASK_RUNNING); + lockevent_inc(rwsem_rlock_fail); + return ERR_PTR(-EINTR); +} + +__visible struct rw_semaphore * __sched +rwsem_down_read_failed(struct rw_semaphore *sem) +{ + return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(rwsem_down_read_failed); + +__visible struct rw_semaphore * __sched +rwsem_down_read_failed_killable(struct rw_semaphore *sem) +{ + return __rwsem_down_read_failed_common(sem, TASK_KILLABLE); +} +EXPORT_SYMBOL(rwsem_down_read_failed_killable); + +/* + * Wait until we successfully acquire the write lock + */ +static inline struct rw_semaphore * +__rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) +{ + long count; + bool waiting = true; /* any queued threads before us */ + struct rwsem_waiter waiter; + struct rw_semaphore *ret = sem; + DEFINE_WAKE_Q(wake_q); + + /* do optimistic spinning and steal lock if possible */ + if (rwsem_optimistic_spin(sem)) + return sem; + + /* + * Optimistic spinning failed, proceed to the slowpath + * and block until we can acquire the sem. + */ + waiter.task = current; + waiter.type = RWSEM_WAITING_FOR_WRITE; + + raw_spin_lock_irq(&sem->wait_lock); + + /* account for this before adding a new element to the list */ + if (list_empty(&sem->wait_list)) + waiting = false; + + list_add_tail(&waiter.list, &sem->wait_list); + + /* we're now waiting on the lock */ + if (waiting) { + count = atomic_long_read(&sem->count); + + /* + * If there were already threads queued before us and there are + * no active writers and some readers, the lock must be read + * owned; so we try to any read locks that were queued ahead + * of us. + */ + if (!(count & RWSEM_WRITER_MASK) && + (count & RWSEM_READER_MASK)) { + __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); + /* + * The wakeup is normally called _after_ the wait_lock + * is released, but given that we are proactively waking + * readers we can deal with the wake_q overhead as it is + * similar to releasing and taking the wait_lock again + * for attempting rwsem_try_write_lock(). + */ + wake_up_q(&wake_q); + + /* + * Reinitialize wake_q after use. + */ + wake_q_init(&wake_q); + } + + } else { + count = atomic_long_add_return(RWSEM_FLAG_WAITERS, &sem->count); + } + + /* wait until we successfully acquire the lock */ + set_current_state(state); + while (true) { + if (rwsem_try_write_lock(count, sem)) + break; + raw_spin_unlock_irq(&sem->wait_lock); + + /* Block until there are no active lockers. */ + do { + if (signal_pending_state(state, current)) + goto out_nolock; + + schedule(); + lockevent_inc(rwsem_sleep_writer); + set_current_state(state); + count = atomic_long_read(&sem->count); + } while (count & RWSEM_LOCK_MASK); + + raw_spin_lock_irq(&sem->wait_lock); + } + __set_current_state(TASK_RUNNING); + list_del(&waiter.list); + raw_spin_unlock_irq(&sem->wait_lock); + lockevent_inc(rwsem_wlock); + + return ret; + +out_nolock: + __set_current_state(TASK_RUNNING); + raw_spin_lock_irq(&sem->wait_lock); + list_del(&waiter.list); + if (list_empty(&sem->wait_list)) + atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count); + else + __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); + lockevent_inc(rwsem_wlock_fail); + + return ERR_PTR(-EINTR); +} + +__visible struct rw_semaphore * __sched +rwsem_down_write_failed(struct rw_semaphore *sem) +{ + return __rwsem_down_write_failed_common(sem, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(rwsem_down_write_failed); + +__visible struct rw_semaphore * __sched +rwsem_down_write_failed_killable(struct rw_semaphore *sem) +{ + return __rwsem_down_write_failed_common(sem, TASK_KILLABLE); +} +EXPORT_SYMBOL(rwsem_down_write_failed_killable); + +/* + * handle waking up a waiter on the semaphore + * - up_read/up_write has decremented the active part of count if we come here + */ +__visible +struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) +{ + unsigned long flags; + DEFINE_WAKE_Q(wake_q); + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + if (!list_empty(&sem->wait_list)) + __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + wake_up_q(&wake_q); + + return sem; +} +EXPORT_SYMBOL(rwsem_wake); + +/* + * downgrade a write lock into a read lock + * - caller incremented waiting part of count and discovered it still negative + * - just wake up any readers at the front of the queue + */ +__visible +struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) +{ + unsigned long flags; + DEFINE_WAKE_Q(wake_q); + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + if (!list_empty(&sem->wait_list)) + __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + wake_up_q(&wake_q); + + return sem; +} +EXPORT_SYMBOL(rwsem_downgrade_wake); + +/* + * lock for reading + */ +inline void __down_read(struct rw_semaphore *sem) +{ + if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, + &sem->count) & RWSEM_READ_FAILED_MASK)) { + rwsem_down_read_failed(sem); + DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & + RWSEM_READER_OWNED), sem); + } else { + rwsem_set_reader_owned(sem); + } +} + +static inline int __down_read_killable(struct rw_semaphore *sem) +{ + if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, + &sem->count) & RWSEM_READ_FAILED_MASK)) { + if (IS_ERR(rwsem_down_read_failed_killable(sem))) + return -EINTR; + DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & + RWSEM_READER_OWNED), sem); + } else { + rwsem_set_reader_owned(sem); + } + return 0; +} + +static inline int __down_read_trylock(struct rw_semaphore *sem) +{ + /* + * Optimize for the case when the rwsem is not locked at all. + */ + long tmp = RWSEM_UNLOCKED_VALUE; + + lockevent_inc(rwsem_rtrylock); + do { + if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, + tmp + RWSEM_READER_BIAS)) { + rwsem_set_reader_owned(sem); + return 1; + } + } while (!(tmp & RWSEM_READ_FAILED_MASK)); + return 0; +} + +/* + * lock for writing + */ +static inline void __down_write(struct rw_semaphore *sem) +{ + if (unlikely(atomic_long_cmpxchg_acquire(&sem->count, 0, + RWSEM_WRITER_LOCKED))) + rwsem_down_write_failed(sem); + rwsem_set_owner(sem); +} + +static inline int __down_write_killable(struct rw_semaphore *sem) +{ + if (unlikely(atomic_long_cmpxchg_acquire(&sem->count, 0, + RWSEM_WRITER_LOCKED))) + if (IS_ERR(rwsem_down_write_failed_killable(sem))) + return -EINTR; + rwsem_set_owner(sem); + return 0; +} + +static inline int __down_write_trylock(struct rw_semaphore *sem) +{ + long tmp; + + lockevent_inc(rwsem_wtrylock); + tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE, + RWSEM_WRITER_LOCKED); + if (tmp == RWSEM_UNLOCKED_VALUE) { + rwsem_set_owner(sem); + return true; + } + return false; +} + +/* + * unlock after reading + */ +inline void __up_read(struct rw_semaphore *sem) +{ + long tmp; + + DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), + sem); + rwsem_clear_reader_owned(sem); + tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); + if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) + == RWSEM_FLAG_WAITERS)) + rwsem_wake(sem); +} + +/* + * unlock after writing + */ +static inline void __up_write(struct rw_semaphore *sem) +{ + DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); + rwsem_clear_owner(sem); + if (unlikely(atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, + &sem->count) & RWSEM_FLAG_WAITERS)) + rwsem_wake(sem); +} + +/* + * downgrade write lock to read lock + */ +static inline void __downgrade_write(struct rw_semaphore *sem) +{ + long tmp; + + /* + * When downgrading from exclusive to shared ownership, + * anything inside the write-locked region cannot leak + * into the read side. In contrast, anything in the + * read-locked region is ok to be re-ordered into the + * write side. As such, rely on RELEASE semantics. + */ + DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); + tmp = atomic_long_fetch_add_release( + -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count); + rwsem_set_reader_owned(sem); + if (tmp & RWSEM_FLAG_WAITERS) + rwsem_downgrade_wake(sem); +} /* * lock for reading diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 499a9b2bda82..2534ce49f648 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -1,279 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* - * The least significant 2 bits of the owner value has the following - * meanings when set. - * - RWSEM_READER_OWNED (bit 0): The rwsem is owned by readers - * - RWSEM_ANONYMOUSLY_OWNED (bit 1): The rwsem is anonymously owned, - * i.e. the owner(s) cannot be readily determined. It can be reader - * owned or the owning writer is indeterminate. - * - * When a writer acquires a rwsem, it puts its task_struct pointer - * into the owner field. It is cleared after an unlock. - * - * When a reader acquires a rwsem, it will also puts its task_struct - * pointer into the owner field with both the RWSEM_READER_OWNED and - * RWSEM_ANONYMOUSLY_OWNED bits set. On unlock, the owner field will - * largely be left untouched. So for a free or reader-owned rwsem, - * the owner value may contain information about the last reader that - * acquires the rwsem. The anonymous bit is set because that particular - * reader may or may not still own the lock. - * - * That information may be helpful in debugging cases where the system - * seems to hang on a reader owned rwsem especially if only one reader - * is involved. Ideally we would like to track all the readers that own - * a rwsem, but the overhead is simply too big. - */ -#include "lock_events.h" -#define RWSEM_READER_OWNED (1UL << 0) -#define RWSEM_ANONYMOUSLY_OWNED (1UL << 1) +#ifndef __INTERNAL_RWSEM_H +#define __INTERNAL_RWSEM_H +#include -#ifdef CONFIG_DEBUG_RWSEMS -# define DEBUG_RWSEMS_WARN_ON(c, sem) do { \ - if (!debug_locks_silent && \ - WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\ - #c, atomic_long_read(&(sem)->count), \ - (long)((sem)->owner), (long)current, \ - list_empty(&(sem)->wait_list) ? "" : "not ")) \ - debug_locks_off(); \ - } while (0) -#else -# define DEBUG_RWSEMS_WARN_ON(c, sem) -#endif +extern void __down_read(struct rw_semaphore *sem); +extern void __up_read(struct rw_semaphore *sem); -/* - * The definition of the atomic counter in the semaphore: - * - * Bit 0 - writer locked bit - * Bit 1 - waiters present bit - * Bits 2-7 - reserved - * Bits 8-X - 24-bit (32-bit) or 56-bit reader count - * - * atomic_long_fetch_add() is used to obtain reader lock, whereas - * atomic_long_cmpxchg() will be used to obtain writer lock. - */ -#define RWSEM_WRITER_LOCKED (1UL << 0) -#define RWSEM_FLAG_WAITERS (1UL << 1) -#define RWSEM_READER_SHIFT 8 -#define RWSEM_READER_BIAS (1UL << RWSEM_READER_SHIFT) -#define RWSEM_READER_MASK (~(RWSEM_READER_BIAS - 1)) -#define RWSEM_WRITER_MASK RWSEM_WRITER_LOCKED -#define RWSEM_LOCK_MASK (RWSEM_WRITER_MASK|RWSEM_READER_MASK) -#define RWSEM_READ_FAILED_MASK (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS) - -/* - * All writes to owner are protected by WRITE_ONCE() to make sure that - * store tearing can't happen as optimistic spinners may read and use - * the owner value concurrently without lock. Read from owner, however, - * may not need READ_ONCE() as long as the pointer value is only used - * for comparison and isn't being dereferenced. - */ -static inline void rwsem_set_owner(struct rw_semaphore *sem) -{ - WRITE_ONCE(sem->owner, current); -} - -static inline void rwsem_clear_owner(struct rw_semaphore *sem) -{ - WRITE_ONCE(sem->owner, NULL); -} - -/* - * The task_struct pointer of the last owning reader will be left in - * the owner field. - * - * Note that the owner value just indicates the task has owned the rwsem - * previously, it may not be the real owner or one of the real owners - * anymore when that field is examined, so take it with a grain of salt. - */ -static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, - struct task_struct *owner) -{ - unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED - | RWSEM_ANONYMOUSLY_OWNED; - - WRITE_ONCE(sem->owner, (struct task_struct *)val); -} - -static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) -{ - __rwsem_set_reader_owned(sem, current); -} - -/* - * Return true if the a rwsem waiter can spin on the rwsem's owner - * and steal the lock, i.e. the lock is not anonymously owned. - * N.B. !owner is considered spinnable. - */ -static inline bool is_rwsem_owner_spinnable(struct task_struct *owner) -{ - return !((unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED); -} - -/* - * Return true if rwsem is owned by an anonymous writer or readers. - */ -static inline bool rwsem_has_anonymous_owner(struct task_struct *owner) -{ - return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED; -} - -#ifdef CONFIG_DEBUG_RWSEMS -/* - * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there - * is a task pointer in owner of a reader-owned rwsem, it will be the - * real owner or one of the real owners. The only exception is when the - * unlock is done by up_read_non_owner(). - */ -static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) -{ - unsigned long val = (unsigned long)current | RWSEM_READER_OWNED - | RWSEM_ANONYMOUSLY_OWNED; - if (READ_ONCE(sem->owner) == (struct task_struct *)val) - cmpxchg_relaxed((unsigned long *)&sem->owner, val, - RWSEM_READER_OWNED | RWSEM_ANONYMOUSLY_OWNED); -} -#else -static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) -{ -} -#endif - -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_read_failed_killable(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); - -/* - * lock for reading - */ -static inline void __down_read(struct rw_semaphore *sem) -{ - if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, - &sem->count) & RWSEM_READ_FAILED_MASK)) { - rwsem_down_read_failed(sem); - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & - RWSEM_READER_OWNED), sem); - } else { - rwsem_set_reader_owned(sem); - } -} - -static inline int __down_read_killable(struct rw_semaphore *sem) -{ - if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, - &sem->count) & RWSEM_READ_FAILED_MASK)) { - if (IS_ERR(rwsem_down_read_failed_killable(sem))) - return -EINTR; - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & - RWSEM_READER_OWNED), sem); - } else { - rwsem_set_reader_owned(sem); - } - return 0; -} - -static inline int __down_read_trylock(struct rw_semaphore *sem) -{ - /* - * Optimize for the case when the rwsem is not locked at all. - */ - long tmp = RWSEM_UNLOCKED_VALUE; - - lockevent_inc(rwsem_rtrylock); - do { - if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, - tmp + RWSEM_READER_BIAS)) { - rwsem_set_reader_owned(sem); - return 1; - } - } while (!(tmp & RWSEM_READ_FAILED_MASK)); - return 0; -} - -/* - * lock for writing - */ -static inline void __down_write(struct rw_semaphore *sem) -{ - if (unlikely(atomic_long_cmpxchg_acquire(&sem->count, 0, - RWSEM_WRITER_LOCKED))) - rwsem_down_write_failed(sem); - rwsem_set_owner(sem); -} - -static inline int __down_write_killable(struct rw_semaphore *sem) -{ - if (unlikely(atomic_long_cmpxchg_acquire(&sem->count, 0, - RWSEM_WRITER_LOCKED))) - if (IS_ERR(rwsem_down_write_failed_killable(sem))) - return -EINTR; - rwsem_set_owner(sem); - return 0; -} - -static inline int __down_write_trylock(struct rw_semaphore *sem) -{ - long tmp; - - lockevent_inc(rwsem_wtrylock); - tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE, - RWSEM_WRITER_LOCKED); - if (tmp == RWSEM_UNLOCKED_VALUE) { - rwsem_set_owner(sem); - return true; - } - return false; -} - -/* - * unlock after reading - */ -static inline void __up_read(struct rw_semaphore *sem) -{ - long tmp; - - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), - sem); - rwsem_clear_reader_owned(sem); - tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); - if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) - == RWSEM_FLAG_WAITERS)) - rwsem_wake(sem); -} - -/* - * unlock after writing - */ -static inline void __up_write(struct rw_semaphore *sem) -{ - DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); - rwsem_clear_owner(sem); - if (unlikely(atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, - &sem->count) & RWSEM_FLAG_WAITERS)) - rwsem_wake(sem); -} - -/* - * downgrade write lock to read lock - */ -static inline void __downgrade_write(struct rw_semaphore *sem) -{ - long tmp; - - /* - * When downgrading from exclusive to shared ownership, - * anything inside the write-locked region cannot leak - * into the read side. In contrast, anything in the - * read-locked region is ok to be re-ordered into the - * write side. As such, rely on RELEASE semantics. - */ - DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); - tmp = atomic_long_fetch_add_release( - -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count); - rwsem_set_reader_owned(sem); - if (tmp & RWSEM_FLAG_WAITERS) - rwsem_downgrade_wake(sem); -} +#endif /* __INTERNAL_RWSEM_H */ -- cgit v1.2.3 From 6cef7ff6e43cbdb9fa8eb91eb9a6b25d45ae11e3 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:04 -0400 Subject: locking/rwsem: Code cleanup after files merging After merging all the relevant rwsem code into one single file, there are a number of optimizations and cleanups that can be done: 1) Remove all the EXPORT_SYMBOL() calls for functions that are not accessed elsewhere. 2) Remove all the __visible tags as none of the functions will be called from assembly code anymore. 3) Make all the internal functions static. 4) Remove some unneeded blank lines. 5) Remove the intermediate rwsem_down_{read|write}_failed*() functions and rename __rwsem_down_{read|write}_failed_common() to rwsem_down_{read|write}_slowpath(). 6) Remove "__" prefix of __rwsem_mark_wake(). 7) Use atomic_long_try_cmpxchg_acquire() as much as possible. 8) Remove the rwsem_rtrylock and rwsem_wtrylock lock events as they are not that useful. That enables the compiler to do better optimization and reduce code size. The text+data size of rwsem.o on an x86-64 machine with gcc8 was reduced from 10237 bytes to 5030 bytes with this change. Suggested-by: Peter Zijlstra Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-6-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/lock_events_list.h | 2 - kernel/locking/rwsem.c | 135 ++++++++++++-------------------------- 2 files changed, 42 insertions(+), 95 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h index ad7668cfc9da..11187a1d40b8 100644 --- a/kernel/locking/lock_events_list.h +++ b/kernel/locking/lock_events_list.h @@ -61,7 +61,5 @@ LOCK_EVENT(rwsem_opt_fail) /* # of failed opt-spinnings */ LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */ LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */ LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */ -LOCK_EVENT(rwsem_rtrylock) /* # of read trylock calls */ LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */ LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */ -LOCK_EVENT(rwsem_wtrylock) /* # of write trylock calls */ diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 8317bcdf063b..f56329240ef1 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -205,7 +205,6 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, osq_lock_init(&sem->osq); #endif } - EXPORT_SYMBOL(__init_rwsem); enum rwsem_waiter_type { @@ -237,9 +236,9 @@ enum rwsem_wake_type { * - woken process blocks are discarded from the list after having task zeroed * - writers are only marked woken if downgrading is false */ -static void __rwsem_mark_wake(struct rw_semaphore *sem, - enum rwsem_wake_type wake_type, - struct wake_q_head *wake_q) +static void rwsem_mark_wake(struct rw_semaphore *sem, + enum rwsem_wake_type wake_type, + struct wake_q_head *wake_q) { struct rwsem_waiter *waiter, *tmp; long oldcount, woken = 0, adjustment = 0; @@ -330,7 +329,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, /* * Ensure calling get_task_struct() before setting the reader - * waiter to nil such that rwsem_down_read_failed() cannot + * waiter to nil such that rwsem_down_read_slowpath() cannot * race with do_exit() by always holding a reference count * to the task to wakeup. */ @@ -516,8 +515,8 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) /* * Wait for the read lock to be granted */ -static inline struct rw_semaphore __sched * -__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) +static struct rw_semaphore __sched * +rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) { long count, adjustment = -RWSEM_READER_BIAS; struct rwsem_waiter waiter; @@ -555,7 +554,7 @@ __rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) */ if (!(count & RWSEM_LOCK_MASK) || (!(count & RWSEM_WRITER_MASK) && (adjustment & RWSEM_FLAG_WAITERS))) - __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); wake_up_q(&wake_q); @@ -589,25 +588,11 @@ out_nolock: return ERR_PTR(-EINTR); } -__visible struct rw_semaphore * __sched -rwsem_down_read_failed(struct rw_semaphore *sem) -{ - return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(rwsem_down_read_failed); - -__visible struct rw_semaphore * __sched -rwsem_down_read_failed_killable(struct rw_semaphore *sem) -{ - return __rwsem_down_read_failed_common(sem, TASK_KILLABLE); -} -EXPORT_SYMBOL(rwsem_down_read_failed_killable); - /* * Wait until we successfully acquire the write lock */ -static inline struct rw_semaphore * -__rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) +static struct rw_semaphore * +rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) { long count; bool waiting = true; /* any queued threads before us */ @@ -646,7 +631,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) */ if (!(count & RWSEM_WRITER_MASK) && (count & RWSEM_READER_MASK)) { - __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); + rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); /* * The wakeup is normally called _after_ the wait_lock * is released, but given that we are proactively waking @@ -700,7 +685,7 @@ out_nolock: if (list_empty(&sem->wait_list)) atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count); else - __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); wake_up_q(&wake_q); lockevent_inc(rwsem_wlock_fail); @@ -708,26 +693,11 @@ out_nolock: return ERR_PTR(-EINTR); } -__visible struct rw_semaphore * __sched -rwsem_down_write_failed(struct rw_semaphore *sem) -{ - return __rwsem_down_write_failed_common(sem, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(rwsem_down_write_failed); - -__visible struct rw_semaphore * __sched -rwsem_down_write_failed_killable(struct rw_semaphore *sem) -{ - return __rwsem_down_write_failed_common(sem, TASK_KILLABLE); -} -EXPORT_SYMBOL(rwsem_down_write_failed_killable); - /* * handle waking up a waiter on the semaphore * - up_read/up_write has decremented the active part of count if we come here */ -__visible -struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) +static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) { unsigned long flags; DEFINE_WAKE_Q(wake_q); @@ -735,22 +705,20 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); if (!list_empty(&sem->wait_list)) - __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); wake_up_q(&wake_q); return sem; } -EXPORT_SYMBOL(rwsem_wake); /* * downgrade a write lock into a read lock * - caller incremented waiting part of count and discovered it still negative * - just wake up any readers at the front of the queue */ -__visible -struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) +static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) { unsigned long flags; DEFINE_WAKE_Q(wake_q); @@ -758,14 +726,13 @@ struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); if (!list_empty(&sem->wait_list)) - __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); + rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); wake_up_q(&wake_q); return sem; } -EXPORT_SYMBOL(rwsem_downgrade_wake); /* * lock for reading @@ -774,7 +741,7 @@ inline void __down_read(struct rw_semaphore *sem) { if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, &sem->count) & RWSEM_READ_FAILED_MASK)) { - rwsem_down_read_failed(sem); + rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE); DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), sem); } else { @@ -786,7 +753,7 @@ static inline int __down_read_killable(struct rw_semaphore *sem) { if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, &sem->count) & RWSEM_READ_FAILED_MASK)) { - if (IS_ERR(rwsem_down_read_failed_killable(sem))) + if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE))) return -EINTR; DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), sem); @@ -803,7 +770,6 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) */ long tmp = RWSEM_UNLOCKED_VALUE; - lockevent_inc(rwsem_rtrylock); do { if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, tmp + RWSEM_READER_BIAS)) { @@ -819,30 +785,33 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) */ static inline void __down_write(struct rw_semaphore *sem) { - if (unlikely(atomic_long_cmpxchg_acquire(&sem->count, 0, - RWSEM_WRITER_LOCKED))) - rwsem_down_write_failed(sem); + long tmp = RWSEM_UNLOCKED_VALUE; + + if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, + RWSEM_WRITER_LOCKED))) + rwsem_down_write_slowpath(sem, TASK_UNINTERRUPTIBLE); rwsem_set_owner(sem); } static inline int __down_write_killable(struct rw_semaphore *sem) { - if (unlikely(atomic_long_cmpxchg_acquire(&sem->count, 0, - RWSEM_WRITER_LOCKED))) - if (IS_ERR(rwsem_down_write_failed_killable(sem))) + long tmp = RWSEM_UNLOCKED_VALUE; + + if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, + RWSEM_WRITER_LOCKED))) { + if (IS_ERR(rwsem_down_write_slowpath(sem, TASK_KILLABLE))) return -EINTR; + } rwsem_set_owner(sem); return 0; } static inline int __down_write_trylock(struct rw_semaphore *sem) { - long tmp; + long tmp = RWSEM_UNLOCKED_VALUE; - lockevent_inc(rwsem_wtrylock); - tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE, - RWSEM_WRITER_LOCKED); - if (tmp == RWSEM_UNLOCKED_VALUE) { + if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, + RWSEM_WRITER_LOCKED)) { rwsem_set_owner(sem); return true; } @@ -856,12 +825,11 @@ inline void __up_read(struct rw_semaphore *sem) { long tmp; - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), - sem); + DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), sem); rwsem_clear_reader_owned(sem); tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); - if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) - == RWSEM_FLAG_WAITERS)) + if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) == + RWSEM_FLAG_WAITERS)) rwsem_wake(sem); } @@ -870,10 +838,12 @@ inline void __up_read(struct rw_semaphore *sem) */ static inline void __up_write(struct rw_semaphore *sem) { + long tmp; + DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); rwsem_clear_owner(sem); - if (unlikely(atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, - &sem->count) & RWSEM_FLAG_WAITERS)) + tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count); + if (unlikely(tmp & RWSEM_FLAG_WAITERS)) rwsem_wake(sem); } @@ -909,7 +879,6 @@ void __sched down_read(struct rw_semaphore *sem) LOCK_CONTENDED(sem, __down_read_trylock, __down_read); } - EXPORT_SYMBOL(down_read); int __sched down_read_killable(struct rw_semaphore *sem) @@ -924,7 +893,6 @@ int __sched down_read_killable(struct rw_semaphore *sem) return 0; } - EXPORT_SYMBOL(down_read_killable); /* @@ -938,7 +906,6 @@ int down_read_trylock(struct rw_semaphore *sem) rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); return ret; } - EXPORT_SYMBOL(down_read_trylock); /* @@ -948,10 +915,8 @@ void __sched down_write(struct rw_semaphore *sem) { might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(sem, __down_write_trylock, __down_write); } - EXPORT_SYMBOL(down_write); /* @@ -962,14 +927,14 @@ int __sched down_write_killable(struct rw_semaphore *sem) might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); - if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) { + if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, + __down_write_killable)) { rwsem_release(&sem->dep_map, 1, _RET_IP_); return -EINTR; } return 0; } - EXPORT_SYMBOL(down_write_killable); /* @@ -984,7 +949,6 @@ int down_write_trylock(struct rw_semaphore *sem) return ret; } - EXPORT_SYMBOL(down_write_trylock); /* @@ -993,10 +957,8 @@ EXPORT_SYMBOL(down_write_trylock); void up_read(struct rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); - __up_read(sem); } - EXPORT_SYMBOL(up_read); /* @@ -1005,10 +967,8 @@ EXPORT_SYMBOL(up_read); void up_write(struct rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); - __up_write(sem); } - EXPORT_SYMBOL(up_write); /* @@ -1017,10 +977,8 @@ EXPORT_SYMBOL(up_write); void downgrade_write(struct rw_semaphore *sem) { lock_downgrade(&sem->dep_map, _RET_IP_); - __downgrade_write(sem); } - EXPORT_SYMBOL(downgrade_write); #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -1029,40 +987,32 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) { might_sleep(); rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED(sem, __down_read_trylock, __down_read); } - EXPORT_SYMBOL(down_read_nested); void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) { might_sleep(); rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); - LOCK_CONTENDED(sem, __down_write_trylock, __down_write); } - EXPORT_SYMBOL(_down_write_nest_lock); void down_read_non_owner(struct rw_semaphore *sem) { might_sleep(); - __down_read(sem); __rwsem_set_reader_owned(sem, NULL); } - EXPORT_SYMBOL(down_read_non_owner); void down_write_nested(struct rw_semaphore *sem, int subclass) { might_sleep(); rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED(sem, __down_write_trylock, __down_write); } - EXPORT_SYMBOL(down_write_nested); int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) @@ -1070,14 +1020,14 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) might_sleep(); rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); - if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) { + if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, + __down_write_killable)) { rwsem_release(&sem->dep_map, 1, _RET_IP_); return -EINTR; } return 0; } - EXPORT_SYMBOL(down_write_killable_nested); void up_read_non_owner(struct rw_semaphore *sem) @@ -1086,7 +1036,6 @@ void up_read_non_owner(struct rw_semaphore *sem) sem); __up_read(sem); } - EXPORT_SYMBOL(up_read_non_owner); #endif -- cgit v1.2.3 From 3f6d517a3ece6e6ced7abcbe798ff332ac5ca586 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:05 -0400 Subject: locking/rwsem: Make rwsem_spin_on_owner() return owner state This patch modifies rwsem_spin_on_owner() to return four possible values to better reflect the state of lock holder which enables us to make a better decision of what to do next. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-7-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem.c | 65 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 47 insertions(+), 18 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index f56329240ef1..8d0f2acfe13d 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -414,17 +414,54 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) } /* - * Return true only if we can still spin on the owner field of the rwsem. + * The rwsem_spin_on_owner() function returns the folowing 4 values + * depending on the lock owner state. + * OWNER_NULL : owner is currently NULL + * OWNER_WRITER: when owner changes and is a writer + * OWNER_READER: when owner changes and the new owner may be a reader. + * OWNER_NONSPINNABLE: + * when optimistic spinning has to stop because either the + * owner stops running, is unknown, or its timeslice has + * been used up. */ -static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) +enum owner_state { + OWNER_NULL = 1 << 0, + OWNER_WRITER = 1 << 1, + OWNER_READER = 1 << 2, + OWNER_NONSPINNABLE = 1 << 3, +}; +#define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER) + +static inline enum owner_state rwsem_owner_state(unsigned long owner) { - struct task_struct *owner = READ_ONCE(sem->owner); + if (!owner) + return OWNER_NULL; - if (!is_rwsem_owner_spinnable(owner)) - return false; + if (owner & RWSEM_ANONYMOUSLY_OWNED) + return OWNER_NONSPINNABLE; + + if (owner & RWSEM_READER_OWNED) + return OWNER_READER; + + return OWNER_WRITER; +} + +static noinline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem) +{ + struct task_struct *tmp, *owner = READ_ONCE(sem->owner); + enum owner_state state = rwsem_owner_state((unsigned long)owner); + + if (state != OWNER_WRITER) + return state; rcu_read_lock(); - while (owner && (READ_ONCE(sem->owner) == owner)) { + for (;;) { + tmp = READ_ONCE(sem->owner); + if (tmp != owner) { + state = rwsem_owner_state((unsigned long)tmp); + break; + } + /* * Ensure we emit the owner->on_cpu, dereference _after_ * checking sem->owner still matches owner, if that fails, @@ -433,24 +470,16 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) */ barrier(); - /* - * abort spinning when need_resched or owner is not running or - * owner's cpu is preempted. - */ if (need_resched() || !owner_on_cpu(owner)) { - rcu_read_unlock(); - return false; + state = OWNER_NONSPINNABLE; + break; } cpu_relax(); } rcu_read_unlock(); - /* - * If there is a new owner or the owner is not set, we continue - * spinning. - */ - return is_rwsem_owner_spinnable(READ_ONCE(sem->owner)); + return state; } static bool rwsem_optimistic_spin(struct rw_semaphore *sem) @@ -473,7 +502,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) * 2) readers own the lock as we can't determine if they are * actively running or not. */ - while (rwsem_spin_on_owner(sem)) { + while (rwsem_spin_on_owner(sem) & OWNER_SPINNABLE) { /* * Try to acquire the lock */ -- cgit v1.2.3 From 4f23dbc1e657951e5d94c60369bc1db065961fb3 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:06 -0400 Subject: locking/rwsem: Implement lock handoff to prevent lock starvation Because of writer lock stealing, it is possible that a constant stream of incoming writers will cause a waiting writer or reader to wait indefinitely leading to lock starvation. This patch implements a lock handoff mechanism to disable lock stealing and force lock handoff to the first waiter or waiters (for readers) in the queue after at least a 4ms waiting period unless it is a RT writer task which doesn't need to wait. The waiting period is used to avoid discouraging lock stealing too much to affect performance. The setting and clearing of the handoff bit is serialized by the wait_lock. So racing is not possible. A rwsem microbenchmark was run for 5 seconds on a 2-socket 40-core 80-thread Skylake system with a v5.1 based kernel and 240 write_lock threads with 5us sleep critical section. Before the patch, the min/mean/max numbers of locking operations for the locking threads were 1/7,792/173,696. After the patch, the figures became 5,842/6,542/7,458. It can be seen that the rwsem became much more fair, though there was a drop of about 16% in the mean locking operations done which was a tradeoff of having better fairness. Making the waiter set the handoff bit right after the first wakeup can impact performance especially with a mixed reader/writer workload. With the same microbenchmark with short critical section and equal number of reader and writer threads (40/40), the reader/writer locking operation counts with the current patch were: 40 readers, Iterations Min/Mean/Max = 1,793/1,794/1,796 40 writers, Iterations Min/Mean/Max = 1,793/34,956/86,081 By making waiter set handoff bit immediately after wakeup: 40 readers, Iterations Min/Mean/Max = 43/44/46 40 writers, Iterations Min/Mean/Max = 43/1,263/3,191 Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-8-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/lock_events_list.h | 2 + kernel/locking/rwsem.c | 225 +++++++++++++++++++++++++++++--------- 2 files changed, 173 insertions(+), 54 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h index 11187a1d40b8..634b47fd8b5e 100644 --- a/kernel/locking/lock_events_list.h +++ b/kernel/locking/lock_events_list.h @@ -61,5 +61,7 @@ LOCK_EVENT(rwsem_opt_fail) /* # of failed opt-spinnings */ LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */ LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */ LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */ +LOCK_EVENT(rwsem_rlock_handoff) /* # of read lock handoffs */ LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */ LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */ +LOCK_EVENT(rwsem_wlock_handoff) /* # of write lock handoffs */ diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 8d0f2acfe13d..decda9fb8c6d 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -10,8 +10,9 @@ * Optimistic spinning by Tim Chen * and Davidlohr Bueso . Based on mutexes. * - * Rwsem count bit fields re-definition and rwsem rearchitecture - * by Waiman Long . + * Rwsem count bit fields re-definition and rwsem rearchitecture by + * Waiman Long and + * Peter Zijlstra . */ #include @@ -74,20 +75,33 @@ * * Bit 0 - writer locked bit * Bit 1 - waiters present bit - * Bits 2-7 - reserved + * Bit 2 - lock handoff bit + * Bits 3-7 - reserved * Bits 8-X - 24-bit (32-bit) or 56-bit reader count * * atomic_long_fetch_add() is used to obtain reader lock, whereas * atomic_long_cmpxchg() will be used to obtain writer lock. + * + * There are three places where the lock handoff bit may be set or cleared. + * 1) rwsem_mark_wake() for readers. + * 2) rwsem_try_write_lock() for writers. + * 3) Error path of rwsem_down_write_slowpath(). + * + * For all the above cases, wait_lock will be held. A writer must also + * be the first one in the wait_list to be eligible for setting the handoff + * bit. So concurrent setting/clearing of handoff bit is not possible. */ #define RWSEM_WRITER_LOCKED (1UL << 0) #define RWSEM_FLAG_WAITERS (1UL << 1) +#define RWSEM_FLAG_HANDOFF (1UL << 2) + #define RWSEM_READER_SHIFT 8 #define RWSEM_READER_BIAS (1UL << RWSEM_READER_SHIFT) #define RWSEM_READER_MASK (~(RWSEM_READER_BIAS - 1)) #define RWSEM_WRITER_MASK RWSEM_WRITER_LOCKED #define RWSEM_LOCK_MASK (RWSEM_WRITER_MASK|RWSEM_READER_MASK) -#define RWSEM_READ_FAILED_MASK (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS) +#define RWSEM_READ_FAILED_MASK (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS|\ + RWSEM_FLAG_HANDOFF) /* * All writes to owner are protected by WRITE_ONCE() to make sure that @@ -216,7 +230,10 @@ struct rwsem_waiter { struct list_head list; struct task_struct *task; enum rwsem_waiter_type type; + unsigned long timeout; }; +#define rwsem_first_waiter(sem) \ + list_first_entry(&sem->wait_list, struct rwsem_waiter, list) enum rwsem_wake_type { RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ @@ -224,6 +241,19 @@ enum rwsem_wake_type { RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ }; +enum writer_wait_state { + WRITER_NOT_FIRST, /* Writer is not first in wait list */ + WRITER_FIRST, /* Writer is first in wait list */ + WRITER_HANDOFF /* Writer is first & handoff needed */ +}; + +/* + * The typical HZ value is either 250 or 1000. So set the minimum waiting + * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait + * queue before initiating the handoff protocol. + */ +#define RWSEM_WAIT_TIMEOUT DIV_ROUND_UP(HZ, 250) + /* * handle the lock release when processes blocked on it that can now run * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must @@ -244,11 +274,13 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, long oldcount, woken = 0, adjustment = 0; struct list_head wlist; + lockdep_assert_held(&sem->wait_lock); + /* * Take a peek at the queue head waiter such that we can determine * the wakeup(s) to perform. */ - waiter = list_first_entry(&sem->wait_list, struct rwsem_waiter, list); + waiter = rwsem_first_waiter(sem); if (waiter->type == RWSEM_WAITING_FOR_WRITE) { if (wake_type == RWSEM_WAKE_ANY) { @@ -275,7 +307,18 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, adjustment = RWSEM_READER_BIAS; oldcount = atomic_long_fetch_add(adjustment, &sem->count); if (unlikely(oldcount & RWSEM_WRITER_MASK)) { - atomic_long_sub(adjustment, &sem->count); + /* + * When we've been waiting "too" long (for writers + * to give up the lock), request a HANDOFF to + * force the issue. + */ + if (!(oldcount & RWSEM_FLAG_HANDOFF) && + time_after(jiffies, waiter->timeout)) { + adjustment -= RWSEM_FLAG_HANDOFF; + lockevent_inc(rwsem_rlock_handoff); + } + + atomic_long_add(-adjustment, &sem->count); return; } /* @@ -317,6 +360,13 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, adjustment -= RWSEM_FLAG_WAITERS; } + /* + * When we've woken a reader, we no longer need to force writers + * to give up the lock and we can clear HANDOFF. + */ + if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF)) + adjustment -= RWSEM_FLAG_HANDOFF; + if (adjustment) atomic_long_add(adjustment, &sem->count); @@ -346,23 +396,48 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, * This function must be called with the sem->wait_lock held to prevent * race conditions between checking the rwsem wait list and setting the * sem->count accordingly. + * + * If wstate is WRITER_HANDOFF, it will make sure that either the handoff + * bit is set or the lock is acquired with handoff bit cleared. */ -static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) +static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem, + enum writer_wait_state wstate) { long new; - if (count & RWSEM_LOCK_MASK) - return false; + lockdep_assert_held(&sem->wait_lock); - new = count + RWSEM_WRITER_LOCKED - - (list_is_singular(&sem->wait_list) ? RWSEM_FLAG_WAITERS : 0); + do { + bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF); - if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, new)) { - rwsem_set_owner(sem); - return true; - } + if (has_handoff && wstate == WRITER_NOT_FIRST) + return false; - return false; + new = count; + + if (count & RWSEM_LOCK_MASK) { + if (has_handoff || (wstate != WRITER_HANDOFF)) + return false; + + new |= RWSEM_FLAG_HANDOFF; + } else { + new |= RWSEM_WRITER_LOCKED; + new &= ~RWSEM_FLAG_HANDOFF; + + if (list_is_singular(&sem->wait_list)) + new &= ~RWSEM_FLAG_WAITERS; + } + } while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new)); + + /* + * We have either acquired the lock with handoff bit cleared or + * set the handoff bit. + */ + if (new & RWSEM_FLAG_HANDOFF) + return false; + + rwsem_set_owner(sem); + return true; } #ifdef CONFIG_RWSEM_SPIN_ON_OWNER @@ -373,9 +448,9 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) { long count = atomic_long_read(&sem->count); - while (!(count & RWSEM_LOCK_MASK)) { + while (!(count & (RWSEM_LOCK_MASK|RWSEM_FLAG_HANDOFF))) { if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, - count + RWSEM_WRITER_LOCKED)) { + count | RWSEM_WRITER_LOCKED)) { rwsem_set_owner(sem); lockevent_inc(rwsem_opt_wlock); return true; @@ -456,6 +531,11 @@ static noinline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem) rcu_read_lock(); for (;;) { + if (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF) { + state = OWNER_NONSPINNABLE; + break; + } + tmp = READ_ONCE(sem->owner); if (tmp != owner) { state = rwsem_owner_state((unsigned long)tmp); @@ -553,16 +633,18 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) waiter.task = current; waiter.type = RWSEM_WAITING_FOR_READ; + waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; raw_spin_lock_irq(&sem->wait_lock); if (list_empty(&sem->wait_list)) { /* * In case the wait queue is empty and the lock isn't owned - * by a writer, this reader can exit the slowpath and return - * immediately as its RWSEM_READER_BIAS has already been - * set in the count. + * by a writer or has the handoff bit set, this reader can + * exit the slowpath and return immediately as its + * RWSEM_READER_BIAS has already been set in the count. */ - if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) { + if (!(atomic_long_read(&sem->count) & + (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { raw_spin_unlock_irq(&sem->wait_lock); rwsem_set_reader_owned(sem); lockevent_inc(rwsem_rlock_fast); @@ -609,8 +691,10 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) return sem; out_nolock: list_del(&waiter.list); - if (list_empty(&sem->wait_list)) - atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count); + if (list_empty(&sem->wait_list)) { + atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF, + &sem->count); + } raw_spin_unlock_irq(&sem->wait_lock); __set_current_state(TASK_RUNNING); lockevent_inc(rwsem_rlock_fail); @@ -624,7 +708,7 @@ static struct rw_semaphore * rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) { long count; - bool waiting = true; /* any queued threads before us */ + enum writer_wait_state wstate; struct rwsem_waiter waiter; struct rw_semaphore *ret = sem; DEFINE_WAKE_Q(wake_q); @@ -639,66 +723,95 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) */ waiter.task = current; waiter.type = RWSEM_WAITING_FOR_WRITE; + waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; raw_spin_lock_irq(&sem->wait_lock); /* account for this before adding a new element to the list */ - if (list_empty(&sem->wait_list)) - waiting = false; + wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST; list_add_tail(&waiter.list, &sem->wait_list); /* we're now waiting on the lock */ - if (waiting) { + if (wstate == WRITER_NOT_FIRST) { count = atomic_long_read(&sem->count); /* - * If there were already threads queued before us and there are - * no active writers and some readers, the lock must be read - * owned; so we try to any read locks that were queued ahead - * of us. + * If there were already threads queued before us and: + * 1) there are no no active locks, wake the front + * queued process(es) as the handoff bit might be set. + * 2) there are no active writers and some readers, the lock + * must be read owned; so we try to wake any read lock + * waiters that were queued ahead of us. */ - if (!(count & RWSEM_WRITER_MASK) && - (count & RWSEM_READER_MASK)) { - rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); - /* - * The wakeup is normally called _after_ the wait_lock - * is released, but given that we are proactively waking - * readers we can deal with the wake_q overhead as it is - * similar to releasing and taking the wait_lock again - * for attempting rwsem_try_write_lock(). - */ - wake_up_q(&wake_q); + if (count & RWSEM_WRITER_MASK) + goto wait; - /* - * Reinitialize wake_q after use. - */ - wake_q_init(&wake_q); - } + rwsem_mark_wake(sem, (count & RWSEM_READER_MASK) + ? RWSEM_WAKE_READERS + : RWSEM_WAKE_ANY, &wake_q); + /* + * The wakeup is normally called _after_ the wait_lock + * is released, but given that we are proactively waking + * readers we can deal with the wake_q overhead as it is + * similar to releasing and taking the wait_lock again + * for attempting rwsem_try_write_lock(). + */ + wake_up_q(&wake_q); + + /* We need wake_q again below, reinitialize */ + wake_q_init(&wake_q); } else { count = atomic_long_add_return(RWSEM_FLAG_WAITERS, &sem->count); } +wait: /* wait until we successfully acquire the lock */ set_current_state(state); while (true) { - if (rwsem_try_write_lock(count, sem)) + if (rwsem_try_write_lock(count, sem, wstate)) break; + raw_spin_unlock_irq(&sem->wait_lock); /* Block until there are no active lockers. */ - do { + for (;;) { if (signal_pending_state(state, current)) goto out_nolock; schedule(); lockevent_inc(rwsem_sleep_writer); set_current_state(state); + /* + * If HANDOFF bit is set, unconditionally do + * a trylock. + */ + if (wstate == WRITER_HANDOFF) + break; + + if ((wstate == WRITER_NOT_FIRST) && + (rwsem_first_waiter(sem) == &waiter)) + wstate = WRITER_FIRST; + count = atomic_long_read(&sem->count); - } while (count & RWSEM_LOCK_MASK); + if (!(count & RWSEM_LOCK_MASK)) + break; + + /* + * The setting of the handoff bit is deferred + * until rwsem_try_write_lock() is called. + */ + if ((wstate == WRITER_FIRST) && (rt_task(current) || + time_after(jiffies, waiter.timeout))) { + wstate = WRITER_HANDOFF; + lockevent_inc(rwsem_wlock_handoff); + break; + } + } raw_spin_lock_irq(&sem->wait_lock); + count = atomic_long_read(&sem->count); } __set_current_state(TASK_RUNNING); list_del(&waiter.list); @@ -711,6 +824,10 @@ out_nolock: __set_current_state(TASK_RUNNING); raw_spin_lock_irq(&sem->wait_lock); list_del(&waiter.list); + + if (unlikely(wstate == WRITER_HANDOFF)) + atomic_long_add(-RWSEM_FLAG_HANDOFF, &sem->count); + if (list_empty(&sem->wait_list)) atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count); else @@ -726,7 +843,7 @@ out_nolock: * handle waking up a waiter on the semaphore * - up_read/up_write has decremented the active part of count if we come here */ -static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) +static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem, long count) { unsigned long flags; DEFINE_WAKE_Q(wake_q); @@ -859,7 +976,7 @@ inline void __up_read(struct rw_semaphore *sem) tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) == RWSEM_FLAG_WAITERS)) - rwsem_wake(sem); + rwsem_wake(sem, tmp); } /* @@ -873,7 +990,7 @@ static inline void __up_write(struct rw_semaphore *sem) rwsem_clear_owner(sem); tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count); if (unlikely(tmp & RWSEM_FLAG_WAITERS)) - rwsem_wake(sem); + rwsem_wake(sem, tmp); } /* -- cgit v1.2.3 From 00f3c5a3df2c1e3dab14d0dd2b71f852d46be97f Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:07 -0400 Subject: locking/rwsem: Always release wait_lock before waking up tasks With the use of wake_q, we can do task wakeups without holding the wait_lock. There is one exception in the rwsem code, though. It is when the writer in the slowpath detects that there are waiters ahead but the rwsem is not held by a writer. This can lead to a long wait_lock hold time especially when a large number of readers are to be woken up. Remediate this situation by releasing the wait_lock before waking up tasks and re-acquiring it afterward. The rwsem_try_write_lock() function is also modified to read the rwsem count directly to avoid stale count value. Suggested-by: Peter Zijlstra Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-9-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index decda9fb8c6d..5532304406f7 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -400,13 +400,14 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, * If wstate is WRITER_HANDOFF, it will make sure that either the handoff * bit is set or the lock is acquired with handoff bit cleared. */ -static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem, +static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, enum writer_wait_state wstate) { - long new; + long count, new; lockdep_assert_held(&sem->wait_lock); + count = atomic_long_read(&sem->count); do { bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF); @@ -751,26 +752,25 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) ? RWSEM_WAKE_READERS : RWSEM_WAKE_ANY, &wake_q); - /* - * The wakeup is normally called _after_ the wait_lock - * is released, but given that we are proactively waking - * readers we can deal with the wake_q overhead as it is - * similar to releasing and taking the wait_lock again - * for attempting rwsem_try_write_lock(). - */ - wake_up_q(&wake_q); - - /* We need wake_q again below, reinitialize */ - wake_q_init(&wake_q); + if (!wake_q_empty(&wake_q)) { + /* + * We want to minimize wait_lock hold time especially + * when a large number of readers are to be woken up. + */ + raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); + wake_q_init(&wake_q); /* Used again, reinit */ + raw_spin_lock_irq(&sem->wait_lock); + } } else { - count = atomic_long_add_return(RWSEM_FLAG_WAITERS, &sem->count); + atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count); } wait: /* wait until we successfully acquire the lock */ set_current_state(state); while (true) { - if (rwsem_try_write_lock(count, sem, wstate)) + if (rwsem_try_write_lock(sem, wstate)) break; raw_spin_unlock_irq(&sem->wait_lock); @@ -811,7 +811,6 @@ wait: } raw_spin_lock_irq(&sem->wait_lock); - count = atomic_long_read(&sem->count); } __set_current_state(TASK_RUNNING); list_del(&waiter.list); -- cgit v1.2.3 From 990fa7384a3057a3298bcf493651c6e14416c47c Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:08 -0400 Subject: locking/rwsem: More optimal RT task handling of null owner An RT task can do optimistic spinning only if the lock holder is actually running. If the state of the lock holder isn't known, there is a possibility that high priority of the RT task may block forward progress of the lock holder if it happens to reside on the same CPU. This will lead to deadlock. So we have to make sure that an RT task will not spin on a reader-owned rwsem. When the owner is temporarily set to NULL, there are two cases where we may want to continue spinning: 1) The lock owner is in the process of releasing the lock, sem->owner is cleared but the lock has not been released yet. 2) The lock was free and owner cleared, but another task just comes in and acquire the lock before we try to get it. The new owner may be a spinnable writer. So an RT task is now made to retry one more time to see if it can acquire the lock or continue spinning on the new owning writer. When testing on a 8-socket IvyBridge-EX system, the one additional retry seems to improve locking performance of RT write locking threads under heavy contentions. The table below shows the locking rates (in kops/s) with various write locking threads before and after the patch. Locking threads Pre-patch Post-patch --------------- --------- ----------- 4 2,753 2,608 8 2,529 2,520 16 1,727 1,918 32 1,263 1,956 64 889 1,343 Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-10-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem.c | 51 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 7 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 5532304406f7..e1840b7c5310 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -566,6 +566,7 @@ static noinline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem) static bool rwsem_optimistic_spin(struct rw_semaphore *sem) { bool taken = false; + int prev_owner_state = OWNER_NULL; preempt_disable(); @@ -583,7 +584,12 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) * 2) readers own the lock as we can't determine if they are * actively running or not. */ - while (rwsem_spin_on_owner(sem) & OWNER_SPINNABLE) { + for (;;) { + enum owner_state owner_state = rwsem_spin_on_owner(sem); + + if (!(owner_state & OWNER_SPINNABLE)) + break; + /* * Try to acquire the lock */ @@ -593,13 +599,44 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) } /* - * When there's no owner, we might have preempted between the - * owner acquiring the lock and setting the owner field. If - * we're an RT task that will live-lock because we won't let - * the owner complete. + * An RT task cannot do optimistic spinning if it cannot + * be sure the lock holder is running or live-lock may + * happen if the current task and the lock holder happen + * to run in the same CPU. However, aborting optimistic + * spinning while a NULL owner is detected may miss some + * opportunity where spinning can continue without causing + * problem. + * + * There are 2 possible cases where an RT task may be able + * to continue spinning. + * + * 1) The lock owner is in the process of releasing the + * lock, sem->owner is cleared but the lock has not + * been released yet. + * 2) The lock was free and owner cleared, but another + * task just comes in and acquire the lock before + * we try to get it. The new owner may be a spinnable + * writer. + * + * To take advantage of two scenarios listed agove, the RT + * task is made to retry one more time to see if it can + * acquire the lock or continue spinning on the new owning + * writer. Of course, if the time lag is long enough or the + * new owner is not a writer or spinnable, the RT task will + * quit spinning. + * + * If the owner is a writer, the need_resched() check is + * done inside rwsem_spin_on_owner(). If the owner is not + * a writer, need_resched() check needs to be done here. */ - if (!sem->owner && (need_resched() || rt_task(current))) - break; + if (owner_state != OWNER_WRITER) { + if (need_resched()) + break; + if (rt_task(current) && + (prev_owner_state != OWNER_WRITER)) + break; + } + prev_owner_state = owner_state; /* * The cpu_relax() call is a compiler barrier which forces -- cgit v1.2.3 From d3681e269fff84048c94012342c3434b227c4706 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:09 -0400 Subject: locking/rwsem: Wake up almost all readers in wait queue When the front of the wait queue is a reader, other readers immediately following the first reader will also be woken up at the same time. However, if there is a writer in between. Those readers behind the writer will not be woken up. Because of optimistic spinning, the lock acquisition order is not FIFO anyway. The lock handoff mechanism will ensure that lock starvation will not happen. Assuming that the lock hold times of the other readers still in the queue will be about the same as the readers that are being woken up, there is really not much additional cost other than the additional latency due to the wakeup of additional tasks by the waker. Therefore all the readers up to a maximum of 256 in the queue are woken up when the first waiter is a reader to improve reader throughput. This is somewhat similar in concept to a phase-fair R/W lock. With a locking microbenchmark running on 5.1 based kernel, the total locking rates (in kops/s) on a 8-socket IvyBridge-EX system with equal numbers of readers and writers before and after this patch were as follows: # of Threads Pre-Patch Post-patch ------------ --------- ---------- 4 1,641 1,674 8 731 1,062 16 564 924 32 78 300 64 38 195 240 50 149 There is no performance gain at low contention level. At high contention level, however, this patch gives a pretty decent performance boost. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-11-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index e1840b7c5310..ded96023f4dc 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -254,6 +254,14 @@ enum writer_wait_state { */ #define RWSEM_WAIT_TIMEOUT DIV_ROUND_UP(HZ, 250) +/* + * Magic number to batch-wakeup waiting readers, even when writers are + * also present in the queue. This both limits the amount of work the + * waking thread must do and also prevents any potential counter overflow, + * however unlikely. + */ +#define MAX_READERS_WAKEUP 0x100 + /* * handle the lock release when processes blocked on it that can now run * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must @@ -329,11 +337,17 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, } /* - * Grant an infinite number of read locks to the readers at the front - * of the queue. We know that woken will be at least 1 as we accounted + * Grant up to MAX_READERS_WAKEUP read locks to all the readers in the + * queue. We know that the woken will be at least 1 as we accounted * for above. Note we increment the 'active part' of the count by the * number of readers before waking any processes up. * + * This is an adaptation of the phase-fair R/W locks where at the + * reader phase (first waiter is a reader), all readers are eligible + * to acquire the lock at the same time irrespective of their order + * in the queue. The writers acquire the lock according to their + * order in the queue. + * * We have to do wakeup in 2 passes to prevent the possibility that * the reader count may be decremented before it is incremented. It * is because the to-be-woken waiter may not have slept yet. So it @@ -345,13 +359,20 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, * 2) For each waiters in the new list, clear waiter->task and * put them into wake_q to be woken up later. */ - list_for_each_entry(waiter, &sem->wait_list, list) { + INIT_LIST_HEAD(&wlist); + list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) { if (waiter->type == RWSEM_WAITING_FOR_WRITE) - break; + continue; woken++; + list_move_tail(&waiter->list, &wlist); + + /* + * Limit # of readers that can be woken up per wakeup call. + */ + if (woken >= MAX_READERS_WAKEUP) + break; } - list_cut_before(&wlist, &sem->wait_list, &waiter->list); adjustment = woken * RWSEM_READER_BIAS - adjustment; lockevent_cond_inc(rwsem_wake_reader, woken); -- cgit v1.2.3 From 02f1082b003a0cd48f48f12533d969cdbf1c2b63 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:10 -0400 Subject: locking/rwsem: Clarify usage of owner's nonspinaable bit Bit 1 of sem->owner (RWSEM_ANONYMOUSLY_OWNED) is used to designate an anonymous owner - readers or an anonymous writer. The setting of this anonymous bit is used as an indicator that optimistic spinning cannot be done on this rwsem. With the upcoming reader optimistic spinning patches, a reader-owned rwsem can be spinned on for a limit period of time. We still need this bit to indicate a rwsem is nonspinnable, but not setting this bit loses its meaning that the owner is known. So rename the bit to RWSEM_NONSPINNABLE to clarify its meaning. This patch also fixes a DEBUG_RWSEMS_WARN_ON() bug in __up_write(). Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-12-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem.c | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index ded96023f4dc..180455b6b0d4 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -33,17 +33,18 @@ /* * The least significant 2 bits of the owner value has the following * meanings when set. - * - RWSEM_READER_OWNED (bit 0): The rwsem is owned by readers - * - RWSEM_ANONYMOUSLY_OWNED (bit 1): The rwsem is anonymously owned, - * i.e. the owner(s) cannot be readily determined. It can be reader - * owned or the owning writer is indeterminate. + * - Bit 0: RWSEM_READER_OWNED - The rwsem is owned by readers + * - Bit 1: RWSEM_NONSPINNABLE - Waiters cannot spin on the rwsem + * The rwsem is anonymously owned, i.e. the owner(s) cannot be + * readily determined. It can be reader owned or the owning writer + * is indeterminate. * * When a writer acquires a rwsem, it puts its task_struct pointer * into the owner field. It is cleared after an unlock. * * When a reader acquires a rwsem, it will also puts its task_struct * pointer into the owner field with both the RWSEM_READER_OWNED and - * RWSEM_ANONYMOUSLY_OWNED bits set. On unlock, the owner field will + * RWSEM_NONSPINNABLE bits set. On unlock, the owner field will * largely be left untouched. So for a free or reader-owned rwsem, * the owner value may contain information about the last reader that * acquires the rwsem. The anonymous bit is set because that particular @@ -55,7 +56,8 @@ * a rwsem, but the overhead is simply too big. */ #define RWSEM_READER_OWNED (1UL << 0) -#define RWSEM_ANONYMOUSLY_OWNED (1UL << 1) +#define RWSEM_NONSPINNABLE (1UL << 1) +#define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE) #ifdef CONFIG_DEBUG_RWSEMS # define DEBUG_RWSEMS_WARN_ON(c, sem) do { \ @@ -132,7 +134,7 @@ static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, struct task_struct *owner) { unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED - | RWSEM_ANONYMOUSLY_OWNED; + | RWSEM_NONSPINNABLE; WRITE_ONCE(sem->owner, (struct task_struct *)val); } @@ -144,20 +146,12 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) /* * Return true if the a rwsem waiter can spin on the rwsem's owner - * and steal the lock, i.e. the lock is not anonymously owned. + * and steal the lock. * N.B. !owner is considered spinnable. */ static inline bool is_rwsem_owner_spinnable(struct task_struct *owner) { - return !((unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED); -} - -/* - * Return true if rwsem is owned by an anonymous writer or readers. - */ -static inline bool rwsem_has_anonymous_owner(struct task_struct *owner) -{ - return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED; + return !((unsigned long)owner & RWSEM_NONSPINNABLE); } #ifdef CONFIG_DEBUG_RWSEMS @@ -170,10 +164,10 @@ static inline bool rwsem_has_anonymous_owner(struct task_struct *owner) static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) { unsigned long val = (unsigned long)current | RWSEM_READER_OWNED - | RWSEM_ANONYMOUSLY_OWNED; + | RWSEM_NONSPINNABLE; if (READ_ONCE(sem->owner) == (struct task_struct *)val) cmpxchg_relaxed((unsigned long *)&sem->owner, val, - RWSEM_READER_OWNED | RWSEM_ANONYMOUSLY_OWNED); + RWSEM_READER_OWNED | RWSEM_NONSPINNABLE); } #else static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) @@ -495,7 +489,7 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) struct task_struct *owner; bool ret = true; - BUILD_BUG_ON(!rwsem_has_anonymous_owner(RWSEM_OWNER_UNKNOWN)); + BUILD_BUG_ON(is_rwsem_owner_spinnable(RWSEM_OWNER_UNKNOWN)); if (need_resched()) return false; @@ -534,7 +528,7 @@ static inline enum owner_state rwsem_owner_state(unsigned long owner) if (!owner) return OWNER_NULL; - if (owner & RWSEM_ANONYMOUSLY_OWNED) + if (owner & RWSEM_NONSPINNABLE) return OWNER_NONSPINNABLE; if (owner & RWSEM_READER_OWNED) @@ -1043,7 +1037,12 @@ static inline void __up_write(struct rw_semaphore *sem) { long tmp; - DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); + /* + * sem->owner may differ from current if the ownership is transferred + * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits. + */ + DEBUG_RWSEMS_WARN_ON((sem->owner != current) && + !((long)sem->owner & RWSEM_NONSPINNABLE), sem); rwsem_clear_owner(sem); tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count); if (unlikely(tmp & RWSEM_FLAG_WAITERS)) -- cgit v1.2.3 From cf69482d62d996d3ce840eeead8e160de281ac6c Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:11 -0400 Subject: locking/rwsem: Enable readers spinning on writer This patch enables readers to optimistically spin on a rwsem when it is owned by a writer instead of going to sleep directly. The rwsem_can_spin_on_owner() function is extracted out of rwsem_optimistic_spin() and is called directly by rwsem_down_read_slowpath() and rwsem_down_write_slowpath(). With a locking microbenchmark running on 5.1 based kernel, the total locking rates (in kops/s) on a 8-socket IvyBrige-EX system with equal numbers of readers and writers before and after the patch were as follows: # of Threads Pre-patch Post-patch ------------ --------- ---------- 4 1,674 1,684 8 1,062 1,074 16 924 900 32 300 458 64 195 208 128 164 168 240 149 143 The performance change wasn't significant in this case, but this change is required by a follow-on patch. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-13-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/lock_events_list.h | 1 + kernel/locking/rwsem.c | 86 +++++++++++++++++++++++++++++++++------ 2 files changed, 75 insertions(+), 12 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h index 634b47fd8b5e..ca954e4e00e4 100644 --- a/kernel/locking/lock_events_list.h +++ b/kernel/locking/lock_events_list.h @@ -56,6 +56,7 @@ LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */ LOCK_EVENT(rwsem_sleep_writer) /* # of writer sleeps */ LOCK_EVENT(rwsem_wake_reader) /* # of reader wakeups */ LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups */ +LOCK_EVENT(rwsem_opt_rlock) /* # of read locks opt-spin acquired */ LOCK_EVENT(rwsem_opt_wlock) /* # of write locks opt-spin acquired */ LOCK_EVENT(rwsem_opt_fail) /* # of failed opt-spinnings */ LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */ diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 180455b6b0d4..985a03ad3f8c 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -457,6 +457,30 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, } #ifdef CONFIG_RWSEM_SPIN_ON_OWNER +/* + * Try to acquire read lock before the reader is put on wait queue. + * Lock acquisition isn't allowed if the rwsem is locked or a writer handoff + * is ongoing. + */ +static inline bool rwsem_try_read_lock_unqueued(struct rw_semaphore *sem) +{ + long count = atomic_long_read(&sem->count); + + if (count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF)) + return false; + + count = atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, &sem->count); + if (!(count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { + rwsem_set_reader_owned(sem); + lockevent_inc(rwsem_opt_rlock); + return true; + } + + /* Back out the change */ + atomic_long_add(-RWSEM_READER_BIAS, &sem->count); + return false; +} + /* * Try to acquire write lock before the writer has been put on wait queue. */ @@ -491,9 +515,12 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) BUILD_BUG_ON(is_rwsem_owner_spinnable(RWSEM_OWNER_UNKNOWN)); - if (need_resched()) + if (need_resched()) { + lockevent_inc(rwsem_opt_fail); return false; + } + preempt_disable(); rcu_read_lock(); owner = READ_ONCE(sem->owner); if (owner) { @@ -501,6 +528,9 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) owner_on_cpu(owner); } rcu_read_unlock(); + preempt_enable(); + + lockevent_cond_inc(rwsem_opt_fail, !ret); return ret; } @@ -578,7 +608,7 @@ static noinline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem) return state; } -static bool rwsem_optimistic_spin(struct rw_semaphore *sem) +static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) { bool taken = false; int prev_owner_state = OWNER_NULL; @@ -586,9 +616,6 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) preempt_disable(); /* sem->wait_lock should not be held when doing optimistic spinning */ - if (!rwsem_can_spin_on_owner(sem)) - goto done; - if (!osq_lock(&sem->osq)) goto done; @@ -608,10 +635,11 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) /* * Try to acquire the lock */ - if (rwsem_try_write_lock_unqueued(sem)) { - taken = true; + taken = wlock ? rwsem_try_write_lock_unqueued(sem) + : rwsem_try_read_lock_unqueued(sem); + + if (taken) break; - } /* * An RT task cannot do optimistic spinning if it cannot @@ -668,7 +696,12 @@ done: return taken; } #else -static bool rwsem_optimistic_spin(struct rw_semaphore *sem) +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) +{ + return false; +} + +static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) { return false; } @@ -684,6 +717,31 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) struct rwsem_waiter waiter; DEFINE_WAKE_Q(wake_q); + if (!rwsem_can_spin_on_owner(sem)) + goto queue; + + /* + * Undo read bias from down_read() and do optimistic spinning. + */ + atomic_long_add(-RWSEM_READER_BIAS, &sem->count); + adjustment = 0; + if (rwsem_optimistic_spin(sem, false)) { + /* + * Wake up other readers in the wait list if the front + * waiter is a reader. + */ + if ((atomic_long_read(&sem->count) & RWSEM_FLAG_WAITERS)) { + raw_spin_lock_irq(&sem->wait_lock); + if (!list_empty(&sem->wait_list)) + rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, + &wake_q); + raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); + } + return sem; + } + +queue: waiter.task = current; waiter.type = RWSEM_WAITING_FOR_READ; waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; @@ -696,7 +754,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) * exit the slowpath and return immediately as its * RWSEM_READER_BIAS has already been set in the count. */ - if (!(atomic_long_read(&sem->count) & + if (adjustment && !(atomic_long_read(&sem->count) & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { raw_spin_unlock_irq(&sem->wait_lock); rwsem_set_reader_owned(sem); @@ -708,7 +766,10 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) list_add_tail(&waiter.list, &sem->wait_list); /* we're now waiting on the lock, but no longer actively locking */ - count = atomic_long_add_return(adjustment, &sem->count); + if (adjustment) + count = atomic_long_add_return(adjustment, &sem->count); + else + count = atomic_long_read(&sem->count); /* * If there are no active locks, wake the front queued process(es). @@ -767,7 +828,8 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) DEFINE_WAKE_Q(wake_q); /* do optimistic spinning and steal lock if possible */ - if (rwsem_optimistic_spin(sem)) + if (rwsem_can_spin_on_owner(sem) && + rwsem_optimistic_spin(sem, true)) return sem; /* -- cgit v1.2.3 From 94a9717b3c40e77a54e4afacd8f19a9a86bfeead Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:12 -0400 Subject: locking/rwsem: Make rwsem->owner an atomic_long_t The rwsem->owner contains not just the task structure pointer, it also holds some flags for storing the current state of the rwsem. Some of the flags may have to be atomically updated. To reflect the new reality, the owner is now changed to an atomic_long_t type. New helper functions are added to properly separate out the task structure pointer and the embedded flags. Suggested-by: Peter Zijlstra Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-14-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem.c | 125 +++++++++++++++++++++++++++++++------------------ 1 file changed, 80 insertions(+), 45 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 985a03ad3f8c..fae557be8334 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -64,7 +64,7 @@ if (!debug_locks_silent && \ WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\ #c, atomic_long_read(&(sem)->count), \ - (long)((sem)->owner), (long)current, \ + atomic_long_read(&(sem)->owner), (long)current, \ list_empty(&(sem)->wait_list) ? "" : "not ")) \ debug_locks_off(); \ } while (0) @@ -114,12 +114,20 @@ */ static inline void rwsem_set_owner(struct rw_semaphore *sem) { - WRITE_ONCE(sem->owner, current); + atomic_long_set(&sem->owner, (long)current); } static inline void rwsem_clear_owner(struct rw_semaphore *sem) { - WRITE_ONCE(sem->owner, NULL); + atomic_long_set(&sem->owner, 0); +} + +/* + * Test the flags in the owner field. + */ +static inline bool rwsem_test_oflags(struct rw_semaphore *sem, long flags) +{ + return atomic_long_read(&sem->owner) & flags; } /* @@ -133,10 +141,9 @@ static inline void rwsem_clear_owner(struct rw_semaphore *sem) static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, struct task_struct *owner) { - unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED - | RWSEM_NONSPINNABLE; + unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED | RWSEM_NONSPINNABLE; - WRITE_ONCE(sem->owner, (struct task_struct *)val); + atomic_long_set(&sem->owner, val); } static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) @@ -145,13 +152,20 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) } /* - * Return true if the a rwsem waiter can spin on the rwsem's owner - * and steal the lock. - * N.B. !owner is considered spinnable. + * Return true if the rwsem is owned by a reader. */ -static inline bool is_rwsem_owner_spinnable(struct task_struct *owner) +static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) { - return !((unsigned long)owner & RWSEM_NONSPINNABLE); +#ifdef CONFIG_DEBUG_RWSEMS + /* + * Check the count to see if it is write-locked. + */ + long count = atomic_long_read(&sem->count); + + if (count & RWSEM_WRITER_MASK) + return false; +#endif + return rwsem_test_oflags(sem, RWSEM_READER_OWNED); } #ifdef CONFIG_DEBUG_RWSEMS @@ -163,11 +177,13 @@ static inline bool is_rwsem_owner_spinnable(struct task_struct *owner) */ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) { - unsigned long val = (unsigned long)current | RWSEM_READER_OWNED - | RWSEM_NONSPINNABLE; - if (READ_ONCE(sem->owner) == (struct task_struct *)val) - cmpxchg_relaxed((unsigned long *)&sem->owner, val, - RWSEM_READER_OWNED | RWSEM_NONSPINNABLE); + unsigned long val = atomic_long_read(&sem->owner); + + while ((val & ~RWSEM_OWNER_FLAGS_MASK) == (unsigned long)current) { + if (atomic_long_try_cmpxchg(&sem->owner, &val, + val & RWSEM_OWNER_FLAGS_MASK)) + return; + } } #else static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) @@ -175,6 +191,28 @@ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) } #endif +/* + * Return just the real task structure pointer of the owner + */ +static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) +{ + return (struct task_struct *) + (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK); +} + +/* + * Return the real task structure pointer of the owner and the embedded + * flags in the owner. pflags must be non-NULL. + */ +static inline struct task_struct * +rwsem_owner_flags(struct rw_semaphore *sem, unsigned long *pflags) +{ + unsigned long owner = atomic_long_read(&sem->owner); + + *pflags = owner & RWSEM_OWNER_FLAGS_MASK; + return (struct task_struct *)(owner & ~RWSEM_OWNER_FLAGS_MASK); +} + /* * Guide to the rw_semaphore's count field. * @@ -208,7 +246,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); - sem->owner = NULL; + atomic_long_set(&sem->owner, 0L); #ifdef CONFIG_RWSEM_SPIN_ON_OWNER osq_lock_init(&sem->osq); #endif @@ -511,9 +549,10 @@ static inline bool owner_on_cpu(struct task_struct *owner) static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) { struct task_struct *owner; + unsigned long flags; bool ret = true; - BUILD_BUG_ON(is_rwsem_owner_spinnable(RWSEM_OWNER_UNKNOWN)); + BUILD_BUG_ON(!(RWSEM_OWNER_UNKNOWN & RWSEM_NONSPINNABLE)); if (need_resched()) { lockevent_inc(rwsem_opt_fail); @@ -522,11 +561,9 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) preempt_disable(); rcu_read_lock(); - owner = READ_ONCE(sem->owner); - if (owner) { - ret = is_rwsem_owner_spinnable(owner) && - owner_on_cpu(owner); - } + owner = rwsem_owner_flags(sem, &flags); + if ((flags & RWSEM_NONSPINNABLE) || (owner && !owner_on_cpu(owner))) + ret = false; rcu_read_unlock(); preempt_enable(); @@ -553,25 +590,26 @@ enum owner_state { }; #define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER) -static inline enum owner_state rwsem_owner_state(unsigned long owner) +static inline enum owner_state +rwsem_owner_state(struct task_struct *owner, unsigned long flags) { - if (!owner) - return OWNER_NULL; - - if (owner & RWSEM_NONSPINNABLE) + if (flags & RWSEM_NONSPINNABLE) return OWNER_NONSPINNABLE; - if (owner & RWSEM_READER_OWNED) + if (flags & RWSEM_READER_OWNED) return OWNER_READER; - return OWNER_WRITER; + return owner ? OWNER_WRITER : OWNER_NULL; } static noinline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem) { - struct task_struct *tmp, *owner = READ_ONCE(sem->owner); - enum owner_state state = rwsem_owner_state((unsigned long)owner); + struct task_struct *new, *owner; + unsigned long flags, new_flags; + enum owner_state state; + owner = rwsem_owner_flags(sem, &flags); + state = rwsem_owner_state(owner, flags); if (state != OWNER_WRITER) return state; @@ -582,9 +620,9 @@ static noinline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem) break; } - tmp = READ_ONCE(sem->owner); - if (tmp != owner) { - state = rwsem_owner_state((unsigned long)tmp); + new = rwsem_owner_flags(sem, &new_flags); + if ((new != owner) || (new_flags != flags)) { + state = rwsem_owner_state(new, new_flags); break; } @@ -1001,8 +1039,7 @@ inline void __down_read(struct rw_semaphore *sem) if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, &sem->count) & RWSEM_READ_FAILED_MASK)) { rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE); - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & - RWSEM_READER_OWNED), sem); + DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); } else { rwsem_set_reader_owned(sem); } @@ -1014,8 +1051,7 @@ static inline int __down_read_killable(struct rw_semaphore *sem) &sem->count) & RWSEM_READ_FAILED_MASK)) { if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE))) return -EINTR; - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & - RWSEM_READER_OWNED), sem); + DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); } else { rwsem_set_reader_owned(sem); } @@ -1084,7 +1120,7 @@ inline void __up_read(struct rw_semaphore *sem) { long tmp; - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), sem); + DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); rwsem_clear_reader_owned(sem); tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) == @@ -1103,8 +1139,8 @@ static inline void __up_write(struct rw_semaphore *sem) * sem->owner may differ from current if the ownership is transferred * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits. */ - DEBUG_RWSEMS_WARN_ON((sem->owner != current) && - !((long)sem->owner & RWSEM_NONSPINNABLE), sem); + DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) && + !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem); rwsem_clear_owner(sem); tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count); if (unlikely(tmp & RWSEM_FLAG_WAITERS)) @@ -1125,7 +1161,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem) * read-locked region is ok to be re-ordered into the * write side. As such, rely on RELEASE semantics. */ - DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); + DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem); tmp = atomic_long_fetch_add_release( -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count); rwsem_set_reader_owned(sem); @@ -1296,8 +1332,7 @@ EXPORT_SYMBOL(down_write_killable_nested); void up_read_non_owner(struct rw_semaphore *sem) { - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), - sem); + DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); __up_read(sem); } EXPORT_SYMBOL(up_read_non_owner); -- cgit v1.2.3 From 7d43f1ce9dd075d8b2aa3ad1f3970ef386a5c358 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:13 -0400 Subject: locking/rwsem: Enable time-based spinning on reader-owned rwsem When the rwsem is owned by reader, writers stop optimistic spinning simply because there is no easy way to figure out if all the readers are actively running or not. However, there are scenarios where the readers are unlikely to sleep and optimistic spinning can help performance. This patch provides a simple mechanism for spinning on a reader-owned rwsem by a writer. It is a time threshold based spinning where the allowable spinning time can vary from 10us to 25us depending on the condition of the rwsem. When the time threshold is exceeded, the nonspinnable bits will be set in the owner field to indicate that no more optimistic spinning will be allowed on this rwsem until it becomes writer owned again. Not even readers is allowed to acquire the reader-locked rwsem by optimistic spinning for fairness. We also want a writer to acquire the lock after the readers hold the lock for a relatively long time. In order to give preference to writers under such a circumstance, the single RWSEM_NONSPINNABLE bit is now split into two - one for reader and one for writer. When optimistic spinning is disabled, both bits will be set. When the reader count drop down to 0, the writer nonspinnable bit will be cleared to allow writers to spin on the lock, but not the readers. When a writer acquires the lock, it will write its own task structure pointer into sem->owner and clear the reader nonspinnable bit in the process. The time taken for each iteration of the reader-owned rwsem spinning loop varies. Below are sample minimum elapsed times for 16 iterations of the loop. System Time for 16 Iterations ------ ---------------------- 1-socket Skylake ~800ns 4-socket Broadwell ~300ns 2-socket ThunderX2 (arm64) ~250ns When the lock cacheline is contended, we can see up to almost 10X increase in elapsed time. So 25us will be at most 500, 1300 and 1600 iterations for each of the above systems. With a locking microbenchmark running on 5.1 based kernel, the total locking rates (in kops/s) on a 8-socket IvyBridge-EX system with equal numbers of readers and writers before and after this patch were as follows: # of Threads Pre-patch Post-patch ------------ --------- ---------- 2 1,759 6,684 4 1,684 6,738 8 1,074 7,222 16 900 7,163 32 458 7,316 64 208 520 128 168 425 240 143 474 This patch gives a big boost in performance for mixed reader/writer workloads. With 32 locking threads, the rwsem lock event data were: rwsem_opt_fail=79850 rwsem_opt_nospin=5069 rwsem_opt_rlock=597484 rwsem_opt_wlock=957339 rwsem_sleep_reader=57782 rwsem_sleep_writer=55663 With 64 locking threads, the data looked like: rwsem_opt_fail=346723 rwsem_opt_nospin=6293 rwsem_opt_rlock=1127119 rwsem_opt_wlock=1400628 rwsem_sleep_reader=308201 rwsem_sleep_writer=72281 So a lot more threads acquired the lock in the slowpath and more threads went to sleep. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-15-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/lock_events_list.h | 1 + kernel/locking/rwsem.c | 173 +++++++++++++++++++++++++++++++------- 2 files changed, 144 insertions(+), 30 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h index ca954e4e00e4..baa998401052 100644 --- a/kernel/locking/lock_events_list.h +++ b/kernel/locking/lock_events_list.h @@ -59,6 +59,7 @@ LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups */ LOCK_EVENT(rwsem_opt_rlock) /* # of read locks opt-spin acquired */ LOCK_EVENT(rwsem_opt_wlock) /* # of write locks opt-spin acquired */ LOCK_EVENT(rwsem_opt_fail) /* # of failed opt-spinnings */ +LOCK_EVENT(rwsem_opt_nospin) /* # of disabled reader opt-spinnings */ LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */ LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */ LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */ diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index fae557be8334..2d7cabcfca50 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -31,24 +32,28 @@ #include "lock_events.h" /* - * The least significant 2 bits of the owner value has the following + * The least significant 3 bits of the owner value has the following * meanings when set. * - Bit 0: RWSEM_READER_OWNED - The rwsem is owned by readers - * - Bit 1: RWSEM_NONSPINNABLE - Waiters cannot spin on the rwsem - * The rwsem is anonymously owned, i.e. the owner(s) cannot be - * readily determined. It can be reader owned or the owning writer - * is indeterminate. + * - Bit 1: RWSEM_RD_NONSPINNABLE - Readers cannot spin on this lock. + * - Bit 2: RWSEM_WR_NONSPINNABLE - Writers cannot spin on this lock. * + * When the rwsem is either owned by an anonymous writer, or it is + * reader-owned, but a spinning writer has timed out, both nonspinnable + * bits will be set to disable optimistic spinning by readers and writers. + * In the later case, the last unlocking reader should then check the + * writer nonspinnable bit and clear it only to give writers preference + * to acquire the lock via optimistic spinning, but not readers. Similar + * action is also done in the reader slowpath. + * When a writer acquires a rwsem, it puts its task_struct pointer * into the owner field. It is cleared after an unlock. * * When a reader acquires a rwsem, it will also puts its task_struct - * pointer into the owner field with both the RWSEM_READER_OWNED and - * RWSEM_NONSPINNABLE bits set. On unlock, the owner field will - * largely be left untouched. So for a free or reader-owned rwsem, - * the owner value may contain information about the last reader that - * acquires the rwsem. The anonymous bit is set because that particular - * reader may or may not still own the lock. + * pointer into the owner field with the RWSEM_READER_OWNED bit set. + * On unlock, the owner field will largely be left untouched. So + * for a free or reader-owned rwsem, the owner value may contain + * information about the last reader that acquires the rwsem. * * That information may be helpful in debugging cases where the system * seems to hang on a reader owned rwsem especially if only one reader @@ -56,7 +61,9 @@ * a rwsem, but the overhead is simply too big. */ #define RWSEM_READER_OWNED (1UL << 0) -#define RWSEM_NONSPINNABLE (1UL << 1) +#define RWSEM_RD_NONSPINNABLE (1UL << 1) +#define RWSEM_WR_NONSPINNABLE (1UL << 2) +#define RWSEM_NONSPINNABLE (RWSEM_RD_NONSPINNABLE | RWSEM_WR_NONSPINNABLE) #define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE) #ifdef CONFIG_DEBUG_RWSEMS @@ -141,7 +148,7 @@ static inline bool rwsem_test_oflags(struct rw_semaphore *sem, long flags) static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, struct task_struct *owner) { - unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED | RWSEM_NONSPINNABLE; + unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED; atomic_long_set(&sem->owner, val); } @@ -191,6 +198,23 @@ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) } #endif +/* + * Set the RWSEM_NONSPINNABLE bits if the RWSEM_READER_OWNED flag + * remains set. Otherwise, the operation will be aborted. + */ +static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem) +{ + unsigned long owner = atomic_long_read(&sem->owner); + + do { + if (!(owner & RWSEM_READER_OWNED)) + break; + if (owner & RWSEM_NONSPINNABLE) + break; + } while (!atomic_long_try_cmpxchg(&sem->owner, &owner, + owner | RWSEM_NONSPINNABLE)); +} + /* * Return just the real task structure pointer of the owner */ @@ -546,7 +570,8 @@ static inline bool owner_on_cpu(struct task_struct *owner) return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); } -static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, + unsigned long nonspinnable) { struct task_struct *owner; unsigned long flags; @@ -562,7 +587,7 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) preempt_disable(); rcu_read_lock(); owner = rwsem_owner_flags(sem, &flags); - if ((flags & RWSEM_NONSPINNABLE) || (owner && !owner_on_cpu(owner))) + if ((flags & nonspinnable) || (owner && !owner_on_cpu(owner))) ret = false; rcu_read_unlock(); preempt_enable(); @@ -588,12 +613,12 @@ enum owner_state { OWNER_READER = 1 << 2, OWNER_NONSPINNABLE = 1 << 3, }; -#define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER) +#define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER | OWNER_READER) static inline enum owner_state -rwsem_owner_state(struct task_struct *owner, unsigned long flags) +rwsem_owner_state(struct task_struct *owner, unsigned long flags, unsigned long nonspinnable) { - if (flags & RWSEM_NONSPINNABLE) + if (flags & nonspinnable) return OWNER_NONSPINNABLE; if (flags & RWSEM_READER_OWNED) @@ -602,14 +627,15 @@ rwsem_owner_state(struct task_struct *owner, unsigned long flags) return owner ? OWNER_WRITER : OWNER_NULL; } -static noinline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem) +static noinline enum owner_state +rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) { struct task_struct *new, *owner; unsigned long flags, new_flags; enum owner_state state; owner = rwsem_owner_flags(sem, &flags); - state = rwsem_owner_state(owner, flags); + state = rwsem_owner_state(owner, flags, nonspinnable); if (state != OWNER_WRITER) return state; @@ -622,7 +648,7 @@ static noinline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem) new = rwsem_owner_flags(sem, &new_flags); if ((new != owner) || (new_flags != flags)) { - state = rwsem_owner_state(new, new_flags); + state = rwsem_owner_state(new, new_flags, nonspinnable); break; } @@ -646,10 +672,39 @@ static noinline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem) return state; } +/* + * Calculate reader-owned rwsem spinning threshold for writer + * + * The more readers own the rwsem, the longer it will take for them to + * wind down and free the rwsem. So the empirical formula used to + * determine the actual spinning time limit here is: + * + * Spinning threshold = (10 + nr_readers/2)us + * + * The limit is capped to a maximum of 25us (30 readers). This is just + * a heuristic and is subjected to change in the future. + */ +static inline u64 rwsem_rspin_threshold(struct rw_semaphore *sem) +{ + long count = atomic_long_read(&sem->count); + int readers = count >> RWSEM_READER_SHIFT; + u64 delta; + + if (readers > 30) + readers = 30; + delta = (20 + readers) * NSEC_PER_USEC / 2; + + return sched_clock() + delta; +} + static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) { bool taken = false; int prev_owner_state = OWNER_NULL; + int loop = 0; + u64 rspin_threshold = 0; + unsigned long nonspinnable = wlock ? RWSEM_WR_NONSPINNABLE + : RWSEM_RD_NONSPINNABLE; preempt_disable(); @@ -661,12 +716,12 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) * Optimistically spin on the owner field and attempt to acquire the * lock whenever the owner changes. Spinning will be stopped when: * 1) the owning writer isn't running; or - * 2) readers own the lock as we can't determine if they are - * actively running or not. + * 2) readers own the lock and spinning time has exceeded limit. */ for (;;) { - enum owner_state owner_state = rwsem_spin_on_owner(sem); + enum owner_state owner_state; + owner_state = rwsem_spin_on_owner(sem, nonspinnable); if (!(owner_state & OWNER_SPINNABLE)) break; @@ -679,6 +734,38 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) if (taken) break; + /* + * Time-based reader-owned rwsem optimistic spinning + */ + if (wlock && (owner_state == OWNER_READER)) { + /* + * Re-initialize rspin_threshold every time when + * the owner state changes from non-reader to reader. + * This allows a writer to steal the lock in between + * 2 reader phases and have the threshold reset at + * the beginning of the 2nd reader phase. + */ + if (prev_owner_state != OWNER_READER) { + if (rwsem_test_oflags(sem, nonspinnable)) + break; + rspin_threshold = rwsem_rspin_threshold(sem); + loop = 0; + } + + /* + * Check time threshold once every 16 iterations to + * avoid calling sched_clock() too frequently so + * as to reduce the average latency between the times + * when the lock becomes free and when the spinner + * is ready to do a trylock. + */ + else if (!(++loop & 0xf) && (sched_clock() > rspin_threshold)) { + rwsem_set_nonspinnable(sem); + lockevent_inc(rwsem_opt_nospin); + break; + } + } + /* * An RT task cannot do optimistic spinning if it cannot * be sure the lock holder is running or live-lock may @@ -733,8 +820,25 @@ done: lockevent_cond_inc(rwsem_opt_fail, !taken); return taken; } + +/* + * Clear the owner's RWSEM_WR_NONSPINNABLE bit if it is set. This should + * only be called when the reader count reaches 0. + * + * This give writers better chance to acquire the rwsem first before + * readers when the rwsem was being held by readers for a relatively long + * period of time. Race can happen that an optimistic spinner may have + * just stolen the rwsem and set the owner, but just clearing the + * RWSEM_WR_NONSPINNABLE bit will do no harm anyway. + */ +static inline void clear_wr_nonspinnable(struct rw_semaphore *sem) +{ + if (rwsem_test_oflags(sem, RWSEM_WR_NONSPINNABLE)) + atomic_long_andnot(RWSEM_WR_NONSPINNABLE, &sem->owner); +} #else -static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, + unsigned long nonspinnable) { return false; } @@ -743,6 +847,8 @@ static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) { return false; } + +static inline void clear_wr_nonspinnable(struct rw_semaphore *sem) { } #endif /* @@ -752,10 +858,11 @@ static struct rw_semaphore __sched * rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) { long count, adjustment = -RWSEM_READER_BIAS; + bool wake = false; struct rwsem_waiter waiter; DEFINE_WAKE_Q(wake_q); - if (!rwsem_can_spin_on_owner(sem)) + if (!rwsem_can_spin_on_owner(sem, RWSEM_RD_NONSPINNABLE)) goto queue; /* @@ -815,8 +922,12 @@ queue: * If there are no writers and we are first in the queue, * wake our own waiter to join the existing active readers ! */ - if (!(count & RWSEM_LOCK_MASK) || - (!(count & RWSEM_WRITER_MASK) && (adjustment & RWSEM_FLAG_WAITERS))) + if (!(count & RWSEM_LOCK_MASK)) { + clear_wr_nonspinnable(sem); + wake = true; + } + if (wake || (!(count & RWSEM_WRITER_MASK) && + (adjustment & RWSEM_FLAG_WAITERS))) rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); @@ -866,7 +977,7 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) DEFINE_WAKE_Q(wake_q); /* do optimistic spinning and steal lock if possible */ - if (rwsem_can_spin_on_owner(sem) && + if (rwsem_can_spin_on_owner(sem, RWSEM_WR_NONSPINNABLE) && rwsem_optimistic_spin(sem, true)) return sem; @@ -1124,8 +1235,10 @@ inline void __up_read(struct rw_semaphore *sem) rwsem_clear_reader_owned(sem); tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) == - RWSEM_FLAG_WAITERS)) + RWSEM_FLAG_WAITERS)) { + clear_wr_nonspinnable(sem); rwsem_wake(sem, tmp); + } } /* -- cgit v1.2.3 From 5cfd92e12e13432251981b9d0cd68dbd7aa8d690 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:14 -0400 Subject: locking/rwsem: Adaptive disabling of reader optimistic spinning Reader optimistic spinning is helpful when the reader critical section is short and there aren't that many readers around. It makes readers relatively more preferred than writers. When a writer times out spinning on a reader-owned lock and set the nospinnable bits, there are two main reasons for that. 1) The reader critical section is long, perhaps the task sleeps after acquiring the read lock. 2) There are just too many readers contending the lock causing it to take a while to service all of them. In the former case, long reader critical section will impede the progress of writers which is usually more important for system performance. In the later case, reader optimistic spinning tends to make the reader groups that contain readers that acquire the lock together smaller leading to more of them. That may hurt performance in some cases. In other words, the setting of nonspinnable bits indicates that reader optimistic spinning may not be helpful for those workloads that cause it. Therefore, any writers that have observed the setting of the writer nonspinnable bit for a given rwsem after they fail to acquire the lock via optimistic spinning will set the reader nonspinnable bit once they acquire the write lock. Similarly, readers that observe the setting of reader nonspinnable bit at slowpath entry will also set the reader nonspinnable bit when they acquire the read lock via the wakeup path. Once the reader nonspinnable bit is on, it will only be reset when a writer is able to acquire the rwsem in the fast path or somehow a reader or writer in the slowpath doesn't observe the nonspinable bit. This is to discourage reader optmistic spinning on that particular rwsem and make writers more preferred. This adaptive disabling of reader optimistic spinning will alleviate some of the negative side effect of this feature. In addition, this patch tries to make readers in the spinning queue follow the phase-fair principle after quitting optimistic spinning by checking if another reader has somehow acquired a read lock after this reader enters the optimistic spinning queue. If so and the rwsem is still reader-owned, this reader is in the right read-phase and can attempt to acquire the lock. On a 2-socket 40-core 80-thread Skylake system, the page_fault1 test of the will-it-scale benchmark was run with various number of threads. The number of operations done before reader optimistic spinning patches, this patch and after this patch were: Threads Before rspin Before patch After patch %change ------- ------------ ------------ ----------- ------- 20 5541068 5345484 5455667 -3.5%/ +2.1% 40 10185150 7292313 9219276 -28.5%/+26.4% 60 8196733 6460517 7181209 -21.2%/+11.2% 80 9508864 6739559 8107025 -29.1%/+20.3% This patch doesn't recover all the lost performance, but it is more than half. Given the fact that reader optimistic spinning does benefit some workloads, this is a good compromise. Using the rwsem locking microbenchmark with very short critical section, this patch doesn't have too much impact on locking performance as shown by the locking rates (kops/s) below with equal numbers of readers and writers before and after this patch: # of Threads Pre-patch Post-patch ------------ --------- ---------- 2 4,730 4,969 4 4,814 4,786 8 4,866 4,815 16 4,715 4,511 32 3,338 3,500 64 3,212 3,389 80 3,110 3,044 When running the locking microbenchmark with 40 dedicated reader and writer threads, however, the reader performance is curtailed to favor the writer. Before patch: 40 readers, Iterations Min/Mean/Max = 204,026/234,309/254,816 40 writers, Iterations Min/Mean/Max = 88,515/95,884/115,644 After patch: 40 readers, Iterations Min/Mean/Max = 33,813/35,260/36,791 40 writers, Iterations Min/Mean/Max = 95,368/96,565/97,798 Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-16-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/lock_events_list.h | 10 +-- kernel/locking/rwsem.c | 133 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 135 insertions(+), 8 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h index baa998401052..239039d0ce21 100644 --- a/kernel/locking/lock_events_list.h +++ b/kernel/locking/lock_events_list.h @@ -56,10 +56,12 @@ LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */ LOCK_EVENT(rwsem_sleep_writer) /* # of writer sleeps */ LOCK_EVENT(rwsem_wake_reader) /* # of reader wakeups */ LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups */ -LOCK_EVENT(rwsem_opt_rlock) /* # of read locks opt-spin acquired */ -LOCK_EVENT(rwsem_opt_wlock) /* # of write locks opt-spin acquired */ -LOCK_EVENT(rwsem_opt_fail) /* # of failed opt-spinnings */ -LOCK_EVENT(rwsem_opt_nospin) /* # of disabled reader opt-spinnings */ +LOCK_EVENT(rwsem_opt_rlock) /* # of opt-acquired read locks */ +LOCK_EVENT(rwsem_opt_wlock) /* # of opt-acquired write locks */ +LOCK_EVENT(rwsem_opt_fail) /* # of failed optspins */ +LOCK_EVENT(rwsem_opt_nospin) /* # of disabled optspins */ +LOCK_EVENT(rwsem_opt_norspin) /* # of disabled reader-only optspins */ +LOCK_EVENT(rwsem_opt_rlock2) /* # of opt-acquired 2ndary read locks */ LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */ LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */ LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */ diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 2d7cabcfca50..e1e0bac957c4 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -59,6 +59,42 @@ * seems to hang on a reader owned rwsem especially if only one reader * is involved. Ideally we would like to track all the readers that own * a rwsem, but the overhead is simply too big. + * + * Reader optimistic spinning is helpful when the reader critical section + * is short and there aren't that many readers around. It makes readers + * relatively more preferred than writers. When a writer times out spinning + * on a reader-owned lock and set the nospinnable bits, there are two main + * reasons for that. + * + * 1) The reader critical section is long, perhaps the task sleeps after + * acquiring the read lock. + * 2) There are just too many readers contending the lock causing it to + * take a while to service all of them. + * + * In the former case, long reader critical section will impede the progress + * of writers which is usually more important for system performance. In + * the later case, reader optimistic spinning tends to make the reader + * groups that contain readers that acquire the lock together smaller + * leading to more of them. That may hurt performance in some cases. In + * other words, the setting of nonspinnable bits indicates that reader + * optimistic spinning may not be helpful for those workloads that cause + * it. + * + * Therefore, any writers that had observed the setting of the writer + * nonspinnable bit for a given rwsem after they fail to acquire the lock + * via optimistic spinning will set the reader nonspinnable bit once they + * acquire the write lock. Similarly, readers that observe the setting + * of reader nonspinnable bit at slowpath entry will set the reader + * nonspinnable bits when they acquire the read lock via the wakeup path. + * + * Once the reader nonspinnable bit is on, it will only be reset when + * a writer is able to acquire the rwsem in the fast path or somehow a + * reader or writer in the slowpath doesn't observe the nonspinable bit. + * + * This is to discourage reader optmistic spinning on that particular + * rwsem and make writers more preferred. This adaptive disabling of reader + * optimistic spinning will alleviate the negative side effect of this + * feature. */ #define RWSEM_READER_OWNED (1UL << 0) #define RWSEM_RD_NONSPINNABLE (1UL << 1) @@ -144,11 +180,14 @@ static inline bool rwsem_test_oflags(struct rw_semaphore *sem, long flags) * Note that the owner value just indicates the task has owned the rwsem * previously, it may not be the real owner or one of the real owners * anymore when that field is examined, so take it with a grain of salt. + * + * The reader non-spinnable bit is preserved. */ static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, struct task_struct *owner) { - unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED; + unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED | + (atomic_long_read(&sem->owner) & RWSEM_RD_NONSPINNABLE); atomic_long_set(&sem->owner, val); } @@ -287,6 +326,7 @@ struct rwsem_waiter { struct task_struct *task; enum rwsem_waiter_type type; unsigned long timeout; + unsigned long last_rowner; }; #define rwsem_first_waiter(sem) \ list_first_entry(&sem->wait_list, struct rwsem_waiter, list) @@ -368,6 +408,8 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, * so we can bail out early if a writer stole the lock. */ if (wake_type != RWSEM_WAKE_READ_OWNED) { + struct task_struct *owner; + adjustment = RWSEM_READER_BIAS; oldcount = atomic_long_fetch_add(adjustment, &sem->count); if (unlikely(oldcount & RWSEM_WRITER_MASK)) { @@ -388,8 +430,15 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, /* * Set it to reader-owned to give spinners an early * indication that readers now have the lock. + * The reader nonspinnable bit seen at slowpath entry of + * the reader is copied over. */ - __rwsem_set_reader_owned(sem, waiter->task); + owner = waiter->task; + if (waiter->last_rowner & RWSEM_RD_NONSPINNABLE) { + owner = (void *)((unsigned long)owner | RWSEM_RD_NONSPINNABLE); + lockevent_inc(rwsem_opt_norspin); + } + __rwsem_set_reader_owned(sem, owner); } /* @@ -836,6 +885,42 @@ static inline void clear_wr_nonspinnable(struct rw_semaphore *sem) if (rwsem_test_oflags(sem, RWSEM_WR_NONSPINNABLE)) atomic_long_andnot(RWSEM_WR_NONSPINNABLE, &sem->owner); } + +/* + * This function is called when the reader fails to acquire the lock via + * optimistic spinning. In this case we will still attempt to do a trylock + * when comparing the rwsem state right now with the state when entering + * the slowpath indicates that the reader is still in a valid reader phase. + * This happens when the following conditions are true: + * + * 1) The lock is currently reader owned, and + * 2) The lock is previously not reader-owned or the last read owner changes. + * + * In the former case, we have transitioned from a writer phase to a + * reader-phase while spinning. In the latter case, it means the reader + * phase hasn't ended when we entered the optimistic spinning loop. In + * both cases, the reader is eligible to acquire the lock. This is the + * secondary path where a read lock is acquired optimistically. + * + * The reader non-spinnable bit wasn't set at time of entry or it will + * not be here at all. + */ +static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, + unsigned long last_rowner) +{ + unsigned long owner = atomic_long_read(&sem->owner); + + if (!(owner & RWSEM_READER_OWNED)) + return false; + + if (((owner ^ last_rowner) & ~RWSEM_OWNER_FLAGS_MASK) && + rwsem_try_read_lock_unqueued(sem)) { + lockevent_inc(rwsem_opt_rlock2); + lockevent_add(rwsem_opt_fail, -1); + return true; + } + return false; +} #else static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) @@ -849,6 +934,12 @@ static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) } static inline void clear_wr_nonspinnable(struct rw_semaphore *sem) { } + +static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, + unsigned long last_rowner) +{ + return false; +} #endif /* @@ -862,6 +953,14 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) struct rwsem_waiter waiter; DEFINE_WAKE_Q(wake_q); + /* + * Save the current read-owner of rwsem, if available, and the + * reader nonspinnable bit. + */ + waiter.last_rowner = atomic_long_read(&sem->owner); + if (!(waiter.last_rowner & RWSEM_READER_OWNED)) + waiter.last_rowner &= RWSEM_RD_NONSPINNABLE; + if (!rwsem_can_spin_on_owner(sem, RWSEM_RD_NONSPINNABLE)) goto queue; @@ -884,6 +983,8 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) wake_up_q(&wake_q); } return sem; + } else if (rwsem_reader_phase_trylock(sem, waiter.last_rowner)) { + return sem; } queue: @@ -964,6 +1065,19 @@ out_nolock: return ERR_PTR(-EINTR); } +/* + * This function is called by the a write lock owner. So the owner value + * won't get changed by others. + */ +static inline void rwsem_disable_reader_optspin(struct rw_semaphore *sem, + bool disable) +{ + if (unlikely(disable)) { + atomic_long_or(RWSEM_RD_NONSPINNABLE, &sem->owner); + lockevent_inc(rwsem_opt_norspin); + } +} + /* * Wait until we successfully acquire the write lock */ @@ -971,6 +1085,7 @@ static struct rw_semaphore * rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) { long count; + bool disable_rspin; enum writer_wait_state wstate; struct rwsem_waiter waiter; struct rw_semaphore *ret = sem; @@ -981,6 +1096,13 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) rwsem_optimistic_spin(sem, true)) return sem; + /* + * Disable reader optimistic spinning for this rwsem after + * acquiring the write lock when the setting of the nonspinnable + * bits are observed. + */ + disable_rspin = atomic_long_read(&sem->owner) & RWSEM_NONSPINNABLE; + /* * Optimistic spinning failed, proceed to the slowpath * and block until we can acquire the sem. @@ -1077,6 +1199,7 @@ wait: } __set_current_state(TASK_RUNNING); list_del(&waiter.list); + rwsem_disable_reader_optspin(sem, disable_rspin); raw_spin_unlock_irq(&sem->wait_lock); lockevent_inc(rwsem_wlock); @@ -1196,7 +1319,8 @@ static inline void __down_write(struct rw_semaphore *sem) if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED))) rwsem_down_write_slowpath(sem, TASK_UNINTERRUPTIBLE); - rwsem_set_owner(sem); + else + rwsem_set_owner(sem); } static inline int __down_write_killable(struct rw_semaphore *sem) @@ -1207,8 +1331,9 @@ static inline int __down_write_killable(struct rw_semaphore *sem) RWSEM_WRITER_LOCKED))) { if (IS_ERR(rwsem_down_write_slowpath(sem, TASK_KILLABLE))) return -EINTR; + } else { + rwsem_set_owner(sem); } - rwsem_set_owner(sem); return 0; } -- cgit v1.2.3 From a15ea1a35f1b2782befc8b958c123c5d6a7cab0a Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:15 -0400 Subject: locking/rwsem: Guard against making count negative The upper bits of the count field is used as reader count. When sufficient number of active readers are present, the most significant bit will be set and the count becomes negative. If the number of active readers keep on piling up, we may eventually overflow the reader counts. This is not likely to happen unless the number of bits reserved for reader count is reduced because those bits are need for other purpose. To prevent this count overflow from happening, the most significant bit is now treated as a guard bit (RWSEM_FLAG_READFAIL). Read-lock attempts will now fail for both the fast and slow paths whenever this bit is set. So all those extra readers will be put to sleep in the wait list. Wakeup will not happen until the reader count reaches 0. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-17-longman@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem.c | 53 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 12 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index e1e0bac957c4..37524a47f002 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -116,13 +116,28 @@ #endif /* - * The definition of the atomic counter in the semaphore: + * On 64-bit architectures, the bit definitions of the count are: * - * Bit 0 - writer locked bit - * Bit 1 - waiters present bit - * Bit 2 - lock handoff bit - * Bits 3-7 - reserved - * Bits 8-X - 24-bit (32-bit) or 56-bit reader count + * Bit 0 - writer locked bit + * Bit 1 - waiters present bit + * Bit 2 - lock handoff bit + * Bits 3-7 - reserved + * Bits 8-62 - 55-bit reader count + * Bit 63 - read fail bit + * + * On 32-bit architectures, the bit definitions of the count are: + * + * Bit 0 - writer locked bit + * Bit 1 - waiters present bit + * Bit 2 - lock handoff bit + * Bits 3-7 - reserved + * Bits 8-30 - 23-bit reader count + * Bit 31 - read fail bit + * + * It is not likely that the most significant bit (read fail bit) will ever + * be set. This guard bit is still checked anyway in the down_read() fastpath + * just in case we need to use up more of the reader bits for other purpose + * in the future. * * atomic_long_fetch_add() is used to obtain reader lock, whereas * atomic_long_cmpxchg() will be used to obtain writer lock. @@ -139,6 +154,7 @@ #define RWSEM_WRITER_LOCKED (1UL << 0) #define RWSEM_FLAG_WAITERS (1UL << 1) #define RWSEM_FLAG_HANDOFF (1UL << 2) +#define RWSEM_FLAG_READFAIL (1UL << (BITS_PER_LONG - 1)) #define RWSEM_READER_SHIFT 8 #define RWSEM_READER_BIAS (1UL << RWSEM_READER_SHIFT) @@ -146,7 +162,7 @@ #define RWSEM_WRITER_MASK RWSEM_WRITER_LOCKED #define RWSEM_LOCK_MASK (RWSEM_WRITER_MASK|RWSEM_READER_MASK) #define RWSEM_READ_FAILED_MASK (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS|\ - RWSEM_FLAG_HANDOFF) + RWSEM_FLAG_HANDOFF|RWSEM_FLAG_READFAIL) /* * All writes to owner are protected by WRITE_ONCE() to make sure that @@ -254,6 +270,14 @@ static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem) owner | RWSEM_NONSPINNABLE)); } +static inline bool rwsem_read_trylock(struct rw_semaphore *sem) +{ + long cnt = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count); + if (WARN_ON_ONCE(cnt < 0)) + rwsem_set_nonspinnable(sem); + return !(cnt & RWSEM_READ_FAILED_MASK); +} + /* * Return just the real task structure pointer of the owner */ @@ -402,6 +426,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, return; } + /* + * No reader wakeup if there are too many of them already. + */ + if (unlikely(atomic_long_read(&sem->count) < 0)) + return; + /* * Writers might steal the lock before we grant it to the next reader. * We prefer to do the first reader grant before counting readers @@ -949,9 +979,9 @@ static struct rw_semaphore __sched * rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) { long count, adjustment = -RWSEM_READER_BIAS; - bool wake = false; struct rwsem_waiter waiter; DEFINE_WAKE_Q(wake_q); + bool wake = false; /* * Save the current read-owner of rwsem, if available, and the @@ -1270,8 +1300,7 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) */ inline void __down_read(struct rw_semaphore *sem) { - if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, - &sem->count) & RWSEM_READ_FAILED_MASK)) { + if (!rwsem_read_trylock(sem)) { rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE); DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); } else { @@ -1281,8 +1310,7 @@ inline void __down_read(struct rw_semaphore *sem) static inline int __down_read_killable(struct rw_semaphore *sem) { - if (unlikely(atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, - &sem->count) & RWSEM_READ_FAILED_MASK)) { + if (!rwsem_read_trylock(sem)) { if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE))) return -EINTR; DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); @@ -1359,6 +1387,7 @@ inline void __up_read(struct rw_semaphore *sem) DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); rwsem_clear_reader_owned(sem); tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); + DEBUG_RWSEMS_WARN_ON(tmp < 0, sem); if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) == RWSEM_FLAG_WAITERS)) { clear_wr_nonspinnable(sem); -- cgit v1.2.3 From 886532aee3cd42d95196601ed16d7c3d4679e9e5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 17 Jun 2019 14:47:05 +0200 Subject: locking/lockdep: Move mark_lock() inside CONFIG_TRACE_IRQFLAGS && CONFIG_PROVE_LOCKING The last cleanup patch triggered another issue, as now another function should be moved into the same section: kernel/locking/lockdep.c:3580:12: error: 'mark_lock' defined but not used [-Werror=unused-function] static int mark_lock(struct task_struct *curr, struct held_lock *this, Move mark_lock() into the same #ifdef section as its only caller, and remove the now-unused mark_lock_irq() stub helper. Signed-off-by: Arnd Bergmann Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Bart Van Assche Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Cc: Yuyang Du Fixes: 0d2cc3b34532 ("locking/lockdep: Move valid_state() inside CONFIG_TRACE_IRQFLAGS && CONFIG_PROVE_LOCKING") Link: https://lkml.kernel.org/r/20190617124718.1232976-1-arnd@arndb.de Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 73 ++++++++++++++++++++++-------------------------- 1 file changed, 34 insertions(+), 39 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 5e368f485330..341f52117f88 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -437,13 +437,6 @@ static int verbose(struct lock_class *class) return 0; } -/* - * Stack-trace: tightly packed array of stack backtrace - * addresses. Protected by the graph_lock. - */ -unsigned long nr_stack_trace_entries; -static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; - static void print_lockdep_off(const char *bug_msg) { printk(KERN_DEBUG "%s\n", bug_msg); @@ -453,6 +446,15 @@ static void print_lockdep_off(const char *bug_msg) #endif } +unsigned long nr_stack_trace_entries; + +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) +/* + * Stack-trace: tightly packed array of stack backtrace + * addresses. Protected by the graph_lock. + */ +static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; + static int save_trace(struct lock_trace *trace) { unsigned long *entries = stack_trace + nr_stack_trace_entries; @@ -475,6 +477,7 @@ static int save_trace(struct lock_trace *trace) return 1; } +#endif unsigned int nr_hardirq_chains; unsigned int nr_softirq_chains; @@ -488,6 +491,7 @@ unsigned int max_lockdep_depth; DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats); #endif +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * Locking printouts: */ @@ -505,6 +509,7 @@ static const char *usage_str[] = #undef LOCKDEP_STATE [LOCK_USED] = "INITIAL USE", }; +#endif const char * __get_key_name(struct lockdep_subclass_key *key, char *str) { @@ -2964,12 +2969,10 @@ static void check_chain_key(struct task_struct *curr) #endif } +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) static int mark_lock(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit); -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) - - static void print_usage_bug_scenario(struct held_lock *lock) { struct lock_class *class = hlock_class(lock); @@ -3545,35 +3548,6 @@ static int separate_irq_context(struct task_struct *curr, return 0; } -#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ - -static inline -int mark_lock_irq(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit) -{ - WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */ - return 1; -} - -static inline int -mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) -{ - return 1; -} - -static inline unsigned int task_irq_context(struct task_struct *task) -{ - return 0; -} - -static inline int separate_irq_context(struct task_struct *curr, - struct held_lock *hlock) -{ - return 0; -} - -#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ - /* * Mark a lock with a usage bit, and validate the state transition: */ @@ -3634,6 +3608,27 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, return ret; } +#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ + +static inline int +mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) +{ + return 1; +} + +static inline unsigned int task_irq_context(struct task_struct *task) +{ + return 0; +} + +static inline int separate_irq_context(struct task_struct *curr, + struct held_lock *hlock) +{ + return 0; +} + +#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ + /* * Initialize a lock instance's lock-class mapping info: */ -- cgit v1.2.3 From 9156e545765e467e6268c4814cfa609ebb16237e Mon Sep 17 00:00:00 2001 From: Kobe Wu Date: Mon, 24 Jun 2019 16:35:48 +0800 Subject: locking/lockdep: increase size of counters for lockdep statistics When system has been running for a long time, signed integer counters are not enough for some lockdep statistics. Using unsigned long counters can satisfy the requirement. Besides, most of lockdep statistics are unsigned. It is better to use unsigned int instead of int. Remove unused variables. - max_recursion_depth - nr_cyclic_check_recursions - nr_find_usage_forwards_recursions - nr_find_usage_backwards_recursions Signed-off-by: Kobe Wu Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Cc: Eason Lin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Link: https://lkml.kernel.org/r/1561365348-16050-1-git-send-email-kobe-cp.wu@mediatek.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep_internals.h | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 150ec3f0c5b5..cc83568d5012 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -131,7 +131,6 @@ extern unsigned int nr_hardirq_chains; extern unsigned int nr_softirq_chains; extern unsigned int nr_process_chains; extern unsigned int max_lockdep_depth; -extern unsigned int max_recursion_depth; extern unsigned int max_bfs_queue_depth; @@ -160,25 +159,22 @@ lockdep_count_backward_deps(struct lock_class *class) * and we want to avoid too much cache bouncing. */ struct lockdep_stats { - int chain_lookup_hits; - int chain_lookup_misses; - int hardirqs_on_events; - int hardirqs_off_events; - int redundant_hardirqs_on; - int redundant_hardirqs_off; - int softirqs_on_events; - int softirqs_off_events; - int redundant_softirqs_on; - int redundant_softirqs_off; - int nr_unused_locks; - int nr_redundant_checks; - int nr_redundant; - int nr_cyclic_checks; - int nr_cyclic_check_recursions; - int nr_find_usage_forwards_checks; - int nr_find_usage_forwards_recursions; - int nr_find_usage_backwards_checks; - int nr_find_usage_backwards_recursions; + unsigned long chain_lookup_hits; + unsigned int chain_lookup_misses; + unsigned long hardirqs_on_events; + unsigned long hardirqs_off_events; + unsigned long redundant_hardirqs_on; + unsigned long redundant_hardirqs_off; + unsigned long softirqs_on_events; + unsigned long softirqs_off_events; + unsigned long redundant_softirqs_on; + unsigned long redundant_softirqs_off; + int nr_unused_locks; + unsigned int nr_redundant_checks; + unsigned int nr_redundant; + unsigned int nr_cyclic_checks; + unsigned int nr_find_usage_forwards_checks; + unsigned int nr_find_usage_backwards_checks; /* * Per lock class locking operation stat counts -- cgit v1.2.3