diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-09-04 11:52:29 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-09-04 11:52:29 -0700 |
commit | 5f82e71a001d14824a7728ad9e49f6aea420f161 (patch) | |
tree | bf5dfa7cf0840ec834899ae925913973bd1e65d1 | |
parent | 6c51e67b64d169419fb13318035bb442f9176612 (diff) | |
parent | edc2988c548db05e33b921fed15821010bc74895 (diff) | |
download | linux-5f82e71a001d14824a7728ad9e49f6aea420f161.tar.gz linux-5f82e71a001d14824a7728ad9e49f6aea420f161.tar.bz2 linux-5f82e71a001d14824a7728ad9e49f6aea420f161.zip |
Merge branch 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull locking updates from Ingo Molnar:
- Add 'cross-release' support to lockdep, which allows APIs like
completions, where it's not the 'owner' who releases the lock, to be
tracked. It's all activated automatically under
CONFIG_PROVE_LOCKING=y.
- Clean up (restructure) the x86 atomics op implementation to be more
readable, in preparation of KASAN annotations. (Dmitry Vyukov)
- Fix static keys (Paolo Bonzini)
- Add killable versions of down_read() et al (Kirill Tkhai)
- Rework and fix jump_label locking (Marc Zyngier, Paolo Bonzini)
- Rework (and fix) tlb_flush_pending() barriers (Peter Zijlstra)
- Remove smp_mb__before_spinlock() and convert its usages, introduce
smp_mb__after_spinlock() (Peter Zijlstra)
* 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (56 commits)
locking/lockdep/selftests: Fix mixed read-write ABBA tests
sched/completion: Avoid unnecessary stack allocation for COMPLETION_INITIALIZER_ONSTACK()
acpi/nfit: Fix COMPLETION_INITIALIZER_ONSTACK() abuse
locking/pvqspinlock: Relax cmpxchg's to improve performance on some architectures
smp: Avoid using two cache lines for struct call_single_data
locking/lockdep: Untangle xhlock history save/restore from task independence
locking/refcounts, x86/asm: Disable CONFIG_ARCH_HAS_REFCOUNT for the time being
futex: Remove duplicated code and fix undefined behaviour
Documentation/locking/atomic: Finish the document...
locking/lockdep: Fix workqueue crossrelease annotation
workqueue/lockdep: 'Fix' flush_work() annotation
locking/lockdep/selftests: Add mixed read-write ABBA tests
mm, locking/barriers: Clarify tlb_flush_pending() barriers
locking/lockdep: Make CONFIG_LOCKDEP_CROSSRELEASE and CONFIG_LOCKDEP_COMPLETIONS truly non-interactive
locking/lockdep: Explicitly initialize wq_barrier::done::map
locking/lockdep: Rename CONFIG_LOCKDEP_COMPLETE to CONFIG_LOCKDEP_COMPLETIONS
locking/lockdep: Reword title of LOCKDEP_CROSSRELEASE config
locking/lockdep: Make CONFIG_LOCKDEP_CROSSRELEASE part of CONFIG_PROVE_LOCKING
locking/refcounts, x86/asm: Implement fast refcount overflow protection
locking/lockdep: Fix the rollback and overwrite detection logic in crossrelease
...
108 files changed, 3460 insertions, 1112 deletions
diff --git a/Documentation/atomic_bitops.txt b/Documentation/atomic_bitops.txt new file mode 100644 index 000000000000..5550bfdcce5f --- /dev/null +++ b/Documentation/atomic_bitops.txt @@ -0,0 +1,66 @@ + +On atomic bitops. + + +While our bitmap_{}() functions are non-atomic, we have a number of operations +operating on single bits in a bitmap that are atomic. + + +API +--- + +The single bit operations are: + +Non-RMW ops: + + test_bit() + +RMW atomic operations without return value: + + {set,clear,change}_bit() + clear_bit_unlock() + +RMW atomic operations with return value: + + test_and_{set,clear,change}_bit() + test_and_set_bit_lock() + +Barriers: + + smp_mb__{before,after}_atomic() + + +All RMW atomic operations have a '__' prefixed variant which is non-atomic. + + +SEMANTICS +--------- + +Non-atomic ops: + +In particular __clear_bit_unlock() suffers the same issue as atomic_set(), +which is why the generic version maps to clear_bit_unlock(), see atomic_t.txt. + + +RMW ops: + +The test_and_{}_bit() operations return the original value of the bit. + + +ORDERING +-------- + +Like with atomic_t, the rule of thumb is: + + - non-RMW operations are unordered; + + - RMW operations that have no return value are unordered; + + - RMW operations that have a return value are fully ordered. + +Except for test_and_set_bit_lock() which has ACQUIRE semantics and +clear_bit_unlock() which has RELEASE semantics. + +Since a platform only has a single means of achieving atomic operations +the same barriers as for atomic_t are used, see atomic_t.txt. + diff --git a/Documentation/atomic_t.txt b/Documentation/atomic_t.txt new file mode 100644 index 000000000000..913396ac5824 --- /dev/null +++ b/Documentation/atomic_t.txt @@ -0,0 +1,242 @@ + +On atomic types (atomic_t atomic64_t and atomic_long_t). + +The atomic type provides an interface to the architecture's means of atomic +RMW operations between CPUs (atomic operations on MMIO are not supported and +can lead to fatal traps on some platforms). + +API +--- + +The 'full' API consists of (atomic64_ and atomic_long_ prefixes omitted for +brevity): + +Non-RMW ops: + + atomic_read(), atomic_set() + atomic_read_acquire(), atomic_set_release() + + +RMW atomic operations: + +Arithmetic: + + atomic_{add,sub,inc,dec}() + atomic_{add,sub,inc,dec}_return{,_relaxed,_acquire,_release}() + atomic_fetch_{add,sub,inc,dec}{,_relaxed,_acquire,_release}() + + +Bitwise: + + atomic_{and,or,xor,andnot}() + atomic_fetch_{and,or,xor,andnot}{,_relaxed,_acquire,_release}() + + +Swap: + + atomic_xchg{,_relaxed,_acquire,_release}() + atomic_cmpxchg{,_relaxed,_acquire,_release}() + atomic_try_cmpxchg{,_relaxed,_acquire,_release}() + + +Reference count (but please see refcount_t): + + atomic_add_unless(), atomic_inc_not_zero() + atomic_sub_and_test(), atomic_dec_and_test() + + +Misc: + + atomic_inc_and_test(), atomic_add_negative() + atomic_dec_unless_positive(), atomic_inc_unless_negative() + + +Barriers: + + smp_mb__{before,after}_atomic() + + + +SEMANTICS +--------- + +Non-RMW ops: + +The non-RMW ops are (typically) regular LOADs and STOREs and are canonically +implemented using READ_ONCE(), WRITE_ONCE(), smp_load_acquire() and +smp_store_release() respectively. + +The one detail to this is that atomic_set{}() should be observable to the RMW +ops. That is: + + C atomic-set + + { + atomic_set(v, 1); + } + + P1(atomic_t *v) + { + atomic_add_unless(v, 1, 0); + } + + P2(atomic_t *v) + { + atomic_set(v, 0); + } + + exists + (v=2) + +In this case we would expect the atomic_set() from CPU1 to either happen +before the atomic_add_unless(), in which case that latter one would no-op, or +_after_ in which case we'd overwrite its result. In no case is "2" a valid +outcome. + +This is typically true on 'normal' platforms, where a regular competing STORE +will invalidate a LL/SC or fail a CMPXCHG. + +The obvious case where this is not so is when we need to implement atomic ops +with a lock: + + CPU0 CPU1 + + atomic_add_unless(v, 1, 0); + lock(); + ret = READ_ONCE(v->counter); // == 1 + atomic_set(v, 0); + if (ret != u) WRITE_ONCE(v->counter, 0); + WRITE_ONCE(v->counter, ret + 1); + unlock(); + +the typical solution is to then implement atomic_set{}() with atomic_xchg(). + + +RMW ops: + +These come in various forms: + + - plain operations without return value: atomic_{}() + + - operations which return the modified value: atomic_{}_return() + + these are limited to the arithmetic operations because those are + reversible. Bitops are irreversible and therefore the modified value + is of dubious utility. + + - operations which return the original value: atomic_fetch_{}() + + - swap operations: xchg(), cmpxchg() and try_cmpxchg() + + - misc; the special purpose operations that are commonly used and would, + given the interface, normally be implemented using (try_)cmpxchg loops but + are time critical and can, (typically) on LL/SC architectures, be more + efficiently implemented. + +All these operations are SMP atomic; that is, the operations (for a single +atomic variable) can be fully ordered and no intermediate state is lost or +visible. + + +ORDERING (go read memory-barriers.txt first) +-------- + +The rule of thumb: + + - non-RMW operations are unordered; + + - RMW operations that have no return value are unordered; + + - RMW operations that have a return value are fully ordered; + + - RMW operations that are conditional are unordered on FAILURE, + otherwise the above rules apply. + +Except of course when an operation has an explicit ordering like: + + {}_relaxed: unordered + {}_acquire: the R of the RMW (or atomic_read) is an ACQUIRE + {}_release: the W of the RMW (or atomic_set) is a RELEASE + +Where 'unordered' is against other memory locations. Address dependencies are +not defeated. + +Fully ordered primitives are ordered against everything prior and everything +subsequent. Therefore a fully ordered primitive is like having an smp_mb() +before and an smp_mb() after the primitive. + + +The barriers: + + smp_mb__{before,after}_atomic() + +only apply to the RMW ops and can be used to augment/upgrade the ordering +inherent to the used atomic op. These barriers provide a full smp_mb(). + +These helper barriers exist because architectures have varying implicit +ordering on their SMP atomic primitives. For example our TSO architectures +provide full ordered atomics and these barriers are no-ops. + +Thus: + + atomic_fetch_add(); + +is equivalent to: + + smp_mb__before_atomic(); + atomic_fetch_add_relaxed(); + smp_mb__after_atomic(); + +However the atomic_fetch_add() might be implemented more efficiently. + +Further, while something like: + + smp_mb__before_atomic(); + atomic_dec(&X); + +is a 'typical' RELEASE pattern, the barrier is strictly stronger than +a RELEASE. Similarly for something like: + + atomic_inc(&X); + smp_mb__after_atomic(); + +is an ACQUIRE pattern (though very much not typical), but again the barrier is +strictly stronger than ACQUIRE. As illustrated: + + C strong-acquire + + { + } + + P1(int *x, atomic_t *y) + { + r0 = READ_ONCE(*x); + smp_rmb(); + r1 = atomic_read(y); + } + + P2(int *x, atomic_t *y) + { + atomic_inc(y); + smp_mb__after_atomic(); + WRITE_ONCE(*x, 1); + } + + exists + (r0=1 /\ r1=0) + +This should not happen; but a hypothetical atomic_inc_acquire() -- +(void)atomic_fetch_inc_acquire() for instance -- would allow the outcome, +since then: + + P1 P2 + + t = LL.acq *y (0) + t++; + *x = 1; + r0 = *x (1) + RMB + r1 = *y (0) + SC *y, t; + +is allowed. diff --git a/Documentation/locking/crossrelease.txt b/Documentation/locking/crossrelease.txt new file mode 100644 index 000000000000..bdf1423d5f99 --- /dev/null +++ b/Documentation/locking/crossrelease.txt @@ -0,0 +1,874 @@ +Crossrelease +============ + +Started by Byungchul Park <byungchul.park@lge.com> + +Contents: + + (*) Background + + - What causes deadlock + - How lockdep works + + (*) Limitation + + - Limit lockdep + - Pros from the limitation + - Cons from the limitation + - Relax the limitation + + (*) Crossrelease + + - Introduce crossrelease + - Introduce commit + + (*) Implementation + + - Data structures + - How crossrelease works + + (*) Optimizations + + - Avoid duplication + - Lockless for hot paths + + (*) APPENDIX A: What lockdep does to work aggresively + + (*) APPENDIX B: How to avoid adding false dependencies + + +========== +Background +========== + +What causes deadlock +-------------------- + +A deadlock occurs when a context is waiting for an event to happen, +which is impossible because another (or the) context who can trigger the +event is also waiting for another (or the) event to happen, which is +also impossible due to the same reason. + +For example: + + A context going to trigger event C is waiting for event A to happen. + A context going to trigger event A is waiting for event B to happen. + A context going to trigger event B is waiting for event C to happen. + +A deadlock occurs when these three wait operations run at the same time, +because event C cannot be triggered if event A does not happen, which in +turn cannot be triggered if event B does not happen, which in turn +cannot be triggered if event C does not happen. After all, no event can +be triggered since any of them never meets its condition to wake up. + +A dependency might exist between two waiters and a deadlock might happen +due to an incorrect releationship between dependencies. Thus, we must +define what a dependency is first. A dependency exists between them if: + + 1. There are two waiters waiting for each event at a given time. + 2. The only way to wake up each waiter is to trigger its event. + 3. Whether one can be woken up depends on whether the other can. + +Each wait in the example creates its dependency like: + + Event C depends on event A. + Event A depends on event B. + Event B depends on event C. + + NOTE: Precisely speaking, a dependency is one between whether a + waiter for an event can be woken up and whether another waiter for + another event can be woken up. However from now on, we will describe + a dependency as if it's one between an event and another event for + simplicity. + +And they form circular dependencies like: + + -> C -> A -> B - + / \ + \ / + ---------------- + + where 'A -> B' means that event A depends on event B. + +Such circular dependencies lead to a deadlock since no waiter can meet +its condition to wake up as described. + +CONCLUSION + +Circular dependencies cause a deadlock. + + +How lockdep works +----------------- + +Lockdep tries to detect a deadlock by checking dependencies created by +lock operations, acquire and release. Waiting for a lock corresponds to +waiting for an event, and releasing a lock corresponds to triggering an +event in the previous section. + +In short, lockdep does: + + 1. Detect a new dependency. + 2. Add the dependency into a global graph. + 3. Check if that makes dependencies circular. + 4. Report a deadlock or its possibility if so. + +For example, consider a graph built by lockdep that looks like: + + A -> B - + \ + -> E + / + C -> D - + + where A, B,..., E are different lock classes. + +Lockdep will add a dependency into the graph on detection of a new +dependency. For example, it will add a dependency 'E -> C' when a new +dependency between lock E and lock C is detected. Then the graph will be: + + A -> B - + \ + -> E - + / \ + -> C -> D - \ + / / + \ / + ------------------ + + where A, B,..., E are different lock classes. + +This graph contains a subgraph which demonstrates circular dependencies: + + -> E - + / \ + -> C -> D - \ + / / + \ / + ------------------ + + where C, D and E are different lock classes. + +This is the condition under which a deadlock might occur. Lockdep +reports it on detection after adding a new dependency. This is the way +how lockdep works. + +CONCLUSION + +Lockdep detects a deadlock or its possibility by checking if circular +dependencies were created after adding each new dependency. + + +========== +Limitation +========== + +Limit lockdep +------------- + +Limiting lockdep to work on only typical locks e.g. spin locks and +mutexes, which are released within the acquire context, the +implementation becomes simple but its capacity for detection becomes +limited. Let's check pros and cons in next section. + + +Pros from the limitation +------------------------ + +Given the limitation, when acquiring a lock, locks in a held_locks +cannot be released if the context cannot acquire it so has to wait to +acquire it, which means all waiters for the locks in the held_locks are +stuck. It's an exact case to create dependencies between each lock in +the held_locks and the lock to acquire. + +For example: + + CONTEXT X + --------- + acquire A + acquire B /* Add a dependency 'A -> B' */ + release B + release A + + where A and B are different lock classes. + +When acquiring lock A, the held_locks of CONTEXT X is empty thus no +dependency is added. But when acquiring lock B, lockdep detects and adds +a new dependency 'A -> B' between lock A in the held_locks and lock B. +They can be simply added whenever acquiring each lock. + +And data required by lockdep exists in a local structure, held_locks +embedded in task_struct. Forcing to access the data within the context, +lockdep can avoid racy problems without explicit locks while handling +the local data. + +Lastly, lockdep only needs to keep locks currently being held, to build +a dependency graph. However, relaxing the limitation, it needs to keep +even locks already released, because a decision whether they created +dependencies might be long-deferred. + +To sum up, we can expect several advantages from the limitation: + + 1. Lockdep can easily identify a dependency when acquiring a lock. + 2. Races are avoidable while accessing local locks in a held_locks. + 3. Lockdep only needs to keep locks currently being held. + +CONCLUSION + +Given the limitation, the implementation becomes simple and efficient. + + +Cons from the limitation +------------------------ + +Given the limitation, lockdep is applicable only to typical locks. For +example, page locks for page access or completions for synchronization +cannot work with lockdep. + +Can we detect deadlocks below, under the limitation? + +Example 1: + + CONTEXT X CONTEXT Y CONTEXT Z + --------- --------- ---------- + mutex_lock A + lock_page B + lock_page B + mutex_lock A /* DEADLOCK */ + unlock_page B held by X + unlock_page B + mutex_unlock A + mutex_unlock A + + where A and B are different lock classes. + +No, we cannot. + +Example 2: + + CONTEXT X CONTEXT Y + --------- --------- + mutex_lock A + mutex_lock A + wait_for_complete B /* DEADLOCK */ + complete B + mutex_unlock A + mutex_unlock A + + where A is a lock class and B is a completion variable. + +No, we cannot. + +CONCLUSION + +Given the limitation, lockdep cannot detect a deadlock or its +possibility caused by page locks or completions. + + +Relax the limitation +-------------------- + +Under the limitation, things to create dependencies are limited to +typical locks. However, synchronization primitives like page locks and +completions, which are allowed to be released in any context, also +create dependencies and can cause a deadlock. So lockdep should track +these locks to do a better job. We have to relax the limitation for +these locks to work with lockdep. + +Detecting dependencies is very important for lockdep to work because +adding a dependency means adding an opportunity to check whether it +causes a deadlock. The more lockdep adds dependencies, the more it +thoroughly works. Thus Lockdep has to do its best to detect and add as +many true dependencies into a graph as possible. + +For example, considering only typical locks, lockdep builds a graph like: + + A -> B - + \ + -> E + / + C -> D - + + where A, B,..., E are different lock classes. + +On the other hand, under the relaxation, additional dependencies might +be created and added. Assuming additional 'FX -> C' and 'E -> GX' are +added thanks to the relaxation, the graph will be: + + A -> B - + \ + -> E -> GX + / + FX -> C -> D - + + where A, B,..., E, FX and GX are different lock classes, and a suffix + 'X' is added on non-typical locks. + +The latter graph gives us more chances to check circular dependencies +than the former. However, it might suffer performance degradation since +relaxing the limitation, with which design and implementation of lockdep +can be efficient, might introduce inefficiency inevitably. So lockdep +should provide two options, strong detection and efficient detection. + +Choosing efficient detection: + + Lockdep works with only locks restricted to be released within the + acquire context. However, lockdep works efficiently. + +Choosing strong detection: + + Lockdep works with all synchronization primitives. However, lockdep + suffers performance degradation. + +CONCLUSION + +Relaxing the limitation, lockdep can add additional dependencies giving +additional opportunities to check circular dependencies. + + +============ +Crossrelease +============ + +Introduce crossrelease +---------------------- + +In order to allow lockdep to handle additional dependencies by what +might be released in any context, namely 'crosslock', we have to be able +to identify those created by crosslocks. The proposed 'crossrelease' +feature provoides a way to do that. + +Crossrelease feature has to do: + + 1. Identify dependencies created by crosslocks. + 2. Add the dependencies into a dependency graph. + +That's all. Once a meaningful dependency is added into graph, then +lockdep would work with the graph as it did. The most important thing +crossrelease feature has to do is to correctly identify and add true +dependencies into the global graph. + +A dependency e.g. 'A -> B' can be identified only in the A's release +context because a decision required to identify the dependency can be +made only in the release context. That is to decide whether A can be +released so that a waiter for A can be woken up. It cannot be made in +other than the A's release context. + +It's no matter for typical locks because each acquire context is same as +its release context, thus lockdep can decide whether a lock can be +released in the acquire context. However for crosslocks, lockdep cannot +make the decision in the acquire context but has to wait until the +release context is identified. + +Therefore, deadlocks by crosslocks cannot be detected just when it +happens, because those cannot be identified until the crosslocks are +released. However, deadlock possibilities can be detected and it's very +worth. See 'APPENDIX A' section to check why. + +CONCLUSION + +Using crossrelease feature, lockdep can work with what might be released +in any context, namely crosslock. + + +Introduce commit +---------------- + +Since crossrelease defers the work adding true dependencies of +crosslocks until they are actually released, crossrelease has to queue +all acquisitions which might create dependencies with the crosslocks. +Then it identifies dependencies using the queued data in batches at a +proper time. We call it 'commit'. + +There are four types of dependencies: + +1. TT type: 'typical lock A -> typical lock B' + + Just when acquiring B, lockdep can see it's in the A's release + context. So the dependency between A and B can be identified + immediately. Commit is unnecessary. + +2. TC type: 'typical lock A -> crosslock BX' + + Just when acquiring BX, lockdep can see it's in the A's release + context. So the dependency between A and BX can be identified + immediately. Commit is unnecessary, too. + +3. CT type: 'crosslock AX -> typical lock B' + + When acquiring B, lockdep cannot identify the dependency because + there's no way to know if it's in the AX's release context. It has + to wait until the decision can be made. Commit is necessary. + +4. CC type: 'crosslock AX -> crosslock BX' + + When acquiring BX, lockdep cannot identify the dependency because + there's no way to know if it's in the AX's release context. It has + to wait until the decision can be made. Commit is necessary. + But, handling CC type is not implemented yet. It's a future work. + +Lockdep can work without commit for typical locks, but commit step is +necessary once crosslocks are involved. Introducing commit, lockdep +performs three steps. What lockdep does in each step is: + +1. Acquisition: For typical locks, lockdep does what it originally did + and queues the lock so that CT type dependencies can be checked using + it at the commit step. For crosslocks, it saves data which will be + used at the commit step and increases a reference count for it. + +2. Commit: No action is reauired for typical locks. For crosslocks, + lockdep adds CT type dependencies using the data saved at the + acquisition step. + +3. Release: No changes are required for typical locks. When a crosslock + is released, it decreases a reference count for it. + +CONCLUSION + +Crossrelease introduces commit step to handle dependencies of crosslocks +in batches at a proper time. + + +============== +Implementation +============== + +Data structures +--------------- + +Crossrelease introduces two main data structures. + +1. hist_lock + + This is an array embedded in task_struct, for keeping lock history so + that dependencies can be added using them at the commit step. Since + it's local data, it can be accessed locklessly in the owner context. + The array is filled at the acquisition step and consumed at the + commit step. And it's managed in circular manner. + +2. cross_lock + + One per lockdep_map exists. This is for keeping data of crosslocks + and used at the commit step. + + +How crossrelease works +---------------------- + +It's the key of how crossrelease works, to defer necessary works to an +appropriate point in time and perform in at once at the commit step. +Let's take a look with examples step by step, starting from how lockdep +works without crossrelease for typical locks. + + acquire A /* Push A onto held_locks */ + acquire B /* Push B onto held_locks and add 'A -> B' */ + acquire C /* Push C onto held_locks and add 'B -> C' */ + release C /* Pop C from held_locks */ + release B /* Pop B from held_locks */ + release A /* Pop A from held_locks */ + + where A, B and C are different lock classes. + + NOTE: This document assumes that readers already understand how + lockdep works without crossrelease thus omits details. But there's + one thing to note. Lockdep pretends to pop a lock from held_locks + when releasing it. But it's subtly different from the original pop + operation because lockdep allows other than the top to be poped. + +In this case, lockdep adds 'the top of held_locks -> the lock to acquire' +dependency every time acquiring a lock. + +After adding 'A -> B', a dependency graph will be: + + A -> B + + where A and B are different lock classes. + +And after adding 'B -> C', the graph will be: + + A -> B -> C + + where A, B and C are different lock classes. + +Let's performs commit step even for typical locks to add dependencies. +Of course, commit step is not necessary for them, however, it would work +well because this is a more general way. + + acquire A + /* + * Queue A into hist_locks + * + * In hist_locks: A + * In graph: Empty + */ + + acquire B + /* + * Queue B into hist_locks + * + * In hist_locks: A, B + * In graph: Empty + */ + + acquire C + /* + * Queue C into hist_locks + * + * In hist_locks: A, B, C + * In graph: Empty + */ + + commit C + /* + * Add 'C -> ?' + * Answer the following to decide '?' + * What has been queued since acquire C: Nothing + * + * In hist_locks: A, B, C + * In graph: Empty + */ + + release C + + commit B + /* + * Add 'B -> ?' + * Answer the following to decide '?' + * What has been queued since acquire B: C + * + * In hist_locks: A, B, C + * In graph: 'B -> C' + */ + + release B + + commit A + /* + * Add 'A -> ?' + * Answer the following to decide '?' + * What has been queued since acquire A: B, C + * + * In hist_locks: A, B, C + * In graph: 'B -> C', 'A -> B', 'A -> C' + */ + + release A + + where A, B and C are different lock classes. + +In this case, dependencies are added at the commit step as described. + +After commits for A, B and C, the graph will be: + + A -> B -> C + + where A, B and C are different lock classes. + + NOTE: A dependency 'A -> C' is optimized out. + +We can see the former graph built without commit step is same as the +latter graph built using commit steps. Of course the former way leads to +earlier finish for building the graph, which means we can detect a +deadlock or its possibility sooner. So the former way would be prefered +when possible. But we cannot avoid using the latter way for crosslocks. + +Let's look at how commit steps work for crosslocks. In this case, the +commit step is performed only on crosslock AX as real. And it assumes +that the AX release context is different from the AX acquire context. + + BX RELEASE CONTEXT BX ACQUIRE CONTEXT + ------------------ ------------------ + acquire A + /* + * Push A onto held_locks + * Queue A into hist_locks + * + * In held_locks: A + * In hist_locks: A + * In graph: Empty + */ + + acquire BX + /* + * Add 'the top of held_locks -> BX' + * + * In held_locks: A + * In hist_locks: A + * In graph: 'A -> BX' + */ + + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + It must be guaranteed that the following operations are seen after + acquiring BX globally. It can be done by things like barrier. + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + acquire C + /* + * Push C onto held_locks + * Queue C into hist_locks + * + * In held_locks: C + * In hist_locks: C + * In graph: 'A -> BX' + */ + + release C + /* + * Pop C from held_locks + * + * In held_locks: Empty + * In hist_locks: C + * In graph: 'A -> BX' + */ + acquire D + /* + * Push D onto held_locks + * Queue D into hist_locks + * Add 'the top of held_locks -> D' + * + * In held_locks: A, D + * In hist_locks: A, D + * In graph: 'A -> BX', 'A -> D' + */ + acquire E + /* + * Push E onto held_locks + * Queue E into hist_locks + * + * In held_locks: E + * In hist_locks: C, E + * In graph: 'A -> BX', 'A -> D' + */ + + release E + /* + * Pop E from held_locks + * + * In held_locks: Empty + * In hist_locks: D, E + * In graph: 'A -> BX', 'A -> D' + */ + release D + /* + * Pop D from held_locks + * + * In held_locks: A + * In hist_locks: A, D + * In graph: 'A -> BX', 'A -> D' + */ + commit BX + /* + * Add 'BX -> ?' + * What has been queued since acquire BX: C, E + * + * In held_locks: Empty + * In hist_locks: D, E + * In graph: 'A -> BX', 'A -> D', + * 'BX -> C', 'BX -> E' + */ + + release BX + /* + * In held_locks: Empty + * In hist_locks: D, E + * In graph: 'A -> BX', 'A -> D', + * 'BX -> C', 'BX -> E' + */ + release A + /* + * Pop A from held_locks + * + * In held_locks: Empty + * In hist_locks: A, D + * In graph: 'A -> BX', 'A -> D', + * 'BX -> C', 'BX -> E' + */ + + where A, BX, C,..., E are different lock classes, and a suffix 'X' is + added on crosslocks. + +Crossrelease considers all acquisitions after acqiuring BX are +candidates which might create dependencies with BX. True dependencies +will be determined when identifying the release context of BX. Meanwhile, +all typical locks are queued so that they can be used at the commit step. +And then two dependencies 'BX -> C' and 'BX -> E' are added at the +commit step when identifying the release context. + +The final graph will be, with crossrelease: + + -> C + / + -> BX - + / \ + A - -> E + \ + -> D + + where A, BX, C,..., E are different lock classes, and a suffix 'X' is + added on crosslocks. + +However, the final graph will be, without crossrelease: + + A -> D + + where A and D are different lock classes. + +The former graph has three more dependencies, 'A -> BX', 'BX -> C' and +'BX -> E' giving additional opportunities to check if they cause +deadlocks. This way lockdep can detect a deadlock or its possibility +caused by crosslocks. + +CONCLUSION + +We checked how crossrelease works with several examples. + + +============= +Optimizations +============= + +Avoid duplication +----------------- + +Crossrelease feature uses a cache like what lockdep already uses for +dependency chains, but this time it's for caching CT type dependencies. +Once that dependency is cached, the same will never be added again. + + +Lockless for hot paths +---------------------- + +To keep all locks for later use at the commit step, crossrelease adopts +a local array embedded in task_struct, which makes access to the data +lockless by forcing it to happen only within the owner context. It's +like how lockdep handles held_locks. Lockless implmentation is important +since typical locks are very frequently acquired and released. + + +================================================= +APPENDIX A: What lockdep does to work aggresively +================================================= + +A deadlock actually occurs when all wait operations creating circular +dependencies run at the same time. Even though they don't, a potential +deadlock exists if the problematic dependencies exist. Thus it's +meaningful to detect not only an actual deadlock but also its potential +possibility. The latter is rather valuable. When a deadlock occurs +actually, we can identify what happens in the system by some means or +other even without lockdep. However, there's no way to detect possiblity +without lockdep unless the whole code is parsed in head. It's terrible. +Lockdep does the both, and crossrelease only focuses on the latter. + +Whether or not a deadlock actually occurs depends on several factors. +For example, what order contexts are switched in is a factor. Assuming +circular dependencies exist, a deadlock would occur when contexts are +switched so that all wait operations creating the dependencies run +simultaneously. Thus to detect a deadlock possibility even in the case +that it has not occured yet, lockdep should consider all possible +combinations of dependencies, trying to: + +1. Use a global dependency graph. + + Lockdep combines all dependencies into one global graph and uses them, + regardless of which context generates them or what order contexts are + switched in. Aggregated dependencies are only considered so they are + prone to be circular if a problem exists. + +2. Check dependencies between classes instead of instances. + + What actually causes a deadlock are instances of lock. However, + lockdep checks dependencies between classes instead of instances. + This way lockdep can detect a deadlock which has not happened but + might happen in future by others but the same class. + +3. Assume all acquisitions lead to waiting. + + Although locks might be acquired without waiting which is essential + to create dependencies, lockdep assumes all acquisitions lead to + waiting since it might be true some time or another. + +CONCLUSION + +Lockdep detects not only an actual deadlock but also its possibility, +and the latter is more valuable. + + +================================================== +APPENDIX B: How to avoid adding false dependencies +================================================== + +Remind what a dependency is. A dependency exists if: + + 1. There are two waiters waiting for each event at a given time. + 2. The only way to wake up each waiter is to trigger its event. + 3. Whether one can be woken up depends on whether the other can. + +For example: + + acquire A + acquire B /* A dependency 'A -> B' exists */ + release B + release A + + where A and B are different lock classes. + +A depedency 'A -> B' exists since: + + 1. A waiter for A and a waiter for B might exist when acquiring B. + 2. Only way to wake up each is to release what it waits for. + 3. Whether the waiter for A can be woken up depends on whether the + other can. IOW, TASK X cannot release A if it fails to acquire B. + +For another example: + + TASK X TASK Y + ------ ------ + acquire AX + acquire B /* A dependency 'AX -> B' exists */ + release B + release AX held by Y + + where AX and B are different lock classes, and a suffix 'X' is added + on crosslocks. + +Even in this case involving crosslocks, the same rule can be applied. A +depedency 'AX -> B' exists since: + + 1. A waiter for AX and a waiter for B might exist when acquiring B. + 2. Only way to wake up each is to release what it waits for. + 3. Whether the waiter for AX can be woken up depends on whether the + other can. IOW, TASK X cannot release AX if it fails to acquire B. + +Let's take a look at more complicated example: + + TASK X TASK Y + ------ ------ + acquire B + release B + fork Y + acquire AX + acquire C /* A dependency 'AX -> C' exists */ + release C + release AX held by Y + + where AX, B and C are different lock classes, and a suffix 'X' is + added on crosslocks. + +Does a dependency 'AX -> B' exist? Nope. + +Two waiters are essential to create a dependency. However, waiters for +AX and B to create 'AX -> B' cannot exist at the same time in this +example. Thus the dependency 'AX -> B' cannot be created. + +It would be ideal if the full set of true ones can be considered. But +we can ensure nothing but what actually happened. Relying on what +actually happens at runtime, we can anyway add only true ones, though +they might be a subset of true ones. It's similar to how lockdep works +for typical locks. There might be more true dependencies than what +lockdep has detected in runtime. Lockdep has no choice but to rely on +what actually happens. Crossrelease also relies on it. + +CONCLUSION + +Relying on what actually happens, lockdep can avoid adding false +dependencies. diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index e2ee0a1c299a..b759a60624fd 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -498,11 +498,11 @@ And a couple of implicit varieties: This means that ACQUIRE acts as a minimal "acquire" operation and RELEASE acts as a minimal "release" operation. -A subset of the atomic operations described in core-api/atomic_ops.rst have -ACQUIRE and RELEASE variants in addition to fully-ordered and relaxed (no -barrier semantics) definitions. For compound atomics performing both a load -and a store, ACQUIRE semantics apply only to the load and RELEASE semantics -apply only to the store portion of the operation. +A subset of the atomic operations described in atomic_t.txt have ACQUIRE and +RELEASE variants in addition to fully-ordered and relaxed (no barrier +semantics) definitions. For compound atomics performing both a load and a +store, ACQUIRE semantics apply only to the load and RELEASE semantics apply +only to the store portion of the operation. Memory barriers are only required where there's a possibility of interaction between two CPUs or between a CPU and a device. If it can be guaranteed that @@ -1883,8 +1883,7 @@ There are some more advanced barrier functions: This makes sure that the death mark on the object is perceived to be set *before* the reference counter is decremented. - See Documentation/core-api/atomic_ops.rst for more information. See the - "Atomic operations" subsection for information on where to use these. + See Documentation/atomic_{t,bitops}.txt for more information. (*) lockless_dereference(); @@ -1989,10 +1988,7 @@ for each construct. These operations all imply certain barriers: ACQUIRE operation has completed. Memory operations issued before the ACQUIRE may be completed after - the ACQUIRE operation has completed. An smp_mb__before_spinlock(), - combined with a following ACQUIRE, orders prior stores against - subsequent loads and stores. Note that this is weaker than smp_mb()! - The smp_mb__before_spinlock() primitive is free on many architectures. + the ACQUIRE operation has completed. (2) RELEASE operation implication: @@ -2510,88 +2506,7 @@ operations are noted specially as some of them imply full memory barriers and some don't, but they're very heavily relied on as a group throughout the kernel. -Any atomic operation that modifies some state in memory and returns information -about the state (old or new) implies an SMP-conditional general memory barrier -(smp_mb()) on each side of the actual operation (with the exception of -explicit lock operations, described later). These include: - - xchg(); - atomic_xchg(); atomic_long_xchg(); - atomic_inc_return(); atomic_long_inc_return(); - atomic_dec_return(); atomic_long_dec_return(); - atomic_add_return(); atomic_long_add_return(); - atomic_sub_return(); atomic_long_sub_return(); - atomic_inc_and_test(); atomic_long_inc_and_test(); - atomic_dec_and_test(); atomic_long_dec_and_test(); - atomic_sub_and_test(); atomic_long_sub_and_test(); - atomic_add_negative(); atomic_long_add_negative(); - test_and_set_bit(); - test_and_clear_bit(); - test_and_change_bit(); - - /* when succeeds */ - cmpxchg(); - atomic_cmpxchg(); atomic_long_cmpxchg(); - atomic_add_unless(); atomic_long_add_unless(); - -These are used for such things as implementing ACQUIRE-class and RELEASE-class -operations and adjusting reference counters towards object destruction, and as -such the implicit memory barrier effects are necessary. - - -The following operations are potential problems as they do _not_ imply memory -barriers, but might be used for implementing such things as RELEASE-class -operations: - - atomic_set(); - set_bit(); - clear_bit(); - change_bit(); - -With these the appropriate explicit memory barrier should be used if necessary -(smp_mb__before_atomic() for instance). - - -The following also do _not_ imply memory barriers, and so may require explicit -memory barriers under some circumstances (smp_mb__before_atomic() for -instance): - - atomic_add(); - atomic_sub(); - atomic_inc(); - atomic_dec(); - -If they're used for statistics generation, then they probably don't need memory -barriers, unless there's a coupling between statistical data. - -If they're used for reference counting on an object to control its lifetime, -they probably don't need memory barriers because either the reference count -will be adjusted inside a locked section, or the caller will already hold -sufficient references to make the lock, and thus a memory barrier unnecessary. - -If they're used for constructing a lock of some description, then they probably -do need memory barriers as a lock primitive generally has to do things in a -specific order. - -Basically, each usage case has to be carefully considered as to whether memory -barriers are needed or not. - -The following operations are special locking primitives: - - test_and_set_bit_lock(); - clear_bit_unlock(); - __clear_bit_unlock(); - -These implement ACQUIRE-class and RELEASE-class operations. These should be -used in preference to other operations when implementing locking primitives, -because their implementations can be optimised on many architectures. - -[!] Note that special memory barrier primitives are available for these -situations because on some CPUs the atomic instructions used imply full memory -barriers, and so barrier instructions are superfluous in conjunction with them, -and in such cases the special barrier primitives will be no-ops. - -See Documentation/core-api/atomic_ops.rst for more information. +See Documentation/atomic_t.txt for more information. ACCESSING DEVICES diff --git a/Documentation/static-keys.txt b/Documentation/static-keys.txt index b83dfa1c0602..ab16efe0c79d 100644 --- a/Documentation/static-keys.txt +++ b/Documentation/static-keys.txt @@ -149,6 +149,26 @@ static_branch_inc(), will change the branch back to true. Likewise, if the key is initialized false, a 'static_branch_inc()', will change the branch to true. And then a 'static_branch_dec()', will again make the branch false. +The state and the reference count can be retrieved with 'static_key_enabled()' +and 'static_key_count()'. In general, if you use these functions, they +should be protected with the same mutex used around the enable/disable +or increment/decrement function. + +Note that switching branches results in some locks being taken, +particularly the CPU hotplug lock (in order to avoid races against +CPUs being brought in the kernel whilst the kernel is getting +patched). Calling the static key API from within a hotplug notifier is +thus a sure deadlock recipe. In order to still allow use of the +functionnality, the following functions are provided: + + static_key_enable_cpuslocked() + static_key_disable_cpuslocked() + static_branch_enable_cpuslocked() + static_branch_disable_cpuslocked() + +These functions are *not* general purpose, and must only be used when +you really know that you're in the above context, and no other. + Where an array of keys is required, it can be defined as:: DEFINE_STATIC_KEY_ARRAY_TRUE(keys, count); diff --git a/Documentation/translations/ko_KR/memory-barriers.txt b/Documentation/translations/ko_KR/memory-barriers.txt index 38310dcd6620..bc80fc0e210f 100644 --- a/Documentation/translations/ko_KR/memory-barriers.txt +++ b/Documentation/translations/ko_KR/memory-barriers.txt @@ -1956,10 +1956,7 @@ MMIO 쓰기 배리어 뒤에 완료됩니다. ACQUIRE 앞에서 요청된 메모리 오퍼레이션은 ACQUIRE 오퍼레이션이 완료된 후에 - 완료될 수 있습니다. smp_mb__before_spinlock() 뒤에 ACQUIRE 가 실행되는 - 코드 블록은 블록 앞의 스토어를 블록 뒤의 로드와 스토어에 대해 순서 - 맞춥니다. 이건 smp_mb() 보다 완화된 것임을 기억하세요! 많은 아키텍쳐에서 - smp_mb__before_spinlock() 은 사실 아무일도 하지 않습니다. + 완료될 수 있습니다. (2) RELEASE 오퍼레이션의 영향: diff --git a/arch/Kconfig b/arch/Kconfig index 21d0089117fe..2520ca5b42eb 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -931,6 +931,18 @@ config STRICT_MODULE_RWX config ARCH_WANT_RELAX_ORDER bool +config ARCH_HAS_REFCOUNT + bool + help + An architecture selects this when it has implemented refcount_t + using open coded assembly primitives that provide an optimized + refcount_t implementation, possibly at the expense of some full + refcount state checks of CONFIG_REFCOUNT_FULL=y. + + The refcount overflow check behavior, however, must be retained. + Catching overflows is the primary security concern for protecting + against bugs in reference counts. + config REFCOUNT_FULL bool "Perform full reference count validation at the expense of speed" help diff --git a/arch/alpha/include/asm/futex.h b/arch/alpha/include/asm/futex.h index fb01dfb760c2..05a70edd57b6 100644 --- a/arch/alpha/include/asm/futex.h +++ b/arch/alpha/include/asm/futex.h @@ -25,18 +25,10 @@ : "r" (uaddr), "r"(oparg) \ : "memory") -static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; pagefault_disable(); @@ -62,17 +54,9 @@ static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h index 54b54da6384c..11859287c52a 100644 --- a/arch/arc/include/asm/atomic.h +++ b/arch/arc/include/asm/atomic.h @@ -123,6 +123,8 @@ static inline void atomic_set(atomic_t *v, int i) atomic_ops_unlock(flags); } +#define atomic_set_release(v, i) atomic_set((v), (i)) + #endif /* diff --git a/arch/arc/include/asm/futex.h b/arch/arc/include/asm/futex.h index 11e1b1f3acda..eb887dd13e74 100644 --- a/arch/arc/include/asm/futex.h +++ b/arch/arc/include/asm/futex.h @@ -73,20 +73,11 @@ #endif -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) - return -EFAULT; - #ifndef CONFIG_ARC_HAS_LLSC preempt_disable(); /* to guarantee atomic r-m-w of futex op */ #endif @@ -118,30 +109,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) preempt_enable(); #endif - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (oldval == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (oldval != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (oldval < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (oldval >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (oldval <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (oldval > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h index 6795368ad023..cc414382dab4 100644 --- a/arch/arm/include/asm/futex.h +++ b/arch/arm/include/asm/futex.h @@ -128,20 +128,10 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, #endif /* !SMP */ static inline int -futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret, tmp; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - #ifndef CONFIG_SMP preempt_disable(); #endif @@ -172,17 +162,9 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) preempt_enable(); #endif - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index f32b42e8725d..5bb2fd4674e7 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -48,20 +48,10 @@ do { \ } while (0) static inline int -futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (int)(encoded_op << 8) >> 20; - int cmparg = (int)(encoded_op << 20) >> 20; int oldval = 0, ret, tmp; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1U << (oparg & 0x1f); - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); switch (op) { @@ -91,17 +81,9 @@ futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h index f445bd7f2b9f..95ad7102b63c 100644 --- a/arch/arm64/include/asm/spinlock.h +++ b/arch/arm64/include/asm/spinlock.h @@ -310,14 +310,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw) #define arch_read_relax(lock) cpu_relax() #define arch_write_relax(lock) cpu_relax() -/* - * Accesses appearing in program order before a spin_lock() operation - * can be reordered with accesses inside the critical section, by virtue - * of arch_spin_lock being constructed using acquire semantics. - * - * In cases where this is problematic (e.g. try_to_wake_up), an - * smp_mb__before_spinlock() can restore the required ordering. - */ -#define smp_mb__before_spinlock() smp_mb() +/* See include/linux/spinlock.h */ +#define smp_mb__after_spinlock() smp_mb() #endif /* __ASM_SPINLOCK_H */ diff --git a/arch/frv/include/asm/futex.h b/arch/frv/include/asm/futex.h index 2e1da71e27a4..ab346f5f8820 100644 --- a/arch/frv/include/asm/futex.h +++ b/arch/frv/include/asm/futex.h @@ -7,7 +7,8 @@ #include <asm/errno.h> #include <linux/uaccess.h> -extern int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr); +extern int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr); static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, diff --git a/arch/frv/kernel/futex.c b/arch/frv/kernel/futex.c index d155ca9e5098..37f7b2bf7f73 100644 --- a/arch/frv/kernel/futex.c +++ b/arch/frv/kernel/futex.c @@ -186,20 +186,10 @@ static inline int atomic_futex_op_xchg_xor(int oparg, u32 __user *uaddr, int *_o /* * do the futex operations */ -int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); switch (op) { @@ -225,18 +215,9 @@ int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; break; - } - } + if (!ret) + *oval = oldval; return ret; -} /* end futex_atomic_op_inuser() */ +} /* end arch_futex_atomic_op_inuser() */ diff --git a/arch/hexagon/include/asm/atomic.h b/arch/hexagon/include/asm/atomic.h index a62ba368b27d..fb3dfb2a667e 100644 --- a/arch/hexagon/include/asm/atomic.h +++ b/arch/hexagon/include/asm/atomic.h @@ -42,6 +42,8 @@ static inline void atomic_set(atomic_t *v, int new) ); } +#define atomic_set_release(v, i) atomic_set((v), (i)) + /** * atomic_read - reads a word, atomically * @v: pointer to atomic value diff --git a/arch/hexagon/include/asm/futex.h b/arch/hexagon/include/asm/futex.h index 7e597f8434da..c607b77c8215 100644 --- a/arch/hexagon/include/asm/futex.h +++ b/arch/hexagon/include/asm/futex.h @@ -31,18 +31,9 @@ static inline int -futex_atomic_op_inuser(int encoded_op, int __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) - return -EFAULT; pagefault_disable(); @@ -72,30 +63,9 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (oldval == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (oldval != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (oldval < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (oldval >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (oldval <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (oldval > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/ia64/include/asm/futex.h b/arch/ia64/include/asm/futex.h index 76acbcd5c060..6d67dc1eaf2b 100644 --- a/arch/ia64/include/asm/futex.h +++ b/arch/ia64/include/asm/futex.h @@ -45,18 +45,9 @@ do { \ } while (0) static inline int -futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; pagefault_disable(); @@ -84,17 +75,9 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/metag/include/asm/atomic_lock1.h b/arch/metag/include/asm/atomic_lock1.h index 6c1380a8a0d4..eee779f26cc4 100644 --- a/arch/metag/include/asm/atomic_lock1.h +++ b/arch/metag/include/asm/atomic_lock1.h @@ -37,6 +37,8 @@ static inline int atomic_set(atomic_t *v, int i) return i; } +#define atomic_set_release(v, i) atomic_set((v), (i)) + #define ATOMIC_OP(op, c_op) \ static inline void atomic_##op(int i, atomic_t *v) \ { \ diff --git a/arch/microblaze/include/asm/futex.h b/arch/microblaze/include/asm/futex.h index 01848f056f43..a9dad9e5e132 100644 --- a/arch/microblaze/include/asm/futex.h +++ b/arch/microblaze/include/asm/futex.h @@ -29,18 +29,9 @@ }) static inline int -futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; pagefault_disable(); @@ -66,30 +57,9 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (oldval == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (oldval != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (oldval < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (oldval >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (oldval <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (oldval > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/mips/include/asm/futex.h b/arch/mips/include/asm/futex.h index 1de190bdfb9c..a9e61ea54ca9 100644 --- a/arch/mips/include/asm/futex.h +++ b/arch/mips/include/asm/futex.h @@ -83,18 +83,9 @@ } static inline int -futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; pagefault_disable(); @@ -125,17 +116,9 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c index 6bace7695788..c7cbddfcdc3b 100644 --- a/arch/mips/kernel/smp.c +++ b/arch/mips/kernel/smp.c @@ -648,12 +648,12 @@ EXPORT_SYMBOL(flush_tlb_one); #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST static DEFINE_PER_CPU(atomic_t, tick_broadcast_count); -static DEFINE_PER_CPU(struct call_single_data, tick_broadcast_csd); +static DEFINE_PER_CPU(call_single_data_t, tick_broadcast_csd); void tick_broadcast(const struct cpumask *mask) { atomic_t *count; - struct call_single_data *csd; + call_single_data_t *csd; int cpu; for_each_cpu(cpu, mask) { @@ -674,7 +674,7 @@ static void tick_broadcast_callee(void *info) static int __init tick_broadcast_init(void) { - struct call_single_data *csd; + call_single_data_t *csd; int cpu; for (cpu = 0; cpu < NR_CPUS; cpu++) { diff --git a/arch/openrisc/include/asm/futex.h b/arch/openrisc/include/asm/futex.h index 778087341977..8fed278a24b8 100644 --- a/arch/openrisc/include/asm/futex.h +++ b/arch/openrisc/include/asm/futex.h @@ -30,20 +30,10 @@ }) static inline int -futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); switch (op) { @@ -68,30 +58,9 @@ futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (oldval == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (oldval != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (oldval < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (oldval >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (oldval <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (oldval > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h index 5394b9c5f914..17b98a87e5e2 100644 --- a/arch/parisc/include/asm/atomic.h +++ b/arch/parisc/include/asm/atomic.h @@ -65,6 +65,8 @@ static __inline__ void atomic_set(atomic_t *v, int i) _atomic_spin_unlock_irqrestore(v, flags); } +#define atomic_set_release(v, i) atomic_set((v), (i)) + static __inline__ int atomic_read(const atomic_t *v) { return READ_ONCE((v)->counter); diff --git a/arch/parisc/include/asm/futex.h b/arch/parisc/include/asm/futex.h index 0ba14300cd8e..c601aab2fb36 100644 --- a/arch/parisc/include/asm/futex.h +++ b/arch/parisc/include/asm/futex.h @@ -32,22 +32,12 @@ _futex_spin_unlock_irqrestore(u32 __user *uaddr, unsigned long int *flags) } static inline int -futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { unsigned long int flags; - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval, ret; u32 tmp; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(*uaddr))) - return -EFAULT; - _futex_spin_lock_irqsave(uaddr, &flags); pagefault_disable(); @@ -85,17 +75,9 @@ out_pagefault_enable: pagefault_enable(); _futex_spin_unlock_irqrestore(uaddr, &flags); - if (ret == 0) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h index 25d42bd3f114..9c601adfc500 100644 --- a/arch/powerpc/include/asm/barrier.h +++ b/arch/powerpc/include/asm/barrier.h @@ -74,13 +74,6 @@ do { \ ___p1; \ }) -/* - * This must resolve to hwsync on SMP for the context switch path. - * See _switch, and core scheduler context switch memory ordering - * comments. - */ -#define smp_mb__before_spinlock() smp_mb() - #include <asm-generic/barrier.h> #endif /* _ASM_POWERPC_BARRIER_H */ diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h index eaada6c92344..719ed9b61ea7 100644 --- a/arch/powerpc/include/asm/futex.h +++ b/arch/powerpc/include/asm/futex.h @@ -29,18 +29,10 @@ : "b" (uaddr), "i" (-EFAULT), "r" (oparg) \ : "cr0", "memory") -static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; pagefault_disable(); @@ -66,17 +58,9 @@ static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h index d256e448ea49..edbe571bcc54 100644 --- a/arch/powerpc/include/asm/spinlock.h +++ b/arch/powerpc/include/asm/spinlock.h @@ -309,5 +309,8 @@ static inline void arch_write_unlock(arch_rwlock_t *rw) #define arch_read_relax(lock) __rw_yield(lock) #define arch_write_relax(lock) __rw_yield(lock) +/* See include/linux/spinlock.h */ +#define smp_mb__after_spinlock() smp_mb() + #endif /* __KERNEL__ */ #endif /* __ASM_SPINLOCK_H */ diff --git a/arch/s390/include/asm/futex.h b/arch/s390/include/asm/futex.h index a4811aa0304d..8f8eec9e1198 100644 --- a/arch/s390/include/asm/futex.h +++ b/arch/s390/include/asm/futex.h @@ -21,17 +21,12 @@ : "0" (-EFAULT), "d" (oparg), "a" (uaddr), \ "m" (*uaddr) : "cc"); -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, newval, ret; load_kernel_asce(); - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; pagefault_disable(); switch (op) { @@ -60,17 +55,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) } pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/sh/include/asm/futex.h b/arch/sh/include/asm/futex.h index d0078747d308..8f8cf941a8cd 100644 --- a/arch/sh/include/asm/futex.h +++ b/arch/sh/include/asm/futex.h @@ -27,21 +27,12 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, return atomic_futex_op_cmpxchg_inatomic(uval, uaddr, oldval, newval); } -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - u32 oparg = (encoded_op << 8) >> 20; - u32 cmparg = (encoded_op << 20) >> 20; u32 oldval, newval, prev; int ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); do { @@ -80,17 +71,8 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = ((int)oldval < (int)cmparg); break; - case FUTEX_OP_CMP_GE: ret = ((int)oldval >= (int)cmparg); break; - case FUTEX_OP_CMP_LE: ret = ((int)oldval <= (int)cmparg); break; - case FUTEX_OP_CMP_GT: ret = ((int)oldval > (int)cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; return ret; } diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h index ee3f11c43cda..7643e979e333 100644 --- a/arch/sparc/include/asm/atomic_32.h +++ b/arch/sparc/include/asm/atomic_32.h @@ -29,6 +29,8 @@ int atomic_xchg(atomic_t *, int); int __atomic_add_unless(atomic_t *, int, int); void atomic_set(atomic_t *, int); +#define atomic_set_release(v, i) atomic_set((v), (i)) + #define atomic_read(v) ACCESS_ONCE((v)->counter) #define atomic_add(i, v) ((void)atomic_add_return( (int)(i), (v))) diff --git a/arch/sparc/include/asm/futex_64.h b/arch/sparc/include/asm/futex_64.h index 4e899b0dabf7..1cfd89d92208 100644 --- a/arch/sparc/include/asm/futex_64.h +++ b/arch/sparc/include/asm/futex_64.h @@ -29,22 +29,14 @@ : "r" (uaddr), "r" (oparg), "i" (-EFAULT) \ : "memory") -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret, tem; - if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) - return -EFAULT; if (unlikely((((unsigned long) uaddr) & 0x3UL))) return -EINVAL; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - pagefault_disable(); switch (op) { @@ -69,17 +61,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/tile/include/asm/atomic_32.h b/arch/tile/include/asm/atomic_32.h index a93774255136..53a423e7cb92 100644 --- a/arch/tile/include/asm/atomic_32.h +++ b/arch/tile/include/asm/atomic_32.h @@ -101,6 +101,8 @@ static inline void atomic_set(atomic_t *v, int n) _atomic_xchg(&v->counter, n); } +#define atomic_set_release(v, i) atomic_set((v), (i)) + /* A 64bit atomic type */ typedef struct { diff --git a/arch/tile/include/asm/futex.h b/arch/tile/include/asm/futex.h index e64a1b75fc38..83c1e639b411 100644 --- a/arch/tile/include/asm/futex.h +++ b/arch/tile/include/asm/futex.h @@ -106,12 +106,9 @@ lock = __atomic_hashed_lock((int __force *)uaddr) #endif -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int uninitialized_var(val), ret; __futex_prolog(); @@ -119,12 +116,6 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) /* The 32-bit futex code makes this assumption, so validate it here. */ BUILD_BUG_ON(sizeof(atomic_t) != sizeof(int)); - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); switch (op) { case FUTEX_OP_SET: @@ -148,30 +139,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) } pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (val == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (val != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (val < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (val >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (val <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (val > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = val; + return ret; } diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9c95aa417e9b..cce15191e9e9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -55,6 +55,8 @@ config X86 select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_MMIO_FLUSH select ARCH_HAS_PMEM_API if X86_64 + # Causing hangs/crashes, see the commit that added this change for details. + select ARCH_HAS_REFCOUNT if BROKEN select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 7a9df3beb89b..676ee5807d86 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -74,6 +74,9 @@ # define _ASM_EXTABLE_EX(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext) +# define _ASM_EXTABLE_REFCOUNT(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount) + # define _ASM_NOKPROBE(entry) \ .pushsection "_kprobe_blacklist","aw" ; \ _ASM_ALIGN ; \ @@ -123,6 +126,9 @@ # define _ASM_EXTABLE_EX(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext) +# define _ASM_EXTABLE_REFCOUNT(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount) + /* For C file, we already have NOKPROBE_SYMBOL macro */ #endif diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 33380b871463..0874ebda3069 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -197,35 +197,56 @@ static inline int atomic_xchg(atomic_t *v, int new) return xchg(&v->counter, new); } -#define ATOMIC_OP(op) \ -static inline void atomic_##op(int i, atomic_t *v) \ -{ \ - asm volatile(LOCK_PREFIX #op"l %1,%0" \ - : "+m" (v->counter) \ - : "ir" (i) \ - : "memory"); \ +static inline void atomic_and(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "andl %1,%0" + : "+m" (v->counter) + : "ir" (i) + : "memory"); +} + +static inline int atomic_fetch_and(int i, atomic_t *v) +{ + int val = atomic_read(v); + + do { } while (!atomic_try_cmpxchg(v, &val, val & i)); + + return val; } -#define ATOMIC_FETCH_OP(op, c_op) \ -static inline int atomic_fetch_##op(int i, atomic_t *v) \ -{ \ - int val = atomic_read(v); \ - do { \ - } while (!atomic_try_cmpxchg(v, &val, val c_op i)); \ - return val; \ +static inline void atomic_or(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "orl %1,%0" + : "+m" (v->counter) + : "ir" (i) + : "memory"); } -#define ATOMIC_OPS(op, c_op) \ - ATOMIC_OP(op) \ - ATOMIC_FETCH_OP(op, c_op) +static inline int atomic_fetch_or(int i, atomic_t *v) +{ + int val = atomic_read(v); -ATOMIC_OPS(and, &) -ATOMIC_OPS(or , |) -ATOMIC_OPS(xor, ^) + do { } while (!atomic_try_cmpxchg(v, &val, val | i)); -#undef ATOMIC_OPS -#undef ATOMIC_FETCH_OP -#undef ATOMIC_OP + return val; +} + +static inline void atomic_xor(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "xorl %1,%0" + : "+m" (v->counter) + : "ir" (i) + : "memory"); +} + +static inline int atomic_fetch_xor(int i, atomic_t *v) +{ + int val = atomic_read(v); + + do { } while (!atomic_try_cmpxchg(v, &val, val ^ i)); + + return val; +} /** * __atomic_add_unless - add unless the number is already a given value @@ -239,10 +260,12 @@ ATOMIC_OPS(xor, ^) static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u) { int c = atomic_read(v); + do { if (unlikely(c == u)) break; } while (!atomic_try_cmpxchg(v, &c, c + a)); + return c; } diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 71d7705fb303..9e206f31ce2a 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -312,37 +312,70 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v) #undef alternative_atomic64 #undef __alternative_atomic64 -#define ATOMIC64_OP(op, c_op) \ -static inline void atomic64_##op(long long i, atomic64_t *v) \ -{ \ - long long old, c = 0; \ - while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \ - c = old; \ +static inline void atomic64_and(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c & i)) != c) + c = old; } -#define ATOMIC64_FETCH_OP(op, c_op) \ -static inline long long atomic64_fetch_##op(long long i, atomic64_t *v) \ -{ \ - long long old, c = 0; \ - while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \ - c = old; \ - return old; \ +static inline long long atomic64_fetch_and(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c & i)) != c) + c = old; + + return old; } -ATOMIC64_FETCH_OP(add, +) +static inline void atomic64_or(long long i, atomic64_t *v) +{ + long long old, c = 0; -#define atomic64_fetch_sub(i, v) atomic64_fetch_add(-(i), (v)) + while ((old = atomic64_cmpxchg(v, c, c | i)) != c) + c = old; +} + +static inline long long atomic64_fetch_or(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c | i)) != c) + c = old; + + return old; +} -#define ATOMIC64_OPS(op, c_op) \ - ATOMIC64_OP(op, c_op) \ - ATOMIC64_FETCH_OP(op, c_op) +static inline void atomic64_xor(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c) + c = old; +} -ATOMIC64_OPS(and, &) -ATOMIC64_OPS(or, |) -ATOMIC64_OPS(xor, ^) +static inline long long atomic64_fetch_xor(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c) + c = old; + + return old; +} -#undef ATOMIC64_OPS -#undef ATOMIC64_FETCH_OP -#undef ATOMIC64_OP +static inline long long atomic64_fetch_add(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c + i)) != c) + c = old; + + return old; +} + +#define atomic64_fetch_sub(i, v) atomic64_fetch_add(-(i), (v)) #endif /* _ASM_X86_ATOMIC64_32_H */ diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 6189a433c9a9..5d9de36a2f04 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -177,7 +177,7 @@ static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new) } #define atomic64_try_cmpxchg atomic64_try_cmpxchg -static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, long *old, long new) +static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, long new) { return try_cmpxchg(&v->counter, old, new); } @@ -198,7 +198,7 @@ static inline long atomic64_xchg(atomic64_t *v, long new) */ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u) { - long c = atomic64_read(v); + s64 c = atomic64_read(v); do { if (unlikely(c == u)) return false; @@ -217,7 +217,7 @@ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u) */ static inline long atomic64_dec_if_positive(atomic64_t *v) { - long dec, c = atomic64_read(v); + s64 dec, c = atomic64_read(v); do { dec = c - 1; if (unlikely(dec < 0)) @@ -226,34 +226,55 @@ static inline long atomic64_dec_if_positive(atomic64_t *v) return dec; } -#define ATOMIC64_OP(op) \ -static inline void atomic64_##op(long i, atomic64_t *v) \ -{ \ - asm volatile(LOCK_PREFIX #op"q %1,%0" \ - : "+m" (v->counter) \ - : "er" (i) \ - : "memory"); \ +static inline void atomic64_and(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "andq %1,%0" + : "+m" (v->counter) + : "er" (i) + : "memory"); } -#define ATOMIC64_FETCH_OP(op, c_op) \ -static inline long atomic64_fetch_##op(long i, atomic64_t *v) \ -{ \ - long val = atomic64_read(v); \ - do { \ - } while (!atomic64_try_cmpxchg(v, &val, val c_op i)); \ - return val; \ +static inline long atomic64_fetch_and(long i, atomic64_t *v) +{ + s64 val = atomic64_read(v); + + do { + } while (!atomic64_try_cmpxchg(v, &val, val & i)); + return val; } -#define ATOMIC64_OPS(op, c_op) \ - ATOMIC64_OP(op) \ - ATOMIC64_FETCH_OP(op, c_op) +static inline void atomic64_or(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "orq %1,%0" + : "+m" (v->counter) + : "er" (i) + : "memory"); +} -ATOMIC64_OPS(and, &) -ATOMIC64_OPS(or, |) -ATOMIC64_OPS(xor, ^) +static inline long atomic64_fetch_or(long i, atomic64_t *v) +{ + s64 val = atomic64_read(v); -#undef ATOMIC64_OPS -#undef ATOMIC64_FETCH_OP -#undef ATOMIC64_OP + do { + } while (!atomic64_try_cmpxchg(v, &val, val | i)); + return val; +} + +static inline void atomic64_xor(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "xorq %1,%0" + : "+m" (v->counter) + : "er" (i) + : "memory"); +} + +static inline long atomic64_fetch_xor(long i, atomic64_t *v) +{ + s64 val = atomic64_read(v); + + do { + } while (!atomic64_try_cmpxchg(v, &val, val ^ i)); + return val; +} #endif /* _ASM_X86_ATOMIC64_64_H */ diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index d90296d061e8..b5069e802d5c 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -157,7 +157,7 @@ extern void __add_wrong_size(void) #define __raw_try_cmpxchg(_ptr, _pold, _new, size, lock) \ ({ \ bool success; \ - __typeof__(_ptr) _old = (_pold); \ + __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ __typeof__(*(_ptr)) __old = *_old; \ __typeof__(*(_ptr)) __new = (_new); \ switch (size) { \ diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h index b4c1f5453436..f4dc9b63bdda 100644 --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -41,20 +41,11 @@ "+m" (*uaddr), "=&r" (tem) \ : "r" (oparg), "i" (-EFAULT), "1" (0)) -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret, tem; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); switch (op) { @@ -80,30 +71,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (oldval == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (oldval != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (oldval < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (oldval >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (oldval <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (oldval > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h new file mode 100644 index 000000000000..ff871210b9f2 --- /dev/null +++ b/arch/x86/include/asm/refcount.h @@ -0,0 +1,109 @@ +#ifndef __ASM_X86_REFCOUNT_H +#define __ASM_X86_REFCOUNT_H +/* + * x86-specific implementation of refcount_t. Based on PAX_REFCOUNT from + * PaX/grsecurity. + */ +#include <linux/refcount.h> + +/* + * This is the first portion of the refcount error handling, which lives in + * .text.unlikely, and is jumped to from the CPU flag check (in the + * following macros). This saves the refcount value location into CX for + * the exception handler to use (in mm/extable.c), and then triggers the + * central refcount exception. The fixup address for the exception points + * back to the regular execution flow in .text. + */ +#define _REFCOUNT_EXCEPTION \ + ".pushsection .text.unlikely\n" \ + "111:\tlea %[counter], %%" _ASM_CX "\n" \ + "112:\t" ASM_UD0 "\n" \ + ASM_UNREACHABLE \ + ".popsection\n" \ + "113:\n" \ + _ASM_EXTABLE_REFCOUNT(112b, 113b) + +/* Trigger refcount exception if refcount result is negative. */ +#define REFCOUNT_CHECK_LT_ZERO \ + "js 111f\n\t" \ + _REFCOUNT_EXCEPTION + +/* Trigger refcount exception if refcount result is zero or negative. */ +#define REFCOUNT_CHECK_LE_ZERO \ + "jz 111f\n\t" \ + REFCOUNT_CHECK_LT_ZERO + +/* Trigger refcount exception unconditionally. */ +#define REFCOUNT_ERROR \ + "jmp 111f\n\t" \ + _REFCOUNT_EXCEPTION + +static __always_inline void refcount_add(unsigned int i, refcount_t *r) +{ + asm volatile(LOCK_PREFIX "addl %1,%0\n\t" + REFCOUNT_CHECK_LT_ZERO + : [counter] "+m" (r->refs.counter) + : "ir" (i) + : "cc", "cx"); +} + +static __always_inline void refcount_inc(refcount_t *r) +{ + asm volatile(LOCK_PREFIX "incl %0\n\t" + REFCOUNT_CHECK_LT_ZERO + : [counter] "+m" (r->refs.counter) + : : "cc", "cx"); +} + +static __always_inline void refcount_dec(refcount_t *r) +{ + asm volatile(LOCK_PREFIX "decl %0\n\t" + REFCOUNT_CHECK_LE_ZERO + : [counter] "+m" (r->refs.counter) + : : "cc", "cx"); +} + +static __always_inline __must_check +bool refcount_sub_and_test(unsigned int i, refcount_t *r) +{ + GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl", REFCOUNT_CHECK_LT_ZERO, + r->refs.counter, "er", i, "%0", e); +} + +static __always_inline __must_check bool refcount_dec_and_test(refcount_t *r) +{ + GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl", REFCOUNT_CHECK_LT_ZERO, + r->refs.counter, "%0", e); +} + +static __always_inline __must_check +bool refcount_add_not_zero(unsigned int i, refcount_t *r) +{ + int c, result; + + c = atomic_read(&(r->refs)); + do { + if (unlikely(c == 0)) + return false; + + result = c + i; + + /* Did we try to increment from/to an undesirable state? */ + if (unlikely(c < 0 || c == INT_MAX || result < c)) { + asm volatile(REFCOUNT_ERROR + : : [counter] "m" (r->refs.counter) + : "cc", "cx"); + break; + } + + } while (!atomic_try_cmpxchg(&(r->refs), &c, result)); + + return c != 0; +} + +static __always_inline __must_check bool refcount_inc_not_zero(refcount_t *r) +{ + return refcount_add_not_zero(1, r); +} + +#endif diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index fb2ddcdf7c73..c076f710de4c 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -36,6 +36,48 @@ bool ex_handler_fault(const struct exception_table_entry *fixup, } EXPORT_SYMBOL_GPL(ex_handler_fault); +/* + * Handler for UD0 exception following a failed test against the + * result of a refcount inc/dec/add/sub. + */ +bool ex_handler_refcount(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + /* First unconditionally saturate the refcount. */ + *(int *)regs->cx = INT_MIN / 2; + + /* + * Strictly speaking, this reports the fixup destination, not + * the fault location, and not the actually overflowing + * instruction, which is the instruction before the "js", but + * since that instruction could be a variety of lengths, just + * report the location after the overflow, which should be close + * enough for finding the overflow, as it's at least back in + * the function, having returned from .text.unlikely. + */ + regs->ip = ex_fixup_addr(fixup); + + /* + * This function has been called because either a negative refcount + * value was seen by any of the refcount functions, or a zero + * refcount value was seen by refcount_dec(). + * + * If we crossed from INT_MAX to INT_MIN, OF (Overflow Flag: result + * wrapped around) will be set. Additionally, seeing the refcount + * reach 0 will set ZF (Zero Flag: result was zero). In each of + * these cases we want a report, since it's a boundary condition. + * + */ + if (regs->flags & (X86_EFLAGS_OF | X86_EFLAGS_ZF)) { + bool zero = regs->flags & X86_EFLAGS_ZF; + + refcount_error_report(regs, zero ? "hit zero" : "overflow"); + } + + return true; +} +EXPORT_SYMBOL_GPL(ex_handler_refcount); + bool ex_handler_ext(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr) { diff --git a/arch/xtensa/include/asm/futex.h b/arch/xtensa/include/asm/futex.h index b39531babec0..eaaf1ebcc7a4 100644 --- a/arch/xtensa/include/asm/futex.h +++ b/arch/xtensa/include/asm/futex.h @@ -44,18 +44,10 @@ : "r" (uaddr), "I" (-EFAULT), "r" (oparg) \ : "memory") -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; #if !XCHAL_HAVE_S32C1I return -ENOSYS; @@ -89,19 +81,10 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (ret) - return ret; + if (!ret) + *oval = oldval; - switch (cmp) { - case FUTEX_OP_CMP_EQ: return (oldval == cmparg); - case FUTEX_OP_CMP_NE: return (oldval != cmparg); - case FUTEX_OP_CMP_LT: return (oldval < cmparg); - case FUTEX_OP_CMP_GE: return (oldval >= cmparg); - case FUTEX_OP_CMP_LE: return (oldval <= cmparg); - case FUTEX_OP_CMP_GT: return (oldval > cmparg); - } - - return -ENOSYS; + return ret; } static inline int diff --git a/block/blk-softirq.c b/block/blk-softirq.c index 87b7df4851bf..07125e7941f4 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -60,7 +60,7 @@ static void trigger_softirq(void *data) static int raise_blk_irq(int cpu, struct request *rq) { if (cpu_online(cpu)) { - struct call_single_data *data = &rq->csd; + call_single_data_t *data = &rq->csd; data->func = trigger_softirq; data->info = rq; diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index 19182d091587..1893e416e7c0 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -2884,7 +2884,7 @@ static int acpi_nfit_flush_probe(struct nvdimm_bus_descriptor *nd_desc) * need to be interruptible while waiting. */ INIT_WORK_ONSTACK(&flush.work, flush_probe); - COMPLETION_INITIALIZER_ONSTACK(flush.cmp); + init_completion(&flush.cmp); queue_work(nfit_wq, &flush.work); mutex_unlock(&acpi_desc->init_mutex); diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 85c24cace973..81142ce781da 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -13,7 +13,7 @@ struct nullb_cmd { struct list_head list; struct llist_node ll_list; - struct call_single_data csd; + call_single_data_t csd; struct request *rq; struct bio *bio; unsigned int tag; diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index 72bbfccef113..fd4b7f684bd0 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -455,7 +455,11 @@ void arch_timer_enable_workaround(const struct arch_timer_erratum_workaround *wa per_cpu(timer_unstable_counter_workaround, i) = wa; } - static_branch_enable(&arch_timer_read_ool_enabled); + /* + * Use the locked version, as we're called from the CPU + * hotplug framework. Otherwise, we end-up in deadlock-land. + */ + static_branch_enable_cpuslocked(&arch_timer_read_ool_enabled); /* * Don't use the vdso fastpath if errata require using the diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c index 71e586d7df71..147f38ea0fcd 100644 --- a/drivers/cpuidle/coupled.c +++ b/drivers/cpuidle/coupled.c @@ -119,13 +119,13 @@ struct cpuidle_coupled { #define CPUIDLE_COUPLED_NOT_IDLE (-1) -static DEFINE_PER_CPU(struct call_single_data, cpuidle_coupled_poke_cb); +static DEFINE_PER_CPU(call_single_data_t, cpuidle_coupled_poke_cb); /* * The cpuidle_coupled_poke_pending mask is used to avoid calling - * __smp_call_function_single with the per cpu call_single_data struct already + * __smp_call_function_single with the per cpu call_single_data_t struct already * in use. This prevents a deadlock where two cpus are waiting for each others - * call_single_data struct to be available + * call_single_data_t struct to be available */ static cpumask_t cpuidle_coupled_poke_pending; @@ -339,7 +339,7 @@ static void cpuidle_coupled_handle_poke(void *info) */ static void cpuidle_coupled_poke(int cpu) { - struct call_single_data *csd = &per_cpu(cpuidle_coupled_poke_cb, cpu); + call_single_data_t *csd = &per_cpu(cpuidle_coupled_poke_cb, cpu); if (!cpumask_test_and_set_cpu(cpu, &cpuidle_coupled_poke_pending)) smp_call_function_single_async(cpu, csd); @@ -651,7 +651,7 @@ int cpuidle_coupled_register_device(struct cpuidle_device *dev) { int cpu; struct cpuidle_device *other_dev; - struct call_single_data *csd; + call_single_data_t *csd; struct cpuidle_coupled *coupled; if (cpumask_empty(&dev->coupled_cpus)) diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 48572b157222..a36216bd2a84 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -28,6 +28,7 @@ #include <linux/debugfs.h> #include <linux/sort.h> +#include <linux/sched/mm.h> #include "intel_drv.h" static inline struct drm_i915_private *node_to_i915(struct drm_info_node *node) @@ -4305,7 +4306,7 @@ i915_drop_caches_set(void *data, u64 val) mutex_unlock(&dev->struct_mutex); } - lockdep_set_current_reclaim_state(GFP_KERNEL); + fs_reclaim_acquire(GFP_KERNEL); if (val & DROP_BOUND) i915_gem_shrink(dev_priv, LONG_MAX, I915_SHRINK_BOUND); @@ -4314,7 +4315,7 @@ i915_drop_caches_set(void *data, u64 val) if (val & DROP_SHRINK_ALL) i915_gem_shrink_all(dev_priv); - lockdep_clear_current_reclaim_state(); + fs_reclaim_release(GFP_KERNEL); if (val & DROP_FREED) { synchronize_rcu(); diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c index 51583ae4b1eb..120b6e537b28 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_main.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c @@ -2468,7 +2468,7 @@ static void liquidio_napi_drv_callback(void *arg) if (OCTEON_CN23XX_PF(oct) || droq->cpu_id == this_cpu) { napi_schedule_irqoff(&droq->napi); } else { - struct call_single_data *csd = &droq->csd; + call_single_data_t *csd = &droq->csd; csd->func = napi_schedule_wrapper; csd->info = &droq->napi; diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_droq.h b/drivers/net/ethernet/cavium/liquidio/octeon_droq.h index 6efd139b894d..f91bc84d1719 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_droq.h +++ b/drivers/net/ethernet/cavium/liquidio/octeon_droq.h @@ -328,7 +328,7 @@ struct octeon_droq { u32 cpu_id; - struct call_single_data csd; + call_single_data_t csd; }; #define OCT_DROQ_SIZE (sizeof(struct octeon_droq)) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 3d424a51cabb..f0fd3adb1693 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -446,14 +446,14 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, ovl_path_upper(dentry, &upperpath); realfile = ovl_path_open(&upperpath, O_RDONLY); - smp_mb__before_spinlock(); + inode_lock(inode); if (!od->upperfile) { if (IS_ERR(realfile)) { inode_unlock(inode); return PTR_ERR(realfile); } - od->upperfile = realfile; + smp_store_release(&od->upperfile, realfile); } else { /* somebody has beaten us to it */ if (!IS_ERR(realfile)) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index b0d5897bc4e6..886085b47c75 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -109,27 +109,24 @@ static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, goto out; WRITE_ONCE(uwq->waken, true); /* - * The implicit smp_mb__before_spinlock in try_to_wake_up() - * renders uwq->waken visible to other CPUs before the task is - * waken. + * The Program-Order guarantees provided by the scheduler + * ensure uwq->waken is visible before the task is woken. */ ret = wake_up_state(wq->private, mode); - if (ret) + if (ret) { /* * Wake only once, autoremove behavior. * - * After the effect of list_del_init is visible to the - * other CPUs, the waitqueue may disappear from under - * us, see the !list_empty_careful() in - * handle_userfault(). try_to_wake_up() has an - * implicit smp_mb__before_spinlock, and the - * wq->private is read before calling the extern - * function "wake_up_state" (which in turns calls - * try_to_wake_up). While the spin_lock;spin_unlock; - * wouldn't be enough, the smp_mb__before_spinlock is - * enough to avoid an explicit smp_mb() here. + * After the effect of list_del_init is visible to the other + * CPUs, the waitqueue may disappear from under us, see the + * !list_empty_careful() in handle_userfault(). + * + * try_to_wake_up() has an implicit smp_mb(), and the + * wq->private is read before calling the extern function + * "wake_up_state" (which in turns calls try_to_wake_up). */ list_del_init(&wq->entry); + } out: return ret; } diff --git a/include/asm-generic/atomic64.h b/include/asm-generic/atomic64.h index dad68bf46c77..8d28eb010d0d 100644 --- a/include/asm-generic/atomic64.h +++ b/include/asm-generic/atomic64.h @@ -21,6 +21,8 @@ typedef struct { extern long long atomic64_read(const atomic64_t *v); extern void atomic64_set(atomic64_t *v, long long i); +#define atomic64_set_release(v, i) atomic64_set((v), (i)) + #define ATOMIC64_OP(op) \ extern void atomic64_##op(long long a, atomic64_t *v); diff --git a/include/asm-generic/futex.h b/include/asm-generic/futex.h index bf2d34c9d804..f0d8b1c51343 100644 --- a/include/asm-generic/futex.h +++ b/include/asm-generic/futex.h @@ -13,7 +13,7 @@ */ /** - * futex_atomic_op_inuser() - Atomic arithmetic operation with constant + * arch_futex_atomic_op_inuser() - Atomic arithmetic operation with constant * argument and comparison of the previous * futex value with another constant. * @@ -25,18 +25,11 @@ * <0 - On error */ static inline int -futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval, ret; u32 tmp; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - preempt_disable(); pagefault_disable(); @@ -74,17 +67,9 @@ out_pagefault_enable: pagefault_enable(); preempt_enable(); - if (ret == 0) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (ret == 0) + *oval = oldval; + return ret; } @@ -126,18 +111,9 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, #else static inline int -futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; pagefault_disable(); @@ -153,17 +129,9 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/include/linux/atomic.h b/include/linux/atomic.h index c56be7410130..40d6bfec0e0d 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -38,6 +38,9 @@ * Besides, if an arch has a special barrier for acquire/release, it could * implement its own __atomic_op_* and use the same framework for building * variants + * + * If an architecture overrides __atomic_op_acquire() it will probably want + * to define smp_mb__after_spinlock(). */ #ifndef __atomic_op_acquire #define __atomic_op_acquire(op, args...) \ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2a5d52fa90f5..4b99b13c7e68 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -134,7 +134,7 @@ typedef __u32 __bitwise req_flags_t; struct request { struct list_head queuelist; union { - struct call_single_data csd; + call_single_data_t csd; u64 fifo_time; }; diff --git a/include/linux/completion.h b/include/linux/completion.h index 5d5aaae3af43..cae5400022a3 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h @@ -9,6 +9,9 @@ */ #include <linux/wait.h> +#ifdef CONFIG_LOCKDEP_COMPLETIONS +#include <linux/lockdep.h> +#endif /* * struct completion - structure used to maintain state for a "completion" @@ -25,13 +28,53 @@ struct completion { unsigned int done; wait_queue_head_t wait; +#ifdef CONFIG_LOCKDEP_COMPLETIONS + struct lockdep_map_cross map; +#endif }; +#ifdef CONFIG_LOCKDEP_COMPLETIONS +static inline void complete_acquire(struct completion *x) +{ + lock_acquire_exclusive((struct lockdep_map *)&x->map, 0, 0, NULL, _RET_IP_); +} + +static inline void complete_release(struct completion *x) +{ + lock_release((struct lockdep_map *)&x->map, 0, _RET_IP_); +} + +static inline void complete_release_commit(struct completion *x) +{ + lock_commit_crosslock((struct lockdep_map *)&x->map); +} + +#define init_completion(x) \ +do { \ + static struct lock_class_key __key; \ + lockdep_init_map_crosslock((struct lockdep_map *)&(x)->map, \ + "(complete)" #x, \ + &__key, 0); \ + __init_completion(x); \ +} while (0) +#else +#define init_completion(x) __init_completion(x) +static inline void complete_acquire(struct completion *x) {} +static inline void complete_release(struct completion *x) {} +static inline void complete_release_commit(struct completion *x) {} +#endif + +#ifdef CONFIG_LOCKDEP_COMPLETIONS +#define COMPLETION_INITIALIZER(work) \ + { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait), \ + STATIC_CROSS_LOCKDEP_MAP_INIT("(complete)" #work, &(work)) } +#else #define COMPLETION_INITIALIZER(work) \ { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) } +#endif #define COMPLETION_INITIALIZER_ONSTACK(work) \ - ({ init_completion(&work); work; }) + (*({ init_completion(&work); &work; })) /** * DECLARE_COMPLETION - declare and initialize a completion structure @@ -70,7 +113,7 @@ struct completion { * This inline function will initialize a dynamically created completion * structure. */ -static inline void init_completion(struct completion *x) +static inline void __init_completion(struct completion *x) { x->done = 0; init_waitqueue_head(&x->wait); diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 898cfe2eeb42..e74655d941b7 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -37,12 +37,6 @@ static inline bool cpusets_enabled(void) return static_branch_unlikely(&cpusets_enabled_key); } -static inline int nr_cpusets(void) -{ - /* jump label reference count + the top-level cpuset */ - return static_key_count(&cpusets_enabled_key.key) + 1; -} - static inline void cpuset_inc(void) { static_branch_inc(&cpusets_pre_enable_key); diff --git a/include/linux/futex.h b/include/linux/futex.h index 7c5b694864cd..f36bfd26f998 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -54,7 +54,6 @@ union futex_key { #ifdef CONFIG_FUTEX extern void exit_robust_list(struct task_struct *curr); -extern void exit_pi_state_list(struct task_struct *curr); #ifdef CONFIG_HAVE_FUTEX_CMPXCHG #define futex_cmpxchg_enabled 1 #else @@ -64,8 +63,14 @@ extern int futex_cmpxchg_enabled; static inline void exit_robust_list(struct task_struct *curr) { } +#endif + +#ifdef CONFIG_FUTEX_PI +extern void exit_pi_state_list(struct task_struct *curr); +#else static inline void exit_pi_state_list(struct task_struct *curr) { } #endif + #endif diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 5dd1272d1ab2..5fdd93bb9300 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -23,10 +23,26 @@ # define trace_softirq_context(p) ((p)->softirq_context) # define trace_hardirqs_enabled(p) ((p)->hardirqs_enabled) # define trace_softirqs_enabled(p) ((p)->softirqs_enabled) -# define trace_hardirq_enter() do { current->hardirq_context++; } while (0) -# define trace_hardirq_exit() do { current->hardirq_context--; } while (0) -# define lockdep_softirq_enter() do { current->softirq_context++; } while (0) -# define lockdep_softirq_exit() do { current->softirq_context--; } while (0) +# define trace_hardirq_enter() \ +do { \ + current->hardirq_context++; \ + crossrelease_hist_start(XHLOCK_HARD); \ +} while (0) +# define trace_hardirq_exit() \ +do { \ + current->hardirq_context--; \ + crossrelease_hist_end(XHLOCK_HARD); \ +} while (0) +# define lockdep_softirq_enter() \ +do { \ + current->softirq_context++; \ + crossrelease_hist_start(XHLOCK_SOFT); \ +} while (0) +# define lockdep_softirq_exit() \ +do { \ + current->softirq_context--; \ + crossrelease_hist_end(XHLOCK_SOFT); \ +} while (0) # define INIT_TRACE_IRQFLAGS .softirqs_enabled = 1, #else # define trace_hardirqs_on() do { } while (0) diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 2afd74b9d844..cd5861651b17 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -163,6 +163,8 @@ extern void jump_label_apply_nops(struct module *mod); extern int static_key_count(struct static_key *key); extern void static_key_enable(struct static_key *key); extern void static_key_disable(struct static_key *key); +extern void static_key_enable_cpuslocked(struct static_key *key); +extern void static_key_disable_cpuslocked(struct static_key *key); /* * We should be using ATOMIC_INIT() for initializing .enabled, but @@ -234,24 +236,29 @@ static inline int jump_label_apply_nops(struct module *mod) static inline void static_key_enable(struct static_key *key) { - int count = static_key_count(key); - - WARN_ON_ONCE(count < 0 || count > 1); + STATIC_KEY_CHECK_USE(); - if (!count) - static_key_slow_inc(key); + if (atomic_read(&key->enabled) != 0) { + WARN_ON_ONCE(atomic_read(&key->enabled) != 1); + return; + } + atomic_set(&key->enabled, 1); } static inline void static_key_disable(struct static_key *key) { - int count = static_key_count(key); - - WARN_ON_ONCE(count < 0 || count > 1); + STATIC_KEY_CHECK_USE(); - if (count) - static_key_slow_dec(key); + if (atomic_read(&key->enabled) != 1) { + WARN_ON_ONCE(atomic_read(&key->enabled) != 0); + return; + } + atomic_set(&key->enabled, 0); } +#define static_key_enable_cpuslocked(k) static_key_enable((k)) +#define static_key_disable_cpuslocked(k) static_key_disable((k)) + #define STATIC_KEY_INIT_TRUE { .enabled = ATOMIC_INIT(1) } #define STATIC_KEY_INIT_FALSE { .enabled = ATOMIC_INIT(0) } @@ -413,8 +420,10 @@ extern bool ____wrong_branch_error(void); * Normal usage; boolean enable/disable. */ -#define static_branch_enable(x) static_key_enable(&(x)->key) -#define static_branch_disable(x) static_key_disable(&(x)->key) +#define static_branch_enable(x) static_key_enable(&(x)->key) +#define static_branch_disable(x) static_key_disable(&(x)->key) +#define static_branch_enable_cpuslocked(x) static_key_enable_cpuslocked(&(x)->key) +#define static_branch_disable_cpuslocked(x) static_key_disable_cpuslocked(&(x)->key) #endif /* __ASSEMBLY__ */ diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h index b7f8aced7870..41960fecf783 100644 --- a/include/linux/kasan-checks.h +++ b/include/linux/kasan-checks.h @@ -2,11 +2,13 @@ #define _LINUX_KASAN_CHECKS_H #ifdef CONFIG_KASAN -void kasan_check_read(const void *p, unsigned int size); -void kasan_check_write(const void *p, unsigned int size); +void kasan_check_read(const volatile void *p, unsigned int size); +void kasan_check_write(const volatile void *p, unsigned int size); #else -static inline void kasan_check_read(const void *p, unsigned int size) { } -static inline void kasan_check_write(const void *p, unsigned int size) { } +static inline void kasan_check_read(const volatile void *p, unsigned int size) +{ } +static inline void kasan_check_write(const volatile void *p, unsigned int size) +{ } #endif #endif diff --git a/include/linux/kernel.h b/include/linux/kernel.h index bd6d96cf80b1..6607225d0ea4 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -277,6 +277,13 @@ extern int oops_may_print(void); void do_exit(long error_code) __noreturn; void complete_and_exit(struct completion *, long) __noreturn; +#ifdef CONFIG_ARCH_HAS_REFCOUNT +void refcount_error_report(struct pt_regs *regs, const char *err); +#else +static inline void refcount_error_report(struct pt_regs *regs, const char *err) +{ } +#endif + /* Internal, do not use. */ int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res); int __must_check _kstrtol(const char *s, unsigned int base, long *res); diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index fffe49f188e6..bfa8e0b0d6f1 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -18,6 +18,8 @@ extern int lock_stat; #define MAX_LOCKDEP_SUBCLASSES 8UL +#include <linux/types.h> + #ifdef CONFIG_LOCKDEP #include <linux/linkage.h> @@ -29,7 +31,7 @@ extern int lock_stat; * We'd rather not expose kernel/lockdep_states.h this wide, but we do need * the total number of states... :-( */ -#define XXX_LOCK_USAGE_STATES (1+3*4) +#define XXX_LOCK_USAGE_STATES (1+2*4) /* * NR_LOCKDEP_CACHING_CLASSES ... Number of classes @@ -155,6 +157,12 @@ struct lockdep_map { int cpu; unsigned long ip; #endif +#ifdef CONFIG_LOCKDEP_CROSSRELEASE + /* + * Whether it's a crosslock. + */ + int cross; +#endif }; static inline void lockdep_copy_map(struct lockdep_map *to, @@ -258,8 +266,95 @@ struct held_lock { unsigned int hardirqs_off:1; unsigned int references:12; /* 32 bits */ unsigned int pin_count; +#ifdef CONFIG_LOCKDEP_CROSSRELEASE + /* + * Generation id. + * + * A value of cross_gen_id will be stored when holding this, + * which is globally increased whenever each crosslock is held. + */ + unsigned int gen_id; +#endif +}; + +#ifdef CONFIG_LOCKDEP_CROSSRELEASE +#define MAX_XHLOCK_TRACE_ENTRIES 5 + +/* + * This is for keeping locks waiting for commit so that true dependencies + * can be added at commit step. + */ +struct hist_lock { + /* + * Id for each entry in the ring buffer. This is used to + * decide whether the ring buffer was overwritten or not. + * + * For example, + * + * |<----------- hist_lock ring buffer size ------->| + * pppppppppppppppppppppiiiiiiiiiiiiiiiiiiiiiiiiiiiii + * wrapped > iiiiiiiiiiiiiiiiiiiiiiiiiii....................... + * + * where 'p' represents an acquisition in process + * context, 'i' represents an acquisition in irq + * context. + * + * In this example, the ring buffer was overwritten by + * acquisitions in irq context, that should be detected on + * rollback or commit. + */ + unsigned int hist_id; + + /* + * Seperate stack_trace data. This will be used at commit step. + */ + struct stack_trace trace; + unsigned long trace_entries[MAX_XHLOCK_TRACE_ENTRIES]; + + /* + * Seperate hlock instance. This will be used at commit step. + * + * TODO: Use a smaller data structure containing only necessary + * data. However, we should make lockdep code able to handle the + * smaller one first. + */ + struct held_lock hlock; +}; + +/* + * To initialize a lock as crosslock, lockdep_init_map_crosslock() should + * be called instead of lockdep_init_map(). + */ +struct cross_lock { + /* + * When more than one acquisition of crosslocks are overlapped, + * we have to perform commit for them based on cross_gen_id of + * the first acquisition, which allows us to add more true + * dependencies. + * + * Moreover, when no acquisition of a crosslock is in progress, + * we should not perform commit because the lock might not exist + * any more, which might cause incorrect memory access. So we + * have to track the number of acquisitions of a crosslock. + */ + int nr_acquire; + + /* + * Seperate hlock instance. This will be used at commit step. + * + * TODO: Use a smaller data structure containing only necessary + * data. However, we should make lockdep code able to handle the + * smaller one first. + */ + struct held_lock hlock; }; +struct lockdep_map_cross { + struct lockdep_map map; + struct cross_lock xlock; +}; +#endif + /* * Initialization, self-test and debugging-output methods: */ @@ -282,13 +377,6 @@ extern void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass); /* - * To initialize a lockdep_map statically use this macro. - * Note that _name must not be NULL. - */ -#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \ - { .name = (_name), .key = (void *)(_key), } - -/* * Reinitialize a lock key - for cases where there is special locking or * special initialization of locks so that the validator gets the scope * of dependencies wrong: they are either too broad (they need a class-split) @@ -363,10 +451,6 @@ static inline void lock_set_subclass(struct lockdep_map *lock, extern void lock_downgrade(struct lockdep_map *lock, unsigned long ip); -extern void lockdep_set_current_reclaim_state(gfp_t gfp_mask); -extern void lockdep_clear_current_reclaim_state(void); -extern void lockdep_trace_alloc(gfp_t mask); - struct pin_cookie { unsigned int val; }; #define NIL_COOKIE (struct pin_cookie){ .val = 0U, } @@ -375,7 +459,7 @@ extern struct pin_cookie lock_pin_lock(struct lockdep_map *lock); extern void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie); extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); -# define INIT_LOCKDEP .lockdep_recursion = 0, .lockdep_reclaim_gfp = 0, +# define INIT_LOCKDEP .lockdep_recursion = 0, #define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0) @@ -416,9 +500,6 @@ static inline void lockdep_on(void) # define lock_downgrade(l, i) do { } while (0) # define lock_set_class(l, n, k, s, i) do { } while (0) # define lock_set_subclass(l, s, i) do { } while (0) -# define lockdep_set_current_reclaim_state(g) do { } while (0) -# define lockdep_clear_current_reclaim_state() do { } while (0) -# define lockdep_trace_alloc(g) do { } while (0) # define lockdep_info() do { } while (0) # define lockdep_init_map(lock, name, key, sub) \ do { (void)(name); (void)(key); } while (0) @@ -467,6 +548,58 @@ struct pin_cookie { }; #endif /* !LOCKDEP */ +enum xhlock_context_t { + XHLOCK_HARD, + XHLOCK_SOFT, + XHLOCK_CTX_NR, +}; + +#ifdef CONFIG_LOCKDEP_CROSSRELEASE +extern void lockdep_init_map_crosslock(struct lockdep_map *lock, + const char *name, + struct lock_class_key *key, + int subclass); +extern void lock_commit_crosslock(struct lockdep_map *lock); + +/* + * What we essencially have to initialize is 'nr_acquire'. Other members + * will be initialized in add_xlock(). + */ +#define STATIC_CROSS_LOCK_INIT() \ + { .nr_acquire = 0,} + +#define STATIC_CROSS_LOCKDEP_MAP_INIT(_name, _key) \ + { .map.name = (_name), .map.key = (void *)(_key), \ + .map.cross = 1, .xlock = STATIC_CROSS_LOCK_INIT(), } + +/* + * To initialize a lockdep_map statically use this macro. + * Note that _name must not be NULL. + */ +#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \ + { .name = (_name), .key = (void *)(_key), .cross = 0, } + +extern void crossrelease_hist_start(enum xhlock_context_t c); +extern void crossrelease_hist_end(enum xhlock_context_t c); +extern void lockdep_invariant_state(bool force); +extern void lockdep_init_task(struct task_struct *task); +extern void lockdep_free_task(struct task_struct *task); +#else /* !CROSSRELEASE */ +#define lockdep_init_map_crosslock(m, n, k, s) do {} while (0) +/* + * To initialize a lockdep_map statically use this macro. + * Note that _name must not be NULL. + */ +#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \ + { .name = (_name), .key = (void *)(_key), } + +static inline void crossrelease_hist_start(enum xhlock_context_t c) {} +static inline void crossrelease_hist_end(enum xhlock_context_t c) {} +static inline void lockdep_invariant_state(bool force) {} +static inline void lockdep_init_task(struct task_struct *task) {} +static inline void lockdep_free_task(struct task_struct *task) {} +#endif /* CROSSRELEASE */ + #ifdef CONFIG_LOCK_STAT extern void lock_contended(struct lockdep_map *lock, unsigned long ip); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3cadee0a3508..57378c7cb5f8 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -526,26 +526,6 @@ extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, extern void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end); -/* - * Memory barriers to keep this state in sync are graciously provided by - * the page table locks, outside of which no page table modifications happen. - * The barriers are used to ensure the order between tlb_flush_pending updates, - * which happen while the lock is not taken, and the PTE updates, which happen - * while the lock is taken, are serialized. - */ -static inline bool mm_tlb_flush_pending(struct mm_struct *mm) -{ - return atomic_read(&mm->tlb_flush_pending) > 0; -} - -/* - * Returns true if there are two above TLB batching threads in parallel. - */ -static inline bool mm_tlb_flush_nested(struct mm_struct *mm) -{ - return atomic_read(&mm->tlb_flush_pending) > 1; -} - static inline void init_tlb_flush_pending(struct mm_struct *mm) { atomic_set(&mm->tlb_flush_pending, 0); @@ -554,27 +534,82 @@ static inline void init_tlb_flush_pending(struct mm_struct *mm) static inline void inc_tlb_flush_pending(struct mm_struct *mm) { atomic_inc(&mm->tlb_flush_pending); - /* - * Guarantee that the tlb_flush_pending increase does not leak into the - * critical section updating the page tables + * The only time this value is relevant is when there are indeed pages + * to flush. And we'll only flush pages after changing them, which + * requires the PTL. + * + * So the ordering here is: + * + * atomic_inc(&mm->tlb_flush_pending); + * spin_lock(&ptl); + * ... + * set_pte_at(); + * spin_unlock(&ptl); + * + * spin_lock(&ptl) + * mm_tlb_flush_pending(); + * .... + * spin_unlock(&ptl); + * + * flush_tlb_range(); + * atomic_dec(&mm->tlb_flush_pending); + * + * Where the increment if constrained by the PTL unlock, it thus + * ensures that the increment is visible if the PTE modification is + * visible. After all, if there is no PTE modification, nobody cares + * about TLB flushes either. + * + * This very much relies on users (mm_tlb_flush_pending() and + * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and + * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc + * locks (PPC) the unlock of one doesn't order against the lock of + * another PTL. + * + * The decrement is ordered by the flush_tlb_range(), such that + * mm_tlb_flush_pending() will not return false unless all flushes have + * completed. */ - smp_mb__before_spinlock(); } -/* Clearing is done after a TLB flush, which also provides a barrier. */ static inline void dec_tlb_flush_pending(struct mm_struct *mm) { /* - * Guarantee that the tlb_flush_pending does not not leak into the - * critical section, since we must order the PTE change and changes to - * the pending TLB flush indication. We could have relied on TLB flush - * as a memory barrier, but this behavior is not clearly documented. + * See inc_tlb_flush_pending(). + * + * This cannot be smp_mb__before_atomic() because smp_mb() simply does + * not order against TLB invalidate completion, which is what we need. + * + * Therefore we must rely on tlb_flush_*() to guarantee order. */ - smp_mb__before_atomic(); atomic_dec(&mm->tlb_flush_pending); } +static inline bool mm_tlb_flush_pending(struct mm_struct *mm) +{ + /* + * Must be called after having acquired the PTL; orders against that + * PTLs release and therefore ensures that if we observe the modified + * PTE we must also observe the increment from inc_tlb_flush_pending(). + * + * That is, it only guarantees to return true if there is a flush + * pending for _this_ PTL. + */ + return atomic_read(&mm->tlb_flush_pending); +} + +static inline bool mm_tlb_flush_nested(struct mm_struct *mm) +{ + /* + * Similar to mm_tlb_flush_pending(), we must have acquired the PTL + * for which there is a TLB flush pending in order to guarantee + * we've seen both that PTE modification and the increment. + * + * (no requirement on actually still holding the PTL, that is irrelevant) + */ + return atomic_read(&mm->tlb_flush_pending) > 1; +} + struct vm_fault; struct vm_special_mapping { diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c99ba7914c0a..461bd5757af6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2774,7 +2774,7 @@ struct softnet_data { unsigned int input_queue_head ____cacheline_aligned_in_smp; /* Elements below can be accessed between CPUs for RPS/RFS */ - struct call_single_data csd ____cacheline_aligned_in_smp; + call_single_data_t csd ____cacheline_aligned_in_smp; struct softnet_data *rps_ipi_next; unsigned int cpu; unsigned int input_queue_tail; diff --git a/include/linux/refcount.h b/include/linux/refcount.h index 591792c8e5b0..48b7c9c68c4d 100644 --- a/include/linux/refcount.h +++ b/include/linux/refcount.h @@ -53,6 +53,9 @@ extern __must_check bool refcount_sub_and_test(unsigned int i, refcount_t *r); extern __must_check bool refcount_dec_and_test(refcount_t *r); extern void refcount_dec(refcount_t *r); #else +# ifdef CONFIG_ARCH_HAS_REFCOUNT +# include <asm/refcount.h> +# else static inline __must_check bool refcount_add_not_zero(unsigned int i, refcount_t *r) { return atomic_add_unless(&r->refs, i, 0); @@ -87,6 +90,7 @@ static inline void refcount_dec(refcount_t *r) { atomic_dec(&r->refs); } +# endif /* !CONFIG_ARCH_HAS_REFCOUNT */ #endif /* CONFIG_REFCOUNT_FULL */ extern __must_check bool refcount_dec_if_one(refcount_t *r); diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h index ae0528b834cd..e784761a4443 100644 --- a/include/linux/rwsem-spinlock.h +++ b/include/linux/rwsem-spinlock.h @@ -32,6 +32,7 @@ struct rw_semaphore { #define RWSEM_UNLOCKED_VALUE 0x00000000 extern void __down_read(struct rw_semaphore *sem); +extern int __must_check __down_read_killable(struct rw_semaphore *sem); extern int __down_read_trylock(struct rw_semaphore *sem); extern void __down_write(struct rw_semaphore *sem); extern int __must_check __down_write_killable(struct rw_semaphore *sem); diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index dd1d14250340..0ad7318ff299 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -44,6 +44,7 @@ struct rw_semaphore { }; extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_down_read_failed_killable(struct rw_semaphore *sem); extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem); extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *); diff --git a/include/linux/sched.h b/include/linux/sched.h index e5fbce866073..9ba42c663fba 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -847,7 +847,17 @@ struct task_struct { int lockdep_depth; unsigned int lockdep_recursion; struct held_lock held_locks[MAX_LOCK_DEPTH]; - gfp_t lockdep_reclaim_gfp; +#endif + +#ifdef CONFIG_LOCKDEP_CROSSRELEASE +#define MAX_XHLOCKS_NR 64UL + struct hist_lock *xhlocks; /* Crossrelease history locks */ + unsigned int xhlock_idx; + /* For restoring at history boundaries */ + unsigned int xhlock_idx_hist[XHLOCK_CTX_NR]; + unsigned int hist_id; + /* For overwrite check at each context exit */ + unsigned int hist_id_save[XHLOCK_CTX_NR]; #endif #ifdef CONFIG_UBSAN diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 2b24a6974847..2b0a281f9d26 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -167,6 +167,14 @@ static inline gfp_t current_gfp_context(gfp_t flags) return flags; } +#ifdef CONFIG_LOCKDEP +extern void fs_reclaim_acquire(gfp_t gfp_mask); +extern void fs_reclaim_release(gfp_t gfp_mask); +#else +static inline void fs_reclaim_acquire(gfp_t gfp_mask) { } +static inline void fs_reclaim_release(gfp_t gfp_mask) { } +#endif + static inline unsigned int memalloc_noio_save(void) { unsigned int flags = current->flags & PF_MEMALLOC_NOIO; diff --git a/include/linux/smp.h b/include/linux/smp.h index 68123c1fe549..98b1fe027fc9 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -14,13 +14,17 @@ #include <linux/llist.h> typedef void (*smp_call_func_t)(void *info); -struct call_single_data { +struct __call_single_data { struct llist_node llist; smp_call_func_t func; void *info; unsigned int flags; }; +/* Use __aligned() to avoid to use 2 cache lines for 1 csd */ +typedef struct __call_single_data call_single_data_t + __aligned(sizeof(struct __call_single_data)); + /* total number of cpus in this system (may exceed NR_CPUS) */ extern unsigned int total_cpus; @@ -48,7 +52,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), smp_call_func_t func, void *info, bool wait, gfp_t gfp_flags); -int smp_call_function_single_async(int cpu, struct call_single_data *csd); +int smp_call_function_single_async(int cpu, call_single_data_t *csd); #ifdef CONFIG_SMP diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index ef018a6e4985..69e079c5ff98 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -118,16 +118,39 @@ do { \ #endif /* - * Despite its name it doesn't necessarily has to be a full barrier. - * It should only guarantee that a STORE before the critical section - * can not be reordered with LOADs and STOREs inside this section. - * spin_lock() is the one-way barrier, this LOAD can not escape out - * of the region. So the default implementation simply ensures that - * a STORE can not move into the critical section, smp_wmb() should - * serialize it with another STORE done by spin_lock(). + * This barrier must provide two things: + * + * - it must guarantee a STORE before the spin_lock() is ordered against a + * LOAD after it, see the comments at its two usage sites. + * + * - it must ensure the critical section is RCsc. + * + * The latter is important for cases where we observe values written by other + * CPUs in spin-loops, without barriers, while being subject to scheduling. + * + * CPU0 CPU1 CPU2 + * + * for (;;) { + * if (READ_ONCE(X)) + * break; + * } + * X=1 + * <sched-out> + * <sched-in> + * r = X; + * + * without transitivity it could be that CPU1 observes X!=0 breaks the loop, + * we get migrated and CPU2 sees X==0. + * + * Since most load-store architectures implement ACQUIRE with an smp_mb() after + * the LL/SC loop, they need no further barriers. Similarly all our TSO + * architectures imply an smp_mb() for each atomic instruction and equally don't + * need more. + * + * Architectures that can implement ACQUIRE better need to take care. */ -#ifndef smp_mb__before_spinlock -#define smp_mb__before_spinlock() smp_wmb() +#ifndef smp_mb__after_spinlock +#define smp_mb__after_spinlock() do { } while (0) #endif #ifdef CONFIG_DEBUG_SPINLOCK diff --git a/init/Kconfig b/init/Kconfig index 8514b25db21c..5f0ef850e808 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1275,12 +1275,17 @@ config BASE_FULL config FUTEX bool "Enable futex support" if EXPERT default y - select RT_MUTEXES + imply RT_MUTEXES help Disabling this option will cause the kernel to be built without support for "fast userspace mutexes". The resulting kernel may not run glibc-based applications correctly. +config FUTEX_PI + bool + depends on FUTEX && RT_MUTEXES + default y + config HAVE_FUTEX_CMPXCHG bool depends on FUTEX diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index df403e97b073..2f4039bafebb 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -577,6 +577,13 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, rcu_read_unlock(); } +/* Must be called with cpuset_mutex held. */ +static inline int nr_cpusets(void) +{ + /* jump label reference count + the top-level cpuset */ + return static_key_count(&cpusets_enabled_key.key) + 1; +} + /* * generate_sched_domains() * diff --git a/kernel/exit.c b/kernel/exit.c index f9ef3ecc78c1..a35d8a17e01f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -918,6 +918,7 @@ void __noreturn do_exit(long code) exit_rcu(); exit_tasks_rcu_finish(); + lockdep_free_task(tsk); do_task_dead(); } EXPORT_SYMBOL_GPL(do_exit); diff --git a/kernel/fork.c b/kernel/fork.c index b7e9e57b71ea..dab73d18bc4d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -484,6 +484,8 @@ void __init fork_init(void) cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", NULL, free_vm_stack_cache); #endif + + lockdep_init_task(&init_task); } int __weak arch_dup_task_struct(struct task_struct *dst, @@ -1700,6 +1702,7 @@ static __latent_entropy struct task_struct *copy_process( p->lockdep_depth = 0; /* no locks held yet */ p->curr_chain_key = 0; p->lockdep_recursion = 0; + lockdep_init_task(p); #endif #ifdef CONFIG_DEBUG_MUTEXES @@ -1958,6 +1961,7 @@ bad_fork_cleanup_audit: bad_fork_cleanup_perf: perf_event_free_task(p); bad_fork_cleanup_policy: + lockdep_free_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_threadgroup_lock: diff --git a/kernel/futex.c b/kernel/futex.c index f50b434756c1..3d38eaf05492 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -876,6 +876,8 @@ static struct task_struct *futex_find_get_task(pid_t pid) return p; } +#ifdef CONFIG_FUTEX_PI + /* * This task is holding PI mutexes at exit time => bad. * Kernel cleans up PI-state, but userspace is likely hosed. @@ -933,6 +935,8 @@ void exit_pi_state_list(struct task_struct *curr) raw_spin_unlock_irq(&curr->pi_lock); } +#endif + /* * We need to check the following states: * @@ -1547,6 +1551,45 @@ out: return ret; } +static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) +{ + unsigned int op = (encoded_op & 0x70000000) >> 28; + unsigned int cmp = (encoded_op & 0x0f000000) >> 24; + int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 12); + int cmparg = sign_extend32(encoded_op & 0x00000fff, 12); + int oldval, ret; + + if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { + if (oparg < 0 || oparg > 31) + return -EINVAL; + oparg = 1 << oparg; + } + + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) + return -EFAULT; + + ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr); + if (ret) + return ret; + + switch (cmp) { + case FUTEX_OP_CMP_EQ: + return oldval == cmparg; + case FUTEX_OP_CMP_NE: + return oldval != cmparg; + case FUTEX_OP_CMP_LT: + return oldval < cmparg; + case FUTEX_OP_CMP_GE: + return oldval >= cmparg; + case FUTEX_OP_CMP_LE: + return oldval <= cmparg; + case FUTEX_OP_CMP_GT: + return oldval > cmparg; + default: + return -ENOSYS; + } +} + /* * Wake up all waiters hashed on the physical page that is mapped * to this virtual address: @@ -1800,6 +1843,15 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, struct futex_q *this, *next; DEFINE_WAKE_Q(wake_q); + /* + * When PI not supported: return -ENOSYS if requeue_pi is true, + * consequently the compiler knows requeue_pi is always false past + * this point which will optimize away all the conditional code + * further down. + */ + if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi) + return -ENOSYS; + if (requeue_pi) { /* * Requeue PI only works on two distinct uaddrs. This @@ -2595,6 +2647,9 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, struct futex_q q = futex_q_init; int res, ret; + if (!IS_ENABLED(CONFIG_FUTEX_PI)) + return -ENOSYS; + if (refill_pi_state_cache()) return -ENOMEM; @@ -2774,6 +2829,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) struct futex_q *top_waiter; int ret; + if (!IS_ENABLED(CONFIG_FUTEX_PI)) + return -ENOSYS; + retry: if (get_user(uval, uaddr)) return -EFAULT; @@ -2984,6 +3042,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, struct futex_q q = futex_q_init; int res, ret; + if (!IS_ENABLED(CONFIG_FUTEX_PI)) + return -ENOSYS; + if (uaddr == uaddr2) return -EINVAL; diff --git a/kernel/jump_label.c b/kernel/jump_label.c index d11c506a6ac3..0bf2e8f5244a 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -79,29 +79,7 @@ int static_key_count(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_count); -void static_key_enable(struct static_key *key) -{ - int count = static_key_count(key); - - WARN_ON_ONCE(count < 0 || count > 1); - - if (!count) - static_key_slow_inc(key); -} -EXPORT_SYMBOL_GPL(static_key_enable); - -void static_key_disable(struct static_key *key) -{ - int count = static_key_count(key); - - WARN_ON_ONCE(count < 0 || count > 1); - - if (count) - static_key_slow_dec(key); -} -EXPORT_SYMBOL_GPL(static_key_disable); - -void static_key_slow_inc(struct static_key *key) +static void static_key_slow_inc_cpuslocked(struct static_key *key) { int v, v1; @@ -125,24 +103,87 @@ void static_key_slow_inc(struct static_key *key) return; } - cpus_read_lock(); jump_label_lock(); if (atomic_read(&key->enabled) == 0) { atomic_set(&key->enabled, -1); jump_label_update(key); - atomic_set(&key->enabled, 1); + /* + * Ensure that if the above cmpxchg loop observes our positive + * value, it must also observe all the text changes. + */ + atomic_set_release(&key->enabled, 1); } else { atomic_inc(&key->enabled); } jump_label_unlock(); +} + +void static_key_slow_inc(struct static_key *key) +{ + cpus_read_lock(); + static_key_slow_inc_cpuslocked(key); cpus_read_unlock(); } EXPORT_SYMBOL_GPL(static_key_slow_inc); -static void __static_key_slow_dec(struct static_key *key, - unsigned long rate_limit, struct delayed_work *work) +void static_key_enable_cpuslocked(struct static_key *key) +{ + STATIC_KEY_CHECK_USE(); + + if (atomic_read(&key->enabled) > 0) { + WARN_ON_ONCE(atomic_read(&key->enabled) != 1); + return; + } + + jump_label_lock(); + if (atomic_read(&key->enabled) == 0) { + atomic_set(&key->enabled, -1); + jump_label_update(key); + /* + * See static_key_slow_inc(). + */ + atomic_set_release(&key->enabled, 1); + } + jump_label_unlock(); +} +EXPORT_SYMBOL_GPL(static_key_enable_cpuslocked); + +void static_key_enable(struct static_key *key) +{ + cpus_read_lock(); + static_key_enable_cpuslocked(key); + cpus_read_unlock(); +} +EXPORT_SYMBOL_GPL(static_key_enable); + +void static_key_disable_cpuslocked(struct static_key *key) +{ + STATIC_KEY_CHECK_USE(); + + if (atomic_read(&key->enabled) != 1) { + WARN_ON_ONCE(atomic_read(&key->enabled) != 0); + return; + } + + jump_label_lock(); + if (atomic_cmpxchg(&key->enabled, 1, 0)) + jump_label_update(key); + jump_label_unlock(); +} +EXPORT_SYMBOL_GPL(static_key_disable_cpuslocked); + +void static_key_disable(struct static_key *key) { cpus_read_lock(); + static_key_disable_cpuslocked(key); + cpus_read_unlock(); +} +EXPORT_SYMBOL_GPL(static_key_disable); + +static void static_key_slow_dec_cpuslocked(struct static_key *key, + unsigned long rate_limit, + struct delayed_work *work) +{ /* * The negative count check is valid even when a negative * key->enabled is in use by static_key_slow_inc(); a @@ -153,7 +194,6 @@ static void __static_key_slow_dec(struct static_key *key, if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { WARN(atomic_read(&key->enabled) < 0, "jump label: negative count!\n"); - cpus_read_unlock(); return; } @@ -164,6 +204,14 @@ static void __static_key_slow_dec(struct static_key *key, jump_label_update(key); } jump_label_unlock(); +} + +static void __static_key_slow_dec(struct static_key *key, + unsigned long rate_limit, + struct delayed_work *work) +{ + cpus_read_lock(); + static_key_slow_dec_cpuslocked(key, rate_limit, work); cpus_read_unlock(); } diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 7d2499bec5fe..44c8d0d17170 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -58,6 +58,10 @@ #define CREATE_TRACE_POINTS #include <trace/events/lock.h> +#ifdef CONFIG_LOCKDEP_CROSSRELEASE +#include <linux/slab.h> +#endif + #ifdef CONFIG_PROVE_LOCKING int prove_locking = 1; module_param(prove_locking, int, 0644); @@ -344,14 +348,12 @@ EXPORT_SYMBOL(lockdep_on); #if VERBOSE # define HARDIRQ_VERBOSE 1 # define SOFTIRQ_VERBOSE 1 -# define RECLAIM_VERBOSE 1 #else # define HARDIRQ_VERBOSE 0 # define SOFTIRQ_VERBOSE 0 -# define RECLAIM_VERBOSE 0 #endif -#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE +#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE /* * Quick filtering for interesting events: */ @@ -726,6 +728,18 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL); } +#ifdef CONFIG_LOCKDEP_CROSSRELEASE +static void cross_init(struct lockdep_map *lock, int cross); +static int cross_lock(struct lockdep_map *lock); +static int lock_acquire_crosslock(struct held_lock *hlock); +static int lock_release_crosslock(struct lockdep_map *lock); +#else +static inline void cross_init(struct lockdep_map *lock, int cross) {} +static inline int cross_lock(struct lockdep_map *lock) { return 0; } +static inline int lock_acquire_crosslock(struct held_lock *hlock) { return 2; } +static inline int lock_release_crosslock(struct lockdep_map *lock) { return 2; } +#endif + /* * Register a lock's class in the hash-table, if the class is not present * yet. Otherwise we look it up. We cache the result in the lock object @@ -1125,22 +1139,41 @@ print_circular_lock_scenario(struct held_lock *src, printk(KERN_CONT "\n\n"); } - printk(" Possible unsafe locking scenario:\n\n"); - printk(" CPU0 CPU1\n"); - printk(" ---- ----\n"); - printk(" lock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(parent); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(source); - printk(KERN_CONT ");\n"); - printk("\n *** DEADLOCK ***\n\n"); + if (cross_lock(tgt->instance)) { + printk(" Possible unsafe locking scenario by crosslock:\n\n"); + printk(" CPU0 CPU1\n"); + printk(" ---- ----\n"); + printk(" lock("); + __print_lock_name(parent); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(target); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(source); + printk(KERN_CONT ");\n"); + printk(" unlock("); + __print_lock_name(target); + printk(KERN_CONT ");\n"); + printk("\n *** DEADLOCK ***\n\n"); + } else { + printk(" Possible unsafe locking scenario:\n\n"); + printk(" CPU0 CPU1\n"); + printk(" ---- ----\n"); + printk(" lock("); + __print_lock_name(target); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(parent); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(target); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(source); + printk(KERN_CONT ");\n"); + printk("\n *** DEADLOCK ***\n\n"); + } } /* @@ -1165,7 +1198,12 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, pr_warn("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); print_lock(check_src); - pr_warn("\nbut task is already holding lock:\n"); + + if (cross_lock(check_tgt->instance)) + pr_warn("\nbut now in release context of a crosslock acquired at the following:\n"); + else + pr_warn("\nbut task is already holding lock:\n"); + print_lock(check_tgt); pr_warn("\nwhich lock already depends on the new lock.\n\n"); pr_warn("\nthe existing dependency chain (in reverse order) is:\n"); @@ -1183,7 +1221,8 @@ static inline int class_equal(struct lock_list *entry, void *data) static noinline int print_circular_bug(struct lock_list *this, struct lock_list *target, struct held_lock *check_src, - struct held_lock *check_tgt) + struct held_lock *check_tgt, + struct stack_trace *trace) { struct task_struct *curr = current; struct lock_list *parent; @@ -1193,7 +1232,9 @@ static noinline int print_circular_bug(struct lock_list *this, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - if (!save_trace(&this->trace)) + if (cross_lock(check_tgt->instance)) + this->trace = *trace; + else if (!save_trace(&this->trace)) return 0; depth = get_lock_depth(target); @@ -1309,6 +1350,19 @@ check_noncircular(struct lock_list *root, struct lock_class *target, return result; } +static noinline int +check_redundant(struct lock_list *root, struct lock_class *target, + struct lock_list **target_entry) +{ + int result; + + debug_atomic_inc(nr_redundant_checks); + + result = __bfs_forwards(root, target, class_equal, target_entry); + + return result; +} + #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * Forwards and backwards subgraph searching, for the purposes of @@ -1784,6 +1838,9 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, if (nest) return 2; + if (cross_lock(prev->instance)) + continue; + return print_deadlock_bug(curr, prev, next); } return 1; @@ -1813,20 +1870,13 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, */ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, int distance, int *stack_saved) + struct held_lock *next, int distance, struct stack_trace *trace, + int (*save)(struct stack_trace *trace)) { struct lock_list *entry; int ret; struct lock_list this; struct lock_list *uninitialized_var(target_entry); - /* - * Static variable, serialized by the graph_lock(). - * - * We use this static variable to save the stack trace in case - * we call into this function multiple times due to encountering - * trylocks in the held lock stack. - */ - static struct stack_trace trace; /* * Prove that the new <prev> -> <next> dependency would not @@ -1841,7 +1891,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, this.parent = NULL; ret = check_noncircular(&this, hlock_class(prev), &target_entry); if (unlikely(!ret)) - return print_circular_bug(&this, target_entry, next, prev); + return print_circular_bug(&this, target_entry, next, prev, trace); else if (unlikely(ret < 0)) return print_bfs_bug(ret); @@ -1870,15 +1920,26 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, if (entry->class == hlock_class(next)) { if (distance == 1) entry->distance = 1; - return 2; + return 1; } } - if (!*stack_saved) { - if (!save_trace(&trace)) - return 0; - *stack_saved = 1; + /* + * Is the <prev> -> <next> link redundant? + */ + this.class = hlock_class(prev); + this.parent = NULL; + ret = check_redundant(&this, hlock_class(next), &target_entry); + if (!ret) { + debug_atomic_inc(nr_redundant); + return 2; } + if (ret < 0) + return print_bfs_bug(ret); + + + if (save && !save(trace)) + return 0; /* * Ok, all validations passed, add the new lock @@ -1886,14 +1947,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, */ ret = add_lock_to_list(hlock_class(next), &hlock_class(prev)->locks_after, - next->acquire_ip, distance, &trace); + next->acquire_ip, distance, trace); if (!ret) return 0; ret = add_lock_to_list(hlock_class(prev), &hlock_class(next)->locks_before, - next->acquire_ip, distance, &trace); + next->acquire_ip, distance, trace); if (!ret) return 0; @@ -1901,8 +1962,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * Debugging printouts: */ if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) { - /* We drop graph lock, so another thread can overwrite trace. */ - *stack_saved = 0; graph_unlock(); printk("\n new dependency: "); print_lock_name(hlock_class(prev)); @@ -1910,9 +1969,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, print_lock_name(hlock_class(next)); printk(KERN_CONT "\n"); dump_stack(); - return graph_lock(); + if (!graph_lock()) + return 0; } - return 1; + return 2; } /* @@ -1925,8 +1985,9 @@ static int check_prevs_add(struct task_struct *curr, struct held_lock *next) { int depth = curr->lockdep_depth; - int stack_saved = 0; struct held_lock *hlock; + struct stack_trace trace; + int (*save)(struct stack_trace *trace) = save_trace; /* * Debugging checks. @@ -1947,21 +2008,36 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) int distance = curr->lockdep_depth - depth + 1; hlock = curr->held_locks + depth - 1; /* - * Only non-recursive-read entries get new dependencies - * added: + * Only non-crosslock entries get new dependencies added. + * Crosslock entries will be added by commit later: */ - if (hlock->read != 2 && hlock->check) { - if (!check_prev_add(curr, hlock, next, - distance, &stack_saved)) - return 0; + if (!cross_lock(hlock->instance)) { /* - * Stop after the first non-trylock entry, - * as non-trylock entries have added their - * own direct dependencies already, so this - * lock is connected to them indirectly: + * Only non-recursive-read entries get new dependencies + * added: */ - if (!hlock->trylock) - break; + if (hlock->read != 2 && hlock->check) { + int ret = check_prev_add(curr, hlock, next, + distance, &trace, save); + if (!ret) + return 0; + + /* + * Stop saving stack_trace if save_trace() was + * called at least once: + */ + if (save && ret == 2) + save = NULL; + + /* + * Stop after the first non-trylock entry, + * as non-trylock entries have added their + * own direct dependencies already, so this + * lock is connected to them indirectly: + */ + if (!hlock->trylock) + break; + } } depth--; /* @@ -2126,19 +2202,26 @@ static int check_no_collision(struct task_struct *curr, } /* - * Look up a dependency chain. If the key is not present yet then - * add it and return 1 - in this case the new dependency chain is - * validated. If the key is already hashed, return 0. - * (On return with 1 graph_lock is held.) + * This is for building a chain between just two different classes, + * instead of adding a new hlock upon current, which is done by + * add_chain_cache(). + * + * This can be called in any context with two classes, while + * add_chain_cache() must be done within the lock owener's context + * since it uses hlock which might be racy in another context. */ -static inline int lookup_chain_cache(struct task_struct *curr, - struct held_lock *hlock, - u64 chain_key) +static inline int add_chain_cache_classes(unsigned int prev, + unsigned int next, + unsigned int irq_context, + u64 chain_key) { - struct lock_class *class = hlock_class(hlock); struct hlist_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; - int i, j; + + /* + * Allocate a new chain entry from the static array, and add + * it to the hash: + */ /* * We might need to take the graph lock, ensure we've got IRQs @@ -2147,43 +2230,76 @@ static inline int lookup_chain_cache(struct task_struct *curr, */ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; + + if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { + if (!debug_locks_off_graph_unlock()) + return 0; + + print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!"); + dump_stack(); + return 0; + } + + chain = lock_chains + nr_lock_chains++; + chain->chain_key = chain_key; + chain->irq_context = irq_context; + chain->depth = 2; + if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { + chain->base = nr_chain_hlocks; + nr_chain_hlocks += chain->depth; + chain_hlocks[chain->base] = prev - 1; + chain_hlocks[chain->base + 1] = next -1; + } +#ifdef CONFIG_DEBUG_LOCKDEP /* - * We can walk it lock-free, because entries only get added - * to the hash: + * Important for check_no_collision(). */ - hlist_for_each_entry_rcu(chain, hash_head, entry) { - if (chain->chain_key == chain_key) { -cache_hit: - debug_atomic_inc(chain_lookup_hits); - if (!check_no_collision(curr, hlock, chain)) - return 0; - - if (very_verbose(class)) - printk("\nhash chain already cached, key: " - "%016Lx tail class: [%p] %s\n", - (unsigned long long)chain_key, - class->key, class->name); + else { + if (!debug_locks_off_graph_unlock()) return 0; - } + + print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!"); + dump_stack(); + return 0; } - if (very_verbose(class)) - printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", - (unsigned long long)chain_key, class->key, class->name); +#endif + + hlist_add_head_rcu(&chain->entry, hash_head); + debug_atomic_inc(chain_lookup_misses); + inc_chains(); + + return 1; +} + +/* + * Adds a dependency chain into chain hashtable. And must be called with + * graph_lock held. + * + * Return 0 if fail, and graph_lock is released. + * Return 1 if succeed, with graph_lock held. + */ +static inline int add_chain_cache(struct task_struct *curr, + struct held_lock *hlock, + u64 chain_key) +{ + struct lock_class *class = hlock_class(hlock); + struct hlist_head *hash_head = chainhashentry(chain_key); + struct lock_chain *chain; + int i, j; + /* * Allocate a new chain entry from the static array, and add * it to the hash: */ - if (!graph_lock()) - return 0; + /* - * We have to walk the chain again locked - to avoid duplicates: + * We might need to take the graph lock, ensure we've got IRQs + * disabled to make this an IRQ-safe lock.. for recursion reasons + * lockdep won't complain about its own locking errors. */ - hlist_for_each_entry(chain, hash_head, entry) { - if (chain->chain_key == chain_key) { - graph_unlock(); - goto cache_hit; - } - } + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return 0; + if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { if (!debug_locks_off_graph_unlock()) return 0; @@ -2235,6 +2351,78 @@ cache_hit: return 1; } +/* + * Look up a dependency chain. + */ +static inline struct lock_chain *lookup_chain_cache(u64 chain_key) +{ + struct hlist_head *hash_head = chainhashentry(chain_key); + struct lock_chain *chain; + + /* + * We can walk it lock-free, because entries only get added + * to the hash: + */ + hlist_for_each_entry_rcu(chain, hash_head, entry) { + if (chain->chain_key == chain_key) { + debug_atomic_inc(chain_lookup_hits); + return chain; + } + } + return NULL; +} + +/* + * If the key is not present yet in dependency chain cache then + * add it and return 1 - in this case the new dependency chain is + * validated. If the key is already hashed, return 0. + * (On return with 1 graph_lock is held.) + */ +static inline int lookup_chain_cache_add(struct task_struct *curr, + struct held_lock *hlock, + u64 chain_key) +{ + struct lock_class *class = hlock_class(hlock); + struct lock_chain *chain = lookup_chain_cache(chain_key); + + if (chain) { +cache_hit: + if (!check_no_collision(curr, hlock, chain)) + return 0; + + if (very_verbose(class)) { + printk("\nhash chain already cached, key: " + "%016Lx tail class: [%p] %s\n", + (unsigned long long)chain_key, + class->key, class->name); + } + + return 0; + } + + if (very_verbose(class)) { + printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", + (unsigned long long)chain_key, class->key, class->name); + } + + if (!graph_lock()) + return 0; + + /* + * We have to walk the chain again locked - to avoid duplicates: + */ + chain = lookup_chain_cache(chain_key); + if (chain) { + graph_unlock(); + goto cache_hit; + } + + if (!add_chain_cache(curr, hlock, chain_key)) + return 0; + + return 1; +} + static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, struct held_lock *hlock, int chain_head, u64 chain_key) { @@ -2245,11 +2433,11 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, * * We look up the chain_key and do the O(N^2) check and update of * the dependencies only if this is a new dependency chain. - * (If lookup_chain_cache() returns with 1 it acquires + * (If lookup_chain_cache_add() return with 1 it acquires * graph_lock for us) */ if (!hlock->trylock && hlock->check && - lookup_chain_cache(curr, hlock, chain_key)) { + lookup_chain_cache_add(curr, hlock, chain_key)) { /* * Check whether last held lock: * @@ -2277,14 +2465,17 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, * Add dependency only if this lock is not the head * of the chain, and if it's not a secondary read-lock: */ - if (!chain_head && ret != 2) + if (!chain_head && ret != 2) { if (!check_prevs_add(curr, hlock)) return 0; + } + graph_unlock(); - } else - /* after lookup_chain_cache(): */ + } else { + /* after lookup_chain_cache_add(): */ if (unlikely(!debug_locks)) return 0; + } return 1; } @@ -2567,14 +2758,6 @@ static int SOFTIRQ_verbose(struct lock_class *class) return 0; } -static int RECLAIM_FS_verbose(struct lock_class *class) -{ -#if RECLAIM_VERBOSE - return class_filter(class); -#endif - return 0; -} - #define STRICT_READ_CHECKS 1 static int (*state_verbose_f[])(struct lock_class *class) = { @@ -2870,57 +3053,6 @@ void trace_softirqs_off(unsigned long ip) debug_atomic_inc(redundant_softirqs_off); } -static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) -{ - struct task_struct *curr = current; - - if (unlikely(!debug_locks)) - return; - - gfp_mask = current_gfp_context(gfp_mask); - - /* no reclaim without waiting on it */ - if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) - return; - - /* this guy won't enter reclaim */ - if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) - return; - - /* We're only interested __GFP_FS allocations for now */ - if (!(gfp_mask & __GFP_FS) || (curr->flags & PF_MEMALLOC_NOFS)) - return; - - /* - * Oi! Can't be having __GFP_FS allocations with IRQs disabled. - */ - if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) - return; - - /* Disable lockdep if explicitly requested */ - if (gfp_mask & __GFP_NOLOCKDEP) - return; - - mark_held_locks(curr, RECLAIM_FS); -} - -static void check_flags(unsigned long flags); - -void lockdep_trace_alloc(gfp_t gfp_mask) -{ - unsigned long flags; - - if (unlikely(current->lockdep_recursion)) - return; - - raw_local_irq_save(flags); - check_flags(flags); - current->lockdep_recursion = 1; - __lockdep_trace_alloc(gfp_mask, flags); - current->lockdep_recursion = 0; - raw_local_irq_restore(flags); -} - static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) { /* @@ -2966,22 +3098,6 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) } } - /* - * We reuse the irq context infrastructure more broadly as a general - * context checking code. This tests GFP_FS recursion (a lock taken - * during reclaim for a GFP_FS allocation is held over a GFP_FS - * allocation). - */ - if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) { - if (hlock->read) { - if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ)) - return 0; - } else { - if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS)) - return 0; - } - } - return 1; } @@ -3040,10 +3156,6 @@ static inline int separate_irq_context(struct task_struct *curr, return 0; } -void lockdep_trace_alloc(gfp_t gfp_mask) -{ -} - #endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ /* @@ -3116,7 +3228,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, /* * Initialize a lock instance's lock-class mapping info: */ -void lockdep_init_map(struct lockdep_map *lock, const char *name, +static void __lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { int i; @@ -3174,8 +3286,25 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, raw_local_irq_restore(flags); } } + +void lockdep_init_map(struct lockdep_map *lock, const char *name, + struct lock_class_key *key, int subclass) +{ + cross_init(lock, 0); + __lockdep_init_map(lock, name, key, subclass); +} EXPORT_SYMBOL_GPL(lockdep_init_map); +#ifdef CONFIG_LOCKDEP_CROSSRELEASE +void lockdep_init_map_crosslock(struct lockdep_map *lock, const char *name, + struct lock_class_key *key, int subclass) +{ + cross_init(lock, 1); + __lockdep_init_map(lock, name, key, subclass); +} +EXPORT_SYMBOL_GPL(lockdep_init_map_crosslock); +#endif + struct lock_class_key __lockdep_no_validate__; EXPORT_SYMBOL_GPL(__lockdep_no_validate__); @@ -3231,6 +3360,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int chain_head = 0; int class_idx; u64 chain_key; + int ret; if (unlikely(!debug_locks)) return 0; @@ -3279,7 +3409,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, class_idx = class - lock_classes + 1; - if (depth) { + /* TODO: nest_lock is not implemented for crosslock yet. */ + if (depth && !cross_lock(lock)) { hlock = curr->held_locks + depth - 1; if (hlock->class_idx == class_idx && nest_lock) { if (hlock->references) { @@ -3367,6 +3498,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) return 0; + ret = lock_acquire_crosslock(hlock); + /* + * 2 means normal acquire operations are needed. Otherwise, it's + * ok just to return with '0:fail, 1:success'. + */ + if (ret != 2) + return ret; + curr->curr_chain_key = chain_key; curr->lockdep_depth++; check_chain_key(curr); @@ -3604,11 +3743,19 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) struct task_struct *curr = current; struct held_lock *hlock; unsigned int depth; - int i; + int ret, i; if (unlikely(!debug_locks)) return 0; + ret = lock_release_crosslock(lock); + /* + * 2 means normal release operations are needed. Otherwise, it's + * ok just to return with '0:fail, 1:success'. + */ + if (ret != 2) + return ret; + depth = curr->lockdep_depth; /* * So we're all set to release this lock.. wait what lock? We don't @@ -3952,18 +4099,6 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie) } EXPORT_SYMBOL_GPL(lock_unpin_lock); -void lockdep_set_current_reclaim_state(gfp_t gfp_mask) -{ - current->lockdep_reclaim_gfp = current_gfp_context(gfp_mask); -} -EXPORT_SYMBOL_GPL(lockdep_set_current_reclaim_state); - -void lockdep_clear_current_reclaim_state(void) -{ - current->lockdep_reclaim_gfp = 0; -} -EXPORT_SYMBOL_GPL(lockdep_clear_current_reclaim_state); - #ifdef CONFIG_LOCK_STAT static int print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, @@ -4484,6 +4619,12 @@ asmlinkage __visible void lockdep_sys_exit(void) curr->comm, curr->pid); lockdep_print_held_locks(curr); } + + /* + * The lock history for each syscall should be independent. So wipe the + * slate clean on return to userspace. + */ + lockdep_invariant_state(false); } void lockdep_rcu_suspicious(const char *file, const int line, const char *s) @@ -4532,3 +4673,488 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) dump_stack(); } EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); + +#ifdef CONFIG_LOCKDEP_CROSSRELEASE + +/* + * Crossrelease works by recording a lock history for each thread and + * connecting those historic locks that were taken after the + * wait_for_completion() in the complete() context. + * + * Task-A Task-B + * + * mutex_lock(&A); + * mutex_unlock(&A); + * + * wait_for_completion(&C); + * lock_acquire_crosslock(); + * atomic_inc_return(&cross_gen_id); + * | + * | mutex_lock(&B); + * | mutex_unlock(&B); + * | + * | complete(&C); + * `-- lock_commit_crosslock(); + * + * Which will then add a dependency between B and C. + */ + +#define xhlock(i) (current->xhlocks[(i) % MAX_XHLOCKS_NR]) + +/* + * Whenever a crosslock is held, cross_gen_id will be increased. + */ +static atomic_t cross_gen_id; /* Can be wrapped */ + +/* + * Make an entry of the ring buffer invalid. + */ +static inline void invalidate_xhlock(struct hist_lock *xhlock) +{ + /* + * Normally, xhlock->hlock.instance must be !NULL. + */ + xhlock->hlock.instance = NULL; +} + +/* + * Lock history stacks; we have 2 nested lock history stacks: + * + * HARD(IRQ) + * SOFT(IRQ) + * + * The thing is that once we complete a HARD/SOFT IRQ the future task locks + * should not depend on any of the locks observed while running the IRQ. So + * what we do is rewind the history buffer and erase all our knowledge of that + * temporal event. + */ + +void crossrelease_hist_start(enum xhlock_context_t c) +{ + struct task_struct *cur = current; + + if (!cur->xhlocks) + return; + + cur->xhlock_idx_hist[c] = cur->xhlock_idx; + cur->hist_id_save[c] = cur->hist_id; +} + +void crossrelease_hist_end(enum xhlock_context_t c) +{ + struct task_struct *cur = current; + + if (cur->xhlocks) { + unsigned int idx = cur->xhlock_idx_hist[c]; + struct hist_lock *h = &xhlock(idx); + + cur->xhlock_idx = idx; + + /* Check if the ring was overwritten. */ + if (h->hist_id != cur->hist_id_save[c]) + invalidate_xhlock(h); + } +} + +/* + * lockdep_invariant_state() is used to annotate independence inside a task, to + * make one task look like multiple independent 'tasks'. + * + * Take for instance workqueues; each work is independent of the last. The + * completion of a future work does not depend on the completion of a past work + * (in general). Therefore we must not carry that (lock) dependency across + * works. + * + * This is true for many things; pretty much all kthreads fall into this + * pattern, where they have an invariant state and future completions do not + * depend on past completions. Its just that since they all have the 'same' + * form -- the kthread does the same over and over -- it doesn't typically + * matter. + * + * The same is true for system-calls, once a system call is completed (we've + * returned to userspace) the next system call does not depend on the lock + * history of the previous system call. + * + * They key property for independence, this invariant state, is that it must be + * a point where we hold no locks and have no history. Because if we were to + * hold locks, the restore at _end() would not necessarily recover it's history + * entry. Similarly, independence per-definition means it does not depend on + * prior state. + */ +void lockdep_invariant_state(bool force) +{ + /* + * We call this at an invariant point, no current state, no history. + * Verify the former, enforce the latter. + */ + WARN_ON_ONCE(!force && current->lockdep_depth); + invalidate_xhlock(&xhlock(current->xhlock_idx)); +} + +static int cross_lock(struct lockdep_map *lock) +{ + return lock ? lock->cross : 0; +} + +/* + * This is needed to decide the relationship between wrapable variables. + */ +static inline int before(unsigned int a, unsigned int b) +{ + return (int)(a - b) < 0; +} + +static inline struct lock_class *xhlock_class(struct hist_lock *xhlock) +{ + return hlock_class(&xhlock->hlock); +} + +static inline struct lock_class *xlock_class(struct cross_lock *xlock) +{ + return hlock_class(&xlock->hlock); +} + +/* + * Should we check a dependency with previous one? + */ +static inline int depend_before(struct held_lock *hlock) +{ + return hlock->read != 2 && hlock->check && !hlock->trylock; +} + +/* + * Should we check a dependency with next one? + */ +static inline int depend_after(struct held_lock *hlock) +{ + return hlock->read != 2 && hlock->check; +} + +/* + * Check if the xhlock is valid, which would be false if, + * + * 1. Has not used after initializaion yet. + * 2. Got invalidated. + * + * Remind hist_lock is implemented as a ring buffer. + */ +static inline int xhlock_valid(struct hist_lock *xhlock) +{ + /* + * xhlock->hlock.instance must be !NULL. + */ + return !!xhlock->hlock.instance; +} + +/* + * Record a hist_lock entry. + * + * Irq disable is only required. + */ +static void add_xhlock(struct held_lock *hlock) +{ + unsigned int idx = ++current->xhlock_idx; + struct hist_lock *xhlock = &xhlock(idx); + +#ifdef CONFIG_DEBUG_LOCKDEP + /* + * This can be done locklessly because they are all task-local + * state, we must however ensure IRQs are disabled. + */ + WARN_ON_ONCE(!irqs_disabled()); +#endif + + /* Initialize hist_lock's members */ + xhlock->hlock = *hlock; + xhlock->hist_id = ++current->hist_id; + + xhlock->trace.nr_entries = 0; + xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES; + xhlock->trace.entries = xhlock->trace_entries; + xhlock->trace.skip = 3; + save_stack_trace(&xhlock->trace); +} + +static inline int same_context_xhlock(struct hist_lock *xhlock) +{ + return xhlock->hlock.irq_context == task_irq_context(current); +} + +/* + * This should be lockless as far as possible because this would be + * called very frequently. + */ +static void check_add_xhlock(struct held_lock *hlock) +{ + /* + * Record a hist_lock, only in case that acquisitions ahead + * could depend on the held_lock. For example, if the held_lock + * is trylock then acquisitions ahead never depends on that. + * In that case, we don't need to record it. Just return. + */ + if (!current->xhlocks || !depend_before(hlock)) + return; + + add_xhlock(hlock); +} + +/* + * For crosslock. + */ +static int add_xlock(struct held_lock *hlock) +{ + struct cross_lock *xlock; + unsigned int gen_id; + + if (!graph_lock()) + return 0; + + xlock = &((struct lockdep_map_cross *)hlock->instance)->xlock; + + /* + * When acquisitions for a crosslock are overlapped, we use + * nr_acquire to perform commit for them, based on cross_gen_id + * of the first acquisition, which allows to add additional + * dependencies. + * + * Moreover, when no acquisition of a crosslock is in progress, + * we should not perform commit because the lock might not exist + * any more, which might cause incorrect memory access. So we + * have to track the number of acquisitions of a crosslock. + * + * depend_after() is necessary to initialize only the first + * valid xlock so that the xlock can be used on its commit. + */ + if (xlock->nr_acquire++ && depend_after(&xlock->hlock)) + goto unlock; + + gen_id = (unsigned int)atomic_inc_return(&cross_gen_id); + xlock->hlock = *hlock; + xlock->hlock.gen_id = gen_id; +unlock: + graph_unlock(); + return 1; +} + +/* + * Called for both normal and crosslock acquires. Normal locks will be + * pushed on the hist_lock queue. Cross locks will record state and + * stop regular lock_acquire() to avoid being placed on the held_lock + * stack. + * + * Return: 0 - failure; + * 1 - crosslock, done; + * 2 - normal lock, continue to held_lock[] ops. + */ +static int lock_acquire_crosslock(struct held_lock *hlock) +{ + /* + * CONTEXT 1 CONTEXT 2 + * --------- --------- + * lock A (cross) + * X = atomic_inc_return(&cross_gen_id) + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * Y = atomic_read_acquire(&cross_gen_id) + * lock B + * + * atomic_read_acquire() is for ordering between A and B, + * IOW, A happens before B, when CONTEXT 2 see Y >= X. + * + * Pairs with atomic_inc_return() in add_xlock(). + */ + hlock->gen_id = (unsigned int)atomic_read_acquire(&cross_gen_id); + + if (cross_lock(hlock->instance)) + return add_xlock(hlock); + + check_add_xhlock(hlock); + return 2; +} + +static int copy_trace(struct stack_trace *trace) +{ + unsigned long *buf = stack_trace + nr_stack_trace_entries; + unsigned int max_nr = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; + unsigned int nr = min(max_nr, trace->nr_entries); + + trace->nr_entries = nr; + memcpy(buf, trace->entries, nr * sizeof(trace->entries[0])); + trace->entries = buf; + nr_stack_trace_entries += nr; + + if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { + if (!debug_locks_off_graph_unlock()) + return 0; + + print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); + dump_stack(); + + return 0; + } + + return 1; +} + +static int commit_xhlock(struct cross_lock *xlock, struct hist_lock *xhlock) +{ + unsigned int xid, pid; + u64 chain_key; + + xid = xlock_class(xlock) - lock_classes; + chain_key = iterate_chain_key((u64)0, xid); + pid = xhlock_class(xhlock) - lock_classes; + chain_key = iterate_chain_key(chain_key, pid); + + if (lookup_chain_cache(chain_key)) + return 1; + + if (!add_chain_cache_classes(xid, pid, xhlock->hlock.irq_context, + chain_key)) + return 0; + + if (!check_prev_add(current, &xlock->hlock, &xhlock->hlock, 1, + &xhlock->trace, copy_trace)) + return 0; + + return 1; +} + +static void commit_xhlocks(struct cross_lock *xlock) +{ + unsigned int cur = current->xhlock_idx; + unsigned int prev_hist_id = xhlock(cur).hist_id; + unsigned int i; + + if (!graph_lock()) + return; + + if (xlock->nr_acquire) { + for (i = 0; i < MAX_XHLOCKS_NR; i++) { + struct hist_lock *xhlock = &xhlock(cur - i); + + if (!xhlock_valid(xhlock)) + break; + + if (before(xhlock->hlock.gen_id, xlock->hlock.gen_id)) + break; + + if (!same_context_xhlock(xhlock)) + break; + + /* + * Filter out the cases where the ring buffer was + * overwritten and the current entry has a bigger + * hist_id than the previous one, which is impossible + * otherwise: + */ + if (unlikely(before(prev_hist_id, xhlock->hist_id))) + break; + + prev_hist_id = xhlock->hist_id; + + /* + * commit_xhlock() returns 0 with graph_lock already + * released if fail. + */ + if (!commit_xhlock(xlock, xhlock)) + return; + } + } + + graph_unlock(); +} + +void lock_commit_crosslock(struct lockdep_map *lock) +{ + struct cross_lock *xlock; + unsigned long flags; + + if (unlikely(!debug_locks || current->lockdep_recursion)) + return; + + if (!current->xhlocks) + return; + + /* + * Do commit hist_locks with the cross_lock, only in case that + * the cross_lock could depend on acquisitions after that. + * + * For example, if the cross_lock does not have the 'check' flag + * then we don't need to check dependencies and commit for that. + * Just skip it. In that case, of course, the cross_lock does + * not depend on acquisitions ahead, either. + * + * WARNING: Don't do that in add_xlock() in advance. When an + * acquisition context is different from the commit context, + * invalid(skipped) cross_lock might be accessed. + */ + if (!depend_after(&((struct lockdep_map_cross *)lock)->xlock.hlock)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + current->lockdep_recursion = 1; + xlock = &((struct lockdep_map_cross *)lock)->xlock; + commit_xhlocks(xlock); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_commit_crosslock); + +/* + * Return: 0 - failure; + * 1 - crosslock, done; + * 2 - normal lock, continue to held_lock[] ops. + */ +static int lock_release_crosslock(struct lockdep_map *lock) +{ + if (cross_lock(lock)) { + if (!graph_lock()) + return 0; + ((struct lockdep_map_cross *)lock)->xlock.nr_acquire--; + graph_unlock(); + return 1; + } + return 2; +} + +static void cross_init(struct lockdep_map *lock, int cross) +{ + if (cross) + ((struct lockdep_map_cross *)lock)->xlock.nr_acquire = 0; + + lock->cross = cross; + + /* + * Crossrelease assumes that the ring buffer size of xhlocks + * is aligned with power of 2. So force it on build. + */ + BUILD_BUG_ON(MAX_XHLOCKS_NR & (MAX_XHLOCKS_NR - 1)); +} + +void lockdep_init_task(struct task_struct *task) +{ + int i; + + task->xhlock_idx = UINT_MAX; + task->hist_id = 0; + + for (i = 0; i < XHLOCK_CTX_NR; i++) { + task->xhlock_idx_hist[i] = UINT_MAX; + task->hist_id_save[i] = 0; + } + + task->xhlocks = kzalloc(sizeof(struct hist_lock) * MAX_XHLOCKS_NR, + GFP_KERNEL); +} + +void lockdep_free_task(struct task_struct *task) +{ + if (task->xhlocks) { + void *tmp = task->xhlocks; + /* Diable crossrelease for current */ + task->xhlocks = NULL; + kfree(tmp); + } +} +#endif diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index c08fbd2f5ba9..1da4669d57a7 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -143,6 +143,8 @@ struct lockdep_stats { int redundant_softirqs_on; int redundant_softirqs_off; int nr_unused_locks; + int nr_redundant_checks; + int nr_redundant; int nr_cyclic_checks; int nr_cyclic_check_recursions; int nr_find_usage_forwards_checks; diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 6d1fcc786081..68d9e267ccd4 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -201,6 +201,10 @@ static void lockdep_stats_debug_show(struct seq_file *m) debug_atomic_read(chain_lookup_hits)); seq_printf(m, " cyclic checks: %11llu\n", debug_atomic_read(nr_cyclic_checks)); + seq_printf(m, " redundant checks: %11llu\n", + debug_atomic_read(nr_redundant_checks)); + seq_printf(m, " redundant links: %11llu\n", + debug_atomic_read(nr_redundant)); seq_printf(m, " find-mask forwards checks: %11llu\n", debug_atomic_read(nr_find_usage_forwards_checks)); seq_printf(m, " find-mask backwards checks: %11llu\n", diff --git a/kernel/locking/lockdep_states.h b/kernel/locking/lockdep_states.h index 995b0cc2b84c..35ca09f2ed0b 100644 --- a/kernel/locking/lockdep_states.h +++ b/kernel/locking/lockdep_states.h @@ -6,4 +6,3 @@ */ LOCKDEP_STATE(HARDIRQ) LOCKDEP_STATE(SOFTIRQ) -LOCKDEP_STATE(RECLAIM_FS) diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index a3167941093b..a74ee6abd039 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -109,6 +109,19 @@ bool osq_lock(struct optimistic_spin_queue *lock) prev = decode_cpu(old); node->prev = prev; + + /* + * osq_lock() unqueue + * + * node->prev = prev osq_wait_next() + * WMB MB + * prev->next = node next->prev = prev // unqueue-C + * + * Here 'node->prev' and 'next->prev' are the same variable and we need + * to ensure these stores happen in-order to avoid corrupting the list. + */ + smp_wmb(); + WRITE_ONCE(prev->next, node); /* diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 4ccfcaae5b89..43555681c40b 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -72,7 +72,7 @@ static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock) struct __qspinlock *l = (void *)lock; if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && - (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) { + (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) { qstat_inc(qstat_pv_lock_stealing, true); return true; } @@ -101,16 +101,16 @@ static __always_inline void clear_pending(struct qspinlock *lock) /* * The pending bit check in pv_queued_spin_steal_lock() isn't a memory - * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock - * just to be sure that it will get it. + * barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the + * lock just to be sure that it will get it. */ static __always_inline int trylock_clear_pending(struct qspinlock *lock) { struct __qspinlock *l = (void *)lock; return !READ_ONCE(l->locked) && - (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL) - == _Q_PENDING_VAL); + (cmpxchg_acquire(&l->locked_pending, _Q_PENDING_VAL, + _Q_LOCKED_VAL) == _Q_PENDING_VAL); } #else /* _Q_PENDING_BITS == 8 */ static __always_inline void set_pending(struct qspinlock *lock) @@ -138,7 +138,7 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock) */ old = val; new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL; - val = atomic_cmpxchg(&lock->val, old, new); + val = atomic_cmpxchg_acquire(&lock->val, old, new); if (val == old) return 1; @@ -362,8 +362,18 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) * observe its next->locked value and advance itself. * * Matches with smp_store_mb() and cmpxchg() in pv_wait_node() + * + * The write to next->locked in arch_mcs_spin_unlock_contended() + * must be ordered before the read of pn->state in the cmpxchg() + * below for the code to work correctly. To guarantee full ordering + * irrespective of the success or failure of the cmpxchg(), + * a relaxed version with explicit barrier is used. The control + * dependency will order the reading of pn->state before any + * subsequent writes. */ - if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted) + smp_mb__before_atomic(); + if (cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_hashed) + != vcpu_halted) return; /* diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 72ad45a9a794..8d039b928d61 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -40,6 +40,9 @@ struct rt_mutex_waiter { /* * Various helpers to access the waiters-tree: */ + +#ifdef CONFIG_RT_MUTEXES + static inline int rt_mutex_has_waiters(struct rt_mutex *lock) { return !RB_EMPTY_ROOT(&lock->waiters); @@ -69,6 +72,32 @@ task_top_pi_waiter(struct task_struct *p) pi_tree_entry); } +#else + +static inline int rt_mutex_has_waiters(struct rt_mutex *lock) +{ + return false; +} + +static inline struct rt_mutex_waiter * +rt_mutex_top_waiter(struct rt_mutex *lock) +{ + return NULL; +} + +static inline int task_has_pi_waiters(struct task_struct *p) +{ + return false; +} + +static inline struct rt_mutex_waiter * +task_top_pi_waiter(struct task_struct *p) +{ + return NULL; +} + +#endif + /* * lock->owner state tracking: */ diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 20819df98125..0848634c5512 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -126,7 +126,7 @@ __rwsem_wake_one_writer(struct rw_semaphore *sem) /* * get a read lock on the semaphore */ -void __sched __down_read(struct rw_semaphore *sem) +int __sched __down_read_common(struct rw_semaphore *sem, int state) { struct rwsem_waiter waiter; unsigned long flags; @@ -140,8 +140,6 @@ void __sched __down_read(struct rw_semaphore *sem) goto out; } - set_current_state(TASK_UNINTERRUPTIBLE); - /* set up my own style of waitqueue */ waiter.task = current; waiter.type = RWSEM_WAITING_FOR_READ; @@ -149,20 +147,41 @@ void __sched __down_read(struct rw_semaphore *sem) list_add_tail(&waiter.list, &sem->wait_list); - /* we don't need to touch the semaphore struct anymore */ - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - /* wait to be given the lock */ for (;;) { if (!waiter.task) break; + if (signal_pending_state(state, current)) + goto out_nolock; + set_current_state(state); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); schedule(); - set_current_state(TASK_UNINTERRUPTIBLE); + raw_spin_lock_irqsave(&sem->wait_lock, flags); } - __set_current_state(TASK_RUNNING); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); out: - ; + return 0; + +out_nolock: + /* + * We didn't take the lock, so that there is a writer, which + * is owner or the first waiter of the sem. If it's a waiter, + * it will be woken by current owner. Not need to wake anybody. + */ + list_del(&waiter.list); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + return -EINTR; +} + +void __sched __down_read(struct rw_semaphore *sem) +{ + __down_read_common(sem, TASK_UNINTERRUPTIBLE); +} + +int __sched __down_read_killable(struct rw_semaphore *sem) +{ + return __down_read_common(sem, TASK_KILLABLE); } /* diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 34e727f18e49..02f660666ab8 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -221,8 +221,8 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, /* * Wait for the read lock to be granted */ -__visible -struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) +static inline struct rw_semaphore __sched * +__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) { long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; struct rwsem_waiter waiter; @@ -255,17 +255,44 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) /* wait to be given the lock */ while (true) { - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(state); if (!waiter.task) break; + if (signal_pending_state(state, current)) { + raw_spin_lock_irq(&sem->wait_lock); + if (waiter.task) + goto out_nolock; + raw_spin_unlock_irq(&sem->wait_lock); + break; + } schedule(); } __set_current_state(TASK_RUNNING); return sem; +out_nolock: + list_del(&waiter.list); + if (list_empty(&sem->wait_list)) + atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); + raw_spin_unlock_irq(&sem->wait_lock); + __set_current_state(TASK_RUNNING); + return ERR_PTR(-EINTR); +} + +__visible struct rw_semaphore * __sched +rwsem_down_read_failed(struct rw_semaphore *sem) +{ + return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(rwsem_down_read_failed); +__visible struct rw_semaphore * __sched +rwsem_down_read_failed_killable(struct rw_semaphore *sem) +{ + return __rwsem_down_read_failed_common(sem, TASK_KILLABLE); +} +EXPORT_SYMBOL(rwsem_down_read_failed_killable); + /* * This function must be called with the sem->wait_lock held to prevent * race conditions between checking the rwsem wait list and setting the diff --git a/kernel/panic.c b/kernel/panic.c index a58932b41700..bdd18afa19a4 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -26,6 +26,7 @@ #include <linux/nmi.h> #include <linux/console.h> #include <linux/bug.h> +#include <linux/ratelimit.h> #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 @@ -601,6 +602,17 @@ EXPORT_SYMBOL(__stack_chk_fail); #endif +#ifdef CONFIG_ARCH_HAS_REFCOUNT +void refcount_error_report(struct pt_regs *regs, const char *err) +{ + WARN_RATELIMIT(1, "refcount_t %s at %pB in %s[%d], uid/euid: %u/%u\n", + err, (void *)instruction_pointer(regs), + current->comm, task_pid_nr(current), + from_kuid_munged(&init_user_ns, current_uid()), + from_kuid_munged(&init_user_ns, current_euid())); +} +#endif + core_param(panic, panic_timeout, int, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); core_param(panic_on_warn, panic_on_warn, int, 0644); diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 5d9131aa846f..cc873075c3bd 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -32,6 +32,12 @@ void complete(struct completion *x) unsigned long flags; spin_lock_irqsave(&x->wait.lock, flags); + + /* + * Perform commit of crossrelease here. + */ + complete_release_commit(x); + if (x->done != UINT_MAX) x->done++; __wake_up_locked(&x->wait, TASK_NORMAL, 1); @@ -99,9 +105,14 @@ __wait_for_common(struct completion *x, { might_sleep(); + complete_acquire(x); + spin_lock_irq(&x->wait.lock); timeout = do_wait_for_common(x, action, timeout, state); spin_unlock_irq(&x->wait.lock); + + complete_release(x); + return timeout; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c1fcd96cf432..6d2c7ff9ba98 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1972,8 +1972,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * reordered with p->state check below. This pairs with mb() in * set_current_state() the waiting thread does. */ - smp_mb__before_spinlock(); raw_spin_lock_irqsave(&p->pi_lock, flags); + smp_mb__after_spinlock(); if (!(p->state & state)) goto out; @@ -3296,8 +3296,8 @@ static void __sched notrace __schedule(bool preempt) * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) * done by the caller to avoid the race with signal_wake_up(). */ - smp_mb__before_spinlock(); rq_lock(rq, &rf); + smp_mb__after_spinlock(); /* Promote REQ to ACT */ rq->clock_update_flags <<= 1; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 25e5cb1107f3..ab1c7f5409a0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -769,7 +769,7 @@ struct rq { #ifdef CONFIG_SCHED_HRTICK #ifdef CONFIG_SMP int hrtick_csd_pending; - struct call_single_data hrtick_csd; + call_single_data_t hrtick_csd; #endif struct hrtimer hrtick_timer; #endif diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index 3d5610dcce11..2227e183e202 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -33,9 +33,6 @@ void swake_up(struct swait_queue_head *q) { unsigned long flags; - if (!swait_active(q)) - return; - raw_spin_lock_irqsave(&q->lock, flags); swake_up_locked(q); raw_spin_unlock_irqrestore(&q->lock, flags); @@ -51,9 +48,6 @@ void swake_up_all(struct swait_queue_head *q) struct swait_queue *curr; LIST_HEAD(tmp); - if (!swait_active(q)) - return; - raw_spin_lock_irq(&q->lock); list_splice_init(&q->task_list, &tmp); while (!list_empty(&tmp)) { diff --git a/kernel/smp.c b/kernel/smp.c index 3061483cb3ad..81cfca9b4cc3 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -28,7 +28,7 @@ enum { }; struct call_function_data { - struct call_single_data __percpu *csd; + call_single_data_t __percpu *csd; cpumask_var_t cpumask; cpumask_var_t cpumask_ipi; }; @@ -51,7 +51,7 @@ int smpcfd_prepare_cpu(unsigned int cpu) free_cpumask_var(cfd->cpumask); return -ENOMEM; } - cfd->csd = alloc_percpu(struct call_single_data); + cfd->csd = alloc_percpu(call_single_data_t); if (!cfd->csd) { free_cpumask_var(cfd->cpumask); free_cpumask_var(cfd->cpumask_ipi); @@ -103,12 +103,12 @@ void __init call_function_init(void) * previous function call. For multi-cpu calls its even more interesting * as we'll have to ensure no other cpu is observing our csd. */ -static __always_inline void csd_lock_wait(struct call_single_data *csd) +static __always_inline void csd_lock_wait(call_single_data_t *csd) { smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK)); } -static __always_inline void csd_lock(struct call_single_data *csd) +static __always_inline void csd_lock(call_single_data_t *csd) { csd_lock_wait(csd); csd->flags |= CSD_FLAG_LOCK; @@ -116,12 +116,12 @@ static __always_inline void csd_lock(struct call_single_data *csd) /* * prevent CPU from reordering the above assignment * to ->flags with any subsequent assignments to other - * fields of the specified call_single_data structure: + * fields of the specified call_single_data_t structure: */ smp_wmb(); } -static __always_inline void csd_unlock(struct call_single_data *csd) +static __always_inline void csd_unlock(call_single_data_t *csd) { WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); @@ -131,14 +131,14 @@ static __always_inline void csd_unlock(struct call_single_data *csd) smp_store_release(&csd->flags, 0); } -static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); +static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data); /* - * Insert a previously allocated call_single_data element + * Insert a previously allocated call_single_data_t element * for execution on the given CPU. data must already have * ->func, ->info, and ->flags set. */ -static int generic_exec_single(int cpu, struct call_single_data *csd, +static int generic_exec_single(int cpu, call_single_data_t *csd, smp_call_func_t func, void *info) { if (cpu == smp_processor_id()) { @@ -210,7 +210,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) { struct llist_head *head; struct llist_node *entry; - struct call_single_data *csd, *csd_next; + call_single_data_t *csd, *csd_next; static bool warned; WARN_ON(!irqs_disabled()); @@ -268,8 +268,10 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) int smp_call_function_single(int cpu, smp_call_func_t func, void *info, int wait) { - struct call_single_data *csd; - struct call_single_data csd_stack = { .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS }; + call_single_data_t *csd; + call_single_data_t csd_stack = { + .flags = CSD_FLAG_LOCK | CSD_FLAG_SYNCHRONOUS, + }; int this_cpu; int err; @@ -321,7 +323,7 @@ EXPORT_SYMBOL(smp_call_function_single); * NOTE: Be careful, there is unfortunately no current debugging facility to * validate the correctness of this serialization. */ -int smp_call_function_single_async(int cpu, struct call_single_data *csd) +int smp_call_function_single_async(int cpu, call_single_data_t *csd) { int err = 0; @@ -444,7 +446,7 @@ void smp_call_function_many(const struct cpumask *mask, cpumask_clear(cfd->cpumask_ipi); for_each_cpu(cpu, cfd->cpumask) { - struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu); + call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu); csd_lock(csd); if (wait) @@ -460,7 +462,7 @@ void smp_call_function_many(const struct cpumask *mask, if (wait) { for_each_cpu(cpu, cfd->cpumask) { - struct call_single_data *csd; + call_single_data_t *csd; csd = per_cpu_ptr(cfd->csd, cpu); csd_lock_wait(csd); diff --git a/kernel/up.c b/kernel/up.c index ee81ac9af4ca..42c46bf3e0a5 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -23,7 +23,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, } EXPORT_SYMBOL(smp_call_function_single); -int smp_call_function_single_async(int cpu, struct call_single_data *csd) +int smp_call_function_single_async(int cpu, call_single_data_t *csd) { unsigned long flags; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ca937b0c3a96..ab3c0dc8c7ed 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2091,8 +2091,30 @@ __acquires(&pool->lock) spin_unlock_irq(&pool->lock); - lock_map_acquire_read(&pwq->wq->lockdep_map); + lock_map_acquire(&pwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); + /* + * Strictly speaking we should mark the invariant state without holding + * any locks, that is, before these two lock_map_acquire()'s. + * + * However, that would result in: + * + * A(W1) + * WFC(C) + * A(W1) + * C(C) + * + * Which would create W1->C->W1 dependencies, even though there is no + * actual deadlock possible. There are two solutions, using a + * read-recursive acquire on the work(queue) 'locks', but this will then + * hit the lockdep limitation on recursive locks, or simply discard + * these locks. + * + * AFAICT there is no possible deadlock scenario between the + * flush_work() and complete() primitives (except for single-threaded + * workqueues), so hiding them isn't a problem. + */ + lockdep_invariant_state(true); trace_workqueue_execute_start(work); worker->current_func(work); /* @@ -2474,7 +2496,16 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, */ INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); - init_completion(&barr->done); + + /* + * Explicitly init the crosslock for wq_barrier::done, make its lock + * key a subkey of the corresponding work. As a result we won't + * build a dependency between wq_barrier::done and unrelated work. + */ + lockdep_init_map_crosslock((struct lockdep_map *)&barr->done.map, + "(complete)wq_barr::done", + target->lockdep_map.key, 1); + __init_completion(&barr->done); barr->task = current; /* @@ -2815,16 +2846,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) spin_unlock_irq(&pool->lock); /* - * If @max_active is 1 or rescuer is in use, flushing another work - * item on the same workqueue may lead to deadlock. Make sure the - * flusher is not running on the same workqueue by verifying write - * access. + * Force a lock recursion deadlock when using flush_work() inside a + * single-threaded or rescuer equipped workqueue. + * + * For single threaded workqueues the deadlock happens when the work + * is after the work issuing the flush_work(). For rescuer equipped + * workqueues the deadlock happens when the rescuer stalls, blocking + * forward progress. */ - if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer) + if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer) { lock_map_acquire(&pwq->wq->lockdep_map); - else - lock_map_acquire_read(&pwq->wq->lockdep_map); - lock_map_release(&pwq->wq->lockdep_map); + lock_map_release(&pwq->wq->lockdep_map); + } return true; already_gone: diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 2a9a8759752b..7396f5044397 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1091,6 +1091,8 @@ config PROVE_LOCKING select DEBUG_MUTEXES select DEBUG_RT_MUTEXES if RT_MUTEXES select DEBUG_LOCK_ALLOC + select LOCKDEP_CROSSRELEASE + select LOCKDEP_COMPLETIONS select TRACE_IRQFLAGS default n help @@ -1160,6 +1162,22 @@ config LOCK_STAT CONFIG_LOCK_STAT defines "contended" and "acquired" lock events. (CONFIG_LOCKDEP defines "acquire" and "release" events.) +config LOCKDEP_CROSSRELEASE + bool + help + This makes lockdep work for crosslock which is a lock allowed to + be released in a different context from the acquisition context. + Normally a lock must be released in the context acquiring the lock. + However, relexing this constraint helps synchronization primitives + such as page locks or completions can use the lock correctness + detector, lockdep. + +config LOCKDEP_COMPLETIONS + bool + help + A deadlock caused by wait_for_completion() and complete() can be + detected by lockdep using crossrelease feature. + config DEBUG_LOCKDEP bool "Lock dependency engine debugging" depends on DEBUG_KERNEL && LOCKDEP diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c index 6f2b135dc5e8..cd0b5c964bd0 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c @@ -363,6 +363,103 @@ static void rsem_AA3(void) } /* + * read_lock(A) + * spin_lock(B) + * spin_lock(B) + * write_lock(A) + */ +static void rlock_ABBA1(void) +{ + RL(X1); + L(Y1); + U(Y1); + RU(X1); + + L(Y1); + WL(X1); + WU(X1); + U(Y1); // should fail +} + +static void rwsem_ABBA1(void) +{ + RSL(X1); + ML(Y1); + MU(Y1); + RSU(X1); + + ML(Y1); + WSL(X1); + WSU(X1); + MU(Y1); // should fail +} + +/* + * read_lock(A) + * spin_lock(B) + * spin_lock(B) + * read_lock(A) + */ +static void rlock_ABBA2(void) +{ + RL(X1); + L(Y1); + U(Y1); + RU(X1); + + L(Y1); + RL(X1); + RU(X1); + U(Y1); // should NOT fail +} + +static void rwsem_ABBA2(void) +{ + RSL(X1); + ML(Y1); + MU(Y1); + RSU(X1); + + ML(Y1); + RSL(X1); + RSU(X1); + MU(Y1); // should fail +} + + +/* + * write_lock(A) + * spin_lock(B) + * spin_lock(B) + * write_lock(A) + */ +static void rlock_ABBA3(void) +{ + WL(X1); + L(Y1); + U(Y1); + WU(X1); + + L(Y1); + WL(X1); + WU(X1); + U(Y1); // should fail +} + +static void rwsem_ABBA3(void) +{ + WSL(X1); + ML(Y1); + MU(Y1); + WSU(X1); + + ML(Y1); + WSL(X1); + WSU(X1); + MU(Y1); // should fail +} + +/* * ABBA deadlock: */ @@ -1056,8 +1153,6 @@ static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask) if (debug_locks != expected) { unexpected_testcase_failures++; pr_cont("FAILED|"); - - dump_stack(); } else { testcase_successes++; pr_cont(" ok |"); @@ -1933,6 +2028,30 @@ void locking_selftest(void) dotest(rsem_AA3, FAILURE, LOCKTYPE_RWSEM); pr_cont("\n"); + print_testname("mixed read-lock/lock-write ABBA"); + pr_cont(" |"); + dotest(rlock_ABBA1, FAILURE, LOCKTYPE_RWLOCK); + /* + * Lockdep does indeed fail here, but there's nothing we can do about + * that now. Don't kill lockdep for it. + */ + unexpected_testcase_failures--; + + pr_cont(" |"); + dotest(rwsem_ABBA1, FAILURE, LOCKTYPE_RWSEM); + + print_testname("mixed read-lock/lock-read ABBA"); + pr_cont(" |"); + dotest(rlock_ABBA2, SUCCESS, LOCKTYPE_RWLOCK); + pr_cont(" |"); + dotest(rwsem_ABBA2, FAILURE, LOCKTYPE_RWSEM); + + print_testname("mixed write-lock/lock-write ABBA"); + pr_cont(" |"); + dotest(rlock_ABBA3, FAILURE, LOCKTYPE_RWLOCK); + pr_cont(" |"); + dotest(rwsem_ABBA3, FAILURE, LOCKTYPE_RWSEM); + printk(" --------------------------------------------------------------------------\n"); /* diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 90731e3b7e58..3644ff918434 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1510,8 +1510,15 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) } /* - * The page_table_lock above provides a memory barrier - * with change_protection_range. + * Since we took the NUMA fault, we must have observed the !accessible + * bit. Make sure all other CPUs agree with that, to avoid them + * modifying the page we're about to migrate. + * + * Must be done under PTL such that we'll observe the relevant + * inc_tlb_flush_pending(). + * + * We are not sure a pending tlb flush here is for a huge page + * mapping or not. Hence use the tlb range variant */ if (mm_tlb_flush_pending(vma->vm_mm)) flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); @@ -1521,6 +1528,7 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) * and access rights restored. */ spin_unlock(vmf->ptl); + migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma, vmf->pmd, pmd, vmf->address, page, target_nid); if (migrated) { diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index ca11bc4ce205..6f319fb81718 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -267,13 +267,13 @@ static void check_memory_region(unsigned long addr, check_memory_region_inline(addr, size, write, ret_ip); } -void kasan_check_read(const void *p, unsigned int size) +void kasan_check_read(const volatile void *p, unsigned int size) { check_memory_region((unsigned long)p, size, false, _RET_IP_); } EXPORT_SYMBOL(kasan_check_read); -void kasan_check_write(const void *p, unsigned int size) +void kasan_check_write(const volatile void *p, unsigned int size) { check_memory_region((unsigned long)p, size, true, _RET_IP_); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1423da8dd16f..9327a940e373 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -66,6 +66,7 @@ #include <linux/kthread.h> #include <linux/memcontrol.h> #include <linux/ftrace.h> +#include <linux/lockdep.h> #include <linux/nmi.h> #include <asm/sections.h> @@ -3513,6 +3514,47 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla } #endif /* CONFIG_COMPACTION */ +#ifdef CONFIG_LOCKDEP +struct lockdep_map __fs_reclaim_map = + STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map); + +static bool __need_fs_reclaim(gfp_t gfp_mask) +{ + gfp_mask = current_gfp_context(gfp_mask); + + /* no reclaim without waiting on it */ + if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) + return false; + + /* this guy won't enter reclaim */ + if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) + return false; + + /* We're only interested __GFP_FS allocations for now */ + if (!(gfp_mask & __GFP_FS)) + return false; + + if (gfp_mask & __GFP_NOLOCKDEP) + return false; + + return true; +} + +void fs_reclaim_acquire(gfp_t gfp_mask) +{ + if (__need_fs_reclaim(gfp_mask)) + lock_map_acquire(&__fs_reclaim_map); +} +EXPORT_SYMBOL_GPL(fs_reclaim_acquire); + +void fs_reclaim_release(gfp_t gfp_mask) +{ + if (__need_fs_reclaim(gfp_mask)) + lock_map_release(&__fs_reclaim_map); +} +EXPORT_SYMBOL_GPL(fs_reclaim_release); +#endif + /* Perform direct synchronous page reclaim */ static int __perform_reclaim(gfp_t gfp_mask, unsigned int order, @@ -3527,7 +3569,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); noreclaim_flag = memalloc_noreclaim_save(); - lockdep_set_current_reclaim_state(gfp_mask); + fs_reclaim_acquire(gfp_mask); reclaim_state.reclaimed_slab = 0; current->reclaim_state = &reclaim_state; @@ -3535,7 +3577,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, ac->nodemask); current->reclaim_state = NULL; - lockdep_clear_current_reclaim_state(); + fs_reclaim_release(gfp_mask); memalloc_noreclaim_restore(noreclaim_flag); cond_resched(); @@ -4064,7 +4106,8 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, *alloc_flags |= ALLOC_CPUSET; } - lockdep_trace_alloc(gfp_mask); + fs_reclaim_acquire(gfp_mask); + fs_reclaim_release(gfp_mask); might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); diff --git a/mm/slab.h b/mm/slab.h index 6885e1192ec5..073362816acc 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -43,6 +43,7 @@ struct kmem_cache { #include <linux/kasan.h> #include <linux/kmemleak.h> #include <linux/random.h> +#include <linux/sched/mm.h> /* * State of the slab allocator. @@ -412,7 +413,10 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) { flags &= gfp_allowed_mask; - lockdep_trace_alloc(flags); + + fs_reclaim_acquire(flags); + fs_reclaim_release(flags); + might_sleep_if(gfpflags_allow_blocking(flags)); if (should_failslab(s, flags)) diff --git a/mm/slob.c b/mm/slob.c index 1bae78d71096..a8bd6fa11a66 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -432,7 +432,8 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) gfp &= gfp_allowed_mask; - lockdep_trace_alloc(gfp); + fs_reclaim_acquire(gfp); + fs_reclaim_release(gfp); if (size < PAGE_SIZE - align) { if (!size) @@ -538,7 +539,8 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) flags &= gfp_allowed_mask; - lockdep_trace_alloc(flags); + fs_reclaim_acquire(flags); + fs_reclaim_release(flags); if (c->size < PAGE_SIZE) { b = slob_alloc(c->size, flags, c->align, node); diff --git a/mm/vmscan.c b/mm/vmscan.c index a1af041930a6..f957afe900ec 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3525,8 +3525,6 @@ static int kswapd(void *p) }; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); - lockdep_set_current_reclaim_state(GFP_KERNEL); - if (!cpumask_empty(cpumask)) set_cpus_allowed_ptr(tsk, cpumask); current->reclaim_state = &reclaim_state; @@ -3585,14 +3583,15 @@ kswapd_try_sleep: */ trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx, alloc_order); + fs_reclaim_acquire(GFP_KERNEL); reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); + fs_reclaim_release(GFP_KERNEL); if (reclaim_order < alloc_order) goto kswapd_try_sleep; } tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); current->reclaim_state = NULL; - lockdep_clear_current_reclaim_state(); return 0; } @@ -3655,14 +3654,14 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) unsigned int noreclaim_flag; noreclaim_flag = memalloc_noreclaim_save(); - lockdep_set_current_reclaim_state(sc.gfp_mask); + fs_reclaim_acquire(sc.gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; nr_reclaimed = do_try_to_free_pages(zonelist, &sc); p->reclaim_state = NULL; - lockdep_clear_current_reclaim_state(); + fs_reclaim_release(sc.gfp_mask); memalloc_noreclaim_restore(noreclaim_flag); return nr_reclaimed; @@ -3847,7 +3846,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in */ noreclaim_flag = memalloc_noreclaim_save(); p->flags |= PF_SWAPWRITE; - lockdep_set_current_reclaim_state(sc.gfp_mask); + fs_reclaim_acquire(sc.gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; @@ -3862,9 +3861,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in } p->reclaim_state = NULL; + fs_reclaim_release(gfp_mask); current->flags &= ~PF_SWAPWRITE; memalloc_noreclaim_restore(noreclaim_flag); - lockdep_clear_current_reclaim_state(); return sc.nr_reclaimed >= nr_pages; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 62344804baae..38e795e0c4bf 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1810,8 +1810,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) static struct static_key udp_encap_needed __read_mostly; void udp_encap_enable(void) { - if (!static_key_enabled(&udp_encap_needed)) - static_key_slow_inc(&udp_encap_needed); + static_key_enable(&udp_encap_needed); } EXPORT_SYMBOL(udp_encap_enable); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index d6886228e1d0..56030d45823a 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -575,8 +575,7 @@ static __inline__ void udpv6_err(struct sk_buff *skb, static struct static_key udpv6_encap_needed __read_mostly; void udpv6_encap_enable(void) { - if (!static_key_enabled(&udpv6_encap_needed)) - static_key_slow_inc(&udpv6_encap_needed); + static_key_enable(&udpv6_encap_needed); } EXPORT_SYMBOL(udpv6_encap_enable); |