summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/RCU/00-INDEX2
-rw-r--r--Documentation/RCU/checklist.txt12
-rw-r--r--Documentation/RCU/rcu_dereference.txt371
-rw-r--r--Documentation/RCU/stallwarn.txt2
-rw-r--r--Documentation/RCU/whatisRCU.txt55
-rw-r--r--include/linux/rcupdate.h5
-rw-r--r--kernel/rcu/tiny_plugin.h8
-rw-r--r--kernel/rcu/tree.c249
-rw-r--r--kernel/rcu/tree.h3
-rw-r--r--kernel/rcu/tree_plugin.h120
-rw-r--r--kernel/softirq.c4
11 files changed, 626 insertions, 205 deletions
diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX
index fa57139f50bf..f773a264ae02 100644
--- a/Documentation/RCU/00-INDEX
+++ b/Documentation/RCU/00-INDEX
@@ -12,6 +12,8 @@ lockdep-splat.txt
- RCU Lockdep splats explained.
NMI-RCU.txt
- Using RCU to Protect Dynamic NMI Handlers
+rcu_dereference.txt
+ - Proper care and feeding of return values from rcu_dereference()
rcubarrier.txt
- RCU and Unloadable Modules
rculist_nulls.txt
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
index 9d10d1db16a5..877947130ebe 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -114,12 +114,16 @@ over a rather long period of time, but improvements are always welcome!
http://www.openvms.compaq.com/wizard/wiz_2637.html
The rcu_dereference() primitive is also an excellent
- documentation aid, letting the person reading the code
- know exactly which pointers are protected by RCU.
+ documentation aid, letting the person reading the
+ code know exactly which pointers are protected by RCU.
Please note that compilers can also reorder code, and
they are becoming increasingly aggressive about doing
- just that. The rcu_dereference() primitive therefore
- also prevents destructive compiler optimizations.
+ just that. The rcu_dereference() primitive therefore also
+ prevents destructive compiler optimizations. However,
+ with a bit of devious creativity, it is possible to
+ mishandle the return value from rcu_dereference().
+ Please see rcu_dereference.txt in this directory for
+ more information.
The rcu_dereference() primitive is used by the
various "_rcu()" list-traversal primitives, such
diff --git a/Documentation/RCU/rcu_dereference.txt b/Documentation/RCU/rcu_dereference.txt
new file mode 100644
index 000000000000..ceb05da5a5ac
--- /dev/null
+++ b/Documentation/RCU/rcu_dereference.txt
@@ -0,0 +1,371 @@
+PROPER CARE AND FEEDING OF RETURN VALUES FROM rcu_dereference()
+
+Most of the time, you can use values from rcu_dereference() or one of
+the similar primitives without worries. Dereferencing (prefix "*"),
+field selection ("->"), assignment ("="), address-of ("&"), addition and
+subtraction of constants, and casts all work quite naturally and safely.
+
+It is nevertheless possible to get into trouble with other operations.
+Follow these rules to keep your RCU code working properly:
+
+o You must use one of the rcu_dereference() family of primitives
+ to load an RCU-protected pointer, otherwise CONFIG_PROVE_RCU
+ will complain. Worse yet, your code can see random memory-corruption
+ bugs due to games that compilers and DEC Alpha can play.
+ Without one of the rcu_dereference() primitives, compilers
+ can reload the value, and won't your code have fun with two
+ different values for a single pointer! Without rcu_dereference(),
+ DEC Alpha can load a pointer, dereference that pointer, and
+ return data preceding initialization that preceded the store of
+ the pointer.
+
+ In addition, the volatile cast in rcu_dereference() prevents the
+ compiler from deducing the resulting pointer value. Please see
+ the section entitled "EXAMPLE WHERE THE COMPILER KNOWS TOO MUCH"
+ for an example where the compiler can in fact deduce the exact
+ value of the pointer, and thus cause misordering.
+
+o Do not use single-element RCU-protected arrays. The compiler
+ is within its right to assume that the value of an index into
+ such an array must necessarily evaluate to zero. The compiler
+ could then substitute the constant zero for the computation, so
+ that the array index no longer depended on the value returned
+ by rcu_dereference(). If the array index no longer depends
+ on rcu_dereference(), then both the compiler and the CPU
+ are within their rights to order the array access before the
+ rcu_dereference(), which can cause the array access to return
+ garbage.
+
+o Avoid cancellation when using the "+" and "-" infix arithmetic
+ operators. For example, for a given variable "x", avoid
+ "(x-x)". There are similar arithmetic pitfalls from other
+ arithmetic operatiors, such as "(x*0)", "(x/(x+1))" or "(x%1)".
+ The compiler is within its rights to substitute zero for all of
+ these expressions, so that subsequent accesses no longer depend
+ on the rcu_dereference(), again possibly resulting in bugs due
+ to misordering.
+
+ Of course, if "p" is a pointer from rcu_dereference(), and "a"
+ and "b" are integers that happen to be equal, the expression
+ "p+a-b" is safe because its value still necessarily depends on
+ the rcu_dereference(), thus maintaining proper ordering.
+
+o Avoid all-zero operands to the bitwise "&" operator, and
+ similarly avoid all-ones operands to the bitwise "|" operator.
+ If the compiler is able to deduce the value of such operands,
+ it is within its rights to substitute the corresponding constant
+ for the bitwise operation. Once again, this causes subsequent
+ accesses to no longer depend on the rcu_dereference(), causing
+ bugs due to misordering.
+
+ Please note that single-bit operands to bitwise "&" can also
+ be dangerous. At this point, the compiler knows that the
+ resulting value can only take on one of two possible values.
+ Therefore, a very small amount of additional information will
+ allow the compiler to deduce the exact value, which again can
+ result in misordering.
+
+o If you are using RCU to protect JITed functions, so that the
+ "()" function-invocation operator is applied to a value obtained
+ (directly or indirectly) from rcu_dereference(), you may need to
+ interact directly with the hardware to flush instruction caches.
+ This issue arises on some systems when a newly JITed function is
+ using the same memory that was used by an earlier JITed function.
+
+o Do not use the results from the boolean "&&" and "||" when
+ dereferencing. For example, the following (rather improbable)
+ code is buggy:
+
+ int a[2];
+ int index;
+ int force_zero_index = 1;
+
+ ...
+
+ r1 = rcu_dereference(i1)
+ r2 = a[r1 && force_zero_index]; /* BUGGY!!! */
+
+ The reason this is buggy is that "&&" and "||" are often compiled
+ using branches. While weak-memory machines such as ARM or PowerPC
+ do order stores after such branches, they can speculate loads,
+ which can result in misordering bugs.
+
+o Do not use the results from relational operators ("==", "!=",
+ ">", ">=", "<", or "<=") when dereferencing. For example,
+ the following (quite strange) code is buggy:
+
+ int a[2];
+ int index;
+ int flip_index = 0;
+
+ ...
+
+ r1 = rcu_dereference(i1)
+ r2 = a[r1 != flip_index]; /* BUGGY!!! */
+
+ As before, the reason this is buggy is that relational operators
+ are often compiled using branches. And as before, although
+ weak-memory machines such as ARM or PowerPC do order stores
+ after such branches, but can speculate loads, which can again
+ result in misordering bugs.
+
+o Be very careful about comparing pointers obtained from
+ rcu_dereference() against non-NULL values. As Linus Torvalds
+ explained, if the two pointers are equal, the compiler could
+ substitute the pointer you are comparing against for the pointer
+ obtained from rcu_dereference(). For example:
+
+ p = rcu_dereference(gp);
+ if (p == &default_struct)
+ do_default(p->a);
+
+ Because the compiler now knows that the value of "p" is exactly
+ the address of the variable "default_struct", it is free to
+ transform this code into the following:
+
+ p = rcu_dereference(gp);
+ if (p == &default_struct)
+ do_default(default_struct.a);
+
+ On ARM and Power hardware, the load from "default_struct.a"
+ can now be speculated, such that it might happen before the
+ rcu_dereference(). This could result in bugs due to misordering.
+
+ However, comparisons are OK in the following cases:
+
+ o The comparison was against the NULL pointer. If the
+ compiler knows that the pointer is NULL, you had better
+ not be dereferencing it anyway. If the comparison is
+ non-equal, the compiler is none the wiser. Therefore,
+ it is safe to compare pointers from rcu_dereference()
+ against NULL pointers.
+
+ o The pointer is never dereferenced after being compared.
+ Since there are no subsequent dereferences, the compiler
+ cannot use anything it learned from the comparison
+ to reorder the non-existent subsequent dereferences.
+ This sort of comparison occurs frequently when scanning
+ RCU-protected circular linked lists.
+
+ o The comparison is against a pointer that references memory
+ that was initialized "a long time ago." The reason
+ this is safe is that even if misordering occurs, the
+ misordering will not affect the accesses that follow
+ the comparison. So exactly how long ago is "a long
+ time ago"? Here are some possibilities:
+
+ o Compile time.
+
+ o Boot time.
+
+ o Module-init time for module code.
+
+ o Prior to kthread creation for kthread code.
+
+ o During some prior acquisition of the lock that
+ we now hold.
+
+ o Before mod_timer() time for a timer handler.
+
+ There are many other possibilities involving the Linux
+ kernel's wide array of primitives that cause code to
+ be invoked at a later time.
+
+ o The pointer being compared against also came from
+ rcu_dereference(). In this case, both pointers depend
+ on one rcu_dereference() or another, so you get proper
+ ordering either way.
+
+ That said, this situation can make certain RCU usage
+ bugs more likely to happen. Which can be a good thing,
+ at least if they happen during testing. An example
+ of such an RCU usage bug is shown in the section titled
+ "EXAMPLE OF AMPLIFIED RCU-USAGE BUG".
+
+ o All of the accesses following the comparison are stores,
+ so that a control dependency preserves the needed ordering.
+ That said, it is easy to get control dependencies wrong.
+ Please see the "CONTROL DEPENDENCIES" section of
+ Documentation/memory-barriers.txt for more details.
+
+ o The pointers are not equal -and- the compiler does
+ not have enough information to deduce the value of the
+ pointer. Note that the volatile cast in rcu_dereference()
+ will normally prevent the compiler from knowing too much.
+
+o Disable any value-speculation optimizations that your compiler
+ might provide, especially if you are making use of feedback-based
+ optimizations that take data collected from prior runs. Such
+ value-speculation optimizations reorder operations by design.
+
+ There is one exception to this rule: Value-speculation
+ optimizations that leverage the branch-prediction hardware are
+ safe on strongly ordered systems (such as x86), but not on weakly
+ ordered systems (such as ARM or Power). Choose your compiler
+ command-line options wisely!
+
+
+EXAMPLE OF AMPLIFIED RCU-USAGE BUG
+
+Because updaters can run concurrently with RCU readers, RCU readers can
+see stale and/or inconsistent values. If RCU readers need fresh or
+consistent values, which they sometimes do, they need to take proper
+precautions. To see this, consider the following code fragment:
+
+ struct foo {
+ int a;
+ int b;
+ int c;
+ };
+ struct foo *gp1;
+ struct foo *gp2;
+
+ void updater(void)
+ {
+ struct foo *p;
+
+ p = kmalloc(...);
+ if (p == NULL)
+ deal_with_it();
+ p->a = 42; /* Each field in its own cache line. */
+ p->b = 43;
+ p->c = 44;
+ rcu_assign_pointer(gp1, p);
+ p->b = 143;
+ p->c = 144;
+ rcu_assign_pointer(gp2, p);
+ }
+
+ void reader(void)
+ {
+ struct foo *p;
+ struct foo *q;
+ int r1, r2;
+
+ p = rcu_dereference(gp2);
+ if (p == NULL)
+ return;
+ r1 = p->b; /* Guaranteed to get 143. */
+ q = rcu_dereference(gp1); /* Guaranteed non-NULL. */
+ if (p == q) {
+ /* The compiler decides that q->c is same as p->c. */
+ r2 = p->c; /* Could get 44 on weakly order system. */
+ }
+ do_something_with(r1, r2);
+ }
+
+You might be surprised that the outcome (r1 == 143 && r2 == 44) is possible,
+but you should not be. After all, the updater might have been invoked
+a second time between the time reader() loaded into "r1" and the time
+that it loaded into "r2". The fact that this same result can occur due
+to some reordering from the compiler and CPUs is beside the point.
+
+But suppose that the reader needs a consistent view?
+
+Then one approach is to use locking, for example, as follows:
+
+ struct foo {
+ int a;
+ int b;
+ int c;
+ spinlock_t lock;
+ };
+ struct foo *gp1;
+ struct foo *gp2;
+
+ void updater(void)
+ {
+ struct foo *p;
+
+ p = kmalloc(...);
+ if (p == NULL)
+ deal_with_it();
+ spin_lock(&p->lock);
+ p->a = 42; /* Each field in its own cache line. */
+ p->b = 43;
+ p->c = 44;
+ spin_unlock(&p->lock);
+ rcu_assign_pointer(gp1, p);
+ spin_lock(&p->lock);
+ p->b = 143;
+ p->c = 144;
+ spin_unlock(&p->lock);
+ rcu_assign_pointer(gp2, p);
+ }
+
+ void reader(void)
+ {
+ struct foo *p;
+ struct foo *q;
+ int r1, r2;
+
+ p = rcu_dereference(gp2);
+ if (p == NULL)
+ return;
+ spin_lock(&p->lock);
+ r1 = p->b; /* Guaranteed to get 143. */
+ q = rcu_dereference(gp1); /* Guaranteed non-NULL. */
+ if (p == q) {
+ /* The compiler decides that q->c is same as p->c. */
+ r2 = p->c; /* Locking guarantees r2 == 144. */
+ }
+ spin_unlock(&p->lock);
+ do_something_with(r1, r2);
+ }
+
+As always, use the right tool for the job!
+
+
+EXAMPLE WHERE THE COMPILER KNOWS TOO MUCH
+
+If a pointer obtained from rcu_dereference() compares not-equal to some
+other pointer, the compiler normally has no clue what the value of the
+first pointer might be. This lack of knowledge prevents the compiler
+from carrying out optimizations that otherwise might destroy the ordering
+guarantees that RCU depends on. And the volatile cast in rcu_dereference()
+should prevent the compiler from guessing the value.
+
+But without rcu_dereference(), the compiler knows more than you might
+expect. Consider the following code fragment:
+
+ struct foo {
+ int a;
+ int b;
+ };
+ static struct foo variable1;
+ static struct foo variable2;
+ static struct foo *gp = &variable1;
+
+ void updater(void)
+ {
+ initialize_foo(&variable2);
+ rcu_assign_pointer(gp, &variable2);
+ /*
+ * The above is the only store to gp in this translation unit,
+ * and the address of gp is not exported in any way.
+ */
+ }
+
+ int reader(void)
+ {
+ struct foo *p;
+
+ p = gp;
+ barrier();
+ if (p == &variable1)
+ return p->a; /* Must be variable1.a. */
+ else
+ return p->b; /* Must be variable2.b. */
+ }
+
+Because the compiler can see all stores to "gp", it knows that the only
+possible values of "gp" are "variable1" on the one hand and "variable2"
+on the other. The comparison in reader() therefore tells the compiler
+the exact value of "p" even in the not-equals case. This allows the
+compiler to make the return values independent of the load from "gp",
+in turn destroying the ordering between this load and the loads of the
+return values. This can result in "p->b" returning pre-initialization
+garbage values.
+
+In short, rcu_dereference() is -not- optional when you are going to
+dereference the resulting pointer.
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index 6f3a0057548e..68fe3ad27015 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -24,7 +24,7 @@ CONFIG_RCU_CPU_STALL_TIMEOUT
timing of the next warning for the current stall.
Stall-warning messages may be enabled and disabled completely via
- /sys/module/rcutree/parameters/rcu_cpu_stall_suppress.
+ /sys/module/rcupdate/parameters/rcu_cpu_stall_suppress.
CONFIG_RCU_CPU_STALL_VERBOSE
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 0f0fb7c432c2..49b8551a3b68 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -326,11 +326,11 @@ used as follows:
a. synchronize_rcu() rcu_read_lock() / rcu_read_unlock()
call_rcu() rcu_dereference()
-b. call_rcu_bh() rcu_read_lock_bh() / rcu_read_unlock_bh()
- rcu_dereference_bh()
+b. synchronize_rcu_bh() rcu_read_lock_bh() / rcu_read_unlock_bh()
+ call_rcu_bh() rcu_dereference_bh()
c. synchronize_sched() rcu_read_lock_sched() / rcu_read_unlock_sched()
- preempt_disable() / preempt_enable()
+ call_rcu_sched() preempt_disable() / preempt_enable()
local_irq_save() / local_irq_restore()
hardirq enter / hardirq exit
NMI enter / NMI exit
@@ -794,10 +794,22 @@ in docbook. Here is the list, by category.
RCU list traversal:
+ list_entry_rcu
+ list_first_entry_rcu
+ list_next_rcu
list_for_each_entry_rcu
+ list_for_each_entry_continue_rcu
+ hlist_first_rcu
+ hlist_next_rcu
+ hlist_pprev_rcu
hlist_for_each_entry_rcu
+ hlist_for_each_entry_rcu_bh
+ hlist_for_each_entry_continue_rcu
+ hlist_for_each_entry_continue_rcu_bh
+ hlist_nulls_first_rcu
hlist_nulls_for_each_entry_rcu
- list_for_each_entry_continue_rcu
+ hlist_bl_first_rcu
+ hlist_bl_for_each_entry_rcu
RCU pointer/list update:
@@ -806,28 +818,38 @@ RCU pointer/list update:
list_add_tail_rcu
list_del_rcu
list_replace_rcu
- hlist_del_rcu
hlist_add_after_rcu
hlist_add_before_rcu
hlist_add_head_rcu
+ hlist_del_rcu
+ hlist_del_init_rcu
hlist_replace_rcu
list_splice_init_rcu()
+ hlist_nulls_del_init_rcu
+ hlist_nulls_del_rcu
+ hlist_nulls_add_head_rcu
+ hlist_bl_add_head_rcu
+ hlist_bl_del_init_rcu
+ hlist_bl_del_rcu
+ hlist_bl_set_first_rcu
RCU: Critical sections Grace period Barrier
rcu_read_lock synchronize_net rcu_barrier
rcu_read_unlock synchronize_rcu
rcu_dereference synchronize_rcu_expedited
- call_rcu
- kfree_rcu
-
+ rcu_read_lock_held call_rcu
+ rcu_dereference_check kfree_rcu
+ rcu_dereference_protected
bh: Critical sections Grace period Barrier
rcu_read_lock_bh call_rcu_bh rcu_barrier_bh
rcu_read_unlock_bh synchronize_rcu_bh
rcu_dereference_bh synchronize_rcu_bh_expedited
-
+ rcu_dereference_bh_check
+ rcu_dereference_bh_protected
+ rcu_read_lock_bh_held
sched: Critical sections Grace period Barrier
@@ -835,7 +857,12 @@ sched: Critical sections Grace period Barrier
rcu_read_unlock_sched call_rcu_sched
[preempt_disable] synchronize_sched_expedited
[and friends]
+ rcu_read_lock_sched_notrace
+ rcu_read_unlock_sched_notrace
rcu_dereference_sched
+ rcu_dereference_sched_check
+ rcu_dereference_sched_protected
+ rcu_read_lock_sched_held
SRCU: Critical sections Grace period Barrier
@@ -843,6 +870,8 @@ SRCU: Critical sections Grace period Barrier
srcu_read_lock synchronize_srcu srcu_barrier
srcu_read_unlock call_srcu
srcu_dereference synchronize_srcu_expedited
+ srcu_dereference_check
+ srcu_read_lock_held
SRCU: Initialization/cleanup
init_srcu_struct
@@ -850,9 +879,13 @@ SRCU: Initialization/cleanup
All: lockdep-checked RCU-protected pointer access
- rcu_dereference_check
- rcu_dereference_protected
+ rcu_access_index
rcu_access_pointer
+ rcu_dereference_index_check
+ rcu_dereference_raw
+ rcu_lockdep_assert
+ rcu_sleep_check
+ RCU_NONIDLE
See the comment headers in the source code (or the docbook generated
from them) for more information.
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 97cc8d6679b4..9ccd644c1234 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -383,7 +383,7 @@ extern struct lockdep_map rcu_lock_map;
extern struct lockdep_map rcu_bh_lock_map;
extern struct lockdep_map rcu_sched_lock_map;
extern struct lockdep_map rcu_callback_map;
-extern int debug_lockdep_rcu_enabled(void);
+int debug_lockdep_rcu_enabled(void);
/**
* rcu_read_lock_held() - might we be in RCU read-side critical section?
@@ -1004,6 +1004,9 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
* pointers, but you must use rcu_assign_pointer() to initialize the
* external-to-structure pointer -after- you have completely initialized
* the reader-accessible portions of the linked structure.
+ *
+ * Note that unlike rcu_assign_pointer(), RCU_INIT_POINTER() provides no
+ * ordering guarantees for either the CPU or the compiler.
*/
#define RCU_INIT_POINTER(p, v) \
do { \
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 431528520562..858c56569127 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -144,7 +144,7 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
return;
rcp->ticks_this_gp++;
j = jiffies;
- js = rcp->jiffies_stall;
+ js = ACCESS_ONCE(rcp->jiffies_stall);
if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
@@ -152,17 +152,17 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
dump_stack();
}
if (*rcp->curtail && ULONG_CMP_GE(j, js))
- rcp->jiffies_stall = jiffies +
+ ACCESS_ONCE(rcp->jiffies_stall) = jiffies +
3 * rcu_jiffies_till_stall_check() + 3;
else if (ULONG_CMP_GE(j, js))
- rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
+ ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
}
static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
{
rcp->ticks_this_gp = 0;
rcp->gp_start = jiffies;
- rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
+ ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
}
static void check_cpu_stalls(void)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 93e64381aa2a..3bbe48939e47 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -243,7 +243,7 @@ static ulong jiffies_till_next_fqs = ULONG_MAX;
module_param(jiffies_till_first_fqs, ulong, 0644);
module_param(jiffies_till_next_fqs, ulong, 0644);
-static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
+static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp);
static void force_qs_rnp(struct rcu_state *rsp,
int (*f)(struct rcu_data *rsp, bool *isidle,
@@ -271,6 +271,15 @@ long rcu_batches_completed_bh(void)
EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
/*
+ * Force a quiescent state.
+ */
+void rcu_force_quiescent_state(void)
+{
+ force_quiescent_state(rcu_state);
+}
+EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
+
+/*
* Force a quiescent state for RCU BH.
*/
void rcu_bh_force_quiescent_state(void)
@@ -372,6 +381,28 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
}
/*
+ * Return the root node of the specified rcu_state structure.
+ */
+static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
+{
+ return &rsp->node[0];
+}
+
+/*
+ * Is there any need for future grace periods?
+ * Interrupts must be disabled. If the caller does not hold the root
+ * rnp_node structure's ->lock, the results are advisory only.
+ */
+static int rcu_future_needs_gp(struct rcu_state *rsp)
+{
+ struct rcu_node *rnp = rcu_get_root(rsp);
+ int idx = (ACCESS_ONCE(rnp->completed) + 1) & 0x1;
+ int *fp = &rnp->need_future_gp[idx];
+
+ return ACCESS_ONCE(*fp);
+}
+
+/*
* Does the current CPU require a not-yet-started grace period?
* The caller must have disabled interrupts to prevent races with
* normal callback registry.
@@ -383,7 +414,7 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
if (rcu_gp_in_progress(rsp))
return 0; /* No, a grace period is already in progress. */
- if (rcu_nocb_needs_gp(rsp))
+ if (rcu_future_needs_gp(rsp))
return 1; /* Yes, a no-CBs CPU needs one. */
if (!rdp->nxttail[RCU_NEXT_TAIL])
return 0; /* No, this is a no-CBs (or offline) CPU. */
@@ -398,14 +429,6 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
}
/*
- * Return the root node of the specified rcu_state structure.
- */
-static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
-{
- return &rsp->node[0];
-}
-
-/*
* rcu_eqs_enter_common - current CPU is moving towards extended quiescent state
*
* If the new value of the ->dynticks_nesting counter now is zero,
@@ -806,7 +829,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
{
rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
rcu_sysidle_check_cpu(rdp, isidle, maxj);
- return (rdp->dynticks_snap & 0x1) == 0;
+ if ((rdp->dynticks_snap & 0x1) == 0) {
+ trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
+ return 1;
+ } else {
+ return 0;
+ }
}
/*
@@ -899,7 +927,7 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
rsp->gp_start = j;
smp_wmb(); /* Record start time before stall time. */
j1 = rcu_jiffies_till_stall_check();
- rsp->jiffies_stall = j + j1;
+ ACCESS_ONCE(rsp->jiffies_stall) = j + j1;
rsp->jiffies_resched = j + j1 / 2;
}
@@ -938,12 +966,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
/* Only let one CPU complain about others per time interval. */
raw_spin_lock_irqsave(&rnp->lock, flags);
- delta = jiffies - rsp->jiffies_stall;
+ delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
- rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+ ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
/*
@@ -980,9 +1008,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
print_cpu_stall_info_end();
for_each_possible_cpu(cpu)
totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
- pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n",
+ pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",
smp_processor_id(), (long)(jiffies - rsp->gp_start),
- rsp->gpnum, rsp->completed, totqlen);
+ (long)rsp->gpnum, (long)rsp->completed, totqlen);
if (ndetected == 0)
pr_err("INFO: Stall ended before state dump start\n");
else if (!trigger_all_cpu_backtrace())
@@ -995,12 +1023,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
force_quiescent_state(rsp); /* Kick them all. */
}
-/*
- * This function really isn't for public consumption, but RCU is special in
- * that context switches can allow the state machine to make progress.
- */
-extern void resched_cpu(int cpu);
-
static void print_cpu_stall(struct rcu_state *rsp)
{
int cpu;
@@ -1019,14 +1041,15 @@ static void print_cpu_stall(struct rcu_state *rsp)
print_cpu_stall_info_end();
for_each_possible_cpu(cpu)
totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
- pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n",
- jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen);
+ pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
+ jiffies - rsp->gp_start,
+ (long)rsp->gpnum, (long)rsp->completed, totqlen);
if (!trigger_all_cpu_backtrace())
dump_stack();
raw_spin_lock_irqsave(&rnp->lock, flags);
- if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
- rsp->jiffies_stall = jiffies +
+ if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall)))
+ ACCESS_ONCE(rsp->jiffies_stall) = jiffies +
3 * rcu_jiffies_till_stall_check() + 3;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1110,7 +1133,7 @@ void rcu_cpu_stall_reset(void)
struct rcu_state *rsp;
for_each_rcu_flavor(rsp)
- rsp->jiffies_stall = jiffies + ULONG_MAX / 2;
+ ACCESS_ONCE(rsp->jiffies_stall) = jiffies + ULONG_MAX / 2;
}
/*
@@ -1171,15 +1194,18 @@ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
/*
* Start some future grace period, as needed to handle newly arrived
* callbacks. The required future grace periods are recorded in each
- * rcu_node structure's ->need_future_gp field.
+ * rcu_node structure's ->need_future_gp field. Returns true if there
+ * is reason to awaken the grace-period kthread.
*
* The caller must hold the specified rcu_node structure's ->lock.
*/
-static unsigned long __maybe_unused
-rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
+static bool __maybe_unused
+rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
+ unsigned long *c_out)
{
unsigned long c;
int i;
+ bool ret = false;
struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
/*
@@ -1190,7 +1216,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf"));
if (rnp->need_future_gp[c & 0x1]) {
trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf"));
- return c;
+ goto out;
}
/*
@@ -1204,7 +1230,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
rnp->need_future_gp[c & 0x1]++;
trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
- return c;
+ goto out;
}
/*
@@ -1245,12 +1271,15 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot"));
} else {
trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot"));
- rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
+ ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
}
unlock_out:
if (rnp != rnp_root)
raw_spin_unlock(&rnp_root->lock);
- return c;
+out:
+ if (c_out != NULL)
+ *c_out = c;
+ return ret;
}
/*
@@ -1274,25 +1303,43 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
}
/*
+ * Awaken the grace-period kthread for the specified flavor of RCU.
+ * Don't do a self-awaken, and don't bother awakening when there is
+ * nothing for the grace-period kthread to do (as in several CPUs
+ * raced to awaken, and we lost), and finally don't try to awaken
+ * a kthread that has not yet been created.
+ */
+static void rcu_gp_kthread_wake(struct rcu_state *rsp)
+{
+ if (current == rsp->gp_kthread ||
+ !ACCESS_ONCE(rsp->gp_flags) ||
+ !rsp->gp_kthread)
+ return;
+ wake_up(&rsp->gp_wq);
+}
+
+/*
* If there is room, assign a ->completed number to any callbacks on
* this CPU that have not already been assigned. Also accelerate any
* callbacks that were previously assigned a ->completed number that has
* since proven to be too conservative, which can happen if callbacks get
* assigned a ->completed number while RCU is idle, but with reference to
* a non-root rcu_node structure. This function is idempotent, so it does
- * not hurt to call it repeatedly.
+ * not hurt to call it repeatedly. Returns an flag saying that we should
+ * awaken the RCU grace-period kthread.
*
* The caller must hold rnp->lock with interrupts disabled.
*/
-static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
+static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp)
{
unsigned long c;
int i;
+ bool ret;
/* If the CPU has no callbacks, nothing to do. */
if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
- return;
+ return false;
/*
* Starting from the sublist containing the callbacks most
@@ -1321,7 +1368,7 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
* be grouped into.
*/
if (++i >= RCU_NEXT_TAIL)
- return;
+ return false;
/*
* Assign all subsequent callbacks' ->completed number to the next
@@ -1333,13 +1380,14 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
rdp->nxtcompleted[i] = c;
}
/* Record any needed additional grace periods. */
- rcu_start_future_gp(rnp, rdp);
+ ret = rcu_start_future_gp(rnp, rdp, NULL);
/* Trace depending on how much we were able to accelerate. */
if (!*rdp->nxttail[RCU_WAIT_TAIL])
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB"));
else
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB"));
+ return ret;
}
/*
@@ -1348,17 +1396,18 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
* assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL
* sublist. This function is idempotent, so it does not hurt to
* invoke it repeatedly. As long as it is not invoked -too- often...
+ * Returns true if the RCU grace-period kthread needs to be awakened.
*
* The caller must hold rnp->lock with interrupts disabled.
*/
-static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
+static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp)
{
int i, j;
/* If the CPU has no callbacks, nothing to do. */
if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
- return;
+ return false;
/*
* Find all callbacks whose ->completed numbers indicate that they
@@ -1382,26 +1431,30 @@ static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
}
/* Classify any remaining callbacks. */
- rcu_accelerate_cbs(rsp, rnp, rdp);
+ return rcu_accelerate_cbs(rsp, rnp, rdp);
}
/*
* Update CPU-local rcu_data state to record the beginnings and ends of
* grace periods. The caller must hold the ->lock of the leaf rcu_node
* structure corresponding to the current CPU, and must have irqs disabled.
+ * Returns true if the grace-period kthread needs to be awakened.
*/
-static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
+static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
+ struct rcu_data *rdp)
{
+ bool ret;
+
/* Handle the ends of any preceding grace periods first. */
if (rdp->completed == rnp->completed) {
/* No grace period end, so just accelerate recent callbacks. */
- rcu_accelerate_cbs(rsp, rnp, rdp);
+ ret = rcu_accelerate_cbs(rsp, rnp, rdp);
} else {
/* Advance callbacks. */
- rcu_advance_cbs(rsp, rnp, rdp);
+ ret = rcu_advance_cbs(rsp, rnp, rdp);
/* Remember that we saw this grace-period completion. */
rdp->completed = rnp->completed;
@@ -1420,11 +1473,13 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc
rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
zero_cpu_stall_ticks(rdp);
}
+ return ret;
}
static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
{
unsigned long flags;
+ bool needwake;
struct rcu_node *rnp;
local_irq_save(flags);
@@ -1436,8 +1491,10 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
return;
}
smp_mb__after_unlock_lock();
- __note_gp_changes(rsp, rnp, rdp);
+ needwake = __note_gp_changes(rsp, rnp, rdp);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ if (needwake)
+ rcu_gp_kthread_wake(rsp);
}
/*
@@ -1451,12 +1508,12 @@ static int rcu_gp_init(struct rcu_state *rsp)
rcu_bind_gp_kthread();
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
- if (rsp->gp_flags == 0) {
+ if (!ACCESS_ONCE(rsp->gp_flags)) {
/* Spurious wakeup, tell caller to go back to sleep. */
raw_spin_unlock_irq(&rnp->lock);
return 0;
}
- rsp->gp_flags = 0; /* Clear all flags: New grace period. */
+ ACCESS_ONCE(rsp->gp_flags) = 0; /* Clear all flags: New grace period. */
if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) {
/*
@@ -1501,7 +1558,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
WARN_ON_ONCE(rnp->completed != rsp->completed);
ACCESS_ONCE(rnp->completed) = rsp->completed;
if (rnp == rdp->mynode)
- __note_gp_changes(rsp, rnp, rdp);
+ (void)__note_gp_changes(rsp, rnp, rdp);
rcu_preempt_boost_start_gp(rnp);
trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
rnp->level, rnp->grplo,
@@ -1549,7 +1606,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
raw_spin_lock_irq(&rnp->lock);
smp_mb__after_unlock_lock();
- rsp->gp_flags &= ~RCU_GP_FLAG_FQS;
+ ACCESS_ONCE(rsp->gp_flags) &= ~RCU_GP_FLAG_FQS;
raw_spin_unlock_irq(&rnp->lock);
}
return fqs_state;
@@ -1561,6 +1618,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
static void rcu_gp_cleanup(struct rcu_state *rsp)
{
unsigned long gp_duration;
+ bool needgp = false;
int nocb = 0;
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);
@@ -1596,7 +1654,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
ACCESS_ONCE(rnp->completed) = rsp->gpnum;
rdp = this_cpu_ptr(rsp->rda);
if (rnp == rdp->mynode)
- __note_gp_changes(rsp, rnp, rdp);
+ needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
/* smp_mb() provided by prior unlock-lock pair. */
nocb += rcu_future_gp_cleanup(rsp, rnp);
raw_spin_unlock_irq(&rnp->lock);
@@ -1612,9 +1670,10 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
rsp->fqs_state = RCU_GP_IDLE;
rdp = this_cpu_ptr(rsp->rda);
- rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */
- if (cpu_needs_another_gp(rsp, rdp)) {
- rsp->gp_flags = RCU_GP_FLAG_INIT;
+ /* Advance CBs to reduce false positives below. */
+ needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp;
+ if (needgp || cpu_needs_another_gp(rsp, rdp)) {
+ ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
trace_rcu_grace_period(rsp->name,
ACCESS_ONCE(rsp->gpnum),
TPS("newreq"));
@@ -1715,14 +1774,6 @@ static int __noreturn rcu_gp_kthread(void *arg)
}
}
-static void rsp_wakeup(struct irq_work *work)
-{
- struct rcu_state *rsp = container_of(work, struct rcu_state, wakeup_work);
-
- /* Wake up rcu_gp_kthread() to start the grace period. */
- wake_up(&rsp->gp_wq);
-}
-
/*
* Start a new RCU grace period if warranted, re-initializing the hierarchy
* in preparation for detecting the next grace period. The caller must hold
@@ -1731,8 +1782,10 @@ static void rsp_wakeup(struct irq_work *work)
* Note that it is legal for a dying CPU (which is marked as offline) to
* invoke this function. This can happen when the dying CPU reports its
* quiescent state.
+ *
+ * Returns true if the grace-period kthread must be awakened.
*/
-static void
+static bool
rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp)
{
@@ -1743,20 +1796,18 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
* or a grace period is already in progress.
* Either way, don't start a new grace period.
*/
- return;
+ return false;
}
- rsp->gp_flags = RCU_GP_FLAG_INIT;
+ ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum),
TPS("newreq"));
/*
* We can't do wakeups while holding the rnp->lock, as that
* could cause possible deadlocks with the rq->lock. Defer
- * the wakeup to interrupt context. And don't bother waking
- * up the running kthread.
+ * the wakeup to our caller.
*/
- if (current != rsp->gp_kthread)
- irq_work_queue(&rsp->wakeup_work);
+ return true;
}
/*
@@ -1765,12 +1816,14 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
* is invoked indirectly from rcu_advance_cbs(), which would result in
* endless recursion -- or would do so if it wasn't for the self-deadlock
* that is encountered beforehand.
+ *
+ * Returns true if the grace-period kthread needs to be awakened.
*/
-static void
-rcu_start_gp(struct rcu_state *rsp)
+static bool rcu_start_gp(struct rcu_state *rsp)
{
struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
struct rcu_node *rnp = rcu_get_root(rsp);
+ bool ret = false;
/*
* If there is no grace period in progress right now, any
@@ -1780,8 +1833,9 @@ rcu_start_gp(struct rcu_state *rsp)
* resulting in pointless grace periods. So, advance callbacks
* then start the grace period!
*/
- rcu_advance_cbs(rsp, rnp, rdp);
- rcu_start_gp_advanced(rsp, rnp, rdp);
+ ret = rcu_advance_cbs(rsp, rnp, rdp) || ret;
+ ret = rcu_start_gp_advanced(rsp, rnp, rdp) || ret;
+ return ret;
}
/*
@@ -1870,6 +1924,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
{
unsigned long flags;
unsigned long mask;
+ bool needwake;
struct rcu_node *rnp;
rnp = rdp->mynode;
@@ -1898,9 +1953,11 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
* This GP can't end until cpu checks in, so all of our
* callbacks can be processed during the next GP.
*/
- rcu_accelerate_cbs(rsp, rnp, rdp);
+ needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
+ if (needwake)
+ rcu_gp_kthread_wake(rsp);
}
}
@@ -2001,7 +2058,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
{
int i;
- struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
+ struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
/* No-CBs CPUs are handled specially. */
if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
@@ -2370,7 +2427,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
return; /* Someone beat us to it. */
}
- rsp->gp_flags |= RCU_GP_FLAG_FQS;
+ ACCESS_ONCE(rsp->gp_flags) |= RCU_GP_FLAG_FQS;
raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */
}
@@ -2384,7 +2441,8 @@ static void
__rcu_process_callbacks(struct rcu_state *rsp)
{
unsigned long flags;
- struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
+ bool needwake;
+ struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
WARN_ON_ONCE(rdp->beenonline == 0);
@@ -2395,8 +2453,10 @@ __rcu_process_callbacks(struct rcu_state *rsp)
local_irq_save(flags);
if (cpu_needs_another_gp(rsp, rdp)) {
raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
- rcu_start_gp(rsp);
+ needwake = rcu_start_gp(rsp);
raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
+ if (needwake)
+ rcu_gp_kthread_wake(rsp);
} else {
local_irq_restore(flags);
}
@@ -2454,6 +2514,8 @@ static void invoke_rcu_core(void)
static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
struct rcu_head *head, unsigned long flags)
{
+ bool needwake;
+
/*
* If called from an extended quiescent state, invoke the RCU
* core in order to force a re-evaluation of RCU's idleness.
@@ -2483,8 +2545,10 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
raw_spin_lock(&rnp_root->lock);
smp_mb__after_unlock_lock();
- rcu_start_gp(rsp);
+ needwake = rcu_start_gp(rsp);
raw_spin_unlock(&rnp_root->lock);
+ if (needwake)
+ rcu_gp_kthread_wake(rsp);
} else {
/* Give the grace period a kick. */
rdp->blimit = LONG_MAX;
@@ -2587,6 +2651,20 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
EXPORT_SYMBOL_GPL(call_rcu_bh);
/*
+ * Queue an RCU callback for lazy invocation after a grace period.
+ * This will likely be later named something like "call_rcu_lazy()",
+ * but this change will require some way of tagging the lazy RCU
+ * callbacks in the list of pending callbacks. Until then, this
+ * function may only be called from __kfree_rcu().
+ */
+void kfree_call_rcu(struct rcu_head *head,
+ void (*func)(struct rcu_head *rcu))
+{
+ __call_rcu(head, func, rcu_state, -1, 1);
+}
+EXPORT_SYMBOL_GPL(kfree_call_rcu);
+
+/*
* Because a context switch is a grace period for RCU-sched and RCU-bh,
* any blocking grace-period wait automatically implies a grace period
* if there is only one CPU online at any point time during execution
@@ -3038,7 +3116,7 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
static void rcu_barrier_func(void *type)
{
struct rcu_state *rsp = type;
- struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
+ struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
_rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done);
atomic_inc(&rsp->barrier_cpu_count);
@@ -3210,7 +3288,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
* that this CPU cannot possibly have any RCU callbacks in flight yet.
*/
static void
-rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
+rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
{
unsigned long flags;
unsigned long mask;
@@ -3223,7 +3301,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
/* Set up local state, ensuring consistent view of global state. */
raw_spin_lock_irqsave(&rnp->lock, flags);
rdp->beenonline = 1; /* We have now been online. */
- rdp->preemptible = preemptible;
rdp->qlen_last_fqs_check = 0;
rdp->n_force_qs_snap = rsp->n_force_qs;
rdp->blimit = blimit;
@@ -3267,8 +3344,7 @@ static void rcu_prepare_cpu(int cpu)
struct rcu_state *rsp;
for_each_rcu_flavor(rsp)
- rcu_init_percpu_data(cpu, rsp,
- strcmp(rsp->name, "rcu_preempt") == 0);
+ rcu_init_percpu_data(cpu, rsp);
}
/*
@@ -3452,8 +3528,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
rnp->qsmaskinit = 0;
rnp->grplo = j * cpustride;
rnp->grphi = (j + 1) * cpustride - 1;
- if (rnp->grphi >= NR_CPUS)
- rnp->grphi = NR_CPUS - 1;
+ if (rnp->grphi >= nr_cpu_ids)
+ rnp->grphi = nr_cpu_ids - 1;
if (i == 0) {
rnp->grpnum = 0;
rnp->grpmask = 0;
@@ -3472,7 +3548,6 @@ static void __init rcu_init_one(struct rcu_state *rsp,
rsp->rda = rda;
init_waitqueue_head(&rsp->gp_wq);
- init_irq_work(&rsp->wakeup_work, rsp_wakeup);
rnp = rsp->level[rcu_num_lvls - 1];
for_each_possible_cpu(i) {
while (i > rnp->grphi)
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index c2fd1e722879..bf2c1e669691 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -252,7 +252,6 @@ struct rcu_data {
bool passed_quiesce; /* User-mode/idle loop etc. */
bool qs_pending; /* Core waits for quiesc state. */
bool beenonline; /* CPU online at least once. */
- bool preemptible; /* Preemptible RCU? */
struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
unsigned long grpmask; /* Mask to apply to leaf qsmask. */
#ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -463,7 +462,6 @@ struct rcu_state {
const char *name; /* Name of structure. */
char abbr; /* Abbreviated name. */
struct list_head flavors; /* List of RCU flavors. */
- struct irq_work wakeup_work; /* Postponed wakeups */
};
/* Values for rcu_state structure's gp_flags field. */
@@ -553,7 +551,6 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
static void print_cpu_stall_info_end(void);
static void zero_cpu_stall_ticks(struct rcu_data *rdp);
static void increment_cpu_stall_ticks(void);
-static int rcu_nocb_needs_gp(struct rcu_state *rsp);
static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
static void rcu_init_one_nocb(struct rcu_node *rnp);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 962d1d589929..2e579c38bd91 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -149,15 +149,6 @@ long rcu_batches_completed(void)
EXPORT_SYMBOL_GPL(rcu_batches_completed);
/*
- * Force a quiescent state for preemptible RCU.
- */
-void rcu_force_quiescent_state(void)
-{
- force_quiescent_state(&rcu_preempt_state);
-}
-EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
-
-/*
* Record a preemptible-RCU quiescent state for the specified CPU. Note
* that this just means that the task currently running on the CPU is
* not in a quiescent state. There might be any number of tasks blocked
@@ -688,20 +679,6 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
}
EXPORT_SYMBOL_GPL(call_rcu);
-/*
- * Queue an RCU callback for lazy invocation after a grace period.
- * This will likely be later named something like "call_rcu_lazy()",
- * but this change will require some way of tagging the lazy RCU
- * callbacks in the list of pending callbacks. Until then, this
- * function may only be called from __kfree_rcu().
- */
-void kfree_call_rcu(struct rcu_head *head,
- void (*func)(struct rcu_head *rcu))
-{
- __call_rcu(head, func, &rcu_preempt_state, -1, 1);
-}
-EXPORT_SYMBOL_GPL(kfree_call_rcu);
-
/**
* synchronize_rcu - wait until a grace period has elapsed.
*
@@ -991,16 +968,6 @@ long rcu_batches_completed(void)
EXPORT_SYMBOL_GPL(rcu_batches_completed);
/*
- * Force a quiescent state for RCU, which, because there is no preemptible
- * RCU, becomes the same as rcu-sched.
- */
-void rcu_force_quiescent_state(void)
-{
- rcu_sched_force_quiescent_state();
-}
-EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
-
-/*
* Because preemptible RCU does not exist, we never have to check for
* CPUs being in quiescent states.
*/
@@ -1080,22 +1047,6 @@ static void rcu_preempt_check_callbacks(int cpu)
}
/*
- * Queue an RCU callback for lazy invocation after a grace period.
- * This will likely be later named something like "call_rcu_lazy()",
- * but this change will require some way of tagging the lazy RCU
- * callbacks in the list of pending callbacks. Until then, this
- * function may only be called from __kfree_rcu().
- *
- * Because there is no preemptible RCU, we use RCU-sched instead.
- */
-void kfree_call_rcu(struct rcu_head *head,
- void (*func)(struct rcu_head *rcu))
-{
- __call_rcu(head, func, &rcu_sched_state, -1, 1);
-}
-EXPORT_SYMBOL_GPL(kfree_call_rcu);
-
-/*
* Wait for an rcu-preempt grace period, but make it happen quickly.
* But because preemptible RCU does not exist, map to rcu-sched.
*/
@@ -1744,6 +1695,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
static void rcu_prepare_for_idle(int cpu)
{
#ifndef CONFIG_RCU_NOCB_CPU_ALL
+ bool needwake;
struct rcu_data *rdp;
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
struct rcu_node *rnp;
@@ -1792,8 +1744,10 @@ static void rcu_prepare_for_idle(int cpu)
rnp = rdp->mynode;
raw_spin_lock(&rnp->lock); /* irqs already disabled. */
smp_mb__after_unlock_lock();
- rcu_accelerate_cbs(rsp, rnp, rdp);
+ needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ if (needwake)
+ rcu_gp_kthread_wake(rsp);
}
#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
}
@@ -1855,7 +1809,7 @@ static void rcu_oom_notify_cpu(void *unused)
struct rcu_data *rdp;
for_each_rcu_flavor(rsp) {
- rdp = __this_cpu_ptr(rsp->rda);
+ rdp = raw_cpu_ptr(rsp->rda);
if (rdp->qlen_lazy != 0) {
atomic_inc(&oom_callback_count);
rsp->call(&rdp->oom_head, rcu_oom_callback);
@@ -1997,7 +1951,7 @@ static void increment_cpu_stall_ticks(void)
struct rcu_state *rsp;
for_each_rcu_flavor(rsp)
- __this_cpu_ptr(rsp->rda)->ticks_this_gp++;
+ raw_cpu_inc(rsp->rda->ticks_this_gp);
}
#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
@@ -2068,19 +2022,6 @@ static int __init parse_rcu_nocb_poll(char *arg)
early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
/*
- * Do any no-CBs CPUs need another grace period?
- *
- * Interrupts must be disabled. If the caller does not hold the root
- * rnp_node structure's ->lock, the results are advisory only.
- */
-static int rcu_nocb_needs_gp(struct rcu_state *rsp)
-{
- struct rcu_node *rnp = rcu_get_root(rsp);
-
- return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1];
-}
-
-/*
* Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
* grace period.
*/
@@ -2109,7 +2050,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
}
#ifndef CONFIG_RCU_NOCB_CPU_ALL
-/* Is the specified CPU a no-CPUs CPU? */
+/* Is the specified CPU a no-CBs CPU? */
bool rcu_is_nocb_cpu(int cpu)
{
if (have_rcu_nocb_mask)
@@ -2243,12 +2184,15 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
unsigned long c;
bool d;
unsigned long flags;
+ bool needwake;
struct rcu_node *rnp = rdp->mynode;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
- c = rcu_start_future_gp(rnp, rdp);
+ needwake = rcu_start_future_gp(rnp, rdp, &c);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ if (needwake)
+ rcu_gp_kthread_wake(rdp->rsp);
/*
* Wait for the grace period. Do so interruptibly to avoid messing
@@ -2402,11 +2346,6 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
-static int rcu_nocb_needs_gp(struct rcu_state *rsp)
-{
- return 0;
-}
-
static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
{
}
@@ -2657,20 +2596,6 @@ static bool is_sysidle_rcu_state(struct rcu_state *rsp)
}
/*
- * Bind the grace-period kthread for the sysidle flavor of RCU to the
- * timekeeping CPU.
- */
-static void rcu_bind_gp_kthread(void)
-{
- int cpu = ACCESS_ONCE(tick_do_timer_cpu);
-
- if (cpu < 0 || cpu >= nr_cpu_ids)
- return;
- if (raw_smp_processor_id() != cpu)
- set_cpus_allowed_ptr(current, cpumask_of(cpu));
-}
-
-/*
* Return a delay in jiffies based on the number of CPUs, rcu_node
* leaf fanout, and jiffies tick rate. The idea is to allow larger
* systems more time to transition to full-idle state in order to
@@ -2734,7 +2659,8 @@ static void rcu_sysidle(unsigned long j)
static void rcu_sysidle_cancel(void)
{
smp_mb();
- ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
+ if (full_sysidle_state > RCU_SYSIDLE_SHORT)
+ ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
}
/*
@@ -2880,10 +2806,6 @@ static bool is_sysidle_rcu_state(struct rcu_state *rsp)
return false;
}
-static void rcu_bind_gp_kthread(void)
-{
-}
-
static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
unsigned long maxj)
{
@@ -2914,3 +2836,19 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
#endif /* #ifdef CONFIG_NO_HZ_FULL */
return 0;
}
+
+/*
+ * Bind the grace-period kthread for the sysidle flavor of RCU to the
+ * timekeeping CPU.
+ */
+static void rcu_bind_gp_kthread(void)
+{
+#ifdef CONFIG_NO_HZ_FULL
+ int cpu = ACCESS_ONCE(tick_do_timer_cpu);
+
+ if (cpu < 0 || cpu >= nr_cpu_ids)
+ return;
+ if (raw_smp_processor_id() != cpu)
+ set_cpus_allowed_ptr(current, cpumask_of(cpu));
+#endif /* #ifdef CONFIG_NO_HZ_FULL */
+}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b50990a5bea0..b9b2d4906848 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -232,7 +232,6 @@ asmlinkage void __do_softirq(void)
bool in_hardirq;
__u32 pending;
int softirq_bit;
- int cpu;
/*
* Mask out PF_MEMALLOC s current task context is borrowed for the
@@ -247,7 +246,6 @@ asmlinkage void __do_softirq(void)
__local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
in_hardirq = lockdep_softirq_start();
- cpu = smp_processor_id();
restart:
/* Reset the pending bitmask before enabling irqs */
set_softirq_pending(0);
@@ -276,11 +274,11 @@ restart:
prev_count, preempt_count());
preempt_count_set(prev_count);
}
- rcu_bh_qs(cpu);
h++;
pending >>= softirq_bit;
}
+ rcu_bh_qs(smp_processor_id());
local_irq_disable();
pending = local_softirq_pending();