From b8c17e6664c461e4aed545a943304c3b32dd309c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 Nov 2016 14:25:21 -0800 Subject: rcu: Maintain special bits at bottom of ->dynticks counter Currently, IPIs are used to force other CPUs to invalidate their TLBs in response to a kernel virtual-memory mapping change. This works, but degrades both battery lifetime (for idle CPUs) and real-time response (for nohz_full CPUs), and in addition results in unnecessary IPIs due to the fact that CPUs executing in usermode are unaffected by stale kernel mappings. It would be better to cause a CPU executing in usermode to wait until it is entering kernel mode to do the flush, first to avoid interrupting usemode tasks and second to handle multiple flush requests with a single flush in the case of a long-running user task. This commit therefore reserves a bit at the bottom of the ->dynticks counter, which is checked upon exit from extended quiescent states. If it is set, it is cleared and then a new rcu_eqs_special_exit() macro is invoked, which, if not supplied, is an empty single-pass do-while loop. If this bottom bit is set on -entry- to an extended quiescent state, then a WARN_ON_ONCE() triggers. This bottom bit may be set using a new rcu_eqs_special_set() function, which returns true if the bit was set, or false if the CPU turned out to not be in an extended quiescent state. Please note that this function refuses to set the bit for a non-nohz_full CPU when that CPU is executing in usermode because usermode execution is tracked by RCU as a dyntick-idle extended quiescent state only for nohz_full CPUs. Reported-by: Andy Lutomirski Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcutiny.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index b452953e21c8..6c9d941e3962 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -33,6 +33,11 @@ static inline int rcu_dynticks_snap(struct rcu_dynticks *rdtp) return 0; } +static inline bool rcu_eqs_special_set(int cpu) +{ + return false; /* Never flag non-existent other CPUs! */ +} + static inline unsigned long get_state_synchronize_rcu(void) { return 0; -- cgit v1.2.3 From 77e5849688670280b173bb9e0544e9da7b2acc36 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 14 Jan 2017 13:32:50 -0800 Subject: rcu: Make arch select smp_mb__after_unlock_lock() strength The definition of smp_mb__after_unlock_lock() is currently smp_mb() for CONFIG_PPC and a no-op otherwise. It would be better to instead provide an architecture-selectable Kconfig option, and select the strength of smp_mb__after_unlock_lock() based on that option. This commit therefore creates ARCH_WEAK_RELEASE_ACQUIRE, has PPC select it, and bases the definition of smp_mb__after_unlock_lock() on this new ARCH_WEAK_RELEASE_ACQUIRE Kconfig option. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Will Deacon Cc: Boqun Feng Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Acked-by: Michael Ellerman Cc: Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index de88b33c0974..e6146d0074f8 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -1127,11 +1127,11 @@ do { \ * if the UNLOCK and LOCK are executed by the same CPU or if the * UNLOCK and LOCK operate on the same lock variable. */ -#ifdef CONFIG_PPC +#ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE #define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */ -#else /* #ifdef CONFIG_PPC */ +#else /* #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */ #define smp_mb__after_unlock_lock() do { } while (0) -#endif /* #else #ifdef CONFIG_PPC */ +#endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */ #endif /* __LINUX_RCUPDATE_H */ -- cgit v1.2.3 From 900b1028ec388e50c98200641ae4274794c807cf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 10 Feb 2017 14:32:54 -0800 Subject: srcu: Allow SRCU to access rcu_scheduler_active This is primarily a code-movement commit in preparation for allowing SRCU to handle early-boot SRCU grace periods. Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 6c9d941e3962..5219be250f00 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -217,14 +217,14 @@ static inline void exit_rcu(void) { } -#ifdef CONFIG_DEBUG_LOCK_ALLOC +#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) extern int rcu_scheduler_active __read_mostly; void rcu_scheduler_starting(void); -#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +#else /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ static inline void rcu_scheduler_starting(void) { } -#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +#endif /* #else #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) -- cgit v1.2.3 From c2a8ec0778b2ca0d360ba9b5cac7fcd5ddfe798f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 10 Mar 2017 15:31:55 -0800 Subject: srcu: Move to state-based grace-period sequencing The current SRCU grace-period processing might never reach the last portion of srcu_advance_batches(). This is OK given the current implementation, as the first portion, up to the try_check_zero() following the srcu_flip() is sufficient to drive grace periods forward. However, it has the unfortunate side-effect of making it impossible to determine when a given grace period has ended, and it will be necessary to efficiently trace ends of grace periods in order to efficiently handle per-CPU SRCU callback lists. This commit therefore adds states to the SRCU grace-period processing, so that the end of a given SRCU grace period is marked by the transition to the SRCU_STATE_DONE state. Signed-off-by: Paul E. McKenney --- include/linux/srcu.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index a598cf3ac70c..f149a685896c 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -48,7 +48,7 @@ struct srcu_struct { unsigned long completed; struct srcu_array __percpu *per_cpu_ref; spinlock_t queue_lock; /* protect ->batch_queue, ->running */ - bool running; + int srcu_state; /* callbacks just queued */ struct rcu_batch batch_queue; /* callbacks try to do the first check_zero */ @@ -62,6 +62,12 @@ struct srcu_struct { #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ }; +/* Values for -> state variable. */ +#define SRCU_STATE_IDLE 0 +#define SRCU_STATE_SCAN1 1 +#define SRCU_STATE_SCAN2 2 +#define SRCU_STATE_DONE 3 + #ifdef CONFIG_DEBUG_LOCK_ALLOC int __init_srcu_struct(struct srcu_struct *sp, const char *name, @@ -89,7 +95,7 @@ void process_srcu(struct work_struct *work); .completed = -300, \ .per_cpu_ref = &name##_srcu_array, \ .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \ - .running = false, \ + .srcu_state = SRCU_STATE_IDLE, \ .batch_queue = RCU_BATCH_INIT(name.batch_queue), \ .batch_check0 = RCU_BATCH_INIT(name.batch_check0), \ .batch_check1 = RCU_BATCH_INIT(name.batch_check1), \ -- cgit v1.2.3 From ac367c1c621b75689f6d5cd8301d364ba2c9f292 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 11 Mar 2017 07:14:06 -0800 Subject: srcu: Add grace-period sequence numbers This commit adds grace-period sequence numbers, which will be used to handle mid-boot grace periods and per-CPU callback lists. Signed-off-by: Paul E. McKenney --- include/linux/srcu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index f149a685896c..047ac8c28a4e 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -46,6 +46,7 @@ struct rcu_batch { struct srcu_struct { unsigned long completed; + unsigned long srcu_gp_seq; struct srcu_array __percpu *per_cpu_ref; spinlock_t queue_lock; /* protect ->batch_queue, ->running */ int srcu_state; -- cgit v1.2.3 From 8660b7d8a545227fd9ee80508aa82528ea9947d7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 Mar 2017 16:48:18 -0700 Subject: srcu: Use rcu_segcblist to track SRCU callbacks This commit switches SRCU from custom-built callback queues to the new rcu_segcblist structure. This change associates grace-period sequence numbers with groups of callbacks, which will be needed for efficient processing of per-CPU callbacks. Signed-off-by: Paul E. McKenney --- include/linux/rcu_segcblist.h | 678 ++++++++++++++++++++++++++++++++++++++++++ include/linux/srcu.h | 24 +- 2 files changed, 683 insertions(+), 19 deletions(-) create mode 100644 include/linux/rcu_segcblist.h (limited to 'include') diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h new file mode 100644 index 000000000000..74b1e7243955 --- /dev/null +++ b/include/linux/rcu_segcblist.h @@ -0,0 +1,678 @@ +/* + * RCU segmented callback lists + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright IBM Corporation, 2017 + * + * Authors: Paul E. McKenney + */ + +#ifndef __KERNEL_RCU_SEGCBLIST_H +#define __KERNEL_RCU_SEGCBLIST_H + +/* Simple unsegmented callback lists. */ +struct rcu_cblist { + struct rcu_head *head; + struct rcu_head **tail; + long len; + long len_lazy; +}; + +#define RCU_CBLIST_INITIALIZER(n) { .head = NULL, .tail = &n.head } + +/* Initialize simple callback list. */ +static inline void rcu_cblist_init(struct rcu_cblist *rclp) +{ + rclp->head = NULL; + rclp->tail = &rclp->head; + rclp->len = 0; + rclp->len_lazy = 0; +} + +/* Is simple callback list empty? */ +static inline bool rcu_cblist_empty(struct rcu_cblist *rclp) +{ + return !rclp->head; +} + +/* Return number of callbacks in simple callback list. */ +static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp) +{ + return rclp->len; +} + +/* Return number of lazy callbacks in simple callback list. */ +static inline long rcu_cblist_n_lazy_cbs(struct rcu_cblist *rclp) +{ + return rclp->len_lazy; +} + +/* + * Debug function to actually count the number of callbacks. + * If the number exceeds the limit specified, return -1. + */ +static inline long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim) +{ + int cnt = 0; + struct rcu_head **rhpp = &rclp->head; + + for (;;) { + if (!*rhpp) + return cnt; + if (++cnt > lim) + return -1; + rhpp = &(*rhpp)->next; + } +} + +/* + * Dequeue the oldest rcu_head structure from the specified callback + * list. This function assumes that the callback is non-lazy, but + * the caller can later invoke rcu_cblist_dequeued_lazy() if it + * finds otherwise (and if it cares about laziness). This allows + * different users to have different ways of determining laziness. + */ +static inline struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp) +{ + struct rcu_head *rhp; + + rhp = rclp->head; + if (!rhp) + return NULL; + rclp->len--; + rclp->head = rhp->next; + if (!rclp->head) + rclp->tail = &rclp->head; + return rhp; +} + +/* + * Account for the fact that a previously dequeued callback turned out + * to be marked as lazy. + */ +static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp) +{ + rclp->len_lazy--; +} + +/* + * Interim function to return rcu_cblist head pointer. Longer term, the + * rcu_cblist will be used more pervasively, removing the need for this + * function. + */ +static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp) +{ + return rclp->head; +} + +/* + * Interim function to return rcu_cblist head pointer. Longer term, the + * rcu_cblist will be used more pervasively, removing the need for this + * function. + */ +static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp) +{ + WARN_ON_ONCE(rcu_cblist_empty(rclp)); + return rclp->tail; +} + +/* Complicated segmented callback lists. ;-) */ + +/* + * Index values for segments in rcu_segcblist structure. + * + * The segments are as follows: + * + * [head, *tails[RCU_DONE_TAIL]): + * Callbacks whose grace period has elapsed, and thus can be invoked. + * [*tails[RCU_DONE_TAIL], *tails[RCU_WAIT_TAIL]): + * Callbacks waiting for the current GP from the current CPU's viewpoint. + * [*tails[RCU_WAIT_TAIL], *tails[RCU_NEXT_READY_TAIL]): + * Callbacks that arrived before the next GP started, again from + * the current CPU's viewpoint. These can be handled by the next GP. + * [*tails[RCU_NEXT_READY_TAIL], *tails[RCU_NEXT_TAIL]): + * Callbacks that might have arrived after the next GP started. + * There is some uncertainty as to when a given GP starts and + * ends, but a CPU knows the exact times if it is the one starting + * or ending the GP. Other CPUs know that the previous GP ends + * before the next one starts. + * + * Note that RCU_WAIT_TAIL cannot be empty unless RCU_NEXT_READY_TAIL is also + * empty. + * + * The ->gp_seq[] array contains the grace-period number at which the + * corresponding segment of callbacks will be ready to invoke. A given + * element of this array is meaningful only when the corresponding segment + * is non-empty, and it is never valid for RCU_DONE_TAIL (whose callbacks + * are already ready to invoke) or for RCU_NEXT_TAIL (whose callbacks have + * not yet been assigned a grace-period number). + */ +#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ +#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ +#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */ +#define RCU_NEXT_TAIL 3 +#define RCU_CBLIST_NSEGS 4 + +struct rcu_segcblist { + struct rcu_head *head; + struct rcu_head **tails[RCU_CBLIST_NSEGS]; + unsigned long gp_seq[RCU_CBLIST_NSEGS]; + long len; + long len_lazy; +}; + +#define RCU_SEGCBLIST_INITIALIZER(n) \ +{ \ + .head = NULL, \ + .tails[RCU_DONE_TAIL] = &n.head, \ + .tails[RCU_WAIT_TAIL] = &n.head, \ + .tails[RCU_NEXT_READY_TAIL] = &n.head, \ + .tails[RCU_NEXT_TAIL] = &n.head, \ +} + +/* + * Initialize an rcu_segcblist structure. + */ +static inline void rcu_segcblist_init(struct rcu_segcblist *rsclp) +{ + int i; + + BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq)); + BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq)); + rsclp->head = NULL; + for (i = 0; i < RCU_CBLIST_NSEGS; i++) + rsclp->tails[i] = &rsclp->head; + rsclp->len = 0; + rsclp->len_lazy = 0; +} + +/* + * Is the specified rcu_segcblist structure empty? + * + * But careful! The fact that the ->head field is NULL does not + * necessarily imply that there are no callbacks associated with + * this structure. When callbacks are being invoked, they are + * removed as a group. If callback invocation must be preempted, + * the remaining callbacks will be added back to the list. Either + * way, the counts are updated later. + * + * So it is often the case that rcu_segcblist_n_cbs() should be used + * instead. + */ +static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp) +{ + return !rsclp->head; +} + +/* Return number of callbacks in segmented callback list. */ +static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp) +{ + return READ_ONCE(rsclp->len); +} + +/* Return number of lazy callbacks in segmented callback list. */ +static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp) +{ + return rsclp->len_lazy; +} + +/* Return number of lazy callbacks in segmented callback list. */ +static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp) +{ + return rsclp->len - rsclp->len_lazy; +} + +/* + * Is the specified rcu_segcblist enabled, for example, not corresponding + * to an offline or callback-offloaded CPU? + */ +static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) +{ + return !!rsclp->tails[RCU_NEXT_TAIL]; +} + +/* + * Disable the specified rcu_segcblist structure, so that callbacks can + * no longer be posted to it. This structure must be empty. + */ +static inline void rcu_segcblist_disable(struct rcu_segcblist *rsclp) +{ + WARN_ON_ONCE(!rcu_segcblist_empty(rsclp)); + WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp)); + WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp)); + rsclp->tails[RCU_NEXT_TAIL] = NULL; +} + +/* + * Is the specified segment of the specified rcu_segcblist structure + * empty of callbacks? + */ +static inline bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg) +{ + if (seg == RCU_DONE_TAIL) + return &rsclp->head == rsclp->tails[RCU_DONE_TAIL]; + return rsclp->tails[seg - 1] == rsclp->tails[seg]; +} + +/* + * Are all segments following the specified segment of the specified + * rcu_segcblist structure empty of callbacks? (The specified + * segment might well contain callbacks.) + */ +static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg) +{ + return !*rsclp->tails[seg]; +} + +/* + * Does the specified rcu_segcblist structure contain callbacks that + * are ready to be invoked? + */ +static inline bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp) +{ + return rcu_segcblist_is_enabled(rsclp) && + &rsclp->head != rsclp->tails[RCU_DONE_TAIL]; +} + +/* + * Does the specified rcu_segcblist structure contain callbacks that + * are still pending, that is, not yet ready to be invoked? + */ +static inline bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp) +{ + return rcu_segcblist_is_enabled(rsclp) && + !rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL); +} + +/* + * Dequeue and return the first ready-to-invoke callback. If there + * are no ready-to-invoke callbacks, return NULL. Disables interrupts + * to avoid interference. Does not protect from interference from other + * CPUs or tasks. + */ +static inline struct rcu_head * +rcu_segcblist_dequeue(struct rcu_segcblist *rsclp) +{ + unsigned long flags; + int i; + struct rcu_head *rhp; + + local_irq_save(flags); + if (!rcu_segcblist_ready_cbs(rsclp)) { + local_irq_restore(flags); + return NULL; + } + rhp = rsclp->head; + BUG_ON(!rhp); + rsclp->head = rhp->next; + for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) { + if (rsclp->tails[i] != &rhp->next) + break; + rsclp->tails[i] = &rsclp->head; + } + smp_mb(); /* Dequeue before decrement for rcu_barrier(). */ + WRITE_ONCE(rsclp->len, rsclp->len - 1); + local_irq_restore(flags); + return rhp; +} + +/* + * Account for the fact that a previously dequeued callback turned out + * to be marked as lazy. + */ +static inline void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp) +{ + unsigned long flags; + + local_irq_save(flags); + rsclp->len_lazy--; + local_irq_restore(flags); +} + +/* + * Return a pointer to the first callback in the specified rcu_segcblist + * structure. This is useful for diagnostics. + */ +static inline struct rcu_head * +rcu_segcblist_first_cb(struct rcu_segcblist *rsclp) +{ + if (rcu_segcblist_is_enabled(rsclp)) + return rsclp->head; + return NULL; +} + +/* + * Return a pointer to the first pending callback in the specified + * rcu_segcblist structure. This is useful just after posting a given + * callback -- if that callback is the first pending callback, then + * you cannot rely on someone else having already started up the required + * grace period. + */ +static inline struct rcu_head * +rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp) +{ + if (rcu_segcblist_is_enabled(rsclp)) + return *rsclp->tails[RCU_DONE_TAIL]; + return NULL; +} + +/* + * Does the specified rcu_segcblist structure contain callbacks that + * have not yet been processed beyond having been posted, that is, + * does it contain callbacks in its last segment? + */ +static inline bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp) +{ + return rcu_segcblist_is_enabled(rsclp) && + !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL); +} + +/* + * Enqueue the specified callback onto the specified rcu_segcblist + * structure, updating accounting as needed. Note that the ->len + * field may be accessed locklessly, hence the WRITE_ONCE(). + * The ->len field is used by rcu_barrier() and friends to determine + * if it must post a callback on this structure, and it is OK + * for rcu_barrier() to sometimes post callbacks needlessly, but + * absolutely not OK for it to ever miss posting a callback. + */ +static inline void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, + struct rcu_head *rhp, bool lazy) +{ + WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */ + if (lazy) + rsclp->len_lazy++; + smp_mb(); /* Ensure counts are updated before callback is enqueued. */ + rhp->next = NULL; + *rsclp->tails[RCU_NEXT_TAIL] = rhp; + rsclp->tails[RCU_NEXT_TAIL] = &rhp->next; +} + +/* + * Extract only the counts from the specified rcu_segcblist structure, + * and place them in the specified rcu_cblist structure. This function + * supports both callback orphaning and invocation, hence the separation + * of counts and callbacks. (Callbacks ready for invocation must be + * orphaned and adopted separately from pending callbacks, but counts + * apply to all callbacks. Locking must be used to make sure that + * both orphaned-callbacks lists are consistent.) + */ +static inline void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + rclp->len_lazy += rsclp->len_lazy; + rclp->len += rsclp->len; + rsclp->len_lazy = 0; + WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */ +} + +/* + * Extract only those callbacks ready to be invoked from the specified + * rcu_segcblist structure and place them in the specified rcu_cblist + * structure. + */ +static inline void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + int i; + + if (!rcu_segcblist_ready_cbs(rsclp)) + return; /* Nothing to do. */ + *rclp->tail = rsclp->head; + rsclp->head = *rsclp->tails[RCU_DONE_TAIL]; + *rsclp->tails[RCU_DONE_TAIL] = NULL; + rclp->tail = rsclp->tails[RCU_DONE_TAIL]; + for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--) + if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL]) + rsclp->tails[i] = &rsclp->head; +} + +/* + * Extract only those callbacks still pending (not yet ready to be + * invoked) from the specified rcu_segcblist structure and place them in + * the specified rcu_cblist structure. Note that this loses information + * about any callbacks that might have been partway done waiting for + * their grace period. Too bad! They will have to start over. + */ +static inline void +rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + int i; + + if (!rcu_segcblist_pend_cbs(rsclp)) + return; /* Nothing to do. */ + *rclp->tail = *rsclp->tails[RCU_DONE_TAIL]; + rclp->tail = rsclp->tails[RCU_NEXT_TAIL]; + *rsclp->tails[RCU_DONE_TAIL] = NULL; + for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) + rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL]; +} + +/* + * Move the entire contents of the specified rcu_segcblist structure, + * counts, callbacks, and all, to the specified rcu_cblist structure. + * @@@ Why do we need this??? Moving early-boot CBs to NOCB lists? + * @@@ Memory barrier needed? (Not if only used at boot time...) + */ +static inline void rcu_segcblist_extract_all(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + rcu_segcblist_extract_done_cbs(rsclp, rclp); + rcu_segcblist_extract_pend_cbs(rsclp, rclp); + rcu_segcblist_extract_count(rsclp, rclp); +} + +/* + * Insert counts from the specified rcu_cblist structure in the + * specified rcu_segcblist structure. + */ +static inline void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + rsclp->len_lazy += rclp->len_lazy; + /* ->len sampled locklessly. */ + WRITE_ONCE(rsclp->len, rsclp->len + rclp->len); + rclp->len_lazy = 0; + rclp->len = 0; +} + +/* + * Move callbacks from the specified rcu_cblist to the beginning of the + * done-callbacks segment of the specified rcu_segcblist. + */ +static inline void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + int i; + + if (!rclp->head) + return; /* No callbacks to move. */ + *rclp->tail = rsclp->head; + rsclp->head = rclp->head; + for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) + if (&rsclp->head == rsclp->tails[i]) + rsclp->tails[i] = rclp->tail; + else + break; + rclp->head = NULL; + rclp->tail = &rclp->head; +} + +/* + * Move callbacks from the specified rcu_cblist to the end of the + * new-callbacks segment of the specified rcu_segcblist. + */ +static inline void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + if (!rclp->head) + return; /* Nothing to do. */ + *rsclp->tails[RCU_NEXT_TAIL] = rclp->head; + rsclp->tails[RCU_NEXT_TAIL] = rclp->tail; + rclp->head = NULL; + rclp->tail = &rclp->head; +} + +/* + * Advance the callbacks in the specified rcu_segcblist structure based + * on the current value passed in for the grace-period counter. + */ +static inline void rcu_segcblist_advance(struct rcu_segcblist *rsclp, + unsigned long seq) +{ + int i, j; + + WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp)); + WARN_ON_ONCE(rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)); + + /* + * Find all callbacks whose ->gp_seq numbers indicate that they + * are ready to invoke, and put them into the RCU_DONE_TAIL segment. + */ + for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { + if (ULONG_CMP_LT(seq, rsclp->gp_seq[i])) + break; + rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i]; + } + + /* If no callbacks moved, nothing more need be done. */ + if (i == RCU_WAIT_TAIL) + return; + + /* Clean up tail pointers that might have been misordered above. */ + for (j = RCU_WAIT_TAIL; j < i; j++) + rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL]; + + /* + * Callbacks moved, so clean up the misordered ->tails[] pointers + * that now point into the middle of the list of ready-to-invoke + * callbacks. The overall effect is to copy down the later pointers + * into the gap that was created by the now-ready segments. + */ + for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { + if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL]) + break; /* No more callbacks. */ + rsclp->tails[j] = rsclp->tails[i]; + rsclp->gp_seq[j] = rsclp->gp_seq[i]; + } +} + +/* + * "Accelerate" callbacks based on more-accurate grace-period information. + * The reason for this is that RCU does not synchronize the beginnings and + * ends of grace periods, and that callbacks are posted locally. This in + * turn means that the callbacks must be labelled conservatively early + * on, as getting exact information would degrade both performance and + * scalability. When more accurate grace-period information becomes + * available, previously posted callbacks can be "accelerated", marking + * them to complete at the end of the earlier grace period. + * + * This function operates on an rcu_segcblist structure, and also the + * grace-period sequence number at which new callbacks would become + * ready to invoke. + */ +static inline bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, + unsigned long seq) +{ + int i; + + WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp)); + WARN_ON_ONCE(rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)); + + /* + * Find the segment preceding the oldest segment of callbacks + * whose ->gp_seq[] completion is at or after that passed in via + * "seq", skipping any empty segments. This oldest segment, along + * with any later segments, can be merged in with any newly arrived + * callbacks in the RCU_NEXT_TAIL segment, and assigned "seq" + * as their ->gp_seq[] grace-period completion sequence number. + */ + for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--) + if (rsclp->tails[i] != rsclp->tails[i - 1] && + ULONG_CMP_LT(rsclp->gp_seq[i], seq)) + break; + + /* + * If all the segments contain callbacks that correspond to + * earlier grace-period sequence numbers than "seq", leave. + * Assuming that the rcu_segcblist structure has enough + * segments in its arrays, this can only happen if some of + * the non-done segments contain callbacks that really are + * ready to invoke. This situation will get straightened + * out by the next call to rcu_segcblist_advance(). + * + * Also advance to the oldest segment of callbacks whose + * ->gp_seq[] completion is at or after that passed in via "seq", + * skipping any empty segments. + */ + if (++i >= RCU_NEXT_TAIL) + return false; + + /* + * Merge all later callbacks, including newly arrived callbacks, + * into the segment located by the for-loop above. Assign "seq" + * as the ->gp_seq[] value in order to correctly handle the case + * where there were no pending callbacks in the rcu_segcblist + * structure other than in the RCU_NEXT_TAIL segment. + */ + for (; i < RCU_NEXT_TAIL; i++) { + rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL]; + rsclp->gp_seq[i] = seq; + } + return true; +} + +/* + * Scan the specified rcu_segcblist structure for callbacks that need + * a grace period later than the one specified by "seq". We don't look + * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't + * have a grace-period sequence number. + */ +static inline bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, + unsigned long seq) +{ + int i; + + for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) + if (rsclp->tails[i - 1] != rsclp->tails[i] && + ULONG_CMP_LT(seq, rsclp->gp_seq[i])) + return true; + return false; +} + +/* + * Interim function to return rcu_segcblist head pointer. Longer term, the + * rcu_segcblist will be used more pervasively, removing the need for this + * function. + */ +static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp) +{ + return rsclp->head; +} + +/* + * Interim function to return rcu_segcblist head pointer. Longer term, the + * rcu_segcblist will be used more pervasively, removing the need for this + * function. + */ +static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp) +{ + WARN_ON_ONCE(rcu_segcblist_empty(rsclp)); + return rsclp->tails[RCU_NEXT_TAIL]; +} + +#endif /* __KERNEL_RCU_SEGCBLIST_H */ diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 047ac8c28a4e..ad154a7bc114 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -22,7 +22,7 @@ * Lai Jiangshan * * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU/ *.txt + * Documentation/RCU/ *.txt * */ @@ -32,31 +32,20 @@ #include #include #include +#include struct srcu_array { unsigned long lock_count[2]; unsigned long unlock_count[2]; }; -struct rcu_batch { - struct rcu_head *head, **tail; -}; - -#define RCU_BATCH_INIT(name) { NULL, &(name.head) } - struct srcu_struct { unsigned long completed; unsigned long srcu_gp_seq; struct srcu_array __percpu *per_cpu_ref; - spinlock_t queue_lock; /* protect ->batch_queue, ->running */ + spinlock_t queue_lock; /* protect ->srcu_cblist, ->srcu_state */ int srcu_state; - /* callbacks just queued */ - struct rcu_batch batch_queue; - /* callbacks try to do the first check_zero */ - struct rcu_batch batch_check0; - /* callbacks done with the first check_zero and the flip */ - struct rcu_batch batch_check1; - struct rcu_batch batch_done; + struct rcu_segcblist srcu_cblist; struct delayed_work work; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; @@ -97,10 +86,7 @@ void process_srcu(struct work_struct *work); .per_cpu_ref = &name##_srcu_array, \ .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \ .srcu_state = SRCU_STATE_IDLE, \ - .batch_queue = RCU_BATCH_INIT(name.batch_queue), \ - .batch_check0 = RCU_BATCH_INIT(name.batch_check0), \ - .batch_check1 = RCU_BATCH_INIT(name.batch_check1), \ - .batch_done = RCU_BATCH_INIT(name.batch_done), \ + .srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist),\ .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\ __SRCU_DEP_MAP_INIT(name) \ } -- cgit v1.2.3 From f2425b4efb0c69e77c0b9666b605ae4a1ecaae47 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 14 Mar 2017 12:42:30 -0700 Subject: srcu: Move combining-tree definitions for SRCU's benefit This commit moves the C preprocessor code that defines the default shape of the rcu_node combining tree to a new include/linux/rcu_node_tree.h file as a first step towards enabling SRCU to create its own combining tree, which in turn enables SRCU to implement per-CPU callback handling, thus avoiding contention on the lock currently guarding the single list of callbacks. Note that users of SRCU still need to know the size of the srcu_struct structure, hence include/linux rather than kernel/rcu. This commit is code-movement only. Signed-off-by: Paul E. McKenney --- include/linux/rcu_node_tree.h | 102 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 include/linux/rcu_node_tree.h (limited to 'include') diff --git a/include/linux/rcu_node_tree.h b/include/linux/rcu_node_tree.h new file mode 100644 index 000000000000..b7eb97096b1c --- /dev/null +++ b/include/linux/rcu_node_tree.h @@ -0,0 +1,102 @@ +/* + * RCU node combining tree definitions. These are used to compute + * global attributes while avoiding common-case global contention. A key + * property that these computations rely on is a tournament-style approach + * where only one of the tasks contending a lower level in the tree need + * advance to the next higher level. If properly configured, this allows + * unlimited scalability while maintaining a constant level of contention + * on the root node. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright IBM Corporation, 2017 + * + * Author: Paul E. McKenney + */ + +#ifndef __LINUX_RCU_NODE_TREE_H +#define __LINUX_RCU_NODE_TREE_H + +/* + * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and + * CONFIG_RCU_FANOUT_LEAF. + * In theory, it should be possible to add more levels straightforwardly. + * In practice, this did work well going from three levels to four. + * Of course, your mileage may vary. + */ + +#ifdef CONFIG_RCU_FANOUT +#define RCU_FANOUT CONFIG_RCU_FANOUT +#else /* #ifdef CONFIG_RCU_FANOUT */ +# ifdef CONFIG_64BIT +# define RCU_FANOUT 64 +# else +# define RCU_FANOUT 32 +# endif +#endif /* #else #ifdef CONFIG_RCU_FANOUT */ + +#ifdef CONFIG_RCU_FANOUT_LEAF +#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF +#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */ +#define RCU_FANOUT_LEAF 16 +#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */ + +#define RCU_FANOUT_1 (RCU_FANOUT_LEAF) +#define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT) +#define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT) +#define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT) + +#if NR_CPUS <= RCU_FANOUT_1 +# define RCU_NUM_LVLS 1 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_NODES NUM_RCU_LVL_0 +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } +# define RCU_NODE_NAME_INIT { "rcu_node_0" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } +#elif NR_CPUS <= RCU_FANOUT_2 +# define RCU_NUM_LVLS 2 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) +# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1) +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } +# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } +#elif NR_CPUS <= RCU_FANOUT_3 +# define RCU_NUM_LVLS 3 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) +# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) +# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2) +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } +# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } +#elif NR_CPUS <= RCU_FANOUT_4 +# define RCU_NUM_LVLS 4 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) +# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) +# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) +# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } +# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } +#else +# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" +#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ + +extern int rcu_num_lvls; +extern int rcu_num_nodes; + +#endif /* __LINUX_RCU_NODE_TREE_H */ -- cgit v1.2.3 From 2b34c43cc1671c59bad6dd1682ae3ee4f0919eb7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 14 Mar 2017 14:29:53 -0700 Subject: srcu: Move rcu_init_levelspread() to rcu_tree_node.h This commit moves the rcu_init_levelspread() function from kernel/rcu/tree.c to kernel/rcu/rcu.h so that SRCU can access it. This is another step towards enabling SRCU to create its own combining tree. This commit is code-movement only, give or take knock-on adjustments. Signed-off-by: Paul E. McKenney --- include/linux/rcu_node_tree.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/rcu_node_tree.h b/include/linux/rcu_node_tree.h index b7eb97096b1c..4b766b61e1a0 100644 --- a/include/linux/rcu_node_tree.h +++ b/include/linux/rcu_node_tree.h @@ -96,7 +96,4 @@ # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ -extern int rcu_num_lvls; -extern int rcu_num_nodes; - #endif /* __LINUX_RCU_NODE_TREE_H */ -- cgit v1.2.3 From 80a7956fe36c2ee40c6ff12c77926d267802b7c8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 22 Mar 2017 15:26:18 -0700 Subject: srcu: Merge ->srcu_state into ->srcu_gp_seq Updating ->srcu_state and ->srcu_gp_seq will lead to extremely complex race conditions given multiple callback queues, so this commit takes advantage of the two-bit state now available in rcu_seq counters to store the state in the bottom two bits of ->srcu_gp_seq. Signed-off-by: Paul E. McKenney --- include/linux/srcu.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index ad154a7bc114..e7dbc01b61a1 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -43,8 +43,7 @@ struct srcu_struct { unsigned long completed; unsigned long srcu_gp_seq; struct srcu_array __percpu *per_cpu_ref; - spinlock_t queue_lock; /* protect ->srcu_cblist, ->srcu_state */ - int srcu_state; + spinlock_t queue_lock; /* protect ->srcu_cblist */ struct rcu_segcblist srcu_cblist; struct delayed_work work; #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -56,7 +55,6 @@ struct srcu_struct { #define SRCU_STATE_IDLE 0 #define SRCU_STATE_SCAN1 1 #define SRCU_STATE_SCAN2 2 -#define SRCU_STATE_DONE 3 #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -85,7 +83,6 @@ void process_srcu(struct work_struct *work); .completed = -300, \ .per_cpu_ref = &name##_srcu_array, \ .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \ - .srcu_state = SRCU_STATE_IDLE, \ .srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist),\ .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\ __SRCU_DEP_MAP_INIT(name) \ -- cgit v1.2.3 From f60d231a87c5c9f23f10e69996f396d46f5bf901 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 24 Mar 2017 13:46:33 -0700 Subject: srcu: Crude control of expedited grace periods SRCU's implementation of expedited grace periods has always assumed that the SRCU instance is idle when the expedited request arrives. This commit improves this a bit by maintaining a count of the number of outstanding expedited requests, thus allowing prior non-expedited grace periods accommodate these requests by shifting to expedited mode. However, any non-expedited wait already in progress will still wait for the full duration. Improved control of expedited grace periods is planned, but one step at a time. Signed-off-by: Paul E. McKenney --- include/linux/srcu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index e7dbc01b61a1..73a1b6296224 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -42,6 +42,7 @@ struct srcu_array { struct srcu_struct { unsigned long completed; unsigned long srcu_gp_seq; + atomic_t srcu_exp_cnt; struct srcu_array __percpu *per_cpu_ref; spinlock_t queue_lock; /* protect ->srcu_cblist */ struct rcu_segcblist srcu_cblist; -- cgit v1.2.3 From d8be81735aa89413b333de488251f0e64e2be591 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 25 Mar 2017 09:59:38 -0700 Subject: srcu: Create a tiny SRCU In response to automated complaints about modifications to SRCU increasing its size, this commit creates a tiny SRCU that is used in SMP=n && PREEMPT=n builds. Signed-off-by: Paul E. McKenney --- include/linux/srcu.h | 69 +++++------------------------------- include/linux/srcutiny.h | 81 ++++++++++++++++++++++++++++++++++++++++++ include/linux/srcutree.h | 91 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 180 insertions(+), 61 deletions(-) create mode 100644 include/linux/srcutiny.h create mode 100644 include/linux/srcutree.h (limited to 'include') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 73a1b6296224..907f09b14eda 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -34,28 +34,7 @@ #include #include -struct srcu_array { - unsigned long lock_count[2]; - unsigned long unlock_count[2]; -}; - -struct srcu_struct { - unsigned long completed; - unsigned long srcu_gp_seq; - atomic_t srcu_exp_cnt; - struct srcu_array __percpu *per_cpu_ref; - spinlock_t queue_lock; /* protect ->srcu_cblist */ - struct rcu_segcblist srcu_cblist; - struct delayed_work work; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ -}; - -/* Values for -> state variable. */ -#define SRCU_STATE_IDLE 0 -#define SRCU_STATE_SCAN1 1 -#define SRCU_STATE_SCAN2 2 +struct srcu_struct; #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -77,42 +56,13 @@ int init_srcu_struct(struct srcu_struct *sp); #define __SRCU_DEP_MAP_INIT(srcu_name) #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ -void process_srcu(struct work_struct *work); - -#define __SRCU_STRUCT_INIT(name) \ - { \ - .completed = -300, \ - .per_cpu_ref = &name##_srcu_array, \ - .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \ - .srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist),\ - .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\ - __SRCU_DEP_MAP_INIT(name) \ - } - -/* - * Define and initialize a srcu struct at build time. - * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it. - * - * Note that although DEFINE_STATIC_SRCU() hides the name from other - * files, the per-CPU variable rules nevertheless require that the - * chosen name be globally unique. These rules also prohibit use of - * DEFINE_STATIC_SRCU() within a function. If these rules are too - * restrictive, declare the srcu_struct manually. For example, in - * each file: - * - * static struct srcu_struct my_srcu; - * - * Then, before the first use of each my_srcu, manually initialize it: - * - * init_srcu_struct(&my_srcu); - * - * See include/linux/percpu-defs.h for the rules on per-CPU variables. - */ -#define __DEFINE_SRCU(name, is_static) \ - static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\ - is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name) -#define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) -#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) +#ifdef CONFIG_TINY_SRCU +#include +#elif defined(CONFIG_TREE_SRCU) +#include +#else +#error "Unknown SRCU implementation specified to kernel configuration" +#endif /** * call_srcu() - Queue a callback for invocation after an SRCU grace period @@ -138,9 +88,6 @@ void cleanup_srcu_struct(struct srcu_struct *sp); int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp); void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp); void synchronize_srcu(struct srcu_struct *sp); -void synchronize_srcu_expedited(struct srcu_struct *sp); -unsigned long srcu_batches_completed(struct srcu_struct *sp); -void srcu_barrier(struct srcu_struct *sp); #ifdef CONFIG_DEBUG_LOCK_ALLOC diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h new file mode 100644 index 000000000000..4f284e4f4d8c --- /dev/null +++ b/include/linux/srcutiny.h @@ -0,0 +1,81 @@ +/* + * Sleepable Read-Copy Update mechanism for mutual exclusion, + * tiny variant. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2017 + * + * Author: Paul McKenney + */ + +#ifndef _LINUX_SRCU_TINY_H +#define _LINUX_SRCU_TINY_H + +#include + +struct srcu_struct { + int srcu_lock_nesting[2]; /* srcu_read_lock() nesting depth. */ + struct swait_queue_head srcu_wq; + /* Last srcu_read_unlock() wakes GP. */ + unsigned long srcu_gp_seq; /* GP seq # for callback tagging. */ + struct rcu_segcblist srcu_cblist; + /* Pending SRCU callbacks. */ + int srcu_idx; /* Current reader array element. */ + bool srcu_gp_running; /* GP workqueue running? */ + bool srcu_gp_waiting; /* GP waiting for readers? */ + struct work_struct srcu_work; /* For driving grace periods. */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +}; + +void srcu_drive_gp(struct work_struct *wp); + +#define __SRCU_STRUCT_INIT(name) \ +{ \ + .srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq), \ + .srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist), \ + .srcu_work = __WORK_INITIALIZER(name.srcu_work, srcu_drive_gp), \ + __SRCU_DEP_MAP_INIT(name) \ +} + +/* + * This odd _STATIC_ arrangement is needed for API compatibility with + * Tree SRCU, which needs some per-CPU data. + */ +#define DEFINE_SRCU(name) \ + struct srcu_struct name = __SRCU_STRUCT_INIT(name) +#define DEFINE_STATIC_SRCU(name) \ + static struct srcu_struct name = __SRCU_STRUCT_INIT(name) + +void synchronize_srcu(struct srcu_struct *sp); + +static inline void synchronize_srcu_expedited(struct srcu_struct *sp) +{ + synchronize_srcu(sp); +} + +static inline void srcu_barrier(struct srcu_struct *sp) +{ + synchronize_srcu(sp); +} + +static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) +{ + return 0; +} + +#endif diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h new file mode 100644 index 000000000000..f2b3bd6c6bc2 --- /dev/null +++ b/include/linux/srcutree.h @@ -0,0 +1,91 @@ +/* + * Sleepable Read-Copy Update mechanism for mutual exclusion, + * tree variant. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2017 + * + * Author: Paul McKenney + */ + +#ifndef _LINUX_SRCU_TREE_H +#define _LINUX_SRCU_TREE_H + +struct srcu_array { + unsigned long lock_count[2]; + unsigned long unlock_count[2]; +}; + +struct srcu_struct { + unsigned long completed; + unsigned long srcu_gp_seq; + atomic_t srcu_exp_cnt; + struct srcu_array __percpu *per_cpu_ref; + spinlock_t queue_lock; /* protect ->srcu_cblist */ + struct rcu_segcblist srcu_cblist; + struct delayed_work work; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +}; + +/* Values for -> state variable. */ +#define SRCU_STATE_IDLE 0 +#define SRCU_STATE_SCAN1 1 +#define SRCU_STATE_SCAN2 2 + +void process_srcu(struct work_struct *work); + +#define __SRCU_STRUCT_INIT(name) \ + { \ + .completed = -300, \ + .per_cpu_ref = &name##_srcu_array, \ + .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \ + .srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist),\ + .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\ + __SRCU_DEP_MAP_INIT(name) \ + } + +/* + * Define and initialize a srcu struct at build time. + * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it. + * + * Note that although DEFINE_STATIC_SRCU() hides the name from other + * files, the per-CPU variable rules nevertheless require that the + * chosen name be globally unique. These rules also prohibit use of + * DEFINE_STATIC_SRCU() within a function. If these rules are too + * restrictive, declare the srcu_struct manually. For example, in + * each file: + * + * static struct srcu_struct my_srcu; + * + * Then, before the first use of each my_srcu, manually initialize it: + * + * init_srcu_struct(&my_srcu); + * + * See include/linux/percpu-defs.h for the rules on per-CPU variables. + */ +#define __DEFINE_SRCU(name, is_static) \ + static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\ + is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name) +#define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) +#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) + +void synchronize_srcu_expedited(struct srcu_struct *sp); +void srcu_barrier(struct srcu_struct *sp); +unsigned long srcu_batches_completed(struct srcu_struct *sp); + +#endif -- cgit v1.2.3 From dad81a2026841b5e2651aab58a7398c13cc05847 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 25 Mar 2017 17:23:44 -0700 Subject: srcu: Introduce CLASSIC_SRCU Kconfig option The TREE_SRCU rewrite is large and a bit on the non-simple side, so this commit helps reduce risk by allowing the old v4.11 SRCU algorithm to be selected using a new CLASSIC_SRCU Kconfig option that depends on RCU_EXPERT. The default is to use the new TREE_SRCU and TINY_SRCU algorithms, in order to help get these the testing that they need. However, if your users do not require the update-side scalability that is to be provided by TREE_SRCU, select RCU_EXPERT and then CLASSIC_SRCU to revert back to the old classic SRCU algorithm. Signed-off-by: Paul E. McKenney --- include/linux/srcu.h | 2 + include/linux/srcuclassic.h | 101 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+) create mode 100644 include/linux/srcuclassic.h (limited to 'include') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 907f09b14eda..167ad8831aaf 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -60,6 +60,8 @@ int init_srcu_struct(struct srcu_struct *sp); #include #elif defined(CONFIG_TREE_SRCU) #include +#elif defined(CONFIG_CLASSIC_SRCU) +#include #else #error "Unknown SRCU implementation specified to kernel configuration" #endif diff --git a/include/linux/srcuclassic.h b/include/linux/srcuclassic.h new file mode 100644 index 000000000000..41cf99930f34 --- /dev/null +++ b/include/linux/srcuclassic.h @@ -0,0 +1,101 @@ +/* + * Sleepable Read-Copy Update mechanism for mutual exclusion, + * classic v4.11 variant. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2017 + * + * Author: Paul McKenney + */ + +#ifndef _LINUX_SRCU_CLASSIC_H +#define _LINUX_SRCU_CLASSIC_H + +struct srcu_array { + unsigned long lock_count[2]; + unsigned long unlock_count[2]; +}; + +struct rcu_batch { + struct rcu_head *head, **tail; +}; + +#define RCU_BATCH_INIT(name) { NULL, &(name.head) } + +struct srcu_struct { + unsigned long completed; + struct srcu_array __percpu *per_cpu_ref; + spinlock_t queue_lock; /* protect ->batch_queue, ->running */ + bool running; + /* callbacks just queued */ + struct rcu_batch batch_queue; + /* callbacks try to do the first check_zero */ + struct rcu_batch batch_check0; + /* callbacks done with the first check_zero and the flip */ + struct rcu_batch batch_check1; + struct rcu_batch batch_done; + struct delayed_work work; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +}; + +void process_srcu(struct work_struct *work); + +#define __SRCU_STRUCT_INIT(name) \ + { \ + .completed = -300, \ + .per_cpu_ref = &name##_srcu_array, \ + .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \ + .running = false, \ + .batch_queue = RCU_BATCH_INIT(name.batch_queue), \ + .batch_check0 = RCU_BATCH_INIT(name.batch_check0), \ + .batch_check1 = RCU_BATCH_INIT(name.batch_check1), \ + .batch_done = RCU_BATCH_INIT(name.batch_done), \ + .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\ + __SRCU_DEP_MAP_INIT(name) \ + } + +/* + * Define and initialize a srcu struct at build time. + * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it. + * + * Note that although DEFINE_STATIC_SRCU() hides the name from other + * files, the per-CPU variable rules nevertheless require that the + * chosen name be globally unique. These rules also prohibit use of + * DEFINE_STATIC_SRCU() within a function. If these rules are too + * restrictive, declare the srcu_struct manually. For example, in + * each file: + * + * static struct srcu_struct my_srcu; + * + * Then, before the first use of each my_srcu, manually initialize it: + * + * init_srcu_struct(&my_srcu); + * + * See include/linux/percpu-defs.h for the rules on per-CPU variables. + */ +#define __DEFINE_SRCU(name, is_static) \ + static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\ + is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name) +#define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) +#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) + +void synchronize_srcu_expedited(struct srcu_struct *sp); +void srcu_barrier(struct srcu_struct *sp); +unsigned long srcu_batches_completed(struct srcu_struct *sp); + +#endif -- cgit v1.2.3 From 5f0d5a3ae7cff0d7fa943c199c3a2e44f23e1fac Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 Jan 2017 02:53:44 -0800 Subject: mm: Rename SLAB_DESTROY_BY_RCU to SLAB_TYPESAFE_BY_RCU A group of Linux kernel hackers reported chasing a bug that resulted from their assumption that SLAB_DESTROY_BY_RCU provided an existence guarantee, that is, that no block from such a slab would be reallocated during an RCU read-side critical section. Of course, that is not the case. Instead, SLAB_DESTROY_BY_RCU only prevents freeing of an entire slab of blocks. However, there is a phrase for this, namely "type safety". This commit therefore renames SLAB_DESTROY_BY_RCU to SLAB_TYPESAFE_BY_RCU in order to avoid future instances of this sort of confusion. Signed-off-by: Paul E. McKenney Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andrew Morton Cc: Acked-by: Johannes Weiner Acked-by: Vlastimil Babka [ paulmck: Add comments mentioning the old name, as requested by Eric Dumazet, in order to help people familiar with the old name find the new one. ] Acked-by: David Rientjes --- include/linux/dma-fence.h | 4 ++-- include/linux/slab.h | 6 ++++-- include/net/sock.h | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index 6048fa404e57..a5195a7d6f77 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -229,7 +229,7 @@ static inline struct dma_fence *dma_fence_get_rcu(struct dma_fence *fence) * * Function returns NULL if no refcount could be obtained, or the fence. * This function handles acquiring a reference to a fence that may be - * reallocated within the RCU grace period (such as with SLAB_DESTROY_BY_RCU), + * reallocated within the RCU grace period (such as with SLAB_TYPESAFE_BY_RCU), * so long as the caller is using RCU on the pointer to the fence. * * An alternative mechanism is to employ a seqlock to protect a bunch of @@ -257,7 +257,7 @@ dma_fence_get_rcu_safe(struct dma_fence * __rcu *fencep) * have successfully acquire a reference to it. If it no * longer matches, we are holding a reference to some other * reallocated pointer. This is possible if the allocator - * is using a freelist like SLAB_DESTROY_BY_RCU where the + * is using a freelist like SLAB_TYPESAFE_BY_RCU where the * fence remains valid for the RCU grace period, but it * may be reallocated. When using such allocators, we are * responsible for ensuring the reference we get is to diff --git a/include/linux/slab.h b/include/linux/slab.h index 3c37a8c51921..04a7f7993e67 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -28,7 +28,7 @@ #define SLAB_STORE_USER 0x00010000UL /* DEBUG: Store the last owner for bug hunting */ #define SLAB_PANIC 0x00040000UL /* Panic if kmem_cache_create() fails */ /* - * SLAB_DESTROY_BY_RCU - **WARNING** READ THIS! + * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS! * * This delays freeing the SLAB page by a grace period, it does _NOT_ * delay object freeing. This means that if you do kmem_cache_free() @@ -61,8 +61,10 @@ * * rcu_read_lock before reading the address, then rcu_read_unlock after * taking the spinlock within the structure expected at that address. + * + * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU. */ -#define SLAB_DESTROY_BY_RCU 0x00080000UL /* Defer freeing slabs to RCU */ +#define SLAB_TYPESAFE_BY_RCU 0x00080000UL /* Defer freeing slabs to RCU */ #define SLAB_MEM_SPREAD 0x00100000UL /* Spread some memory over cpuset */ #define SLAB_TRACE 0x00200000UL /* Trace allocations and frees */ diff --git a/include/net/sock.h b/include/net/sock.h index 5e5997654db6..59cdccaa30e7 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -993,7 +993,7 @@ struct smc_hashinfo; struct module; /* - * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes + * caches using SLAB_TYPESAFE_BY_RCU should let .next pointer from nulls nodes * un-modified. Special care is taken when initializing object to zero. */ static inline void sk_prot_clear_nulls(struct sock *sk, int size) -- cgit v1.2.3 From 468d01bec544286bb5283f012b95b5b84636565b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 Feb 2017 11:40:15 -0800 Subject: types: Update obsolete callback_head comment The comment header for callback_head (and thus for rcu_head) states that the bottom two bits of a pointer to these structures must be zero. This is obsolete: The new requirement is that only the bottom bit need be zero. This commit therefore updates this comment. Signed-off-by: Paul E. McKenney --- include/linux/types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/types.h b/include/linux/types.h index 1e7bd24848fc..258099a4ed82 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -209,7 +209,7 @@ struct ustat { * naturally due ABI requirements, but some architectures (like CRIS) have * weird ABI and we need to ask it explicitly. * - * The alignment is required to guarantee that bits 0 and 1 of @next will be + * The alignment is required to guarantee that bit 0 of @next will be * clear under normal conditions -- as long as we use call_rcu(), * call_rcu_bh(), call_rcu_sched(), or call_srcu() to queue callback. * -- cgit v1.2.3 From 48ac34666ff76843d8743db1cc78b303759916f1 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 27 Feb 2017 21:14:19 +0200 Subject: hlist_add_tail_rcu disable sparse warning sparse is unhappy about this code in hlist_add_tail_rcu: struct hlist_node *i, *last = NULL; for (i = hlist_first_rcu(h); i; i = hlist_next_rcu(i)) last = i; This is because hlist_next_rcu and hlist_next_rcu return __rcu pointers. It's a false positive - it's a write side primitive and so does not need to be called in a read side critical section. The following trivial patch disables the warning without changing the behaviour in any way. Note: __hlist_for_each_rcu would also remove the warning but it would be confusing since it calls rcu_derefence and is designed to run in the rcu read side critical section. Signed-off-by: Michael S. Tsirkin Reviewed-by: Steven Rostedt (VMware) Signed-off-by: Paul E. McKenney --- include/linux/rculist.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 4f7a9561b8c4..b1fd8bf85fdc 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -509,7 +509,8 @@ static inline void hlist_add_tail_rcu(struct hlist_node *n, { struct hlist_node *i, *last = NULL; - for (i = hlist_first_rcu(h); i; i = hlist_next_rcu(i)) + /* Note: write side code, so rcu accessors are not needed. */ + for (i = h->first; i; i = i->next) last = i; if (last) { -- cgit v1.2.3 From 6ade8694f471d847500c7cec152cc15171cef5d5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 20 Apr 2017 17:30:06 -0700 Subject: kvm: Move srcu_struct fields to end of struct kvm Parallelizing SRCU callback handling will increase the size of srcu_struct, which will move the kvm structure's kvm_arch field out of reach of powerpc's current assembly code, which will result in the following sort of build error: arch/powerpc/kvm/book3s_hv_rmhandlers.S:617: Error: operand out of range (0x000000000000b328 is not between 0xffffffffffff8000 and 0x0000000000007fff) This commit moves the srcu_struct fields in the kvm structure to follow the kvm_arch field, which will allow powerpc's assembly code to continue to be able to reach the kvm_arch field. Reported-by: Stephen Rothwell Reported-by: Michael Ellerman Reported-by: kbuild test robot Suggested-by: Paolo Bonzini Signed-off-by: Paul E. McKenney Tested-by: Michael Ellerman Acked-by: Paolo Bonzini [ paulmck: Moved this commit to precede SRCU callback parallelization, and reworded the commit log into future tense, all in the name of bisectability. ] --- include/linux/kvm_host.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 2c14ad9809da..96c8e29c6442 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -375,8 +375,6 @@ struct kvm { struct mutex slots_lock; struct mm_struct *mm; /* userspace tied to this vm */ struct kvm_memslots *memslots[KVM_ADDRESS_SPACE_NUM]; - struct srcu_struct srcu; - struct srcu_struct irq_srcu; struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; /* @@ -429,6 +427,8 @@ struct kvm { struct list_head devices; struct dentry *debugfs_dentry; struct kvm_stat_data **debugfs_stat_data; + struct srcu_struct srcu; + struct srcu_struct irq_srcu; }; #define kvm_err(fmt, ...) \ -- cgit v1.2.3 From da915ad5cf25b5f5d358dd3670c3378d8ae8c03e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 5 Apr 2017 09:01:53 -0700 Subject: srcu: Parallelize callback handling Peter Zijlstra proposed using SRCU to reduce mmap_sem contention [1,2], however, there are workloads that could result in a high volume of concurrent invocations of call_srcu(), which with current SRCU would result in excessive lock contention on the srcu_struct structure's ->queue_lock, which protects SRCU's callback lists. This commit therefore moves SRCU to per-CPU callback lists, thus greatly reducing contention. Because a given SRCU instance no longer has a single centralized callback list, starting grace periods and invoking callbacks are both more complex than in the single-list Classic SRCU implementation. Starting grace periods and handling callbacks are now handled using an srcu_node tree that is in some ways similar to the rcu_node trees used by RCU-bh, RCU-preempt, and RCU-sched (for example, the srcu_node tree shape is controlled by exactly the same Kconfig options and boot parameters that control the shape of the rcu_node tree). In addition, the old per-CPU srcu_array structure is now named srcu_data and contains an rcu_segcblist structure named ->srcu_cblist for its callbacks (and a spinlock to protect this). The srcu_struct gets an srcu_gp_seq that is used to associate callback segments with the corresponding completion-time grace-period number. These completion-time grace-period numbers are propagated up the srcu_node tree so that the grace-period workqueue handler can determine whether additional grace periods are needed on the one hand and where to look for callbacks that are ready to be invoked. The srcu_barrier() function must now wait on all instances of the per-CPU ->srcu_cblist. Because each ->srcu_cblist is protected by ->lock, srcu_barrier() can remotely add the needed callbacks. In theory, it could also remotely start grace periods, but in practice doing so is complex and racy. And interestingly enough, it is never necessary for srcu_barrier() to start a grace period because srcu_barrier() only enqueues a callback when a callback is already present--and it turns out that a grace period has to have already been started for this pre-existing callback. Furthermore, it is only the callback that srcu_barrier() needs to wait on, not any particular grace period. Therefore, a new rcu_segcblist_entrain() function enqueues the srcu_barrier() function's callback into the same segment occupied by the last pre-existing callback in the list. The special case where all the pre-existing callbacks are on a different list (because they are in the process of being invoked) is handled by enqueuing srcu_barrier()'s callback into the RCU_DONE_TAIL segment, relying on the done-callbacks check that takes place after all callbacks are inovked. Note that the readers use the same algorithm as before. Note that there is a separate srcu_idx that tells the readers what counter to increment. This unfortunately cannot be combined with srcu_gp_seq because they need to be incremented at different times. This commit introduces some ugly #ifdefs in rcutorture. These will go away when I feel good enough about Tree SRCU to ditch Classic SRCU. Some crude performance comparisons, courtesy of a quickly hacked rcuperf asynchronous-grace-period capability: Callback Queuing Overhead ------------------------- # CPUS Classic SRCU Tree SRCU ------ ------------ --------- 2 0.349 us 0.342 us 16 31.66 us 0.4 us 41 --------- 0.417 us The times are the 90th percentiles, a statistic that was chosen to reject the overheads of the occasional srcu_barrier() call needed to avoid OOMing the test machine. The rcuperf test hangs when running Classic SRCU at 41 CPUs, hence the line of dashes. Despite the hacks to both the rcuperf code and that statistics, this is a convincing demonstration of Tree SRCU's performance and scalability advantages. [1] https://lwn.net/Articles/309030/ [2] https://patchwork.kernel.org/patch/5108281/ Signed-off-by: Paul E. McKenney [ paulmck: Fix initialization if synchronize_srcu_expedited() called first. ] --- include/linux/rcu_segcblist.h | 42 ++++++++++++++++++++--- include/linux/srcutree.h | 80 ++++++++++++++++++++++++++++++++++--------- 2 files changed, 102 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index 74b1e7243955..ced8f313fd05 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -401,6 +401,37 @@ static inline void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, rsclp->tails[RCU_NEXT_TAIL] = &rhp->next; } +/* + * Entrain the specified callback onto the specified rcu_segcblist at + * the end of the last non-empty segment. If the entire rcu_segcblist + * is empty, make no change, but return false. + * + * This is intended for use by rcu_barrier()-like primitives, -not- + * for normal grace-period use. IMPORTANT: The callback you enqueue + * will wait for all prior callbacks, NOT necessarily for a grace + * period. You have been warned. + */ +static inline bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, + struct rcu_head *rhp, bool lazy) +{ + int i; + + if (rcu_segcblist_n_cbs(rsclp) == 0) + return false; + WRITE_ONCE(rsclp->len, rsclp->len + 1); + if (lazy) + rsclp->len_lazy++; + smp_mb(); /* Ensure counts are updated before callback is entrained. */ + rhp->next = NULL; + for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--) + if (rsclp->tails[i] != rsclp->tails[i - 1]) + break; + *rsclp->tails[i] = rhp; + for (; i <= RCU_NEXT_TAIL; i++) + rsclp->tails[i] = &rhp->next; + return true; +} + /* * Extract only the counts from the specified rcu_segcblist structure, * and place them in the specified rcu_cblist structure. This function @@ -537,7 +568,8 @@ static inline void rcu_segcblist_advance(struct rcu_segcblist *rsclp, int i, j; WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp)); - WARN_ON_ONCE(rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)); + if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)) + return; /* * Find all callbacks whose ->gp_seq numbers indicate that they @@ -582,8 +614,9 @@ static inline void rcu_segcblist_advance(struct rcu_segcblist *rsclp, * them to complete at the end of the earlier grace period. * * This function operates on an rcu_segcblist structure, and also the - * grace-period sequence number at which new callbacks would become - * ready to invoke. + * grace-period sequence number seq at which new callbacks would become + * ready to invoke. Returns true if there are callbacks that won't be + * ready to invoke until seq, false otherwise. */ static inline bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) @@ -591,7 +624,8 @@ static inline bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, int i; WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp)); - WARN_ON_ONCE(rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)); + if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)) + return false; /* * Find the segment preceding the oldest segment of callbacks diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index f2b3bd6c6bc2..0400e211aa44 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -24,25 +24,75 @@ #ifndef _LINUX_SRCU_TREE_H #define _LINUX_SRCU_TREE_H -struct srcu_array { - unsigned long lock_count[2]; - unsigned long unlock_count[2]; +#include +#include + +struct srcu_node; +struct srcu_struct; + +/* + * Per-CPU structure feeding into leaf srcu_node, similar in function + * to rcu_node. + */ +struct srcu_data { + /* Read-side state. */ + unsigned long srcu_lock_count[2]; /* Locks per CPU. */ + unsigned long srcu_unlock_count[2]; /* Unlocks per CPU. */ + + /* Update-side state. */ + spinlock_t lock ____cacheline_internodealigned_in_smp; + struct rcu_segcblist srcu_cblist; /* List of callbacks.*/ + unsigned long srcu_gp_seq_needed; /* Furthest future GP needed. */ + bool srcu_cblist_invoking; /* Invoking these CBs? */ + struct delayed_work work; /* Context for CB invoking. */ + struct rcu_head srcu_barrier_head; /* For srcu_barrier() use. */ + struct srcu_node *mynode; /* Leaf srcu_node. */ + int cpu; + struct srcu_struct *sp; }; +/* + * Node in SRCU combining tree, similar in function to rcu_data. + */ +struct srcu_node { + spinlock_t lock; + unsigned long srcu_have_cbs[4]; /* GP seq for children */ + /* having CBs, but only */ + /* is > ->srcu_gq_seq. */ + struct srcu_node *srcu_parent; /* Next up in tree. */ + int grplo; /* Least CPU for node. */ + int grphi; /* Biggest CPU for node. */ +}; + +/* + * Per-SRCU-domain structure, similar in function to rcu_state. + */ struct srcu_struct { - unsigned long completed; - unsigned long srcu_gp_seq; - atomic_t srcu_exp_cnt; - struct srcu_array __percpu *per_cpu_ref; - spinlock_t queue_lock; /* protect ->srcu_cblist */ - struct rcu_segcblist srcu_cblist; + struct srcu_node node[NUM_RCU_NODES]; /* Combining tree. */ + struct srcu_node *level[RCU_NUM_LVLS + 1]; + /* First node at each level. */ + struct mutex srcu_cb_mutex; /* Serialize CB preparation. */ + spinlock_t gp_lock; /* protect ->srcu_cblist */ + struct mutex srcu_gp_mutex; /* Serialize GP work. */ + unsigned int srcu_idx; /* Current rdr array element. */ + unsigned long srcu_gp_seq; /* Grace-period seq #. */ + unsigned long srcu_gp_seq_needed; /* Latest gp_seq needed. */ + atomic_t srcu_exp_cnt; /* # ongoing expedited GPs. */ + struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ + unsigned long srcu_barrier_seq; /* srcu_barrier seq #. */ + struct mutex srcu_barrier_mutex; /* Serialize barrier ops. */ + struct completion srcu_barrier_completion; + /* Awaken barrier rq at end. */ + atomic_t srcu_barrier_cpu_cnt; /* # CPUs not yet posting a */ + /* callback for the barrier */ + /* operation. */ struct delayed_work work; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ }; -/* Values for -> state variable. */ +/* Values for state variable (bottom bits of ->srcu_gp_seq). */ #define SRCU_STATE_IDLE 0 #define SRCU_STATE_SCAN1 1 #define SRCU_STATE_SCAN2 2 @@ -51,11 +101,9 @@ void process_srcu(struct work_struct *work); #define __SRCU_STRUCT_INIT(name) \ { \ - .completed = -300, \ - .per_cpu_ref = &name##_srcu_array, \ - .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \ - .srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist),\ - .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\ + .sda = &name##_srcu_data, \ + .gp_lock = __SPIN_LOCK_UNLOCKED(name.gp_lock), \ + .srcu_gp_seq_needed = 0 - 1, \ __SRCU_DEP_MAP_INIT(name) \ } @@ -79,7 +127,7 @@ void process_srcu(struct work_struct *work); * See include/linux/percpu-defs.h for the rules on per-CPU variables. */ #define __DEFINE_SRCU(name, is_static) \ - static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\ + static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data);\ is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name) #define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) #define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) -- cgit v1.2.3 From bcbfdd01dce5556a952fae84ef16fd0f12525e7b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 11 Apr 2017 15:50:41 -0700 Subject: rcu: Make non-preemptive schedule be Tasks RCU quiescent state Currently, a call to schedule() acts as a Tasks RCU quiescent state only if a context switch actually takes place. However, just the call to schedule() guarantees that the calling task has moved off of whatever tracing trampoline that it might have been one previously. This commit therefore plumbs schedule()'s "preempt" parameter into rcu_note_context_switch(), which then records the Tasks RCU quiescent state, but only if this call to schedule() was -not- due to a preemption. To avoid adding overhead to the common-case context-switch path, this commit hides the rcu_note_context_switch() check under an existing non-common-case check. Suggested-by: Steven Rostedt Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 11 ++++++++--- include/linux/rcutiny.h | 13 +++++++++---- include/linux/rcutree.h | 5 +++-- 3 files changed, 20 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index e6146d0074f8..f531b29207da 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -363,15 +363,20 @@ static inline void rcu_init_nohz(void) #ifdef CONFIG_TASKS_RCU #define TASKS_RCU(x) x extern struct srcu_struct tasks_rcu_exit_srcu; -#define rcu_note_voluntary_context_switch(t) \ +#define rcu_note_voluntary_context_switch_lite(t) \ do { \ - rcu_all_qs(); \ if (READ_ONCE((t)->rcu_tasks_holdout)) \ WRITE_ONCE((t)->rcu_tasks_holdout, false); \ } while (0) +#define rcu_note_voluntary_context_switch(t) \ + do { \ + rcu_all_qs(); \ + rcu_note_voluntary_context_switch_lite(t); \ + } while (0) #else /* #ifdef CONFIG_TASKS_RCU */ #define TASKS_RCU(x) do { } while (0) -#define rcu_note_voluntary_context_switch(t) rcu_all_qs() +#define rcu_note_voluntary_context_switch_lite(t) do { } while (0) +#define rcu_note_voluntary_context_switch(t) rcu_all_qs() #endif /* #else #ifdef CONFIG_TASKS_RCU */ /** diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 5219be250f00..74d9c3a1feee 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -92,10 +92,11 @@ static inline void kfree_call_rcu(struct rcu_head *head, call_rcu(head, func); } -static inline void rcu_note_context_switch(void) -{ - rcu_sched_qs(); -} +#define rcu_note_context_switch(preempt) \ + do { \ + rcu_sched_qs(); \ + rcu_note_voluntary_context_switch_lite(current); \ + } while (0) /* * Take advantage of the fact that there is only one CPU, which @@ -242,6 +243,10 @@ static inline bool rcu_is_watching(void) #endif /* #else defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ +static inline void rcu_request_urgent_qs_task(struct task_struct *t) +{ +} + static inline void rcu_all_qs(void) { barrier(); /* Avoid RCU read-side critical sections leaking across. */ diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 63a4e4cf40a5..0bacb6b2af69 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -30,7 +30,7 @@ #ifndef __LINUX_RCUTREE_H #define __LINUX_RCUTREE_H -void rcu_note_context_switch(void); +void rcu_note_context_switch(bool preempt); int rcu_needs_cpu(u64 basem, u64 *nextevt); void rcu_cpu_stall_reset(void); @@ -41,7 +41,7 @@ void rcu_cpu_stall_reset(void); */ static inline void rcu_virt_note_context_switch(int cpu) { - rcu_note_context_switch(); + rcu_note_context_switch(false); } void synchronize_rcu_bh(void); @@ -108,6 +108,7 @@ void rcu_scheduler_starting(void); extern int rcu_scheduler_active __read_mostly; bool rcu_is_watching(void); +void rcu_request_urgent_qs_task(struct task_struct *t); void rcu_all_qs(void); -- cgit v1.2.3