34 files changed, 1704 insertions, 744 deletions
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index da1525ec4c87..d819dc77fe65 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -775,10 +775,11 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
 	unsigned long flags;
 	int i;
 
+	spin_lock_irqsave(&bfqd->lock, flags);
+
 	if (!entity) /* root group */
-		return;
+		goto put_async_queues;
 
-	spin_lock_irqsave(&bfqd->lock, flags);
 	/*
 	 * Empty all service_trees belonging to this group before
 	 * deactivating the group itself.
@@ -809,6 +810,8 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
 	}
 
 	__bfq_deactivate_entity(entity, false);
+
+put_async_queues:
 	bfq_put_async_queues(bfqd, bfqg);
 
 	spin_unlock_irqrestore(&bfqd->lock, flags);
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index bcb6d21baf12..aeca22d91101 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -166,6 +166,20 @@ static const int bfq_async_charge_factor = 10;
 /* Default timeout values, in jiffies, approximating CFQ defaults. */
 const int bfq_timeout = HZ / 8;
 
+/*
+ * Time limit for merging (see comments in bfq_setup_cooperator). Set
+ * to the slowest value that, in our tests, proved to be effective in
+ * removing false positives, while not causing true positives to miss
+ * queue merging.
+ *
+ * As can be deduced from the low time limit below, queue merging, if
+ * successful, happens at the very beggining of the I/O of the involved
+ * cooperating processes, as a consequence of the arrival of the very
+ * first requests from each cooperator.  After that, there is very
+ * little chance to find cooperators.
+ */
+static const unsigned long bfq_merge_time_limit = HZ/10;
+
 static struct kmem_cache *bfq_pool;
 
 /* Below this threshold (in ns), we consider thinktime immediate. */
@@ -178,7 +192,7 @@ static struct kmem_cache *bfq_pool;
 #define BFQQ_SEEK_THR		(sector_t)(8 * 100)
 #define BFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)
 #define BFQQ_CLOSE_THR		(sector_t)(8 * 1024)
-#define BFQQ_SEEKY(bfqq)	(hweight32(bfqq->seek_history) > 32/8)
+#define BFQQ_SEEKY(bfqq)	(hweight32(bfqq->seek_history) > 19)
 
 /* Min number of samples required to perform peak-rate update */
 #define BFQ_RATE_MIN_SAMPLES	32
@@ -195,15 +209,17 @@ static struct kmem_cache *bfq_pool;
  * interactive applications automatically, using the following formula:
  * duration = (R / r) * T, where r is the peak rate of the device, and
  * R and T are two reference parameters.
- * In particular, R is the peak rate of the reference device (see below),
- * and T is a reference time: given the systems that are likely to be
- * installed on the reference device according to its speed class, T is
- * about the maximum time needed, under BFQ and while reading two files in
- * parallel, to load typical large applications on these systems.
- * In practice, the slower/faster the device at hand is, the more/less it
- * takes to load applications with respect to the reference device.
- * Accordingly, the longer/shorter BFQ grants weight raising to interactive
- * applications.
+ * In particular, R is the peak rate of the reference device (see
+ * below), and T is a reference time: given the systems that are
+ * likely to be installed on the reference device according to its
+ * speed class, T is about the maximum time needed, under BFQ and
+ * while reading two files in parallel, to load typical large
+ * applications on these systems (see the comments on
+ * max_service_from_wr below, for more details on how T is obtained).
+ * In practice, the slower/faster the device at hand is, the more/less
+ * it takes to load applications with respect to the reference device.
+ * Accordingly, the longer/shorter BFQ grants weight raising to
+ * interactive applications.
  *
  * BFQ uses four different reference pairs (R, T), depending on:
  * . whether the device is rotational or non-rotational;
@@ -240,6 +256,60 @@ static int T_slow[2];
 static int T_fast[2];
 static int device_speed_thresh[2];
 
+/*
+ * BFQ uses the above-detailed, time-based weight-raising mechanism to
+ * privilege interactive tasks. This mechanism is vulnerable to the
+ * following false positives: I/O-bound applications that will go on
+ * doing I/O for much longer than the duration of weight
+ * raising. These applications have basically no benefit from being
+ * weight-raised at the beginning of their I/O. On the opposite end,
+ * while being weight-raised, these applications
+ * a) unjustly steal throughput to applications that may actually need
+ * low latency;
+ * b) make BFQ uselessly perform device idling; device idling results
+ * in loss of device throughput with most flash-based storage, and may
+ * increase latencies when used purposelessly.
+ *
+ * BFQ tries to reduce these problems, by adopting the following
+ * countermeasure. To introduce this countermeasure, we need first to
+ * finish explaining how the duration of weight-raising for
+ * interactive tasks is computed.
+ *
+ * For a bfq_queue deemed as interactive, the duration of weight
+ * raising is dynamically adjusted, as a function of the estimated
+ * peak rate of the device, so as to be equal to the time needed to
+ * execute the 'largest' interactive task we benchmarked so far. By
+ * largest task, we mean the task for which each involved process has
+ * to do more I/O than for any of the other tasks we benchmarked. This
+ * reference interactive task is the start-up of LibreOffice Writer,
+ * and in this task each process/bfq_queue needs to have at most ~110K
+ * sectors transferred.
+ *
+ * This last piece of information enables BFQ to reduce the actual
+ * duration of weight-raising for at least one class of I/O-bound
+ * applications: those doing sequential or quasi-sequential I/O. An
+ * example is file copy. In fact, once started, the main I/O-bound
+ * processes of these applications usually consume the above 110K
+ * sectors in much less time than the processes of an application that
+ * is starting, because these I/O-bound processes will greedily devote
+ * almost all their CPU cycles only to their target,
+ * throughput-friendly I/O operations. This is even more true if BFQ
+ * happens to be underestimating the device peak rate, and thus
+ * overestimating the duration of weight raising. But, according to
+ * our measurements, once transferred 110K sectors, these processes
+ * have no right to be weight-raised any longer.
+ *
+ * Basing on the last consideration, BFQ ends weight-raising for a
+ * bfq_queue if the latter happens to have received an amount of
+ * service at least equal to the following constant. The constant is
+ * set to slightly more than 110K, to have a minimum safety margin.
+ *
+ * This early ending of weight-raising reduces the amount of time
+ * during which interactive false positives cause the two problems
+ * described at the beginning of these comments.
+ */
+static const unsigned long max_service_from_wr = 120000;
+
 #define RQ_BIC(rq)		icq_to_bic((rq)->elv.priv[0])
 #define RQ_BFQQ(rq)		((rq)->elv.priv[1])
 
@@ -403,6 +473,82 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd,
 	}
 }
 
+/*
+ * See the comments on bfq_limit_depth for the purpose of
+ * the depths set in the function.
+ */
+static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
+{
+	bfqd->sb_shift = bt->sb.shift;
+
+	/*
+	 * In-word depths if no bfq_queue is being weight-raised:
+	 * leaving 25% of tags only for sync reads.
+	 *
+	 * In next formulas, right-shift the value
+	 * (1U<<bfqd->sb_shift), instead of computing directly
+	 * (1U<<(bfqd->sb_shift - something)), to be robust against
+	 * any possible value of bfqd->sb_shift, without having to
+	 * limit 'something'.
+	 */
+	/* no more than 50% of tags for async I/O */
+	bfqd->word_depths[0][0] = max((1U<<bfqd->sb_shift)>>1, 1U);
+	/*
+	 * no more than 75% of tags for sync writes (25% extra tags
+	 * w.r.t. async I/O, to prevent async I/O from starving sync
+	 * writes)
+	 */
+	bfqd->word_depths[0][1] = max(((1U<<bfqd->sb_shift) * 3)>>2, 1U);
+
+	/*
+	 * In-word depths in case some bfq_queue is being weight-
+	 * raised: leaving ~63% of tags for sync reads. This is the
+	 * highest percentage for which, in our tests, application
+	 * start-up times didn't suffer from any regression due to tag
+	 * shortage.
+	 */
+	/* no more than ~18% of tags for async I/O */
+	bfqd->word_depths[1][0] = max(((1U<<bfqd->sb_shift) * 3)>>4, 1U);
+	/* no more than ~37% of tags for sync writes (~20% extra tags) */
+	bfqd->word_depths[1][1] = max(((1U<<bfqd->sb_shift) * 6)>>4, 1U);
+}
+
+/*
+ * Async I/O can easily starve sync I/O (both sync reads and sync
+ * writes), by consuming all tags. Similarly, storms of sync writes,
+ * such as those that sync(2) may trigger, can starve sync reads.
+ * Limit depths of async I/O and sync writes so as to counter both
+ * problems.
+ */
+static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
+{
+	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
+	struct bfq_data *bfqd = data->q->elevator->elevator_data;
+	struct sbitmap_queue *bt;
+
+	if (op_is_sync(op) && !op_is_write(op))
+		return;
+
+	if (data->flags & BLK_MQ_REQ_RESERVED) {
+		if (unlikely(!tags->nr_reserved_tags)) {
+			WARN_ON_ONCE(1);
+			return;
+		}
+		bt = &tags->breserved_tags;
+	} else
+		bt = &tags->bitmap_tags;
+
+	if (unlikely(bfqd->sb_shift != bt->sb.shift))
+		bfq_update_depths(bfqd, bt);
+
+	data->shallow_depth =
+		bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
+
+	bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
+			__func__, bfqd->wr_busy_queues, op_is_sync(op),
+			data->shallow_depth);
+}
+
 static struct bfq_queue *
 bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
 		     sector_t sector, struct rb_node **ret_parent,
@@ -444,6 +590,13 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
 	return bfqq;
 }
 
+static bool bfq_too_late_for_merging(struct bfq_queue *bfqq)
+{
+	return bfqq->service_from_backlogged > 0 &&
+		time_is_before_jiffies(bfqq->first_IO_time +
+				       bfq_merge_time_limit);
+}
+
 void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 {
 	struct rb_node **p, *parent;
@@ -454,6 +607,14 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 		bfqq->pos_root = NULL;
 	}
 
+	/*
+	 * bfqq cannot be merged any longer (see comments in
+	 * bfq_setup_cooperator): no point in adding bfqq into the
+	 * position tree.
+	 */
+	if (bfq_too_late_for_merging(bfqq))
+		return;
+
 	if (bfq_class_idle(bfqq))
 		return;
 	if (!bfqq->next_rq)
@@ -1247,6 +1408,7 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
 	if (old_wr_coeff == 1 && wr_or_deserves_wr) {
 		/* start a weight-raising period */
 		if (interactive) {
+			bfqq->service_from_wr = 0;
 			bfqq->wr_coeff = bfqd->bfq_wr_coeff;
 			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
 		} else {
@@ -1627,6 +1789,8 @@ static void bfq_remove_request(struct request_queue *q,
 			rb_erase(&bfqq->pos_node, bfqq->pos_root);
 			bfqq->pos_root = NULL;
 		}
+	} else {
+		bfq_pos_tree_add_move(bfqd, bfqq);
 	}
 
 	if (rq->cmd_flags & REQ_META)
@@ -1933,6 +2097,9 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
 static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
 					struct bfq_queue *new_bfqq)
 {
+	if (bfq_too_late_for_merging(new_bfqq))
+		return false;
+
 	if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) ||
 	    (bfqq->ioprio_class != new_bfqq->ioprio_class))
 		return false;
@@ -1957,20 +2124,6 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
 }
 
 /*
- * If this function returns true, then bfqq cannot be merged. The idea
- * is that true cooperation happens very early after processes start
- * to do I/O. Usually, late cooperations are just accidental false
- * positives. In case bfqq is weight-raised, such false positives
- * would evidently degrade latency guarantees for bfqq.
- */
-static bool wr_from_too_long(struct bfq_queue *bfqq)
-{
-	return bfqq->wr_coeff > 1 &&
-		time_is_before_jiffies(bfqq->last_wr_start_finish +
-				       msecs_to_jiffies(100));
-}
-
-/*
  * Attempt to schedule a merge of bfqq with the currently in-service
  * queue or with a close queue among the scheduled queues.  Return
  * NULL if no merge was scheduled, a pointer to the shared bfq_queue
@@ -1983,11 +2136,6 @@ static bool wr_from_too_long(struct bfq_queue *bfqq)
  * to maintain. Besides, in such a critical condition as an out of memory,
  * the benefits of queue merging may be little relevant, or even negligible.
  *
- * Weight-raised queues can be merged only if their weight-raising
- * period has just started. In fact cooperating processes are usually
- * started together. Thus, with this filter we avoid false positives
- * that would jeopardize low-latency guarantees.
- *
  * WARNING: queue merging may impair fairness among non-weight raised
  * queues, for at least two reasons: 1) the original weight of a
  * merged queue may change during the merged state, 2) even being the
@@ -2001,12 +2149,24 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 {
 	struct bfq_queue *in_service_bfqq, *new_bfqq;
 
+	/*
+	 * Prevent bfqq from being merged if it has been created too
+	 * long ago. The idea is that true cooperating processes, and
+	 * thus their associated bfq_queues, are supposed to be
+	 * created shortly after each other. This is the case, e.g.,
+	 * for KVM/QEMU and dump I/O threads. Basing on this
+	 * assumption, the following filtering greatly reduces the
+	 * probability that two non-cooperating processes, which just
+	 * happen to do close I/O for some short time interval, have
+	 * their queues merged by mistake.
+	 */
+	if (bfq_too_late_for_merging(bfqq))
+		return NULL;
+
 	if (bfqq->new_bfqq)
 		return bfqq->new_bfqq;
 
-	if (!io_struct ||
-	    wr_from_too_long(bfqq) ||
-	    unlikely(bfqq == &bfqd->oom_bfqq))
+	if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq))
 		return NULL;
 
 	/* If there is only one backlogged queue, don't search. */
@@ -2015,12 +2175,9 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 
 	in_service_bfqq = bfqd->in_service_queue;
 
-	if (!in_service_bfqq || in_service_bfqq == bfqq
-	    || wr_from_too_long(in_service_bfqq) ||
-	    unlikely(in_service_bfqq == &bfqd->oom_bfqq))
-		goto check_scheduled;
-
-	if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
+	if (in_service_bfqq && in_service_bfqq != bfqq &&
+	    likely(in_service_bfqq != &bfqd->oom_bfqq) &&
+	    bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
 	    bfqq->entity.parent == in_service_bfqq->entity.parent &&
 	    bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) {
 		new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
@@ -2032,12 +2189,10 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	 * queues. The only thing we need is that the bio/request is not
 	 * NULL, as we need it to establish whether a cooperator exists.
 	 */
-check_scheduled:
 	new_bfqq = bfq_find_close_cooperator(bfqd, bfqq,
 			bfq_io_struct_pos(io_struct, request));
 
-	if (new_bfqq && !wr_from_too_long(new_bfqq) &&
-	    likely(new_bfqq != &bfqd->oom_bfqq) &&
+	if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) &&
 	    bfq_may_be_close_cooperator(bfqq, new_bfqq))
 		return bfq_setup_merge(bfqq, new_bfqq);
 
@@ -2062,7 +2217,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
 	bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
 	bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
 	if (unlikely(bfq_bfqq_just_created(bfqq) &&
-		     !bfq_bfqq_in_large_burst(bfqq))) {
+		     !bfq_bfqq_in_large_burst(bfqq) &&
+		     bfqq->bfqd->low_latency)) {
 		/*
 		 * bfqq being merged right after being created: bfqq
 		 * would have deserved interactive weight raising, but
@@ -2917,45 +3073,87 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
  * whereas soft_rt_next_start is set to infinity for applications that do
  * not.
  *
- * Unfortunately, even a greedy application may happen to behave in an
- * isochronous way if the CPU load is high. In fact, the application may
- * stop issuing requests while the CPUs are busy serving other processes,
- * then restart, then stop again for a while, and so on. In addition, if
- * the disk achieves a low enough throughput with the request pattern
- * issued by the application (e.g., because the request pattern is random
- * and/or the device is slow), then the application may meet the above
- * bandwidth requirement too. To prevent such a greedy application to be
- * deemed as soft real-time, a further rule is used in the computation of
- * soft_rt_next_start: soft_rt_next_start must be higher than the current
- * time plus the maximum time for which the arrival of a request is waited
- * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.
- * This filters out greedy applications, as the latter issue instead their
- * next request as soon as possible after the last one has been completed
- * (in contrast, when a batch of requests is completed, a soft real-time
- * application spends some time processing data).
+ * Unfortunately, even a greedy (i.e., I/O-bound) application may
+ * happen to meet, occasionally or systematically, both the above
+ * bandwidth and isochrony requirements. This may happen at least in
+ * the following circumstances. First, if the CPU load is high. The
+ * application may stop issuing requests while the CPUs are busy
+ * serving other processes, then restart, then stop again for a while,
+ * and so on. The other circumstances are related to the storage
+ * device: the storage device is highly loaded or reaches a low-enough
+ * throughput with the I/O of the application (e.g., because the I/O
+ * is random and/or the device is slow). In all these cases, the
+ * I/O of the application may be simply slowed down enough to meet
+ * the bandwidth and isochrony requirements. To reduce the probability
+ * that greedy applications are deemed as soft real-time in these
+ * corner cases, a further rule is used in the computation of
+ * soft_rt_next_start: the return value of this function is forced to
+ * be higher than the maximum between the following two quantities.
  *
- * Unfortunately, the last filter may easily generate false positives if
- * only bfqd->bfq_slice_idle is used as a reference time interval and one
- * or both the following cases occur:
- * 1) HZ is so low that the duration of a jiffy is comparable to or higher
- *    than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
- *    HZ=100.
+ * (a) Current time plus: (1) the maximum time for which the arrival
+ *     of a request is waited for when a sync queue becomes idle,
+ *     namely bfqd->bfq_slice_idle, and (2) a few extra jiffies. We
+ *     postpone for a moment the reason for adding a few extra
+ *     jiffies; we get back to it after next item (b).  Lower-bounding
+ *     the return value of this function with the current time plus
+ *     bfqd->bfq_slice_idle tends to filter out greedy applications,
+ *     because the latter issue their next request as soon as possible
+ *     after the last one has been completed. In contrast, a soft
+ *     real-time application spends some time processing data, after a
+ *     batch of its requests has been completed.
+ *
+ * (b) Current value of bfqq->soft_rt_next_start. As pointed out
+ *     above, greedy applications may happen to meet both the
+ *     bandwidth and isochrony requirements under heavy CPU or
+ *     storage-device load. In more detail, in these scenarios, these
+ *     applications happen, only for limited time periods, to do I/O
+ *     slowly enough to meet all the requirements described so far,
+ *     including the filtering in above item (a). These slow-speed
+ *     time intervals are usually interspersed between other time
+ *     intervals during which these applications do I/O at a very high
+ *     speed. Fortunately, exactly because of the high speed of the
+ *     I/O in the high-speed intervals, the values returned by this
+ *     function happen to be so high, near the end of any such
+ *     high-speed interval, to be likely to fall *after* the end of
+ *     the low-speed time interval that follows. These high values are
+ *     stored in bfqq->soft_rt_next_start after each invocation of
+ *     this function. As a consequence, if the last value of
+ *     bfqq->soft_rt_next_start is constantly used to lower-bound the
+ *     next value that this function may return, then, from the very
+ *     beginning of a low-speed interval, bfqq->soft_rt_next_start is
+ *     likely to be constantly kept so high that any I/O request
+ *     issued during the low-speed interval is considered as arriving
+ *     to soon for the application to be deemed as soft
+ *     real-time. Then, in the high-speed interval that follows, the
+ *     application will not be deemed as soft real-time, just because
+ *     it will do I/O at a high speed. And so on.
+ *
+ * Getting back to the filtering in item (a), in the following two
+ * cases this filtering might be easily passed by a greedy
+ * application, if the reference quantity was just
+ * bfqd->bfq_slice_idle:
+ * 1) HZ is so low that the duration of a jiffy is comparable to or
+ *    higher than bfqd->bfq_slice_idle. This happens, e.g., on slow
+ *    devices with HZ=100. The time granularity may be so coarse
+ *    that the approximation, in jiffies, of bfqd->bfq_slice_idle
+ *    is rather lower than the exact value.
  * 2) jiffies, instead of increasing at a constant rate, may stop increasing
  *    for a while, then suddenly 'jump' by several units to recover the lost
  *    increments. This seems to happen, e.g., inside virtual machines.
- * To address this issue, we do not use as a reference time interval just
- * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
- * particular we add the minimum number of jiffies for which the filter
- * seems to be quite precise also in embedded systems and KVM/QEMU virtual
- * machines.
+ * To address this issue, in the filtering in (a) we do not use as a
+ * reference time interval just bfqd->bfq_slice_idle, but
+ * bfqd->bfq_slice_idle plus a few jiffies. In particular, we add the
+ * minimum number of jiffies for which the filter seems to be quite
+ * precise also in embedded systems and KVM/QEMU virtual machines.
  */
 static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
 						struct bfq_queue *bfqq)
 {
-	return max(bfqq->last_idle_bklogged +
-		   HZ * bfqq->service_from_backlogged /
-		   bfqd->bfq_wr_max_softrt_rate,
-		   jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
+	return max3(bfqq->soft_rt_next_start,
+		    bfqq->last_idle_bklogged +
+		    HZ * bfqq->service_from_backlogged /
+		    bfqd->bfq_wr_max_softrt_rate,
+		    jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
 }
 
 /**
@@ -3000,17 +3198,6 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
 	slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta);
 
 	/*
-	 * Increase service_from_backlogged before next statement,
-	 * because the possible next invocation of
-	 * bfq_bfqq_charge_time would likely inflate
-	 * entity->service. In contrast, service_from_backlogged must
-	 * contain real service, to enable the soft real-time
-	 * heuristic to correctly compute the bandwidth consumed by
-	 * bfqq.
-	 */
-	bfqq->service_from_backlogged += entity->service;
-
-	/*
 	 * As above explained, charge slow (typically seeky) and
 	 * timed-out queues with the time and not the service
 	 * received, to favor sequential workloads.
@@ -3535,6 +3722,12 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 				bfqq->entity.prio_changed = 1;
 			}
 		}
+		if (bfqq->wr_coeff > 1 &&
+		    bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time &&
+		    bfqq->service_from_wr > max_service_from_wr) {
+			/* see comments on max_service_from_wr */
+			bfq_bfqq_end_wr(bfqq);
+		}
 	}
 	/*
 	 * To improve latency (for this or other queues), immediately
@@ -3630,20 +3823,22 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
 		}
 
 		/*
-		 * We exploit the put_rq_private hook to decrement
-		 * rq_in_driver, but put_rq_private will not be
-		 * invoked on this request. So, to avoid unbalance,
-		 * just start this request, without incrementing
-		 * rq_in_driver. As a negative consequence,
-		 * rq_in_driver is deceptively lower than it should be
-		 * while this request is in service. This may cause
-		 * bfq_schedule_dispatch to be invoked uselessly.
+		 * We exploit the bfq_finish_requeue_request hook to
+		 * decrement rq_in_driver, but
+		 * bfq_finish_requeue_request will not be invoked on
+		 * this request. So, to avoid unbalance, just start
+		 * this request, without incrementing rq_in_driver. As
+		 * a negative consequence, rq_in_driver is deceptively
+		 * lower than it should be while this request is in
+		 * service. This may cause bfq_schedule_dispatch to be
+		 * invoked uselessly.
 		 *
 		 * As for implementing an exact solution, the
-		 * put_request hook, if defined, is probably invoked
-		 * also on this request. So, by exploiting this hook,
-		 * we could 1) increment rq_in_driver here, and 2)
-		 * decrement it in put_request. Such a solution would
+		 * bfq_finish_requeue_request hook, if defined, is
+		 * probably invoked also on this request. So, by
+		 * exploiting this hook, we could 1) increment
+		 * rq_in_driver here, and 2) decrement it in
+		 * bfq_finish_requeue_request. Such a solution would
 		 * let the value of the counter be always accurate,
 		 * but it would entail using an extra interface
 		 * function. This cost seems higher than the benefit,
@@ -3689,35 +3884,16 @@ exit:
 	return rq;
 }
 
-static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
-{
-	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
-	struct request *rq;
-#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
-	struct bfq_queue *in_serv_queue, *bfqq;
-	bool waiting_rq, idle_timer_disabled;
-#endif
-
-	spin_lock_irq(&bfqd->lock);
-
 #if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
-	in_serv_queue = bfqd->in_service_queue;
-	waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue);
-
-	rq = __bfq_dispatch_request(hctx);
-
-	idle_timer_disabled =
-		waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
-
-#else
-	rq = __bfq_dispatch_request(hctx);
-#endif
-	spin_unlock_irq(&bfqd->lock);
+static void bfq_update_dispatch_stats(struct request_queue *q,
+				      struct request *rq,
+				      struct bfq_queue *in_serv_queue,
+				      bool idle_timer_disabled)
+{
+	struct bfq_queue *bfqq = rq ? RQ_BFQQ(rq) : NULL;
 
-#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
-	bfqq = rq ? RQ_BFQQ(rq) : NULL;
 	if (!idle_timer_disabled && !bfqq)
-		return rq;
+		return;
 
 	/*
 	 * rq and bfqq are guaranteed to exist until this function
@@ -3732,7 +3908,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
 	 * In addition, the following queue lock guarantees that
 	 * bfqq_group(bfqq) exists as well.
 	 */
-	spin_lock_irq(hctx->queue->queue_lock);
+	spin_lock_irq(q->queue_lock);
 	if (idle_timer_disabled)
 		/*
 		 * Since the idle timer has been disabled,
@@ -3751,9 +3927,37 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
 		bfqg_stats_set_start_empty_time(bfqg);
 		bfqg_stats_update_io_remove(bfqg, rq->cmd_flags);
 	}
-	spin_unlock_irq(hctx->queue->queue_lock);
+	spin_unlock_irq(q->queue_lock);
+}
+#else
+static inline void bfq_update_dispatch_stats(struct request_queue *q,
+					     struct request *rq,
+					     struct bfq_queue *in_serv_queue,
+					     bool idle_timer_disabled) {}
 #endif
 
+static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
+{
+	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
+	struct request *rq;
+	struct bfq_queue *in_serv_queue;
+	bool waiting_rq, idle_timer_disabled;
+
+	spin_lock_irq(&bfqd->lock);
+
+	in_serv_queue = bfqd->in_service_queue;
+	waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue);
+
+	rq = __bfq_dispatch_request(hctx);
+
+	idle_timer_disabled =
+		waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
+
+	spin_unlock_irq(&bfqd->lock);
+
+	bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue,
+				  idle_timer_disabled);
+
 	return rq;
 }
 
@@ -4002,10 +4206,15 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	bfqq->split_time = bfq_smallest_from_now();
 
 	/*
-	 * Set to the value for which bfqq will not be deemed as
-	 * soft rt when it becomes backlogged.
+	 * To not forget the possibly high bandwidth consumed by a
+	 * process/queue in the recent past,
+	 * bfq_bfqq_softrt_next_start() returns a value at least equal
+	 * to the current value of bfqq->soft_rt_next_start (see
+	 * comments on bfq_bfqq_softrt_next_start).  Set
+	 * soft_rt_next_start to now, to mean that bfqq has consumed
+	 * no bandwidth so far.
 	 */
-	bfqq->soft_rt_next_start = bfq_greatest_from_now();
+	bfqq->soft_rt_next_start = jiffies;
 
 	/* first request is almost certainly seeky */
 	bfqq->seek_history = 1;
@@ -4276,16 +4485,48 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
 	return idle_timer_disabled;
 }
 
+#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
+static void bfq_update_insert_stats(struct request_queue *q,
+				    struct bfq_queue *bfqq,
+				    bool idle_timer_disabled,
+				    unsigned int cmd_flags)
+{
+	if (!bfqq)
+		return;
+
+	/*
+	 * bfqq still exists, because it can disappear only after
+	 * either it is merged with another queue, or the process it
+	 * is associated with exits. But both actions must be taken by
+	 * the same process currently executing this flow of
+	 * instructions.
+	 *
+	 * In addition, the following queue lock guarantees that
+	 * bfqq_group(bfqq) exists as well.
+	 */
+	spin_lock_irq(q->queue_lock);
+	bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags);
+	if (idle_timer_disabled)
+		bfqg_stats_update_idle_time(bfqq_group(bfqq));
+	spin_unlock_irq(q->queue_lock);
+}
+#else
+static inline void bfq_update_insert_stats(struct request_queue *q,
+					   struct bfq_queue *bfqq,
+					   bool idle_timer_disabled,
+					   unsigned int cmd_flags) {}
+#endif
+
+static void bfq_prepare_request(struct request *rq, struct bio *bio);
+
 static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 			       bool at_head)
 {
 	struct request_queue *q = hctx->queue;
 	struct bfq_data *bfqd = q->elevator->elevator_data;
-#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
 	struct bfq_queue *bfqq = RQ_BFQQ(rq);
 	bool idle_timer_disabled = false;
 	unsigned int cmd_flags;
-#endif
 
 	spin_lock_irq(&bfqd->lock);
 	if (blk_mq_sched_try_insert_merge(q, rq)) {
@@ -4304,7 +4545,18 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 		else
 			list_add_tail(&rq->queuelist, &bfqd->dispatch);
 	} else {
-#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
+		if (WARN_ON_ONCE(!bfqq)) {
+			/*
+			 * This should never happen. Most likely rq is
+			 * a requeued regular request, being
+			 * re-inserted without being first
+			 * re-prepared. Do a prepare, to avoid
+			 * failure.
+			 */
+			bfq_prepare_request(rq, rq->bio);
+			bfqq = RQ_BFQQ(rq);
+		}
+
 		idle_timer_disabled = __bfq_insert_request(bfqd, rq);
 		/*
 		 * Update bfqq, because, if a queue merge has occurred
@@ -4312,9 +4564,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 		 * redirected into a new queue.
 		 */
 		bfqq = RQ_BFQQ(rq);
-#else
-		__bfq_insert_request(bfqd, rq);
-#endif
 
 		if (rq_mergeable(rq)) {
 			elv_rqhash_add(q, rq);
@@ -4323,35 +4572,17 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 		}
 	}
 
-#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
 	/*
 	 * Cache cmd_flags before releasing scheduler lock, because rq
 	 * may disappear afterwards (for example, because of a request
 	 * merge).
 	 */
 	cmd_flags = rq->cmd_flags;
-#endif
+
 	spin_unlock_irq(&bfqd->lock);
 
-#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
-	if (!bfqq)
-		return;
-	/*
-	 * bfqq still exists, because it can disappear only after
-	 * either it is merged with another queue, or the process it
-	 * is associated with exits. But both actions must be taken by
-	 * the same process currently executing this flow of
-	 * instruction.
-	 *
-	 * In addition, the following queue lock guarantees that
-	 * bfqq_group(bfqq) exists as well.
-	 */
-	spin_lock_irq(q->queue_lock);
-	bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags);
-	if (idle_timer_disabled)
-		bfqg_stats_update_idle_time(bfqq_group(bfqq));
-	spin_unlock_irq(q->queue_lock);
-#endif
+	bfq_update_insert_stats(q, bfqq, idle_timer_disabled,
+				cmd_flags);
 }
 
 static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
@@ -4482,22 +4713,44 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
 		bfq_schedule_dispatch(bfqd);
 }
 
-static void bfq_put_rq_priv_body(struct bfq_queue *bfqq)
+static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq)
 {
 	bfqq->allocated--;
 
 	bfq_put_queue(bfqq);
 }
 
-static void bfq_finish_request(struct request *rq)
+/*
+ * Handle either a requeue or a finish for rq. The things to do are
+ * the same in both cases: all references to rq are to be dropped. In
+ * particular, rq is considered completed from the point of view of
+ * the scheduler.
+ */
+static void bfq_finish_requeue_request(struct request *rq)
 {
-	struct bfq_queue *bfqq;
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
 	struct bfq_data *bfqd;
 
-	if (!rq->elv.icq)
+	/*
+	 * Requeue and finish hooks are invoked in blk-mq without
+	 * checking whether the involved request is actually still
+	 * referenced in the scheduler. To handle this fact, the
+	 * following two checks make this function exit in case of
+	 * spurious invocations, for which there is nothing to do.
+	 *
+	 * First, check whether rq has nothing to do with an elevator.
+	 */
+	if (unlikely(!(rq->rq_flags & RQF_ELVPRIV)))
+		return;
+
+	/*
+	 * rq either is not associated with any icq, or is an already
+	 * requeued request that has not (yet) been re-inserted into
+	 * a bfq_queue.
+	 */
+	if (!rq->elv.icq || !bfqq)
 		return;
 
-	bfqq = RQ_BFQQ(rq);
 	bfqd = bfqq->bfqd;
 
 	if (rq->rq_flags & RQF_STARTED)
@@ -4512,13 +4765,14 @@ static void bfq_finish_request(struct request *rq)
 		spin_lock_irqsave(&bfqd->lock, flags);
 
 		bfq_completed_request(bfqq, bfqd);
-		bfq_put_rq_priv_body(bfqq);
+		bfq_finish_requeue_request_body(bfqq);
 
 		spin_unlock_irqrestore(&bfqd->lock, flags);
 	} else {
 		/*
 		 * Request rq may be still/already in the scheduler,
-		 * in which case we need to remove it. And we cannot
+		 * in which case we need to remove it (this should
+		 * never happen in case of requeue). And we cannot
 		 * defer such a check and removal, to avoid
 		 * inconsistencies in the time interval from the end
 		 * of this function to the start of the deferred work.
@@ -4533,9 +4787,26 @@ static void bfq_finish_request(struct request *rq)
 			bfqg_stats_update_io_remove(bfqq_group(bfqq),
 						    rq->cmd_flags);
 		}
-		bfq_put_rq_priv_body(bfqq);
+		bfq_finish_requeue_request_body(bfqq);
 	}
 
+	/*
+	 * Reset private fields. In case of a requeue, this allows
+	 * this function to correctly do nothing if it is spuriously
+	 * invoked again on this same request (see the check at the
+	 * beginning of the function). Probably, a better general
+	 * design would be to prevent blk-mq from invoking the requeue
+	 * or finish hooks of an elevator, for a request that is not
+	 * referred by that elevator.
+	 *
+	 * Resetting the following fields would break the
+	 * request-insertion logic if rq is re-inserted into a bfq
+	 * internal queue, without a re-preparation. Here we assume
+	 * that re-insertions of requeued requests, without
+	 * re-preparation, can happen only for pass_through or at_head
+	 * requests (which are not re-inserted into bfq internal
+	 * queues).
+	 */
 	rq->elv.priv[0] = NULL;
 	rq->elv.priv[1] = NULL;
 }
@@ -4818,6 +5089,9 @@ static void bfq_exit_queue(struct elevator_queue *e)
 	hrtimer_cancel(&bfqd->idle_slice_timer);
 
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
+	/* release oom-queue reference to root group */
+	bfqg_and_blkg_put(bfqd->root_group);
+
 	blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq);
 #else
 	spin_lock_irq(&bfqd->lock);
@@ -5206,8 +5480,10 @@ static struct elv_fs_entry bfq_attrs[] = {
 
 static struct elevator_type iosched_bfq_mq = {
 	.ops.mq = {
+		.limit_depth		= bfq_limit_depth,
 		.prepare_request	= bfq_prepare_request,
-		.finish_request		= bfq_finish_request,
+		.requeue_request        = bfq_finish_requeue_request,
+		.finish_request		= bfq_finish_requeue_request,
 		.exit_icq		= bfq_exit_icq,
 		.insert_requests	= bfq_insert_requests,
 		.dispatch_request	= bfq_dispatch_request,
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 91c4390903a1..350c39ae2896 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -337,6 +337,11 @@ struct bfq_queue {
 	 * last transition from idle to backlogged.
 	 */
 	unsigned long service_from_backlogged;
+	/*
+	 * Cumulative service received from the @bfq_queue since its
+	 * last transition to weight-raised state.
+	 */
+	unsigned long service_from_wr;
 
 	/*
 	 * Value of wr start time when switching to soft rt
@@ -344,6 +349,8 @@ struct bfq_queue {
 	unsigned long wr_start_at_switch_to_srt;
 
 	unsigned long split_time; /* time of last split */
+
+	unsigned long first_IO_time; /* time of first I/O for this queue */
 };
 
 /**
@@ -627,6 +634,18 @@ struct bfq_data {
 	struct bfq_io_cq *bio_bic;
 	/* bfqq associated with the task issuing current bio for merging */
 	struct bfq_queue *bio_bfqq;
+
+	/*
+	 * Cached sbitmap shift, used to compute depth limits in
+	 * bfq_update_depths.
+	 */
+	unsigned int sb_shift;
+
+	/*
+	 * Depth limits used in bfq_limit_depth (see comments on the
+	 * function)
+	 */
+	unsigned int word_depths[2][2];
 };
 
 enum bfqq_state_flags {
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index e495d3f9b4b0..4498c43245e2 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -835,6 +835,13 @@ void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
 	struct bfq_entity *entity = &bfqq->entity;
 	struct bfq_service_tree *st;
 
+	if (!bfqq->service_from_backlogged)
+		bfqq->first_IO_time = jiffies;
+
+	if (bfqq->wr_coeff > 1)
+		bfqq->service_from_wr += served;
+
+	bfqq->service_from_backlogged += served;
 	for_each_entity(entity) {
 		st = bfq_entity_service_tree(entity);
 
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 23b42e8aa03e..9cfdd6c83b5b 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -374,7 +374,6 @@ static void bio_integrity_verify_fn(struct work_struct *work)
 /**
  * __bio_integrity_endio - Integrity I/O completion function
  * @bio:	Protected bio
- * @error:	Pointer to errno
  *
  * Description: Completion for integrity I/O
  *
diff --git a/block/bio.c b/block/bio.c
index 8bfdea58159b..e1708db48258 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -599,6 +599,8 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
 	bio->bi_disk = bio_src->bi_disk;
 	bio->bi_partno = bio_src->bi_partno;
 	bio_set_flag(bio, BIO_CLONED);
+	if (bio_flagged(bio_src, BIO_THROTTLED))
+		bio_set_flag(bio, BIO_THROTTLED);
 	bio->bi_opf = bio_src->bi_opf;
 	bio->bi_write_hint = bio_src->bi_write_hint;
 	bio->bi_iter = bio_src->bi_iter;
@@ -969,34 +971,6 @@ void bio_advance(struct bio *bio, unsigned bytes)
 EXPORT_SYMBOL(bio_advance);
 
 /**
- * bio_alloc_pages - allocates a single page for each bvec in a bio
- * @bio: bio to allocate pages for
- * @gfp_mask: flags for allocation
- *
- * Allocates pages up to @bio->bi_vcnt.
- *
- * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
- * freed.
- */
-int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
-{
-	int i;
-	struct bio_vec *bv;
-
-	bio_for_each_segment_all(bv, bio, i) {
-		bv->bv_page = alloc_page(gfp_mask);
-		if (!bv->bv_page) {
-			while (--bv >= bio->bi_io_vec)
-				__free_page(bv->bv_page);
-			return -ENOMEM;
-		}
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(bio_alloc_pages);
-
-/**
  * bio_copy_data - copy contents of data buffers from one chain of bios to
  * another
  * @src: source bio list
@@ -1836,7 +1810,7 @@ struct bio *bio_split(struct bio *bio, int sectors,
 	bio_advance(bio, split->bi_iter.bi_size);
 
 	if (bio_flagged(bio, BIO_TRACE_COMPLETION))
-		bio_set_flag(bio, BIO_TRACE_COMPLETION);
+		bio_set_flag(split, BIO_TRACE_COMPLETION);
 
 	return split;
 }
diff --git a/block/blk-core.c b/block/blk-core.c
index b8881750a3ac..2d1a7bbe0634 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -34,6 +34,7 @@
 #include <linux/pm_runtime.h>
 #include <linux/blk-cgroup.h>
 #include <linux/debugfs.h>
+#include <linux/bpf.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/block.h>
@@ -126,6 +127,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	rq->start_time = jiffies;
 	set_start_time_ns(rq);
 	rq->part = NULL;
+	seqcount_init(&rq->gstate_seq);
+	u64_stats_init(&rq->aborted_gstate_sync);
 }
 EXPORT_SYMBOL(blk_rq_init);
 
@@ -143,6 +146,7 @@ static const struct {
 	[BLK_STS_MEDIUM]	= { -ENODATA,	"critical medium" },
 	[BLK_STS_PROTECTION]	= { -EILSEQ,	"protection" },
 	[BLK_STS_RESOURCE]	= { -ENOMEM,	"kernel resource" },
+	[BLK_STS_DEV_RESOURCE]	= { -EBUSY,	"device resource" },
 	[BLK_STS_AGAIN]		= { -EAGAIN,	"nonblocking retry" },
 
 	/* device mapper special case, should not leak out: */
@@ -562,6 +566,13 @@ static void __blk_drain_queue(struct request_queue *q, bool drain_all)
 	}
 }
 
+void blk_drain_queue(struct request_queue *q)
+{
+	spin_lock_irq(q->queue_lock);
+	__blk_drain_queue(q, true);
+	spin_unlock_irq(q->queue_lock);
+}
+
 /**
  * blk_queue_bypass_start - enter queue bypass mode
  * @q: queue of interest
@@ -689,11 +700,18 @@ void blk_cleanup_queue(struct request_queue *q)
 	 */
 	blk_freeze_queue(q);
 	spin_lock_irq(lock);
-	if (!q->mq_ops)
-		__blk_drain_queue(q, true);
 	queue_flag_set(QUEUE_FLAG_DEAD, q);
 	spin_unlock_irq(lock);
 
+	/*
+	 * make sure all in-progress dispatch are completed because
+	 * blk_freeze_queue() can only complete all requests, and
+	 * dispatch may still be in-progress since we dispatch requests
+	 * from more than one contexts
+	 */
+	if (q->mq_ops)
+		blk_mq_quiesce_queue(q);
+
 	/* for synchronous bio-based driver finish in-flight integrity i/o */
 	blk_flush_integrity();
 
@@ -1641,6 +1659,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)
 
 	lockdep_assert_held(q->queue_lock);
 
+	blk_req_zone_write_unlock(req);
 	blk_pm_put_request(req);
 
 	elv_completed_request(q, req);
@@ -2050,6 +2069,29 @@ static inline bool should_fail_request(struct hd_struct *part,
 
 #endif /* CONFIG_FAIL_MAKE_REQUEST */
 
+static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
+{
+	if (part->policy && op_is_write(bio_op(bio))) {
+		char b[BDEVNAME_SIZE];
+
+		printk(KERN_ERR
+		       "generic_make_request: Trying to write "
+			"to read-only block-device %s (partno %d)\n",
+			bio_devname(bio, b), part->partno);
+		return true;
+	}
+
+	return false;
+}
+
+static noinline int should_fail_bio(struct bio *bio)
+{
+	if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
+		return -EIO;
+	return 0;
+}
+ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);
+
 /*
  * Remap block n of partition p to block n+start(p) of the disk.
  */
@@ -2058,27 +2100,28 @@ static inline int blk_partition_remap(struct bio *bio)
 	struct hd_struct *p;
 	int ret = 0;
 
+	rcu_read_lock();
+	p = __disk_get_part(bio->bi_disk, bio->bi_partno);
+	if (unlikely(!p || should_fail_request(p, bio->bi_iter.bi_size) ||
+		     bio_check_ro(bio, p))) {
+		ret = -EIO;
+		goto out;
+	}
+
 	/*
 	 * Zone reset does not include bi_size so bio_sectors() is always 0.
 	 * Include a test for the reset op code and perform the remap if needed.
 	 */
-	if (!bio->bi_partno ||
-	    (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET))
-		return 0;
+	if (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET)
+		goto out;
 
-	rcu_read_lock();
-	p = __disk_get_part(bio->bi_disk, bio->bi_partno);
-	if (likely(p && !should_fail_request(p, bio->bi_iter.bi_size))) {
-		bio->bi_iter.bi_sector += p->start_sect;
-		bio->bi_partno = 0;
-		trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
-				bio->bi_iter.bi_sector - p->start_sect);
-	} else {
-		printk("%s: fail for partition %d\n", __func__, bio->bi_partno);
-		ret = -EIO;
-	}
-	rcu_read_unlock();
+	bio->bi_iter.bi_sector += p->start_sect;
+	bio->bi_partno = 0;
+	trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
+			      bio->bi_iter.bi_sector - p->start_sect);
 
+out:
+	rcu_read_unlock();
 	return ret;
 }
 
@@ -2137,15 +2180,19 @@ generic_make_request_checks(struct bio *bio)
 	 * For a REQ_NOWAIT based request, return -EOPNOTSUPP
 	 * if queue is not a request based queue.
 	 */
-
 	if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q))
 		goto not_supported;
 
-	if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
+	if (should_fail_bio(bio))
 		goto end_io;
 
-	if (blk_partition_remap(bio))
-		goto end_io;
+	if (!bio->bi_partno) {
+		if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0)))
+			goto end_io;
+	} else {
+		if (blk_partition_remap(bio))
+			goto end_io;
+	}
 
 	if (bio_check_eod(bio, nr_sectors))
 		goto end_io;
@@ -2488,8 +2535,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
 		 * bypass a potential scheduler on the bottom device for
 		 * insert.
 		 */
-		blk_mq_request_bypass_insert(rq, true);
-		return BLK_STS_OK;
+		return blk_mq_request_issue_directly(rq);
 	}
 
 	spin_lock_irqsave(q->queue_lock, flags);
@@ -2841,7 +2887,7 @@ void blk_start_request(struct request *req)
 		wbt_issue(req->q->rq_wb, &req->issue_stat);
 	}
 
-	BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
+	BUG_ON(blk_rq_is_complete(req));
 	blk_add_timer(req);
 }
 EXPORT_SYMBOL(blk_start_request);
@@ -3246,6 +3292,8 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 {
 	if (bio_has_data(bio))
 		rq->nr_phys_segments = bio_phys_segments(q, bio);
+	else if (bio_op(bio) == REQ_OP_DISCARD)
+		rq->nr_phys_segments = 1;
 
 	rq->__data_len = bio->bi_iter.bi_size;
 	rq->bio = rq->biotail = bio;
@@ -3410,20 +3458,6 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
 }
 EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
 
-int kblockd_schedule_delayed_work(struct delayed_work *dwork,
-				  unsigned long delay)
-{
-	return queue_delayed_work(kblockd_workqueue, dwork, delay);
-}
-EXPORT_SYMBOL(kblockd_schedule_delayed_work);
-
-int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
-				     unsigned long delay)
-{
-	return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
-}
-EXPORT_SYMBOL(kblockd_schedule_delayed_work_on);
-
 /**
  * blk_start_plug - initialize blk_plug and track it inside the task_struct
  * @plug:	The &struct blk_plug that needs to be initialized
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 5c0f3dc446dc..f7b292f12449 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -61,7 +61,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 	 * be reused after dying flag is set
 	 */
 	if (q->mq_ops) {
-		blk_mq_sched_insert_request(rq, at_head, true, false, false);
+		blk_mq_sched_insert_request(rq, at_head, true, false);
 		return;
 	}
 
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 2bc544ce3d2e..a676084d4740 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -37,6 +37,9 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	if (!q)
 		return -ENXIO;
 
+	if (bdev_read_only(bdev))
+		return -EPERM;
+
 	if (flags & BLKDEV_DISCARD_SECURE) {
 		if (!blk_queue_secure_erase(q))
 			return -EOPNOTSUPP;
@@ -156,6 +159,9 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 	if (!q)
 		return -ENXIO;
 
+	if (bdev_read_only(bdev))
+		return -EPERM;
+
 	bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
 	if ((sector | nr_sects) & bs_mask)
 		return -EINVAL;
@@ -233,6 +239,9 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
 	if (!q)
 		return -ENXIO;
 
+	if (bdev_read_only(bdev))
+		return -EPERM;
+
 	/* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */
 	max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev);
 
@@ -287,6 +296,9 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev,
 	if (!q)
 		return -ENXIO;
 
+	if (bdev_read_only(bdev))
+		return -EPERM;
+
 	while (nr_sects != 0) {
 		bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
 			       gfp_mask);
diff --git a/block/blk-map.c b/block/blk-map.c
index b21f8e86f120..db9373bd31ac 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -12,22 +12,29 @@
 #include "blk.h"
 
 /*
- * Append a bio to a passthrough request.  Only works can be merged into
- * the request based on the driver constraints.
+ * Append a bio to a passthrough request.  Only works if the bio can be merged
+ * into the request based on the driver constraints.
  */
-int blk_rq_append_bio(struct request *rq, struct bio *bio)
+int blk_rq_append_bio(struct request *rq, struct bio **bio)
 {
-	blk_queue_bounce(rq->q, &bio);
+	struct bio *orig_bio = *bio;
+
+	blk_queue_bounce(rq->q, bio);
 
 	if (!rq->bio) {
-		blk_rq_bio_prep(rq->q, rq, bio);
+		blk_rq_bio_prep(rq->q, rq, *bio);
 	} else {
-		if (!ll_back_merge_fn(rq->q, rq, bio))
+		if (!ll_back_merge_fn(rq->q, rq, *bio)) {
+			if (orig_bio != *bio) {
+				bio_put(*bio);
+				*bio = orig_bio;
+			}
 			return -EINVAL;
+		}
 
-		rq->biotail->bi_next = bio;
-		rq->biotail = bio;
-		rq->__data_len += bio->bi_iter.bi_size;
+		rq->biotail->bi_next = *bio;
+		rq->biotail = *bio;
+		rq->__data_len += (*bio)->bi_iter.bi_size;
 	}
 
 	return 0;
@@ -73,14 +80,12 @@ static int __blk_rq_map_user_iov(struct request *rq,
 	 * We link the bounce buffer in and could have to traverse it
 	 * later so we have to get a ref to prevent it from being freed
 	 */
-	ret = blk_rq_append_bio(rq, bio);
-	bio_get(bio);
+	ret = blk_rq_append_bio(rq, &bio);
 	if (ret) {
-		bio_endio(bio);
 		__blk_rq_unmap_user(orig_bio);
-		bio_put(bio);
 		return ret;
 	}
+	bio_get(bio);
 
 	return 0;
 }
@@ -114,7 +119,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 	unsigned long align = q->dma_pad_mask | queue_dma_alignment(q);
 	struct bio *bio = NULL;
 	struct iov_iter i;
-	int ret;
+	int ret = -EINVAL;
 
 	if (!iter_is_iovec(iter))
 		goto fail;
@@ -143,7 +148,7 @@ unmap_rq:
 	__blk_rq_unmap_user(bio);
 fail:
 	rq->bio = NULL;
-	return -EINVAL;
+	return ret;
 }
 EXPORT_SYMBOL(blk_rq_map_user_iov);
 
@@ -213,7 +218,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 	int reading = rq_data_dir(rq) == READ;
 	unsigned long addr = (unsigned long) kbuf;
 	int do_copy = 0;
-	struct bio *bio;
+	struct bio *bio, *orig_bio;
 	int ret;
 
 	if (len > (queue_max_hw_sectors(q) << 9))
@@ -236,10 +241,11 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 	if (do_copy)
 		rq->rq_flags |= RQF_COPY_USER;
 
-	ret = blk_rq_append_bio(rq, bio);
+	orig_bio = bio;
+	ret = blk_rq_append_bio(rq, &bio);
 	if (unlikely(ret)) {
 		/* request is too big */
-		bio_put(bio);
+		bio_put(orig_bio);
 		return ret;
 	}
 
diff --git a/block/blk-merge.c b/block/blk-merge.c
index f5dedd57dff6..782940c65d8a 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -128,9 +128,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 				nsegs++;
 				sectors = max_sectors;
 			}
-			if (sectors)
-				goto split;
-			/* Make this single bvec as the 1st segment */
+			goto split;
 		}
 
 		if (bvprvp && blk_queue_cluster(q)) {
@@ -146,22 +144,21 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 			bvprvp = &bvprv;
 			sectors += bv.bv_len >> 9;
 
-			if (nsegs == 1 && seg_size > front_seg_size)
-				front_seg_size = seg_size;
 			continue;
 		}
 new_segment:
 		if (nsegs == queue_max_segments(q))
 			goto split;
 
+		if (nsegs == 1 && seg_size > front_seg_size)
+			front_seg_size = seg_size;
+
 		nsegs++;
 		bvprv = bv;
 		bvprvp = &bvprv;
 		seg_size = bv.bv_len;
 		sectors += bv.bv_len >> 9;
 
-		if (nsegs == 1 && seg_size > front_seg_size)
-			front_seg_size = seg_size;
 	}
 
 	do_split = false;
@@ -174,6 +171,8 @@ split:
 			bio = new;
 	}
 
+	if (nsegs == 1 && seg_size > front_seg_size)
+		front_seg_size = seg_size;
 	bio->bi_seg_front_size = front_seg_size;
 	if (seg_size > bio->bi_seg_back_size)
 		bio->bi_seg_back_size = seg_size;
@@ -551,6 +550,24 @@ static bool req_no_special_merge(struct request *req)
 	return !q->mq_ops && req->special;
 }
 
+static bool req_attempt_discard_merge(struct request_queue *q, struct request *req,
+		struct request *next)
+{
+	unsigned short segments = blk_rq_nr_discard_segments(req);
+
+	if (segments >= queue_max_discard_segments(q))
+		goto no_merge;
+	if (blk_rq_sectors(req) + bio_sectors(next->bio) >
+	    blk_rq_get_max_sectors(req, blk_rq_pos(req)))
+		goto no_merge;
+
+	req->nr_phys_segments = segments + blk_rq_nr_discard_segments(next);
+	return true;
+no_merge:
+	req_set_nomerge(q, req);
+	return false;
+}
+
 static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 				struct request *next)
 {
@@ -684,9 +701,13 @@ static struct request *attempt_merge(struct request_queue *q,
 	 * If we are allowed to merge, then append bio list
 	 * from next to rq and release next. merge_requests_fn
 	 * will have updated segment counts, update sector
-	 * counts here.
+	 * counts here. Handle DISCARDs separately, as they
+	 * have separate settings.
 	 */
-	if (!ll_merge_requests_fn(q, req, next))
+	if (req_op(req) == REQ_OP_DISCARD) {
+		if (!req_attempt_discard_merge(q, req, next))
+			return NULL;
+	} else if (!ll_merge_requests_fn(q, req, next))
 		return NULL;
 
 	/*
@@ -716,7 +737,8 @@ static struct request *attempt_merge(struct request_queue *q,
 
 	req->__data_len += blk_rq_bytes(next);
 
-	elv_merge_requests(q, req, next);
+	if (req_op(req) != REQ_OP_DISCARD)
+		elv_merge_requests(q, req, next);
 
 	/*
 	 * 'next' is going away, so update stats accordingly
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index b56a4f35720d..21cbc1f071c6 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -289,17 +289,12 @@ static const char *const rqf_name[] = {
 	RQF_NAME(HASHED),
 	RQF_NAME(STATS),
 	RQF_NAME(SPECIAL_PAYLOAD),
+	RQF_NAME(ZONE_WRITE_LOCKED),
+	RQF_NAME(MQ_TIMEOUT_EXPIRED),
+	RQF_NAME(MQ_POLL_SLEPT),
 };
 #undef RQF_NAME
 
-#define RQAF_NAME(name) [REQ_ATOM_##name] = #name
-static const char *const rqaf_name[] = {
-	RQAF_NAME(COMPLETE),
-	RQAF_NAME(STARTED),
-	RQAF_NAME(POLL_SLEPT),
-};
-#undef RQAF_NAME
-
 int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
 {
 	const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
@@ -316,8 +311,7 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
 	seq_puts(m, ", .rq_flags=");
 	blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name,
 		       ARRAY_SIZE(rqf_name));
-	seq_puts(m, ", .atomic_flags=");
-	blk_flags_show(m, rq->atomic_flags, rqaf_name, ARRAY_SIZE(rqaf_name));
+	seq_printf(m, ", complete=%d", blk_rq_is_complete(rq));
 	seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag,
 		   rq->internal_tag);
 	if (mq_ops->show_rq)
@@ -409,7 +403,7 @@ static void hctx_show_busy_rq(struct request *rq, void *data, bool reserved)
 	const struct show_busy_params *params = data;
 
 	if (blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) == params->hctx &&
-	    test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
+	    blk_mq_rq_state(rq) != MQ_RQ_IDLE)
 		__blk_mq_debugfs_rq_show(params->m,
 					 list_entry_rq(&rq->queuelist));
 }
@@ -703,7 +697,11 @@ static ssize_t blk_mq_debugfs_write(struct file *file, const char __user *buf,
 	const struct blk_mq_debugfs_attr *attr = m->private;
 	void *data = d_inode(file->f_path.dentry->d_parent)->i_private;
 
-	if (!attr->write)
+	/*
+	 * Attributes that only implement .seq_ops are read-only and 'attr' is
+	 * the same with 'data' in this case.
+	 */
+	if (attr == data || !attr->write)
 		return -EPERM;
 
 	return attr->write(data, buf, count, ppos);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index c117bd8fd1f6..25c14c58385c 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -172,7 +172,6 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
 	WRITE_ONCE(hctx->dispatch_from, ctx);
 }
 
-/* return true if hw queue need to be run again */
 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
@@ -260,6 +259,8 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
 		if (!*merged_request)
 			elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE);
 		return true;
+	case ELEVATOR_DISCARD_MERGE:
+		return bio_attempt_discard_merge(q, rq, bio);
 	default:
 		return false;
 	}
@@ -428,7 +429,7 @@ done:
 }
 
 void blk_mq_sched_insert_request(struct request *rq, bool at_head,
-				 bool run_queue, bool async, bool can_block)
+				 bool run_queue, bool async)
 {
 	struct request_queue *q = rq->q;
 	struct elevator_queue *e = q->elevator;
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index ba1d1418a96d..1e9c9018ace1 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -18,7 +18,7 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
 void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
 
 void blk_mq_sched_insert_request(struct request *rq, bool at_head,
-				 bool run_queue, bool async, bool can_block);
+				 bool run_queue, bool async);
 void blk_mq_sched_insert_requests(struct request_queue *q,
 				  struct blk_mq_ctx *ctx,
 				  struct list_head *list, bool run_queue_async);
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 79969c3c234f..a54b4b070f1c 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -248,7 +248,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
 	return ret;
 }
 
-static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
+void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
 	int i;
@@ -265,13 +265,6 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
 	q->mq_sysfs_init_done = false;
 }
 
-void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
-{
-	mutex_lock(&q->sysfs_lock);
-	__blk_mq_unregister_dev(dev, q);
-	mutex_unlock(&q->sysfs_lock);
-}
-
 void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
 {
 	kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index c81b40ecd3f1..336dde07b230 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -134,12 +134,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	ws = bt_wait_ptr(bt, data->hctx);
 	drop_ctx = data->ctx == NULL;
 	do {
-		prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE);
-
-		tag = __blk_mq_get_tag(data, bt);
-		if (tag != -1)
-			break;
-
 		/*
 		 * We're out of tags on this hardware queue, kick any
 		 * pending IO submits before going to sleep waiting for
@@ -155,6 +149,13 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		if (tag != -1)
 			break;
 
+		prepare_to_wait_exclusive(&ws->wait, &wait,
+						TASK_UNINTERRUPTIBLE);
+
+		tag = __blk_mq_get_tag(data, bt);
+		if (tag != -1)
+			break;
+
 		if (data->ctx)
 			blk_mq_put_ctx(data->ctx);
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 11097477eeab..df93102e2149 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -95,8 +95,7 @@ static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
 {
 	struct mq_inflight *mi = priv;
 
-	if (test_bit(REQ_ATOM_STARTED, &rq->atomic_flags) &&
-	    !test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) {
+	if (blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) {
 		/*
 		 * index[0] counts the specific partition that was asked
 		 * for. index[1] counts the ones that are active on the
@@ -161,6 +160,8 @@ void blk_freeze_queue(struct request_queue *q)
 	 * exported to drivers as the only user for unfreeze is blk_mq.
 	 */
 	blk_freeze_queue_start(q);
+	if (!q->mq_ops)
+		blk_drain_queue(q);
 	blk_mq_freeze_queue_wait(q);
 }
 
@@ -220,7 +221,7 @@ void blk_mq_quiesce_queue(struct request_queue *q)
 
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (hctx->flags & BLK_MQ_F_BLOCKING)
-			synchronize_srcu(hctx->queue_rq_srcu);
+			synchronize_srcu(hctx->srcu);
 		else
 			rcu = true;
 	}
@@ -270,15 +271,14 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 {
 	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
 	struct request *rq = tags->static_rqs[tag];
-
-	rq->rq_flags = 0;
+	req_flags_t rq_flags = 0;
 
 	if (data->flags & BLK_MQ_REQ_INTERNAL) {
 		rq->tag = -1;
 		rq->internal_tag = tag;
 	} else {
 		if (blk_mq_tag_busy(data->hctx)) {
-			rq->rq_flags = RQF_MQ_INFLIGHT;
+			rq_flags = RQF_MQ_INFLIGHT;
 			atomic_inc(&data->hctx->nr_active);
 		}
 		rq->tag = tag;
@@ -286,27 +286,22 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 		data->hctx->tags->rqs[rq->tag] = rq;
 	}
 
-	INIT_LIST_HEAD(&rq->queuelist);
 	/* csd/requeue_work/fifo_time is initialized before use */
 	rq->q = data->q;
 	rq->mq_ctx = data->ctx;
+	rq->rq_flags = rq_flags;
+	rq->cpu = -1;
 	rq->cmd_flags = op;
 	if (data->flags & BLK_MQ_REQ_PREEMPT)
 		rq->rq_flags |= RQF_PREEMPT;
 	if (blk_queue_io_stat(data->q))
 		rq->rq_flags |= RQF_IO_STAT;
-	/* do not touch atomic flags, it needs atomic ops against the timer */
-	rq->cpu = -1;
+	INIT_LIST_HEAD(&rq->queuelist);
 	INIT_HLIST_NODE(&rq->hash);
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->rq_disk = NULL;
 	rq->part = NULL;
 	rq->start_time = jiffies;
-#ifdef CONFIG_BLK_CGROUP
-	rq->rl = NULL;
-	set_start_time_ns(rq);
-	rq->io_start_time_ns = 0;
-#endif
 	rq->nr_phys_segments = 0;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 	rq->nr_integrity_segments = 0;
@@ -314,6 +309,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	rq->special = NULL;
 	/* tag was already set */
 	rq->extra_len = 0;
+	rq->__deadline = 0;
 
 	INIT_LIST_HEAD(&rq->timeout_list);
 	rq->timeout = 0;
@@ -322,6 +318,12 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	rq->end_io_data = NULL;
 	rq->next_rq = NULL;
 
+#ifdef CONFIG_BLK_CGROUP
+	rq->rl = NULL;
+	set_start_time_ns(rq);
+	rq->io_start_time_ns = 0;
+#endif
+
 	data->ctx->rq_dispatched[op_is_sync(op)]++;
 	return rq;
 }
@@ -441,7 +443,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 		blk_queue_exit(q);
 		return ERR_PTR(-EXDEV);
 	}
-	cpu = cpumask_first(alloc_data.hctx->cpumask);
+	cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
 	alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
 
 	rq = blk_mq_get_request(q, NULL, op, &alloc_data);
@@ -483,8 +485,7 @@ void blk_mq_free_request(struct request *rq)
 	if (blk_rq_rl(rq))
 		blk_put_rl(blk_rq_rl(rq));
 
-	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
-	clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
+	blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
 	if (rq->tag != -1)
 		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
 	if (sched_tag != -1)
@@ -530,6 +531,9 @@ static void __blk_mq_complete_request(struct request *rq)
 	bool shared = false;
 	int cpu;
 
+	WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT);
+	blk_mq_rq_update_state(rq, MQ_RQ_COMPLETE);
+
 	if (rq->internal_tag != -1)
 		blk_mq_sched_completed_request(rq);
 	if (rq->rq_flags & RQF_STATS) {
@@ -557,6 +561,56 @@ static void __blk_mq_complete_request(struct request *rq)
 	put_cpu();
 }
 
+static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
+	__releases(hctx->srcu)
+{
+	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
+		rcu_read_unlock();
+	else
+		srcu_read_unlock(hctx->srcu, srcu_idx);
+}
+
+static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
+	__acquires(hctx->srcu)
+{
+	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
+		/* shut up gcc false positive */
+		*srcu_idx = 0;
+		rcu_read_lock();
+	} else
+		*srcu_idx = srcu_read_lock(hctx->srcu);
+}
+
+static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate)
+{
+	unsigned long flags;
+
+	/*
+	 * blk_mq_rq_aborted_gstate() is used from the completion path and
+	 * can thus be called from irq context.  u64_stats_fetch in the
+	 * middle of update on the same CPU leads to lockup.  Disable irq
+	 * while updating.
+	 */
+	local_irq_save(flags);
+	u64_stats_update_begin(&rq->aborted_gstate_sync);
+	rq->aborted_gstate = gstate;
+	u64_stats_update_end(&rq->aborted_gstate_sync);
+	local_irq_restore(flags);
+}
+
+static u64 blk_mq_rq_aborted_gstate(struct request *rq)
+{
+	unsigned int start;
+	u64 aborted_gstate;
+
+	do {
+		start = u64_stats_fetch_begin(&rq->aborted_gstate_sync);
+		aborted_gstate = rq->aborted_gstate;
+	} while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start));
+
+	return aborted_gstate;
+}
+
 /**
  * blk_mq_complete_request - end I/O on a request
  * @rq:		the request being processed
@@ -568,17 +622,33 @@ static void __blk_mq_complete_request(struct request *rq)
 void blk_mq_complete_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
+	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
+	int srcu_idx;
 
 	if (unlikely(blk_should_fake_timeout(q)))
 		return;
-	if (!blk_mark_rq_complete(rq))
+
+	/*
+	 * If @rq->aborted_gstate equals the current instance, timeout is
+	 * claiming @rq and we lost.  This is synchronized through
+	 * hctx_lock().  See blk_mq_timeout_work() for details.
+	 *
+	 * Completion path never blocks and we can directly use RCU here
+	 * instead of hctx_lock() which can be either RCU or SRCU.
+	 * However, that would complicate paths which want to synchronize
+	 * against us.  Let stay in sync with the issue path so that
+	 * hctx_lock() covers both issue and completion paths.
+	 */
+	hctx_lock(hctx, &srcu_idx);
+	if (blk_mq_rq_aborted_gstate(rq) != rq->gstate)
 		__blk_mq_complete_request(rq);
+	hctx_unlock(hctx, srcu_idx);
 }
 EXPORT_SYMBOL(blk_mq_complete_request);
 
 int blk_mq_request_started(struct request *rq)
 {
-	return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+	return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
 }
 EXPORT_SYMBOL_GPL(blk_mq_request_started);
 
@@ -596,34 +666,27 @@ void blk_mq_start_request(struct request *rq)
 		wbt_issue(q->rq_wb, &rq->issue_stat);
 	}
 
-	blk_add_timer(rq);
-
-	WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags));
+	WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
 
 	/*
-	 * Mark us as started and clear complete. Complete might have been
-	 * set if requeue raced with timeout, which then marked it as
-	 * complete. So be sure to clear complete again when we start
-	 * the request, otherwise we'll ignore the completion event.
+	 * Mark @rq in-flight which also advances the generation number,
+	 * and register for timeout.  Protect with a seqcount to allow the
+	 * timeout path to read both @rq->gstate and @rq->deadline
+	 * coherently.
 	 *
-	 * Ensure that ->deadline is visible before we set STARTED, such that
-	 * blk_mq_check_expired() is guaranteed to observe our ->deadline when
-	 * it observes STARTED.
+	 * This is the only place where a request is marked in-flight.  If
+	 * the timeout path reads an in-flight @rq->gstate, the
+	 * @rq->deadline it reads together under @rq->gstate_seq is
+	 * guaranteed to be the matching one.
 	 */
-	smp_wmb();
-	set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
-	if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) {
-		/*
-		 * Coherence order guarantees these consecutive stores to a
-		 * single variable propagate in the specified order. Thus the
-		 * clear_bit() is ordered _after_ the set bit. See
-		 * blk_mq_check_expired().
-		 *
-		 * (the bits must be part of the same byte for this to be
-		 * true).
-		 */
-		clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
-	}
+	preempt_disable();
+	write_seqcount_begin(&rq->gstate_seq);
+
+	blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT);
+	blk_add_timer(rq);
+
+	write_seqcount_end(&rq->gstate_seq);
+	preempt_enable();
 
 	if (q->dma_drain_size && blk_rq_bytes(rq)) {
 		/*
@@ -637,13 +700,9 @@ void blk_mq_start_request(struct request *rq)
 EXPORT_SYMBOL(blk_mq_start_request);
 
 /*
- * When we reach here because queue is busy, REQ_ATOM_COMPLETE
- * flag isn't set yet, so there may be race with timeout handler,
- * but given rq->deadline is just set in .queue_rq() under
- * this situation, the race won't be possible in reality because
- * rq->timeout should be set as big enough to cover the window
- * between blk_mq_start_request() called from .queue_rq() and
- * clearing REQ_ATOM_STARTED here.
+ * When we reach here because queue is busy, it's safe to change the state
+ * to IDLE without checking @rq->aborted_gstate because we should still be
+ * holding the RCU read lock and thus protected against timeout.
  */
 static void __blk_mq_requeue_request(struct request *rq)
 {
@@ -655,7 +714,8 @@ static void __blk_mq_requeue_request(struct request *rq)
 	wbt_requeue(q->rq_wb, &rq->issue_stat);
 	blk_mq_sched_requeue_request(rq);
 
-	if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
+	if (blk_mq_rq_state(rq) != MQ_RQ_IDLE) {
+		blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
 		if (q->dma_drain_size && blk_rq_bytes(rq))
 			rq->nr_phys_segments--;
 	}
@@ -687,13 +747,13 @@ static void blk_mq_requeue_work(struct work_struct *work)
 
 		rq->rq_flags &= ~RQF_SOFTBARRIER;
 		list_del_init(&rq->queuelist);
-		blk_mq_sched_insert_request(rq, true, false, false, true);
+		blk_mq_sched_insert_request(rq, true, false, false);
 	}
 
 	while (!list_empty(&rq_list)) {
 		rq = list_entry(rq_list.next, struct request, queuelist);
 		list_del_init(&rq->queuelist);
-		blk_mq_sched_insert_request(rq, false, false, false, true);
+		blk_mq_sched_insert_request(rq, false, false, false);
 	}
 
 	blk_mq_run_hw_queues(q, false);
@@ -727,7 +787,7 @@ EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
 
 void blk_mq_kick_requeue_list(struct request_queue *q)
 {
-	kblockd_schedule_delayed_work(&q->requeue_work, 0);
+	kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
 }
 EXPORT_SYMBOL(blk_mq_kick_requeue_list);
 
@@ -753,24 +813,15 @@ EXPORT_SYMBOL(blk_mq_tag_to_rq);
 struct blk_mq_timeout_data {
 	unsigned long next;
 	unsigned int next_set;
+	unsigned int nr_expired;
 };
 
-void blk_mq_rq_timed_out(struct request *req, bool reserved)
+static void blk_mq_rq_timed_out(struct request *req, bool reserved)
 {
 	const struct blk_mq_ops *ops = req->q->mq_ops;
 	enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
 
-	/*
-	 * We know that complete is set at this point. If STARTED isn't set
-	 * anymore, then the request isn't active and the "timeout" should
-	 * just be ignored. This can happen due to the bitflag ordering.
-	 * Timeout first checks if STARTED is set, and if it is, assumes
-	 * the request is active. But if we race with completion, then
-	 * both flags will get cleared. So check here again, and ignore
-	 * a timeout event with a request that isn't active.
-	 */
-	if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
-		return;
+	req->rq_flags |= RQF_MQ_TIMEOUT_EXPIRED;
 
 	if (ops->timeout)
 		ret = ops->timeout(req, reserved);
@@ -780,8 +831,13 @@ void blk_mq_rq_timed_out(struct request *req, bool reserved)
 		__blk_mq_complete_request(req);
 		break;
 	case BLK_EH_RESET_TIMER:
+		/*
+		 * As nothing prevents from completion happening while
+		 * ->aborted_gstate is set, this may lead to ignored
+		 * completions and further spurious timeouts.
+		 */
+		blk_mq_rq_update_aborted_gstate(req, 0);
 		blk_add_timer(req);
-		blk_clear_rq_complete(req);
 		break;
 	case BLK_EH_NOT_HANDLED:
 		break;
@@ -795,50 +851,51 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 		struct request *rq, void *priv, bool reserved)
 {
 	struct blk_mq_timeout_data *data = priv;
-	unsigned long deadline;
+	unsigned long gstate, deadline;
+	int start;
 
-	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
-		return;
+	might_sleep();
 
-	/*
-	 * Ensures that if we see STARTED we must also see our
-	 * up-to-date deadline, see blk_mq_start_request().
-	 */
-	smp_rmb();
+	if (rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED)
+		return;
 
-	deadline = READ_ONCE(rq->deadline);
+	/* read coherent snapshots of @rq->state_gen and @rq->deadline */
+	while (true) {
+		start = read_seqcount_begin(&rq->gstate_seq);
+		gstate = READ_ONCE(rq->gstate);
+		deadline = blk_rq_deadline(rq);
+		if (!read_seqcount_retry(&rq->gstate_seq, start))
+			break;
+		cond_resched();
+	}
 
-	/*
-	 * The rq being checked may have been freed and reallocated
-	 * out already here, we avoid this race by checking rq->deadline
-	 * and REQ_ATOM_COMPLETE flag together:
-	 *
-	 * - if rq->deadline is observed as new value because of
-	 *   reusing, the rq won't be timed out because of timing.
-	 * - if rq->deadline is observed as previous value,
-	 *   REQ_ATOM_COMPLETE flag won't be cleared in reuse path
-	 *   because we put a barrier between setting rq->deadline
-	 *   and clearing the flag in blk_mq_start_request(), so
-	 *   this rq won't be timed out too.
-	 */
-	if (time_after_eq(jiffies, deadline)) {
-		if (!blk_mark_rq_complete(rq)) {
-			/*
-			 * Again coherence order ensures that consecutive reads
-			 * from the same variable must be in that order. This
-			 * ensures that if we see COMPLETE clear, we must then
-			 * see STARTED set and we'll ignore this timeout.
-			 *
-			 * (There's also the MB implied by the test_and_clear())
-			 */
-			blk_mq_rq_timed_out(rq, reserved);
-		}
+	/* if in-flight && overdue, mark for abortion */
+	if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT &&
+	    time_after_eq(jiffies, deadline)) {
+		blk_mq_rq_update_aborted_gstate(rq, gstate);
+		data->nr_expired++;
+		hctx->nr_expired++;
 	} else if (!data->next_set || time_after(data->next, deadline)) {
 		data->next = deadline;
 		data->next_set = 1;
 	}
 }
 
+static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
+		struct request *rq, void *priv, bool reserved)
+{
+	/*
+	 * We marked @rq->aborted_gstate and waited for RCU.  If there were
+	 * completions that we lost to, they would have finished and
+	 * updated @rq->gstate by now; otherwise, the completion path is
+	 * now guaranteed to see @rq->aborted_gstate and yield.  If
+	 * @rq->aborted_gstate still matches @rq->gstate, @rq is ours.
+	 */
+	if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) &&
+	    READ_ONCE(rq->gstate) == rq->aborted_gstate)
+		blk_mq_rq_timed_out(rq, reserved);
+}
+
 static void blk_mq_timeout_work(struct work_struct *work)
 {
 	struct request_queue *q =
@@ -846,7 +903,9 @@ static void blk_mq_timeout_work(struct work_struct *work)
 	struct blk_mq_timeout_data data = {
 		.next		= 0,
 		.next_set	= 0,
+		.nr_expired	= 0,
 	};
+	struct blk_mq_hw_ctx *hctx;
 	int i;
 
 	/* A deadlock might occur if a request is stuck requiring a
@@ -865,14 +924,46 @@ static void blk_mq_timeout_work(struct work_struct *work)
 	if (!percpu_ref_tryget(&q->q_usage_counter))
 		return;
 
+	/* scan for the expired ones and set their ->aborted_gstate */
 	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
 
+	if (data.nr_expired) {
+		bool has_rcu = false;
+
+		/*
+		 * Wait till everyone sees ->aborted_gstate.  The
+		 * sequential waits for SRCUs aren't ideal.  If this ever
+		 * becomes a problem, we can add per-hw_ctx rcu_head and
+		 * wait in parallel.
+		 */
+		queue_for_each_hw_ctx(q, hctx, i) {
+			if (!hctx->nr_expired)
+				continue;
+
+			if (!(hctx->flags & BLK_MQ_F_BLOCKING))
+				has_rcu = true;
+			else
+				synchronize_srcu(hctx->srcu);
+
+			hctx->nr_expired = 0;
+		}
+		if (has_rcu)
+			synchronize_rcu();
+
+		/* terminate the ones we won */
+		blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL);
+	}
+
 	if (data.next_set) {
 		data.next = blk_rq_timeout(round_jiffies_up(data.next));
 		mod_timer(&q->timeout, data.next);
 	} else {
-		struct blk_mq_hw_ctx *hctx;
-
+		/*
+		 * Request timeouts are handled as a forward rolling timer. If
+		 * we end up here it means that no requests are pending and
+		 * also that no request has been pending for a while. Mark
+		 * each hctx as idle.
+		 */
 		queue_for_each_hw_ctx(q, hctx, i) {
 			/* the hctx may be unmapped, so check it here */
 			if (blk_mq_hw_queue_mapped(hctx))
@@ -1008,68 +1099,71 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
 
 /*
  * Mark us waiting for a tag. For shared tags, this involves hooking us into
- * the tag wakeups. For non-shared tags, we can simply mark us nedeing a
- * restart. For both caes, take care to check the condition again after
+ * the tag wakeups. For non-shared tags, we can simply mark us needing a
+ * restart. For both cases, take care to check the condition again after
  * marking us as waiting.
  */
 static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
 				 struct request *rq)
 {
 	struct blk_mq_hw_ctx *this_hctx = *hctx;
-	bool shared_tags = (this_hctx->flags & BLK_MQ_F_TAG_SHARED) != 0;
 	struct sbq_wait_state *ws;
 	wait_queue_entry_t *wait;
 	bool ret;
 
-	if (!shared_tags) {
+	if (!(this_hctx->flags & BLK_MQ_F_TAG_SHARED)) {
 		if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state))
 			set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state);
-	} else {
-		wait = &this_hctx->dispatch_wait;
-		if (!list_empty_careful(&wait->entry))
-			return false;
 
-		spin_lock(&this_hctx->lock);
-		if (!list_empty(&wait->entry)) {
-			spin_unlock(&this_hctx->lock);
-			return false;
-		}
+		/*
+		 * It's possible that a tag was freed in the window between the
+		 * allocation failure and adding the hardware queue to the wait
+		 * queue.
+		 *
+		 * Don't clear RESTART here, someone else could have set it.
+		 * At most this will cost an extra queue run.
+		 */
+		return blk_mq_get_driver_tag(rq, hctx, false);
+	}
+
+	wait = &this_hctx->dispatch_wait;
+	if (!list_empty_careful(&wait->entry))
+		return false;
 
-		ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
-		add_wait_queue(&ws->wait, wait);
+	spin_lock(&this_hctx->lock);
+	if (!list_empty(&wait->entry)) {
+		spin_unlock(&this_hctx->lock);
+		return false;
 	}
 
+	ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
+	add_wait_queue(&ws->wait, wait);
+
 	/*
 	 * It's possible that a tag was freed in the window between the
 	 * allocation failure and adding the hardware queue to the wait
 	 * queue.
 	 */
 	ret = blk_mq_get_driver_tag(rq, hctx, false);
-
-	if (!shared_tags) {
-		/*
-		 * Don't clear RESTART here, someone else could have set it.
-		 * At most this will cost an extra queue run.
-		 */
-		return ret;
-	} else {
-		if (!ret) {
-			spin_unlock(&this_hctx->lock);
-			return false;
-		}
-
-		/*
-		 * We got a tag, remove ourselves from the wait queue to ensure
-		 * someone else gets the wakeup.
-		 */
-		spin_lock_irq(&ws->wait.lock);
-		list_del_init(&wait->entry);
-		spin_unlock_irq(&ws->wait.lock);
+	if (!ret) {
 		spin_unlock(&this_hctx->lock);
-		return true;
+		return false;
 	}
+
+	/*
+	 * We got a tag, remove ourselves from the wait queue to ensure
+	 * someone else gets the wakeup.
+	 */
+	spin_lock_irq(&ws->wait.lock);
+	list_del_init(&wait->entry);
+	spin_unlock_irq(&ws->wait.lock);
+	spin_unlock(&this_hctx->lock);
+
+	return true;
 }
 
+#define BLK_MQ_RESOURCE_DELAY	3		/* ms units */
+
 bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 			     bool got_budget)
 {
@@ -1077,6 +1171,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 	struct request *rq, *nxt;
 	bool no_tag = false;
 	int errors, queued;
+	blk_status_t ret = BLK_STS_OK;
 
 	if (list_empty(list))
 		return false;
@@ -1089,7 +1184,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 	errors = queued = 0;
 	do {
 		struct blk_mq_queue_data bd;
-		blk_status_t ret;
 
 		rq = list_first_entry(list, struct request, queuelist);
 		if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
@@ -1134,7 +1228,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 		}
 
 		ret = q->mq_ops->queue_rq(hctx, &bd);
-		if (ret == BLK_STS_RESOURCE) {
+		if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
 			/*
 			 * If an I/O scheduler has been configured and we got a
 			 * driver tag for the next request already, free it
@@ -1165,6 +1259,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 	 * that is where we will continue on next queue run.
 	 */
 	if (!list_empty(list)) {
+		bool needs_restart;
+
 		spin_lock(&hctx->lock);
 		list_splice_init(list, &hctx->dispatch);
 		spin_unlock(&hctx->lock);
@@ -1188,10 +1284,17 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 		 * - Some but not all block drivers stop a queue before
 		 *   returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
 		 *   and dm-rq.
+		 *
+		 * If driver returns BLK_STS_RESOURCE and SCHED_RESTART
+		 * bit is set, run queue after a delay to avoid IO stalls
+		 * that could otherwise occur if the queue is idle.
 		 */
-		if (!blk_mq_sched_needs_restart(hctx) ||
+		needs_restart = blk_mq_sched_needs_restart(hctx);
+		if (!needs_restart ||
 		    (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
 			blk_mq_run_hw_queue(hctx, true);
+		else if (needs_restart && (ret == BLK_STS_RESOURCE))
+			blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
 	}
 
 	return (queued + errors) != 0;
@@ -1204,9 +1307,27 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 	/*
 	 * We should be running this queue from one of the CPUs that
 	 * are mapped to it.
+	 *
+	 * There are at least two related races now between setting
+	 * hctx->next_cpu from blk_mq_hctx_next_cpu() and running
+	 * __blk_mq_run_hw_queue():
+	 *
+	 * - hctx->next_cpu is found offline in blk_mq_hctx_next_cpu(),
+	 *   but later it becomes online, then this warning is harmless
+	 *   at all
+	 *
+	 * - hctx->next_cpu is found online in blk_mq_hctx_next_cpu(),
+	 *   but later it becomes offline, then the warning can't be
+	 *   triggered, and we depend on blk-mq timeout handler to
+	 *   handle dispatched requests to this hctx
 	 */
-	WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
-		cpu_online(hctx->next_cpu));
+	if (!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
+		cpu_online(hctx->next_cpu)) {
+		printk(KERN_WARNING "run queue from wrong CPU %d, hctx %s\n",
+			raw_smp_processor_id(),
+			cpumask_empty(hctx->cpumask) ? "inactive": "active");
+		dump_stack();
+	}
 
 	/*
 	 * We can't run the queue inline with ints disabled. Ensure that
@@ -1214,17 +1335,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 	 */
 	WARN_ON_ONCE(in_interrupt());
 
-	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
-		rcu_read_lock();
-		blk_mq_sched_dispatch_requests(hctx);
-		rcu_read_unlock();
-	} else {
-		might_sleep();
+	might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
 
-		srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
-		blk_mq_sched_dispatch_requests(hctx);
-		srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
-	}
+	hctx_lock(hctx, &srcu_idx);
+	blk_mq_sched_dispatch_requests(hctx);
+	hctx_unlock(hctx, srcu_idx);
 }
 
 /*
@@ -1235,20 +1350,47 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
  */
 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 {
+	bool tried = false;
+
 	if (hctx->queue->nr_hw_queues == 1)
 		return WORK_CPU_UNBOUND;
 
 	if (--hctx->next_cpu_batch <= 0) {
 		int next_cpu;
-
-		next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
+select_cpu:
+		next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask,
+				cpu_online_mask);
 		if (next_cpu >= nr_cpu_ids)
-			next_cpu = cpumask_first(hctx->cpumask);
+			next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask);
 
-		hctx->next_cpu = next_cpu;
+		/*
+		 * No online CPU is found, so have to make sure hctx->next_cpu
+		 * is set correctly for not breaking workqueue.
+		 */
+		if (next_cpu >= nr_cpu_ids)
+			hctx->next_cpu = cpumask_first(hctx->cpumask);
+		else
+			hctx->next_cpu = next_cpu;
 		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
 	}
 
+	/*
+	 * Do unbound schedule if we can't find a online CPU for this hctx,
+	 * and it should only happen in the path of handling CPU DEAD.
+	 */
+	if (!cpu_online(hctx->next_cpu)) {
+		if (!tried) {
+			tried = true;
+			goto select_cpu;
+		}
+
+		/*
+		 * Make sure to re-select CPU next time once after CPUs
+		 * in hctx->cpumask become online again.
+		 */
+		hctx->next_cpu_batch = 1;
+		return WORK_CPU_UNBOUND;
+	}
 	return hctx->next_cpu;
 }
 
@@ -1272,9 +1414,8 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
 		put_cpu();
 	}
 
-	kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
-					 &hctx->run_work,
-					 msecs_to_jiffies(msecs));
+	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
+				    msecs_to_jiffies(msecs));
 }
 
 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
@@ -1285,7 +1426,23 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
 
 bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 {
-	if (blk_mq_hctx_has_pending(hctx)) {
+	int srcu_idx;
+	bool need_run;
+
+	/*
+	 * When queue is quiesced, we may be switching io scheduler, or
+	 * updating nr_hw_queues, or other things, and we can't run queue
+	 * any more, even __blk_mq_hctx_has_pending() can't be called safely.
+	 *
+	 * And queue will be rerun in blk_mq_unquiesce_queue() if it is
+	 * quiesced.
+	 */
+	hctx_lock(hctx, &srcu_idx);
+	need_run = !blk_queue_quiesced(hctx->queue) &&
+		blk_mq_hctx_has_pending(hctx);
+	hctx_unlock(hctx, srcu_idx);
+
+	if (need_run) {
 		__blk_mq_delay_run_hw_queue(hctx, async, 0);
 		return true;
 	}
@@ -1593,9 +1750,9 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
 	return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
 }
 
-static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
-					struct request *rq,
-					blk_qc_t *cookie, bool may_sleep)
+static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
+					    struct request *rq,
+					    blk_qc_t *cookie)
 {
 	struct request_queue *q = rq->q;
 	struct blk_mq_queue_data bd = {
@@ -1604,15 +1761,53 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 	};
 	blk_qc_t new_cookie;
 	blk_status_t ret;
+
+	new_cookie = request_to_qc_t(hctx, rq);
+
+	/*
+	 * For OK queue, we are done. For error, caller may kill it.
+	 * Any other error (busy), just add it to our list as we
+	 * previously would have done.
+	 */
+	ret = q->mq_ops->queue_rq(hctx, &bd);
+	switch (ret) {
+	case BLK_STS_OK:
+		*cookie = new_cookie;
+		break;
+	case BLK_STS_RESOURCE:
+	case BLK_STS_DEV_RESOURCE:
+		__blk_mq_requeue_request(rq);
+		break;
+	default:
+		*cookie = BLK_QC_T_NONE;
+		break;
+	}
+
+	return ret;
+}
+
+static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
+						struct request *rq,
+						blk_qc_t *cookie,
+						bool bypass_insert)
+{
+	struct request_queue *q = rq->q;
 	bool run_queue = true;
 
-	/* RCU or SRCU read lock is needed before checking quiesced flag */
+	/*
+	 * RCU or SRCU read lock is needed before checking quiesced flag.
+	 *
+	 * When queue is stopped or quiesced, ignore 'bypass_insert' from
+	 * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller,
+	 * and avoid driver to try to dispatch again.
+	 */
 	if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
 		run_queue = false;
+		bypass_insert = false;
 		goto insert;
 	}
 
-	if (q->elevator)
+	if (q->elevator && !bypass_insert)
 		goto insert;
 
 	if (!blk_mq_get_driver_tag(rq, NULL, false))
@@ -1623,47 +1818,47 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 		goto insert;
 	}
 
-	new_cookie = request_to_qc_t(hctx, rq);
-
-	/*
-	 * For OK queue, we are done. For error, kill it. Any other
-	 * error (busy), just add it to our list as we previously
-	 * would have done
-	 */
-	ret = q->mq_ops->queue_rq(hctx, &bd);
-	switch (ret) {
-	case BLK_STS_OK:
-		*cookie = new_cookie;
-		return;
-	case BLK_STS_RESOURCE:
-		__blk_mq_requeue_request(rq);
-		goto insert;
-	default:
-		*cookie = BLK_QC_T_NONE;
-		blk_mq_end_request(rq, ret);
-		return;
-	}
-
+	return __blk_mq_issue_directly(hctx, rq, cookie);
 insert:
-	blk_mq_sched_insert_request(rq, false, run_queue, false, may_sleep);
+	if (bypass_insert)
+		return BLK_STS_RESOURCE;
+
+	blk_mq_sched_insert_request(rq, false, run_queue, false);
+	return BLK_STS_OK;
 }
 
 static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 		struct request *rq, blk_qc_t *cookie)
 {
-	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
-		rcu_read_lock();
-		__blk_mq_try_issue_directly(hctx, rq, cookie, false);
-		rcu_read_unlock();
-	} else {
-		unsigned int srcu_idx;
+	blk_status_t ret;
+	int srcu_idx;
 
-		might_sleep();
+	might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
 
-		srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
-		__blk_mq_try_issue_directly(hctx, rq, cookie, true);
-		srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
-	}
+	hctx_lock(hctx, &srcu_idx);
+
+	ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false);
+	if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
+		blk_mq_sched_insert_request(rq, false, true, false);
+	else if (ret != BLK_STS_OK)
+		blk_mq_end_request(rq, ret);
+
+	hctx_unlock(hctx, srcu_idx);
+}
+
+blk_status_t blk_mq_request_issue_directly(struct request *rq)
+{
+	blk_status_t ret;
+	int srcu_idx;
+	blk_qc_t unused_cookie;
+	struct blk_mq_ctx *ctx = rq->mq_ctx;
+	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
+
+	hctx_lock(hctx, &srcu_idx);
+	ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true);
+	hctx_unlock(hctx, srcu_idx);
+
+	return ret;
 }
 
 static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
@@ -1774,7 +1969,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	} else if (q->elevator) {
 		blk_mq_put_ctx(data.ctx);
 		blk_mq_bio_to_request(rq, bio);
-		blk_mq_sched_insert_request(rq, false, true, true, true);
+		blk_mq_sched_insert_request(rq, false, true, true);
 	} else {
 		blk_mq_put_ctx(data.ctx);
 		blk_mq_bio_to_request(rq, bio);
@@ -1867,6 +2062,22 @@ static size_t order_to_size(unsigned int order)
 	return (size_t)PAGE_SIZE << order;
 }
 
+static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
+			       unsigned int hctx_idx, int node)
+{
+	int ret;
+
+	if (set->ops->init_request) {
+		ret = set->ops->init_request(set, rq, hctx_idx, node);
+		if (ret)
+			return ret;
+	}
+
+	seqcount_init(&rq->gstate_seq);
+	u64_stats_init(&rq->aborted_gstate_sync);
+	return 0;
+}
+
 int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 		     unsigned int hctx_idx, unsigned int depth)
 {
@@ -1928,12 +2139,9 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 			struct request *rq = p;
 
 			tags->static_rqs[i] = rq;
-			if (set->ops->init_request) {
-				if (set->ops->init_request(set, rq, hctx_idx,
-						node)) {
-					tags->static_rqs[i] = NULL;
-					goto fail;
-				}
+			if (blk_mq_init_request(set, rq, hctx_idx, node)) {
+				tags->static_rqs[i] = NULL;
+				goto fail;
 			}
 
 			p += rq_size;
@@ -1992,7 +2200,8 @@ static void blk_mq_exit_hctx(struct request_queue *q,
 {
 	blk_mq_debugfs_unregister_hctx(hctx);
 
-	blk_mq_tag_idle(hctx);
+	if (blk_mq_hw_queue_mapped(hctx))
+		blk_mq_tag_idle(hctx);
 
 	if (set->ops->exit_request)
 		set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
@@ -2003,7 +2212,7 @@ static void blk_mq_exit_hctx(struct request_queue *q,
 		set->ops->exit_hctx(hctx, hctx_idx);
 
 	if (hctx->flags & BLK_MQ_F_BLOCKING)
-		cleanup_srcu_struct(hctx->queue_rq_srcu);
+		cleanup_srcu_struct(hctx->srcu);
 
 	blk_mq_remove_cpuhp(hctx);
 	blk_free_flush_queue(hctx->fq);
@@ -2072,13 +2281,11 @@ static int blk_mq_init_hctx(struct request_queue *q,
 	if (!hctx->fq)
 		goto sched_exit_hctx;
 
-	if (set->ops->init_request &&
-	    set->ops->init_request(set, hctx->fq->flush_rq, hctx_idx,
-				   node))
+	if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
 		goto free_fq;
 
 	if (hctx->flags & BLK_MQ_F_BLOCKING)
-		init_srcu_struct(hctx->queue_rq_srcu);
+		init_srcu_struct(hctx->srcu);
 
 	blk_mq_debugfs_register_hctx(q, hctx);
 
@@ -2114,16 +2321,11 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 		INIT_LIST_HEAD(&__ctx->rq_list);
 		__ctx->queue = q;
 
-		/* If the cpu isn't present, the cpu is mapped to first hctx */
-		if (!cpu_present(i))
-			continue;
-
-		hctx = blk_mq_map_queue(q, i);
-
 		/*
 		 * Set local node, IFF we have more than one hw queue. If
 		 * not, we remain on the home node of the device
 		 */
+		hctx = blk_mq_map_queue(q, i);
 		if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
 			hctx->numa_node = local_memory_node(cpu_to_node(i));
 	}
@@ -2180,7 +2382,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 	 *
 	 * If the cpu isn't present, the cpu is mapped to first hctx.
 	 */
-	for_each_present_cpu(i) {
+	for_each_possible_cpu(i) {
 		hctx_idx = q->mq_map[i];
 		/* unmapped hw queue can be remapped after CPU topo changed */
 		if (!set->tags[hctx_idx] &&
@@ -2234,7 +2436,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 		/*
 		 * Initialize batch roundrobin counts
 		 */
-		hctx->next_cpu = cpumask_first(hctx->cpumask);
+		hctx->next_cpu = cpumask_first_and(hctx->cpumask,
+				cpu_online_mask);
 		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
 	}
 }
@@ -2367,7 +2570,7 @@ static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
 {
 	int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
 
-	BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, queue_rq_srcu),
+	BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
 			   __alignof__(struct blk_mq_hw_ctx)) !=
 		     sizeof(struct blk_mq_hw_ctx));
 
@@ -2384,6 +2587,9 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 	struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
 
 	blk_mq_sysfs_unregister(q);
+
+	/* protect against switching io scheduler  */
+	mutex_lock(&q->sysfs_lock);
 	for (i = 0; i < set->nr_hw_queues; i++) {
 		int node;
 
@@ -2428,6 +2634,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 		}
 	}
 	q->nr_hw_queues = i;
+	mutex_unlock(&q->sysfs_lock);
 	blk_mq_sysfs_register(q);
 }
 
@@ -2599,9 +2806,27 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 
 static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
 {
-	if (set->ops->map_queues)
+	if (set->ops->map_queues) {
+		int cpu;
+		/*
+		 * transport .map_queues is usually done in the following
+		 * way:
+		 *
+		 * for (queue = 0; queue < set->nr_hw_queues; queue++) {
+		 * 	mask = get_cpu_mask(queue)
+		 * 	for_each_cpu(cpu, mask)
+		 * 		set->mq_map[cpu] = queue;
+		 * }
+		 *
+		 * When we need to remap, the table has to be cleared for
+		 * killing stale mapping since one CPU may not be mapped
+		 * to any hw queue.
+		 */
+		for_each_possible_cpu(cpu)
+			set->mq_map[cpu] = 0;
+
 		return set->ops->map_queues(set);
-	else
+	} else
 		return blk_mq_map_queues(set);
 }
 
@@ -2710,6 +2935,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
 		return -EINVAL;
 
 	blk_mq_freeze_queue(q);
+	blk_mq_quiesce_queue(q);
 
 	ret = 0;
 	queue_for_each_hw_ctx(q, hctx, i) {
@@ -2733,6 +2959,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
 	if (!ret)
 		q->nr_requests = nr;
 
+	blk_mq_unquiesce_queue(q);
 	blk_mq_unfreeze_queue(q);
 
 	return ret;
@@ -2848,7 +3075,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
 	unsigned int nsecs;
 	ktime_t kt;
 
-	if (test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags))
+	if (rq->rq_flags & RQF_MQ_POLL_SLEPT)
 		return false;
 
 	/*
@@ -2868,7 +3095,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
 	if (!nsecs)
 		return false;
 
-	set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
+	rq->rq_flags |= RQF_MQ_POLL_SLEPT;
 
 	/*
 	 * This will be replaced with the stats tracking code, using
@@ -2882,7 +3109,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
 
 	hrtimer_init_sleeper(&hs, current);
 	do {
-		if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
+		if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
 			break;
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		hrtimer_start_expires(&hs.timer, mode);
@@ -2968,12 +3195,6 @@ static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
 
 static int __init blk_mq_init(void)
 {
-	/*
-	 * See comment in block/blk.h rq_atomic_flags enum
-	 */
-	BUILD_BUG_ON((REQ_ATOM_STARTED / BITS_PER_BYTE) !=
-			(REQ_ATOM_COMPLETE / BITS_PER_BYTE));
-
 	cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
 				blk_mq_hctx_notify_dead);
 	return 0;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 6c7c3ff5bf62..88c558f71819 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -27,6 +27,20 @@ struct blk_mq_ctx {
 	struct kobject		kobj;
 } ____cacheline_aligned_in_smp;
 
+/*
+ * Bits for request->gstate.  The lower two bits carry MQ_RQ_* state value
+ * and the upper bits the generation number.
+ */
+enum mq_rq_state {
+	MQ_RQ_IDLE		= 0,
+	MQ_RQ_IN_FLIGHT		= 1,
+	MQ_RQ_COMPLETE		= 2,
+
+	MQ_RQ_STATE_BITS	= 2,
+	MQ_RQ_STATE_MASK	= (1 << MQ_RQ_STATE_BITS) - 1,
+	MQ_RQ_GEN_INC		= 1 << MQ_RQ_STATE_BITS,
+};
+
 void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
@@ -60,6 +74,9 @@ void blk_mq_request_bypass_insert(struct request *rq, bool run_queue);
 void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 				struct list_head *list);
 
+/* Used by blk_insert_cloned_request() to issue request directly */
+blk_status_t blk_mq_request_issue_directly(struct request *rq);
+
 /*
  * CPU -> queue mappings
  */
@@ -81,10 +98,41 @@ extern int blk_mq_sysfs_register(struct request_queue *q);
 extern void blk_mq_sysfs_unregister(struct request_queue *q);
 extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
 
-extern void blk_mq_rq_timed_out(struct request *req, bool reserved);
-
 void blk_mq_release(struct request_queue *q);
 
+/**
+ * blk_mq_rq_state() - read the current MQ_RQ_* state of a request
+ * @rq: target request.
+ */
+static inline int blk_mq_rq_state(struct request *rq)
+{
+	return READ_ONCE(rq->gstate) & MQ_RQ_STATE_MASK;
+}
+
+/**
+ * blk_mq_rq_update_state() - set the current MQ_RQ_* state of a request
+ * @rq: target request.
+ * @state: new state to set.
+ *
+ * Set @rq's state to @state.  The caller is responsible for ensuring that
+ * there are no other updaters.  A request can transition into IN_FLIGHT
+ * only from IDLE and doing so increments the generation number.
+ */
+static inline void blk_mq_rq_update_state(struct request *rq,
+					  enum mq_rq_state state)
+{
+	u64 old_val = READ_ONCE(rq->gstate);
+	u64 new_val = (old_val & ~MQ_RQ_STATE_MASK) | state;
+
+	if (state == MQ_RQ_IN_FLIGHT) {
+		WARN_ON_ONCE((old_val & MQ_RQ_STATE_MASK) != MQ_RQ_IDLE);
+		new_val += MQ_RQ_GEN_INC;
+	}
+
+	/* avoid exposing interim values */
+	WRITE_ONCE(rq->gstate, new_val);
+}
+
 static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
 					   unsigned int cpu)
 {
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 870484eaed1f..cbea895a5547 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -853,6 +853,10 @@ struct kobj_type blk_queue_ktype = {
 	.release	= blk_release_queue,
 };
 
+/**
+ * blk_register_queue - register a block layer queue with sysfs
+ * @disk: Disk of which the request queue should be registered with sysfs.
+ */
 int blk_register_queue(struct gendisk *disk)
 {
 	int ret;
@@ -909,11 +913,12 @@ int blk_register_queue(struct gendisk *disk)
 	if (q->request_fn || (q->mq_ops && q->elevator)) {
 		ret = elv_register_queue(q);
 		if (ret) {
+			mutex_unlock(&q->sysfs_lock);
 			kobject_uevent(&q->kobj, KOBJ_REMOVE);
 			kobject_del(&q->kobj);
 			blk_trace_remove_sysfs(dev);
 			kobject_put(&dev->kobj);
-			goto unlock;
+			return ret;
 		}
 	}
 	ret = 0;
@@ -921,7 +926,15 @@ unlock:
 	mutex_unlock(&q->sysfs_lock);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(blk_register_queue);
 
+/**
+ * blk_unregister_queue - counterpart of blk_register_queue()
+ * @disk: Disk of which the request queue should be unregistered from sysfs.
+ *
+ * Note: the caller is responsible for guaranteeing that this function is called
+ * after blk_register_queue() has finished.
+ */
 void blk_unregister_queue(struct gendisk *disk)
 {
 	struct request_queue *q = disk->queue;
@@ -929,21 +942,39 @@ void blk_unregister_queue(struct gendisk *disk)
 	if (WARN_ON(!q))
 		return;
 
-	mutex_lock(&q->sysfs_lock);
-	queue_flag_clear_unlocked(QUEUE_FLAG_REGISTERED, q);
-	mutex_unlock(&q->sysfs_lock);
+	/* Return early if disk->queue was never registered. */
+	if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
+		return;
 
-	wbt_exit(q);
+	/*
+	 * Since sysfs_remove_dir() prevents adding new directory entries
+	 * before removal of existing entries starts, protect against
+	 * concurrent elv_iosched_store() calls.
+	 */
+	mutex_lock(&q->sysfs_lock);
 
+	spin_lock_irq(q->queue_lock);
+	queue_flag_clear(QUEUE_FLAG_REGISTERED, q);
+	spin_unlock_irq(q->queue_lock);
 
+	/*
+	 * Remove the sysfs attributes before unregistering the queue data
+	 * structures that can be modified through sysfs.
+	 */
 	if (q->mq_ops)
 		blk_mq_unregister_dev(disk_to_dev(disk), q);
-
-	if (q->request_fn || (q->mq_ops && q->elevator))
-		elv_unregister_queue(q);
+	mutex_unlock(&q->sysfs_lock);
 
 	kobject_uevent(&q->kobj, KOBJ_REMOVE);
 	kobject_del(&q->kobj);
 	blk_trace_remove_sysfs(disk_to_dev(disk));
+
+	wbt_exit(q);
+
+	mutex_lock(&q->sysfs_lock);
+	if (q->request_fn || (q->mq_ops && q->elevator))
+		elv_unregister_queue(q);
+	mutex_unlock(&q->sysfs_lock);
+
 	kobject_put(&disk_to_dev(disk)->kobj);
 }
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 825bc29767e6..c5a131673733 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -216,9 +216,9 @@ struct throtl_data
 
 	unsigned int scale;
 
-	struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
-	struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
-	struct latency_bucket __percpu *latency_buckets;
+	struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE];
+	struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE];
+	struct latency_bucket __percpu *latency_buckets[2];
 	unsigned long last_calculate_time;
 	unsigned long filtered_latency;
 
@@ -1511,10 +1511,20 @@ static struct cftype throtl_legacy_files[] = {
 		.seq_show = blkg_print_stat_bytes,
 	},
 	{
+		.name = "throttle.io_service_bytes_recursive",
+		.private = (unsigned long)&blkcg_policy_throtl,
+		.seq_show = blkg_print_stat_bytes_recursive,
+	},
+	{
 		.name = "throttle.io_serviced",
 		.private = (unsigned long)&blkcg_policy_throtl,
 		.seq_show = blkg_print_stat_ios,
 	},
+	{
+		.name = "throttle.io_serviced_recursive",
+		.private = (unsigned long)&blkcg_policy_throtl,
+		.seq_show = blkg_print_stat_ios_recursive,
+	},
 	{ }	/* terminate */
 };
 
@@ -2040,10 +2050,10 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg)
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 static void throtl_update_latency_buckets(struct throtl_data *td)
 {
-	struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
-	int i, cpu;
-	unsigned long last_latency = 0;
-	unsigned long latency;
+	struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE];
+	int i, cpu, rw;
+	unsigned long last_latency[2] = { 0 };
+	unsigned long latency[2];
 
 	if (!blk_queue_nonrot(td->queue))
 		return;
@@ -2052,56 +2062,67 @@ static void throtl_update_latency_buckets(struct throtl_data *td)
 	td->last_calculate_time = jiffies;
 
 	memset(avg_latency, 0, sizeof(avg_latency));
-	for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
-		struct latency_bucket *tmp = &td->tmp_buckets[i];
-
-		for_each_possible_cpu(cpu) {
-			struct latency_bucket *bucket;
-
-			/* this isn't race free, but ok in practice */
-			bucket = per_cpu_ptr(td->latency_buckets, cpu);
-			tmp->total_latency += bucket[i].total_latency;
-			tmp->samples += bucket[i].samples;
-			bucket[i].total_latency = 0;
-			bucket[i].samples = 0;
-		}
+	for (rw = READ; rw <= WRITE; rw++) {
+		for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+			struct latency_bucket *tmp = &td->tmp_buckets[rw][i];
+
+			for_each_possible_cpu(cpu) {
+				struct latency_bucket *bucket;
+
+				/* this isn't race free, but ok in practice */
+				bucket = per_cpu_ptr(td->latency_buckets[rw],
+					cpu);
+				tmp->total_latency += bucket[i].total_latency;
+				tmp->samples += bucket[i].samples;
+				bucket[i].total_latency = 0;
+				bucket[i].samples = 0;
+			}
 
-		if (tmp->samples >= 32) {
-			int samples = tmp->samples;
+			if (tmp->samples >= 32) {
+				int samples = tmp->samples;
 
-			latency = tmp->total_latency;
+				latency[rw] = tmp->total_latency;
 
-			tmp->total_latency = 0;
-			tmp->samples = 0;
-			latency /= samples;
-			if (latency == 0)
-				continue;
-			avg_latency[i].latency = latency;
+				tmp->total_latency = 0;
+				tmp->samples = 0;
+				latency[rw] /= samples;
+				if (latency[rw] == 0)
+					continue;
+				avg_latency[rw][i].latency = latency[rw];
+			}
 		}
 	}
 
-	for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
-		if (!avg_latency[i].latency) {
-			if (td->avg_buckets[i].latency < last_latency)
-				td->avg_buckets[i].latency = last_latency;
-			continue;
-		}
+	for (rw = READ; rw <= WRITE; rw++) {
+		for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+			if (!avg_latency[rw][i].latency) {
+				if (td->avg_buckets[rw][i].latency < last_latency[rw])
+					td->avg_buckets[rw][i].latency =
+						last_latency[rw];
+				continue;
+			}
 
-		if (!td->avg_buckets[i].valid)
-			latency = avg_latency[i].latency;
-		else
-			latency = (td->avg_buckets[i].latency * 7 +
-				avg_latency[i].latency) >> 3;
+			if (!td->avg_buckets[rw][i].valid)
+				latency[rw] = avg_latency[rw][i].latency;
+			else
+				latency[rw] = (td->avg_buckets[rw][i].latency * 7 +
+					avg_latency[rw][i].latency) >> 3;
 
-		td->avg_buckets[i].latency = max(latency, last_latency);
-		td->avg_buckets[i].valid = true;
-		last_latency = td->avg_buckets[i].latency;
+			td->avg_buckets[rw][i].latency = max(latency[rw],
+				last_latency[rw]);
+			td->avg_buckets[rw][i].valid = true;
+			last_latency[rw] = td->avg_buckets[rw][i].latency;
+		}
 	}
 
 	for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
 		throtl_log(&td->service_queue,
-			"Latency bucket %d: latency=%ld, valid=%d", i,
-			td->avg_buckets[i].latency, td->avg_buckets[i].valid);
+			"Latency bucket %d: read latency=%ld, read valid=%d, "
+			"write latency=%ld, write valid=%d", i,
+			td->avg_buckets[READ][i].latency,
+			td->avg_buckets[READ][i].valid,
+			td->avg_buckets[WRITE][i].latency,
+			td->avg_buckets[WRITE][i].valid);
 }
 #else
 static inline void throtl_update_latency_buckets(struct throtl_data *td)
@@ -2226,13 +2247,7 @@ again:
 out_unlock:
 	spin_unlock_irq(q->queue_lock);
 out:
-	/*
-	 * As multiple blk-throtls may stack in the same issue path, we
-	 * don't want bios to leave with the flag set.  Clear the flag if
-	 * being issued.
-	 */
-	if (!throttled)
-		bio_clear_flag(bio, BIO_THROTTLED);
+	bio_set_flag(bio, BIO_THROTTLED);
 
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 	if (throttled || !td->track_bio_latency)
@@ -2248,16 +2263,17 @@ static void throtl_track_latency(struct throtl_data *td, sector_t size,
 	struct latency_bucket *latency;
 	int index;
 
-	if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ ||
+	if (!td || td->limit_index != LIMIT_LOW ||
+	    !(op == REQ_OP_READ || op == REQ_OP_WRITE) ||
 	    !blk_queue_nonrot(td->queue))
 		return;
 
 	index = request_bucket_index(size);
 
-	latency = get_cpu_ptr(td->latency_buckets);
+	latency = get_cpu_ptr(td->latency_buckets[op]);
 	latency[index].total_latency += time;
 	latency[index].samples++;
-	put_cpu_ptr(td->latency_buckets);
+	put_cpu_ptr(td->latency_buckets[op]);
 }
 
 void blk_throtl_stat_add(struct request *rq, u64 time_ns)
@@ -2276,6 +2292,7 @@ void blk_throtl_bio_endio(struct bio *bio)
 	unsigned long finish_time;
 	unsigned long start_time;
 	unsigned long lat;
+	int rw = bio_data_dir(bio);
 
 	tg = bio->bi_cg_private;
 	if (!tg)
@@ -2304,7 +2321,7 @@ void blk_throtl_bio_endio(struct bio *bio)
 
 		bucket = request_bucket_index(
 			blk_stat_size(&bio->bi_issue_stat));
-		threshold = tg->td->avg_buckets[bucket].latency +
+		threshold = tg->td->avg_buckets[rw][bucket].latency +
 			tg->latency_target;
 		if (lat > threshold)
 			tg->bad_bio_cnt++;
@@ -2397,9 +2414,16 @@ int blk_throtl_init(struct request_queue *q)
 	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
 	if (!td)
 		return -ENOMEM;
-	td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) *
+	td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) *
+		LATENCY_BUCKET_SIZE, __alignof__(u64));
+	if (!td->latency_buckets[READ]) {
+		kfree(td);
+		return -ENOMEM;
+	}
+	td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) *
 		LATENCY_BUCKET_SIZE, __alignof__(u64));
-	if (!td->latency_buckets) {
+	if (!td->latency_buckets[WRITE]) {
+		free_percpu(td->latency_buckets[READ]);
 		kfree(td);
 		return -ENOMEM;
 	}
@@ -2418,7 +2442,8 @@ int blk_throtl_init(struct request_queue *q)
 	/* activate policy */
 	ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
 	if (ret) {
-		free_percpu(td->latency_buckets);
+		free_percpu(td->latency_buckets[READ]);
+		free_percpu(td->latency_buckets[WRITE]);
 		kfree(td);
 	}
 	return ret;
@@ -2429,7 +2454,8 @@ void blk_throtl_exit(struct request_queue *q)
 	BUG_ON(!q->td);
 	throtl_shutdown_wq(q);
 	blkcg_deactivate_policy(q, &blkcg_policy_throtl);
-	free_percpu(q->td->latency_buckets);
+	free_percpu(q->td->latency_buckets[READ]);
+	free_percpu(q->td->latency_buckets[WRITE]);
 	kfree(q->td);
 }
 
@@ -2447,15 +2473,17 @@ void blk_throtl_register_queue(struct request_queue *q)
 	} else {
 		td->throtl_slice = DFL_THROTL_SLICE_HD;
 		td->filtered_latency = LATENCY_FILTERED_HD;
-		for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
-			td->avg_buckets[i].latency = DFL_HD_BASELINE_LATENCY;
+		for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+			td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY;
+			td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY;
+		}
 	}
 #ifndef CONFIG_BLK_DEV_THROTTLING_LOW
 	/* if no low limit, use previous default */
 	td->throtl_slice = DFL_THROTL_SLICE_HD;
 #endif
 
-	td->track_bio_latency = !q->mq_ops && !q->request_fn;
+	td->track_bio_latency = !queue_is_rq_based(q);
 	if (!td->track_bio_latency)
 		blk_stat_enable_accounting(q);
 }
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 764ecf9aeb30..a05e3676d24a 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -112,7 +112,9 @@ static void blk_rq_timed_out(struct request *req)
 static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
 			  unsigned int *next_set)
 {
-	if (time_after_eq(jiffies, rq->deadline)) {
+	const unsigned long deadline = blk_rq_deadline(rq);
+
+	if (time_after_eq(jiffies, deadline)) {
 		list_del_init(&rq->timeout_list);
 
 		/*
@@ -120,8 +122,8 @@ static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout
 		 */
 		if (!blk_mark_rq_complete(rq))
 			blk_rq_timed_out(rq);
-	} else if (!*next_set || time_after(*next_timeout, rq->deadline)) {
-		*next_timeout = rq->deadline;
+	} else if (!*next_set || time_after(*next_timeout, deadline)) {
+		*next_timeout = deadline;
 		*next_set = 1;
 	}
 }
@@ -156,12 +158,17 @@ void blk_timeout_work(struct work_struct *work)
  */
 void blk_abort_request(struct request *req)
 {
-	if (blk_mark_rq_complete(req))
-		return;
-
 	if (req->q->mq_ops) {
-		blk_mq_rq_timed_out(req, false);
+		/*
+		 * All we need to ensure is that timeout scan takes place
+		 * immediately and that scan sees the new timeout value.
+		 * No need for fancy synchronizations.
+		 */
+		blk_rq_set_deadline(req, jiffies);
+		mod_timer(&req->q->timeout, 0);
 	} else {
+		if (blk_mark_rq_complete(req))
+			return;
 		blk_delete_timer(req);
 		blk_rq_timed_out(req);
 	}
@@ -208,7 +215,8 @@ void blk_add_timer(struct request *req)
 	if (!req->timeout)
 		req->timeout = q->rq_timeout;
 
-	WRITE_ONCE(req->deadline, jiffies + req->timeout);
+	blk_rq_set_deadline(req, jiffies + req->timeout);
+	req->rq_flags &= ~RQF_MQ_TIMEOUT_EXPIRED;
 
 	/*
 	 * Only the non-mq case needs to add the request to a protected list.
@@ -222,7 +230,7 @@ void blk_add_timer(struct request *req)
 	 * than an existing one, modify the timer. Round up to next nearest
 	 * second.
 	 */
-	expiry = blk_rq_timeout(round_jiffies_up(req->deadline));
+	expiry = blk_rq_timeout(round_jiffies_up(blk_rq_deadline(req)));
 
 	if (!timer_pending(&q->timeout) ||
 	    time_before(expiry, q->timeout.expires)) {
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index ae8de9780085..f92fc84b5e2c 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -697,7 +697,15 @@ u64 wbt_default_latency_nsec(struct request_queue *q)
 
 static int wbt_data_dir(const struct request *rq)
 {
-	return rq_data_dir(rq);
+	const int op = req_op(rq);
+
+	if (op == REQ_OP_READ)
+		return READ;
+	else if (op == REQ_OP_WRITE || op == REQ_OP_FLUSH)
+		return WRITE;
+
+	/* don't account */
+	return -1;
 }
 
 int wbt_init(struct request_queue *q)
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index ff57fb51b338..acb7252c7e81 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -22,6 +22,48 @@ static inline sector_t blk_zone_start(struct request_queue *q,
 }
 
 /*
+ * Return true if a request is a write requests that needs zone write locking.
+ */
+bool blk_req_needs_zone_write_lock(struct request *rq)
+{
+	if (!rq->q->seq_zones_wlock)
+		return false;
+
+	if (blk_rq_is_passthrough(rq))
+		return false;
+
+	switch (req_op(rq)) {
+	case REQ_OP_WRITE_ZEROES:
+	case REQ_OP_WRITE_SAME:
+	case REQ_OP_WRITE:
+		return blk_rq_zone_is_seq(rq);
+	default:
+		return false;
+	}
+}
+EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock);
+
+void __blk_req_zone_write_lock(struct request *rq)
+{
+	if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq),
+					  rq->q->seq_zones_wlock)))
+		return;
+
+	WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
+	rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
+}
+EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock);
+
+void __blk_req_zone_write_unlock(struct request *rq)
+{
+	rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED;
+	if (rq->q->seq_zones_wlock)
+		WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq),
+						 rq->q->seq_zones_wlock));
+}
+EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
+
+/*
  * Check that a zone report belongs to the partition.
  * If yes, fix its start sector and write pointer, copy it in the
  * zone information array and return true. Return false otherwise.
diff --git a/block/blk.h b/block/blk.h
index 3f1446937aec..46db5dc83dcb 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -120,33 +120,23 @@ void blk_account_io_completion(struct request *req, unsigned int bytes);
 void blk_account_io_done(struct request *req);
 
 /*
- * Internal atomic flags for request handling
- */
-enum rq_atomic_flags {
-	/*
-	 * Keep these two bits first - not because we depend on the
-	 * value of them, but we do depend on them being in the same
-	 * byte of storage to ensure ordering on writes. Keeping them
-	 * first will achieve that nicely.
-	 */
-	REQ_ATOM_COMPLETE = 0,
-	REQ_ATOM_STARTED,
-
-	REQ_ATOM_POLL_SLEPT,
-};
-
-/*
  * EH timer and IO completion will both attempt to 'grab' the request, make
- * sure that only one of them succeeds
+ * sure that only one of them succeeds. Steal the bottom bit of the
+ * __deadline field for this.
  */
 static inline int blk_mark_rq_complete(struct request *rq)
 {
-	return test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
+	return test_and_set_bit(0, &rq->__deadline);
 }
 
 static inline void blk_clear_rq_complete(struct request *rq)
 {
-	clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
+	clear_bit(0, &rq->__deadline);
+}
+
+static inline bool blk_rq_is_complete(struct request *rq)
+{
+	return test_bit(0, &rq->__deadline);
 }
 
 /*
@@ -172,6 +162,9 @@ static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq
 		e->type->ops.sq.elevator_deactivate_req_fn(q, rq);
 }
 
+int elv_register_queue(struct request_queue *q);
+void elv_unregister_queue(struct request_queue *q);
+
 struct hd_struct *__disk_get_part(struct gendisk *disk, int partno);
 
 #ifdef CONFIG_FAIL_IO_TIMEOUT
@@ -246,6 +239,21 @@ static inline void req_set_nomerge(struct request_queue *q, struct request *req)
 }
 
 /*
+ * Steal a bit from this field for legacy IO path atomic IO marking. Note that
+ * setting the deadline clears the bottom bit, potentially clearing the
+ * completed bit. The user has to be OK with this (current ones are fine).
+ */
+static inline void blk_rq_set_deadline(struct request *rq, unsigned long time)
+{
+	rq->__deadline = time & ~0x1UL;
+}
+
+static inline unsigned long blk_rq_deadline(struct request *rq)
+{
+	return rq->__deadline & ~0x1UL;
+}
+
+/*
  * Internal io_context interface
  */
 void get_io_context(struct io_context *ioc);
@@ -330,4 +338,6 @@ static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio)
 }
 #endif /* CONFIG_BOUNCE */
 
+extern void blk_drain_queue(struct request_queue *q);
+
 #endif /* BLK_INTERNAL_H */
diff --git a/block/bounce.c b/block/bounce.c
index fceb1a96480b..6a3e68292273 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -113,45 +113,50 @@ int init_emergency_isa_pool(void)
 static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
 {
 	unsigned char *vfrom;
-	struct bio_vec tovec, *fromvec = from->bi_io_vec;
+	struct bio_vec tovec, fromvec;
 	struct bvec_iter iter;
+	/*
+	 * The bio of @from is created by bounce, so we can iterate
+	 * its bvec from start to end, but the @from->bi_iter can't be
+	 * trusted because it might be changed by splitting.
+	 */
+	struct bvec_iter from_iter = BVEC_ITER_ALL_INIT;
 
 	bio_for_each_segment(tovec, to, iter) {
-		if (tovec.bv_page != fromvec->bv_page) {
+		fromvec = bio_iter_iovec(from, from_iter);
+		if (tovec.bv_page != fromvec.bv_page) {
 			/*
 			 * fromvec->bv_offset and fromvec->bv_len might have
 			 * been modified by the block layer, so use the original
 			 * copy, bounce_copy_vec already uses tovec->bv_len
 			 */
-			vfrom = page_address(fromvec->bv_page) +
+			vfrom = page_address(fromvec.bv_page) +
 				tovec.bv_offset;
 
 			bounce_copy_vec(&tovec, vfrom);
 			flush_dcache_page(tovec.bv_page);
 		}
-
-		fromvec++;
+		bio_advance_iter(from, &from_iter, tovec.bv_len);
 	}
 }
 
 static void bounce_end_io(struct bio *bio, mempool_t *pool)
 {
 	struct bio *bio_orig = bio->bi_private;
-	struct bio_vec *bvec, *org_vec;
+	struct bio_vec *bvec, orig_vec;
 	int i;
-	int start = bio_orig->bi_iter.bi_idx;
+	struct bvec_iter orig_iter = bio_orig->bi_iter;
 
 	/*
 	 * free up bounce indirect pages used
 	 */
 	bio_for_each_segment_all(bvec, bio, i) {
-		org_vec = bio_orig->bi_io_vec + i + start;
-
-		if (bvec->bv_page == org_vec->bv_page)
-			continue;
-
-		dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
-		mempool_free(bvec->bv_page, pool);
+		orig_vec = bio_iter_iovec(bio_orig, orig_iter);
+		if (bvec->bv_page != orig_vec.bv_page) {
+			dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
+			mempool_free(bvec->bv_page, pool);
+		}
+		bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len);
 	}
 
 	bio_orig->bi_status = bio->bi_status;
@@ -200,6 +205,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 	unsigned i = 0;
 	bool bounce = false;
 	int sectors = 0;
+	bool passthrough = bio_is_passthrough(*bio_orig);
 
 	bio_for_each_segment(from, *bio_orig, iter) {
 		if (i++ < BIO_MAX_PAGES)
@@ -210,13 +216,14 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 	if (!bounce)
 		return;
 
-	if (sectors < bio_sectors(*bio_orig)) {
+	if (!passthrough && sectors < bio_sectors(*bio_orig)) {
 		bio = bio_split(*bio_orig, sectors, GFP_NOIO, bounce_bio_split);
 		bio_chain(bio, *bio_orig);
 		generic_make_request(*bio_orig);
 		*bio_orig = bio;
 	}
-	bio = bio_clone_bioset(*bio_orig, GFP_NOIO, bounce_bio_set);
+	bio = bio_clone_bioset(*bio_orig, GFP_NOIO, passthrough ? NULL :
+			bounce_bio_set);
 
 	bio_for_each_segment_all(to, bio, i) {
 		struct page *page = to->bv_page;
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 15d25ccd51a5..1474153f73e3 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -30,7 +30,7 @@
 
 /**
  * bsg_teardown_job - routine to teardown a bsg job
- * @job: bsg_job that is to be torn down
+ * @kref: kref inside bsg_job that is to be torn down
  */
 static void bsg_teardown_job(struct kref *kref)
 {
@@ -251,6 +251,7 @@ static void bsg_exit_rq(struct request_queue *q, struct request *req)
  * @name: device to give bsg device
  * @job_fn: bsg job handler
  * @dd_job_size: size of LLD data needed for each job
+ * @release: @dev release function
  */
 struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
 		bsg_job_fn *job_fn, int dd_job_size,
diff --git a/block/bsg.c b/block/bsg.c
index 452f94f1c5d4..06dc96e1f670 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -32,6 +32,9 @@
 #define BSG_DESCRIPTION	"Block layer SCSI generic (bsg) driver"
 #define BSG_VERSION	"0.4"
 
+#define bsg_dbg(bd, fmt, ...) \
+	pr_debug("%s: " fmt, (bd)->name, ##__VA_ARGS__)
+
 struct bsg_device {
 	struct request_queue *queue;
 	spinlock_t lock;
@@ -55,14 +58,6 @@ enum {
 #define BSG_DEFAULT_CMDS	64
 #define BSG_MAX_DEVS		32768
 
-#undef BSG_DEBUG
-
-#ifdef BSG_DEBUG
-#define dprintk(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ##args)
-#else
-#define dprintk(fmt, args...)
-#endif
-
 static DEFINE_MUTEX(bsg_mutex);
 static DEFINE_IDR(bsg_minor_idr);
 
@@ -123,7 +118,7 @@ static struct bsg_command *bsg_alloc_command(struct bsg_device *bd)
 
 	bc->bd = bd;
 	INIT_LIST_HEAD(&bc->list);
-	dprintk("%s: returning free cmd %p\n", bd->name, bc);
+	bsg_dbg(bd, "returning free cmd %p\n", bc);
 	return bc;
 out:
 	spin_unlock_irq(&bd->lock);
@@ -222,7 +217,8 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t mode)
 	if (!bcd->class_dev)
 		return ERR_PTR(-ENXIO);
 
-	dprintk("map hdr %llx/%u %llx/%u\n", (unsigned long long) hdr->dout_xferp,
+	bsg_dbg(bd, "map hdr %llx/%u %llx/%u\n",
+		(unsigned long long) hdr->dout_xferp,
 		hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp,
 		hdr->din_xfer_len);
 
@@ -299,8 +295,8 @@ static void bsg_rq_end_io(struct request *rq, blk_status_t status)
 	struct bsg_device *bd = bc->bd;
 	unsigned long flags;
 
-	dprintk("%s: finished rq %p bc %p, bio %p\n",
-		bd->name, rq, bc, bc->bio);
+	bsg_dbg(bd, "finished rq %p bc %p, bio %p\n",
+		rq, bc, bc->bio);
 
 	bc->hdr.duration = jiffies_to_msecs(jiffies - bc->hdr.duration);
 
@@ -333,7 +329,7 @@ static void bsg_add_command(struct bsg_device *bd, struct request_queue *q,
 	list_add_tail(&bc->list, &bd->busy_list);
 	spin_unlock_irq(&bd->lock);
 
-	dprintk("%s: queueing rq %p, bc %p\n", bd->name, rq, bc);
+	bsg_dbg(bd, "queueing rq %p, bc %p\n", rq, bc);
 
 	rq->end_io_data = bc;
 	blk_execute_rq_nowait(q, NULL, rq, at_head, bsg_rq_end_io);
@@ -379,7 +375,7 @@ static struct bsg_command *bsg_get_done_cmd(struct bsg_device *bd)
 		}
 	} while (1);
 
-	dprintk("%s: returning done %p\n", bd->name, bc);
+	bsg_dbg(bd, "returning done %p\n", bc);
 
 	return bc;
 }
@@ -390,7 +386,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
 	struct scsi_request *req = scsi_req(rq);
 	int ret = 0;
 
-	dprintk("rq %p bio %p 0x%x\n", rq, bio, req->result);
+	pr_debug("rq %p bio %p 0x%x\n", rq, bio, req->result);
 	/*
 	 * fill in all the output members
 	 */
@@ -469,7 +465,7 @@ static int bsg_complete_all_commands(struct bsg_device *bd)
 	struct bsg_command *bc;
 	int ret, tret;
 
-	dprintk("%s: entered\n", bd->name);
+	bsg_dbg(bd, "entered\n");
 
 	/*
 	 * wait for all commands to complete
@@ -572,7 +568,7 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 	int ret;
 	ssize_t bytes_read;
 
-	dprintk("%s: read %zd bytes\n", bd->name, count);
+	bsg_dbg(bd, "read %zd bytes\n", count);
 
 	bsg_set_block(bd, file);
 
@@ -646,7 +642,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
 	ssize_t bytes_written;
 	int ret;
 
-	dprintk("%s: write %zd bytes\n", bd->name, count);
+	bsg_dbg(bd, "write %zd bytes\n", count);
 
 	if (unlikely(uaccess_kernel()))
 		return -EINVAL;
@@ -664,7 +660,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
 	if (!bytes_written || err_block_err(ret))
 		bytes_written = ret;
 
-	dprintk("%s: returning %zd\n", bd->name, bytes_written);
+	bsg_dbg(bd, "returning %zd\n", bytes_written);
 	return bytes_written;
 }
 
@@ -717,7 +713,7 @@ static int bsg_put_device(struct bsg_device *bd)
 	hlist_del(&bd->dev_list);
 	mutex_unlock(&bsg_mutex);
 
-	dprintk("%s: tearing down\n", bd->name);
+	bsg_dbg(bd, "tearing down\n");
 
 	/*
 	 * close can always block
@@ -744,9 +740,7 @@ static struct bsg_device *bsg_add_device(struct inode *inode,
 					 struct file *file)
 {
 	struct bsg_device *bd;
-#ifdef BSG_DEBUG
 	unsigned char buf[32];
-#endif
 
 	if (!blk_queue_scsi_passthrough(rq)) {
 		WARN_ONCE(true, "Attempt to register a non-SCSI queue\n");
@@ -771,7 +765,7 @@ static struct bsg_device *bsg_add_device(struct inode *inode,
 	hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode)));
 
 	strncpy(bd->name, dev_name(rq->bsg_dev.class_dev), sizeof(bd->name) - 1);
-	dprintk("bound to <%s>, max queue %d\n",
+	bsg_dbg(bd, "bound to <%s>, max queue %d\n",
 		format_dev_t(buf, inode->i_rdev), bd->max_queue);
 
 	mutex_unlock(&bsg_mutex);
@@ -845,19 +839,19 @@ static int bsg_release(struct inode *inode, struct file *file)
 	return bsg_put_device(bd);
 }
 
-static unsigned int bsg_poll(struct file *file, poll_table *wait)
+static __poll_t bsg_poll(struct file *file, poll_table *wait)
 {
 	struct bsg_device *bd = file->private_data;
-	unsigned int mask = 0;
+	__poll_t mask = 0;
 
 	poll_wait(file, &bd->wq_done, wait);
 	poll_wait(file, &bd->wq_free, wait);
 
 	spin_lock_irq(&bd->lock);
 	if (!list_empty(&bd->done_list))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 	if (bd->queued_cmds < bd->max_queue)
-		mask |= POLLOUT;
+		mask |= EPOLLOUT;
 	spin_unlock_irq(&bd->lock);
 
 	return mask;
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index b83f77460d28..9de9f156e203 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -50,8 +50,6 @@ struct deadline_data {
 	int front_merges;
 };
 
-static void deadline_move_request(struct deadline_data *, struct request *);
-
 static inline struct rb_root *
 deadline_rb_root(struct deadline_data *dd, struct request *rq)
 {
@@ -100,6 +98,12 @@ deadline_add_request(struct request_queue *q, struct request *rq)
 	struct deadline_data *dd = q->elevator->elevator_data;
 	const int data_dir = rq_data_dir(rq);
 
+	/*
+	 * This may be a requeue of a write request that has locked its
+	 * target zone. If it is the case, this releases the zone lock.
+	 */
+	blk_req_zone_write_unlock(rq);
+
 	deadline_add_rq_rb(dd, rq);
 
 	/*
@@ -190,6 +194,12 @@ deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq)
 {
 	struct request_queue *q = rq->q;
 
+	/*
+	 * For a zoned block device, write requests must write lock their
+	 * target zone.
+	 */
+	blk_req_zone_write_lock(rq);
+
 	deadline_remove_request(q, rq);
 	elv_dispatch_add_tail(q, rq);
 }
@@ -231,6 +241,69 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
 }
 
 /*
+ * For the specified data direction, return the next request to dispatch using
+ * arrival ordered lists.
+ */
+static struct request *
+deadline_fifo_request(struct deadline_data *dd, int data_dir)
+{
+	struct request *rq;
+
+	if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
+		return NULL;
+
+	if (list_empty(&dd->fifo_list[data_dir]))
+		return NULL;
+
+	rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+	if (data_dir == READ || !blk_queue_is_zoned(rq->q))
+		return rq;
+
+	/*
+	 * Look for a write request that can be dispatched, that is one with
+	 * an unlocked target zone.
+	 */
+	list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) {
+		if (blk_req_can_dispatch_to_zone(rq))
+			return rq;
+	}
+
+	return NULL;
+}
+
+/*
+ * For the specified data direction, return the next request to dispatch using
+ * sector position sorted lists.
+ */
+static struct request *
+deadline_next_request(struct deadline_data *dd, int data_dir)
+{
+	struct request *rq;
+
+	if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
+		return NULL;
+
+	rq = dd->next_rq[data_dir];
+	if (!rq)
+		return NULL;
+
+	if (data_dir == READ || !blk_queue_is_zoned(rq->q))
+		return rq;
+
+	/*
+	 * Look for a write request that can be dispatched, that is one with
+	 * an unlocked target zone.
+	 */
+	while (rq) {
+		if (blk_req_can_dispatch_to_zone(rq))
+			return rq;
+		rq = deadline_latter_request(rq);
+	}
+
+	return NULL;
+}
+
+/*
  * deadline_dispatch_requests selects the best request according to
  * read/write expire, fifo_batch, etc
  */
@@ -239,16 +312,15 @@ static int deadline_dispatch_requests(struct request_queue *q, int force)
 	struct deadline_data *dd = q->elevator->elevator_data;
 	const int reads = !list_empty(&dd->fifo_list[READ]);
 	const int writes = !list_empty(&dd->fifo_list[WRITE]);
-	struct request *rq;
+	struct request *rq, *next_rq;
 	int data_dir;
 
 	/*
 	 * batches are currently reads XOR writes
 	 */
-	if (dd->next_rq[WRITE])
-		rq = dd->next_rq[WRITE];
-	else
-		rq = dd->next_rq[READ];
+	rq = deadline_next_request(dd, WRITE);
+	if (!rq)
+		rq = deadline_next_request(dd, READ);
 
 	if (rq && dd->batching < dd->fifo_batch)
 		/* we have a next request are still entitled to batch */
@@ -262,7 +334,8 @@ static int deadline_dispatch_requests(struct request_queue *q, int force)
 	if (reads) {
 		BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
 
-		if (writes && (dd->starved++ >= dd->writes_starved))
+		if (deadline_fifo_request(dd, WRITE) &&
+		    (dd->starved++ >= dd->writes_starved))
 			goto dispatch_writes;
 
 		data_dir = READ;
@@ -291,21 +364,29 @@ dispatch_find_request:
 	/*
 	 * we are not running a batch, find best request for selected data_dir
 	 */
-	if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
+	next_rq = deadline_next_request(dd, data_dir);
+	if (deadline_check_fifo(dd, data_dir) || !next_rq) {
 		/*
 		 * A deadline has expired, the last request was in the other
 		 * direction, or we have run out of higher-sectored requests.
 		 * Start again from the request with the earliest expiry time.
 		 */
-		rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+		rq = deadline_fifo_request(dd, data_dir);
 	} else {
 		/*
 		 * The last req was the same dir and we have a next request in
 		 * sort order. No expired requests so continue on from here.
 		 */
-		rq = dd->next_rq[data_dir];
+		rq = next_rq;
 	}
 
+	/*
+	 * For a zoned block device, if we only have writes queued and none of
+	 * them can be dispatched, rq will be NULL.
+	 */
+	if (!rq)
+		return 0;
+
 	dd->batching = 0;
 
 dispatch_request:
@@ -318,6 +399,16 @@ dispatch_request:
 	return 1;
 }
 
+/*
+ * For zoned block devices, write unlock the target zone of completed
+ * write requests.
+ */
+static void
+deadline_completed_request(struct request_queue *q, struct request *rq)
+{
+	blk_req_zone_write_unlock(rq);
+}
+
 static void deadline_exit_queue(struct elevator_queue *e)
 {
 	struct deadline_data *dd = e->elevator_data;
@@ -439,6 +530,7 @@ static struct elevator_type iosched_deadline = {
 		.elevator_merged_fn =		deadline_merged_request,
 		.elevator_merge_req_fn =	deadline_merged_requests,
 		.elevator_dispatch_fn =		deadline_dispatch_requests,
+		.elevator_completed_req_fn =	deadline_completed_request,
 		.elevator_add_req_fn =		deadline_add_request,
 		.elevator_former_req_fn =	elv_rb_former_request,
 		.elevator_latter_req_fn =	elv_rb_latter_request,
diff --git a/block/elevator.c b/block/elevator.c
index 7bda083d5968..e87e9b43aba0 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -869,6 +869,8 @@ int elv_register_queue(struct request_queue *q)
 	struct elevator_queue *e = q->elevator;
 	int error;
 
+	lockdep_assert_held(&q->sysfs_lock);
+
 	error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
 	if (!error) {
 		struct elv_fs_entry *attr = e->type->elevator_attrs;
@@ -886,10 +888,11 @@ int elv_register_queue(struct request_queue *q)
 	}
 	return error;
 }
-EXPORT_SYMBOL(elv_register_queue);
 
 void elv_unregister_queue(struct request_queue *q)
 {
+	lockdep_assert_held(&q->sysfs_lock);
+
 	if (q) {
 		struct elevator_queue *e = q->elevator;
 
@@ -900,7 +903,6 @@ void elv_unregister_queue(struct request_queue *q)
 		wbt_enable_default(q);
 	}
 }
-EXPORT_SYMBOL(elv_unregister_queue);
 
 int elv_register(struct elevator_type *e)
 {
@@ -967,7 +969,10 @@ static int elevator_switch_mq(struct request_queue *q,
 {
 	int ret;
 
+	lockdep_assert_held(&q->sysfs_lock);
+
 	blk_mq_freeze_queue(q);
+	blk_mq_quiesce_queue(q);
 
 	if (q->elevator) {
 		if (q->elevator->registered)
@@ -994,6 +999,7 @@ static int elevator_switch_mq(struct request_queue *q,
 		blk_add_trace_msg(q, "elv switch: none");
 
 out:
+	blk_mq_unquiesce_queue(q);
 	blk_mq_unfreeze_queue(q);
 	return ret;
 }
@@ -1010,6 +1016,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	bool old_registered = false;
 	int err;
 
+	lockdep_assert_held(&q->sysfs_lock);
+
 	if (q->mq_ops)
 		return elevator_switch_mq(q, new_e);
 
diff --git a/block/genhd.c b/block/genhd.c
index 96a66f671720..88a53c188cb7 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -629,16 +629,18 @@ exit:
 }
 
 /**
- * device_add_disk - add partitioning information to kernel list
+ * __device_add_disk - add disk information to kernel list
  * @parent: parent device for the disk
  * @disk: per-device partitioning information
+ * @register_queue: register the queue if set to true
  *
  * This function registers the partitioning information in @disk
  * with the kernel.
  *
  * FIXME: error handling
  */
-void device_add_disk(struct device *parent, struct gendisk *disk)
+static void __device_add_disk(struct device *parent, struct gendisk *disk,
+			      bool register_queue)
 {
 	dev_t devt;
 	int retval;
@@ -682,7 +684,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
 				    exact_match, exact_lock, disk);
 	}
 	register_disk(parent, disk);
-	blk_register_queue(disk);
+	if (register_queue)
+		blk_register_queue(disk);
 
 	/*
 	 * Take an extra ref on queue which will be put on disk_release()
@@ -693,8 +696,19 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
 	disk_add_events(disk);
 	blk_integrity_add(disk);
 }
+
+void device_add_disk(struct device *parent, struct gendisk *disk)
+{
+	__device_add_disk(parent, disk, true);
+}
 EXPORT_SYMBOL(device_add_disk);
 
+void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk)
+{
+	__device_add_disk(parent, disk, false);
+}
+EXPORT_SYMBOL(device_add_disk_no_queue_reg);
+
 void del_gendisk(struct gendisk *disk)
 {
 	struct disk_part_iter piter;
@@ -725,7 +739,8 @@ void del_gendisk(struct gendisk *disk)
 		 * Unregister bdi before releasing device numbers (as they can
 		 * get reused and we'd get clashes in sysfs).
 		 */
-		bdi_unregister(disk->queue->backing_dev_info);
+		if (!(disk->flags & GENHD_FL_HIDDEN))
+			bdi_unregister(disk->queue->backing_dev_info);
 		blk_unregister_queue(disk);
 	} else {
 		WARN_ON(1);
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index b4df317c2916..f95c60774ce8 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -100,9 +100,13 @@ struct kyber_hctx_data {
 	unsigned int cur_domain;
 	unsigned int batching;
 	wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
+	struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
 	atomic_t wait_index[KYBER_NUM_DOMAINS];
 };
 
+static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
+			     void *key);
+
 static int rq_sched_domain(const struct request *rq)
 {
 	unsigned int op = rq->cmd_flags;
@@ -385,6 +389,9 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 
 	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 		INIT_LIST_HEAD(&khd->rqs[i]);
+		init_waitqueue_func_entry(&khd->domain_wait[i],
+					  kyber_domain_wake);
+		khd->domain_wait[i].private = hctx;
 		INIT_LIST_HEAD(&khd->domain_wait[i].entry);
 		atomic_set(&khd->wait_index[i], 0);
 	}
@@ -524,35 +531,39 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd,
 	int nr;
 
 	nr = __sbitmap_queue_get(domain_tokens);
-	if (nr >= 0)
-		return nr;
 
 	/*
 	 * If we failed to get a domain token, make sure the hardware queue is
 	 * run when one becomes available. Note that this is serialized on
 	 * khd->lock, but we still need to be careful about the waker.
 	 */
-	if (list_empty_careful(&wait->entry)) {
-		init_waitqueue_func_entry(wait, kyber_domain_wake);
-		wait->private = hctx;
+	if (nr < 0 && list_empty_careful(&wait->entry)) {
 		ws = sbq_wait_ptr(domain_tokens,
 				  &khd->wait_index[sched_domain]);
+		khd->domain_ws[sched_domain] = ws;
 		add_wait_queue(&ws->wait, wait);
 
 		/*
 		 * Try again in case a token was freed before we got on the wait
-		 * queue. The waker may have already removed the entry from the
-		 * wait queue, but list_del_init() is okay with that.
+		 * queue.
 		 */
 		nr = __sbitmap_queue_get(domain_tokens);
-		if (nr >= 0) {
-			unsigned long flags;
+	}
 
-			spin_lock_irqsave(&ws->wait.lock, flags);
-			list_del_init(&wait->entry);
-			spin_unlock_irqrestore(&ws->wait.lock, flags);
-		}
+	/*
+	 * If we got a token while we were on the wait queue, remove ourselves
+	 * from the wait queue to ensure that all wake ups make forward
+	 * progress. It's possible that the waker already deleted the entry
+	 * between the !list_empty_careful() check and us grabbing the lock, but
+	 * list_del_init() is okay with that.
+	 */
+	if (nr >= 0 && !list_empty_careful(&wait->entry)) {
+		ws = khd->domain_ws[sched_domain];
+		spin_lock_irq(&ws->wait.lock);
+		list_del_init(&wait->entry);
+		spin_unlock_irq(&ws->wait.lock);
 	}
+
 	return nr;
 }
 
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 0179e484ec98..c56f211c8440 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -59,6 +59,7 @@ struct deadline_data {
 	int front_merges;
 
 	spinlock_t lock;
+	spinlock_t zone_lock;
 	struct list_head dispatch;
 };
 
@@ -192,13 +193,83 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
 }
 
 /*
+ * For the specified data direction, return the next request to
+ * dispatch using arrival ordered lists.
+ */
+static struct request *
+deadline_fifo_request(struct deadline_data *dd, int data_dir)
+{
+	struct request *rq;
+	unsigned long flags;
+
+	if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
+		return NULL;
+
+	if (list_empty(&dd->fifo_list[data_dir]))
+		return NULL;
+
+	rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+	if (data_dir == READ || !blk_queue_is_zoned(rq->q))
+		return rq;
+
+	/*
+	 * Look for a write request that can be dispatched, that is one with
+	 * an unlocked target zone.
+	 */
+	spin_lock_irqsave(&dd->zone_lock, flags);
+	list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) {
+		if (blk_req_can_dispatch_to_zone(rq))
+			goto out;
+	}
+	rq = NULL;
+out:
+	spin_unlock_irqrestore(&dd->zone_lock, flags);
+
+	return rq;
+}
+
+/*
+ * For the specified data direction, return the next request to
+ * dispatch using sector position sorted lists.
+ */
+static struct request *
+deadline_next_request(struct deadline_data *dd, int data_dir)
+{
+	struct request *rq;
+	unsigned long flags;
+
+	if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
+		return NULL;
+
+	rq = dd->next_rq[data_dir];
+	if (!rq)
+		return NULL;
+
+	if (data_dir == READ || !blk_queue_is_zoned(rq->q))
+		return rq;
+
+	/*
+	 * Look for a write request that can be dispatched, that is one with
+	 * an unlocked target zone.
+	 */
+	spin_lock_irqsave(&dd->zone_lock, flags);
+	while (rq) {
+		if (blk_req_can_dispatch_to_zone(rq))
+			break;
+		rq = deadline_latter_request(rq);
+	}
+	spin_unlock_irqrestore(&dd->zone_lock, flags);
+
+	return rq;
+}
+
+/*
  * deadline_dispatch_requests selects the best request according to
  * read/write expire, fifo_batch, etc
  */
-static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
+static struct request *__dd_dispatch_request(struct deadline_data *dd)
 {
-	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
-	struct request *rq;
+	struct request *rq, *next_rq;
 	bool reads, writes;
 	int data_dir;
 
@@ -214,10 +285,9 @@ static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
 	/*
 	 * batches are currently reads XOR writes
 	 */
-	if (dd->next_rq[WRITE])
-		rq = dd->next_rq[WRITE];
-	else
-		rq = dd->next_rq[READ];
+	rq = deadline_next_request(dd, WRITE);
+	if (!rq)
+		rq = deadline_next_request(dd, READ);
 
 	if (rq && dd->batching < dd->fifo_batch)
 		/* we have a next request are still entitled to batch */
@@ -231,7 +301,8 @@ static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
 	if (reads) {
 		BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
 
-		if (writes && (dd->starved++ >= dd->writes_starved))
+		if (deadline_fifo_request(dd, WRITE) &&
+		    (dd->starved++ >= dd->writes_starved))
 			goto dispatch_writes;
 
 		data_dir = READ;
@@ -260,21 +331,29 @@ dispatch_find_request:
 	/*
 	 * we are not running a batch, find best request for selected data_dir
 	 */
-	if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
+	next_rq = deadline_next_request(dd, data_dir);
+	if (deadline_check_fifo(dd, data_dir) || !next_rq) {
 		/*
 		 * A deadline has expired, the last request was in the other
 		 * direction, or we have run out of higher-sectored requests.
 		 * Start again from the request with the earliest expiry time.
 		 */
-		rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+		rq = deadline_fifo_request(dd, data_dir);
 	} else {
 		/*
 		 * The last req was the same dir and we have a next request in
 		 * sort order. No expired requests so continue on from here.
 		 */
-		rq = dd->next_rq[data_dir];
+		rq = next_rq;
 	}
 
+	/*
+	 * For a zoned block device, if we only have writes queued and none of
+	 * them can be dispatched, rq will be NULL.
+	 */
+	if (!rq)
+		return NULL;
+
 	dd->batching = 0;
 
 dispatch_request:
@@ -284,17 +363,27 @@ dispatch_request:
 	dd->batching++;
 	deadline_move_request(dd, rq);
 done:
+	/*
+	 * If the request needs its target zone locked, do it.
+	 */
+	blk_req_zone_write_lock(rq);
 	rq->rq_flags |= RQF_STARTED;
 	return rq;
 }
 
+/*
+ * One confusing aspect here is that we get called for a specific
+ * hardware queue, but we return a request that may not be for a
+ * different hardware queue. This is because mq-deadline has shared
+ * state for all hardware queues, in terms of sorting, FIFOs, etc.
+ */
 static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
 {
 	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
 	struct request *rq;
 
 	spin_lock(&dd->lock);
-	rq = __dd_dispatch_request(hctx);
+	rq = __dd_dispatch_request(dd);
 	spin_unlock(&dd->lock);
 
 	return rq;
@@ -339,6 +428,7 @@ static int dd_init_queue(struct request_queue *q, struct elevator_type *e)
 	dd->front_merges = 1;
 	dd->fifo_batch = fifo_batch;
 	spin_lock_init(&dd->lock);
+	spin_lock_init(&dd->zone_lock);
 	INIT_LIST_HEAD(&dd->dispatch);
 
 	q->elevator = eq;
@@ -395,6 +485,12 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	struct deadline_data *dd = q->elevator->elevator_data;
 	const int data_dir = rq_data_dir(rq);
 
+	/*
+	 * This may be a requeue of a write request that has locked its
+	 * target zone. If it is the case, this releases the zone lock.
+	 */
+	blk_req_zone_write_unlock(rq);
+
 	if (blk_mq_sched_try_insert_merge(q, rq))
 		return;
 
@@ -439,6 +535,26 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
 	spin_unlock(&dd->lock);
 }
 
+/*
+ * For zoned block devices, write unlock the target zone of
+ * completed write requests. Do this while holding the zone lock
+ * spinlock so that the zone is never unlocked while deadline_fifo_request()
+ * while deadline_next_request() are executing.
+ */
+static void dd_completed_request(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+
+	if (blk_queue_is_zoned(q)) {
+		struct deadline_data *dd = q->elevator->elevator_data;
+		unsigned long flags;
+
+		spin_lock_irqsave(&dd->zone_lock, flags);
+		blk_req_zone_write_unlock(rq);
+		spin_unlock_irqrestore(&dd->zone_lock, flags);
+	}
+}
+
 static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
 {
 	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
@@ -640,6 +756,7 @@ static struct elevator_type mq_deadline = {
 	.ops.mq = {
 		.insert_requests	= dd_insert_requests,
 		.dispatch_request	= dd_dispatch_request,
+		.completed_request	= dd_completed_request,
 		.next_request		= elv_rb_latter_request,
 		.former_request		= elv_rb_former_request,
 		.bio_merge		= dd_bio_merge,
diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c
index 0af3a3db6fb0..82c44f7df911 100644
--- a/block/partitions/msdos.c
+++ b/block/partitions/msdos.c
@@ -301,7 +301,9 @@ static void parse_bsd(struct parsed_partitions *state,
 			continue;
 		bsd_start = le32_to_cpu(p->p_offset);
 		bsd_size = le32_to_cpu(p->p_size);
-		if (memcmp(flavour, "bsd\0", 4) == 0)
+		/* FreeBSD has relative offset if C partition offset is zero */
+		if (memcmp(flavour, "bsd\0", 4) == 0 &&
+		    le32_to_cpu(l->d_partitions[2].p_offset) == 0)
 			bsd_start += offset;
 		if (offset == bsd_start && size == bsd_size)
 			/* full parent partition, we have it already */
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index edcfff974527..60b471f8621b 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -384,9 +384,10 @@ out_put_request:
 
 /**
  * sg_scsi_ioctl  --  handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl
- * @file:	file this ioctl operates on (optional)
  * @q:		request queue to send scsi commands down
  * @disk:	gendisk to operate on (option)
+ * @mode:	mode used to open the file through which the ioctl has been
+ *		submitted
  * @sic:	userspace structure describing the command to perform
  *
  * Send down the scsi command described by @sic to the device below
@@ -415,10 +416,10 @@ out_put_request:
  *      Positive numbers returned are the compacted SCSI error codes (4
  *      bytes in one int) where the lowest byte is the SCSI status.
  */
-#define OMAX_SB_LEN 16          /* For backward compatibility */
 int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 		struct scsi_ioctl_command __user *sic)
 {
+	enum { OMAX_SB_LEN = 16 };	/* For backward compatibility */
 	struct request *rq;
 	struct scsi_request *req;
 	int err;
@@ -692,38 +693,9 @@ int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd)
 	if (bd && bd == bd->bd_contains)
 		return 0;
 
-	/* Actually none of these is particularly useful on a partition,
-	 * but they are safe.
-	 */
-	switch (cmd) {
-	case SCSI_IOCTL_GET_IDLUN:
-	case SCSI_IOCTL_GET_BUS_NUMBER:
-	case SCSI_IOCTL_GET_PCI:
-	case SCSI_IOCTL_PROBE_HOST:
-	case SG_GET_VERSION_NUM:
-	case SG_SET_TIMEOUT:
-	case SG_GET_TIMEOUT:
-	case SG_GET_RESERVED_SIZE:
-	case SG_SET_RESERVED_SIZE:
-	case SG_EMULATED_HOST:
-		return 0;
-	case CDROM_GET_CAPABILITY:
-		/* Keep this until we remove the printk below.  udev sends it
-		 * and we do not want to spam dmesg about it.   CD-ROMs do
-		 * not have partitions, so we get here only for disks.
-		 */
-		return -ENOIOCTLCMD;
-	default:
-		break;
-	}
-
 	if (capable(CAP_SYS_RAWIO))
 		return 0;
 
-	/* In particular, rule out all resets and host-specific ioctls.  */
-	printk_ratelimited(KERN_WARNING
-			   "%s: sending ioctl %x to a partition!\n", current->comm, cmd);
-
 	return -ENOIOCTLCMD;
 }
 EXPORT_SYMBOL(scsi_verify_blk_ioctl);