40 files changed, 1250 insertions, 970 deletions
diff --git a/block/badblocks.c b/block/badblocks.c
index 6ebcef282314..43c71166e1e2 100644
--- a/block/badblocks.c
+++ b/block/badblocks.c
@@ -533,6 +533,7 @@ ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len,
 	case 3:
 		if (newline != '\n')
 			return -EINVAL;
+		/* fall through */
 	case 2:
 		if (length <= 0)
 			return -EINVAL;
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index c8a32fb345cf..78b2e0db4fb2 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -52,7 +52,7 @@ BFQG_FLAG_FNS(idling)
 BFQG_FLAG_FNS(empty)
 #undef BFQG_FLAG_FNS
 
-/* This should be called with the queue_lock held. */
+/* This should be called with the scheduler lock held. */
 static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
 {
 	unsigned long long now;
@@ -67,7 +67,7 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
 	bfqg_stats_clear_waiting(stats);
 }
 
-/* This should be called with the queue_lock held. */
+/* This should be called with the scheduler lock held. */
 static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
 						 struct bfq_group *curr_bfqg)
 {
@@ -81,7 +81,7 @@ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
 	bfqg_stats_mark_waiting(stats);
 }
 
-/* This should be called with the queue_lock held. */
+/* This should be called with the scheduler lock held. */
 static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
 {
 	unsigned long long now;
@@ -203,12 +203,30 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
 
 static void bfqg_get(struct bfq_group *bfqg)
 {
-	return blkg_get(bfqg_to_blkg(bfqg));
+	bfqg->ref++;
 }
 
 void bfqg_put(struct bfq_group *bfqg)
 {
-	return blkg_put(bfqg_to_blkg(bfqg));
+	bfqg->ref--;
+
+	if (bfqg->ref == 0)
+		kfree(bfqg);
+}
+
+static void bfqg_and_blkg_get(struct bfq_group *bfqg)
+{
+	/* see comments in bfq_bic_update_cgroup for why refcounting bfqg */
+	bfqg_get(bfqg);
+
+	blkg_get(bfqg_to_blkg(bfqg));
+}
+
+void bfqg_and_blkg_put(struct bfq_group *bfqg)
+{
+	bfqg_put(bfqg);
+
+	blkg_put(bfqg_to_blkg(bfqg));
 }
 
 void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
@@ -312,7 +330,11 @@ void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg)
 	if (bfqq) {
 		bfqq->ioprio = bfqq->new_ioprio;
 		bfqq->ioprio_class = bfqq->new_ioprio_class;
-		bfqg_get(bfqg);
+		/*
+		 * Make sure that bfqg and its associated blkg do not
+		 * disappear before entity.
+		 */
+		bfqg_and_blkg_get(bfqg);
 	}
 	entity->parent = bfqg->my_entity; /* NULL for root group */
 	entity->sched_data = &bfqg->sched_data;
@@ -399,6 +421,8 @@ struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
 		return NULL;
 	}
 
+	/* see comments in bfq_bic_update_cgroup for why refcounting */
+	bfqg_get(bfqg);
 	return &bfqg->pd;
 }
 
@@ -426,7 +450,7 @@ void bfq_pd_free(struct blkg_policy_data *pd)
 	struct bfq_group *bfqg = pd_to_bfqg(pd);
 
 	bfqg_stats_exit(&bfqg->stats);
-	return kfree(bfqg);
+	bfqg_put(bfqg);
 }
 
 void bfq_pd_reset_stats(struct blkg_policy_data *pd)
@@ -496,9 +520,10 @@ struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
  * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
  * it on the new one.  Avoid putting the entity on the old group idle tree.
  *
- * Must be called under the queue lock; the cgroup owning @bfqg must
- * not disappear (by now this just means that we are called under
- * rcu_read_lock()).
+ * Must be called under the scheduler lock, to make sure that the blkg
+ * owning @bfqg does not disappear (see comments in
+ * bfq_bic_update_cgroup on guaranteeing the consistency of blkg
+ * objects).
  */
 void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		   struct bfq_group *bfqg)
@@ -519,16 +544,12 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		bfq_deactivate_bfqq(bfqd, bfqq, false, false);
 	else if (entity->on_st)
 		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
-	bfqg_put(bfqq_group(bfqq));
+	bfqg_and_blkg_put(bfqq_group(bfqq));
 
-	/*
-	 * Here we use a reference to bfqg.  We don't need a refcounter
-	 * as the cgroup reference will not be dropped, so that its
-	 * destroy() callback will not be invoked.
-	 */
 	entity->parent = bfqg->my_entity;
 	entity->sched_data = &bfqg->sched_data;
-	bfqg_get(bfqg);
+	/* pin down bfqg and its associated blkg  */
+	bfqg_and_blkg_get(bfqg);
 
 	if (bfq_bfqq_busy(bfqq)) {
 		bfq_pos_tree_add_move(bfqd, bfqq);
@@ -545,8 +566,9 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
  * @bic: the bic to move.
  * @blkcg: the blk-cgroup to move to.
  *
- * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
- * has to make sure that the reference to cgroup is valid across the call.
+ * Move bic to blkcg, assuming that bfqd->lock is held; which makes
+ * sure that the reference to cgroup is valid across the call (see
+ * comments in bfq_bic_update_cgroup on this issue)
  *
  * NOTE: an alternative approach might have been to store the current
  * cgroup in bfqq and getting a reference to it, reducing the lookup
@@ -604,6 +626,57 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
 		goto out;
 
 	bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio));
+	/*
+	 * Update blkg_path for bfq_log_* functions. We cache this
+	 * path, and update it here, for the following
+	 * reasons. Operations on blkg objects in blk-cgroup are
+	 * protected with the request_queue lock, and not with the
+	 * lock that protects the instances of this scheduler
+	 * (bfqd->lock). This exposes BFQ to the following sort of
+	 * race.
+	 *
+	 * The blkg_lookup performed in bfq_get_queue, protected
+	 * through rcu, may happen to return the address of a copy of
+	 * the original blkg. If this is the case, then the
+	 * bfqg_and_blkg_get performed in bfq_get_queue, to pin down
+	 * the blkg, is useless: it does not prevent blk-cgroup code
+	 * from destroying both the original blkg and all objects
+	 * directly or indirectly referred by the copy of the
+	 * blkg.
+	 *
+	 * On the bright side, destroy operations on a blkg invoke, as
+	 * a first step, hooks of the scheduler associated with the
+	 * blkg. And these hooks are executed with bfqd->lock held for
+	 * BFQ. As a consequence, for any blkg associated with the
+	 * request queue this instance of the scheduler is attached
+	 * to, we are guaranteed that such a blkg is not destroyed, and
+	 * that all the pointers it contains are consistent, while we
+	 * are holding bfqd->lock. A blkg_lookup performed with
+	 * bfqd->lock held then returns a fully consistent blkg, which
+	 * remains consistent until this lock is held.
+	 *
+	 * Thanks to the last fact, and to the fact that: (1) bfqg has
+	 * been obtained through a blkg_lookup in the above
+	 * assignment, and (2) bfqd->lock is being held, here we can
+	 * safely use the policy data for the involved blkg (i.e., the
+	 * field bfqg->pd) to get to the blkg associated with bfqg,
+	 * and then we can safely use any field of blkg. After we
+	 * release bfqd->lock, even just getting blkg through this
+	 * bfqg may cause dangling references to be traversed, as
+	 * bfqg->pd may not exist any more.
+	 *
+	 * In view of the above facts, here we cache, in the bfqg, any
+	 * blkg data we may need for this bic, and for its associated
+	 * bfq_queue. As of now, we need to cache only the path of the
+	 * blkg, which is used in the bfq_log_* functions.
+	 *
+	 * Finally, note that bfqg itself needs to be protected from
+	 * destruction on the blkg_free of the original blkg (which
+	 * invokes bfq_pd_free). We use an additional private
+	 * refcounter for bfqg, to let it disappear only after no
+	 * bfq_queue refers to it any longer.
+	 */
+	blkg_path(bfqg_to_blkg(bfqg), bfqg->blkg_path, sizeof(bfqg->blkg_path));
 	bic->blkcg_serial_nr = serial_nr;
 out:
 	rcu_read_unlock();
@@ -640,8 +713,6 @@ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
  * @bfqd: the device data structure with the root group.
  * @bfqg: the group to move from.
  * @st: the service tree with the entities.
- *
- * Needs queue_lock to be taken and reference to be valid over the call.
  */
 static void bfq_reparent_active_entities(struct bfq_data *bfqd,
 					 struct bfq_group *bfqg,
@@ -692,8 +763,7 @@ void bfq_pd_offline(struct blkg_policy_data *pd)
 		/*
 		 * The idle tree may still contain bfq_queues belonging
 		 * to exited task because they never migrated to a different
-		 * cgroup from the one being destroyed now.  No one else
-		 * can access them so it's safe to act without any lock.
+		 * cgroup from the one being destroyed now.
 		 */
 		bfq_flush_idle_tree(st);
 
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 08ce45096350..12bbc6b8657d 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -725,8 +725,12 @@ static void bfq_updated_next_req(struct bfq_data *bfqd,
 }
 
 static void
-bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
+		      struct bfq_io_cq *bic, bool bfq_already_existing)
 {
+	unsigned int old_wr_coeff = bfqq->wr_coeff;
+	bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq);
+
 	if (bic->saved_idle_window)
 		bfq_mark_bfqq_idle_window(bfqq);
 	else
@@ -754,6 +758,14 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
 
 	/* make sure weight will be updated, however we got here */
 	bfqq->entity.prio_changed = 1;
+
+	if (likely(!busy))
+		return;
+
+	if (old_wr_coeff == 1 && bfqq->wr_coeff > 1)
+		bfqd->wr_busy_queues++;
+	else if (old_wr_coeff > 1 && bfqq->wr_coeff == 1)
+		bfqd->wr_busy_queues--;
 }
 
 static int bfqq_process_refs(struct bfq_queue *bfqq)
@@ -3665,7 +3677,7 @@ void bfq_put_queue(struct bfq_queue *bfqq)
 
 	kmem_cache_free(bfq_pool, bfqq);
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
-	bfqg_put(bfqg);
+	bfqg_and_blkg_put(bfqg);
 #endif
 }
 
@@ -4290,10 +4302,16 @@ static void bfq_put_rq_priv_body(struct bfq_queue *bfqq)
 	bfq_put_queue(bfqq);
 }
 
-static void bfq_put_rq_private(struct request_queue *q, struct request *rq)
+static void bfq_finish_request(struct request *rq)
 {
-	struct bfq_queue *bfqq = RQ_BFQQ(rq);
-	struct bfq_data *bfqd = bfqq->bfqd;
+	struct bfq_queue *bfqq;
+	struct bfq_data *bfqd;
+
+	if (!rq->elv.icq)
+		return;
+
+	bfqq = RQ_BFQQ(rq);
+	bfqd = bfqq->bfqd;
 
 	if (rq->rq_flags & RQF_STARTED)
 		bfqg_stats_update_completion(bfqq_group(bfqq),
@@ -4324,7 +4342,7 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq)
 		 */
 
 		if (!RB_EMPTY_NODE(&rq->rb_node))
-			bfq_remove_request(q, rq);
+			bfq_remove_request(rq->q, rq);
 		bfq_put_rq_priv_body(bfqq);
 	}
 
@@ -4394,20 +4412,21 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
 /*
  * Allocate bfq data structures associated with this request.
  */
-static int bfq_get_rq_private(struct request_queue *q, struct request *rq,
-			      struct bio *bio)
+static void bfq_prepare_request(struct request *rq, struct bio *bio)
 {
+	struct request_queue *q = rq->q;
 	struct bfq_data *bfqd = q->elevator->elevator_data;
-	struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
+	struct bfq_io_cq *bic;
 	const int is_sync = rq_is_sync(rq);
 	struct bfq_queue *bfqq;
 	bool new_queue = false;
-	bool split = false;
+	bool bfqq_already_existing = false, split = false;
 
-	spin_lock_irq(&bfqd->lock);
+	if (!rq->elv.icq)
+		return;
+	bic = icq_to_bic(rq->elv.icq);
 
-	if (!bic)
-		goto queue_fail;
+	spin_lock_irq(&bfqd->lock);
 
 	bfq_check_ioprio_change(bic, bio);
 
@@ -4432,6 +4451,8 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq,
 				bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio,
 								 true, is_sync,
 								 NULL);
+			else
+				bfqq_already_existing = true;
 		}
 	}
 
@@ -4457,7 +4478,8 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq,
 			 * queue: restore the idle window and the
 			 * possible weight raising period.
 			 */
-			bfq_bfqq_resume_state(bfqq, bic);
+			bfq_bfqq_resume_state(bfqq, bfqd, bic,
+					      bfqq_already_existing);
 		}
 	}
 
@@ -4465,13 +4487,6 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq,
 		bfq_handle_burst(bfqd, bfqq);
 
 	spin_unlock_irq(&bfqd->lock);
-
-	return 0;
-
-queue_fail:
-	spin_unlock_irq(&bfqd->lock);
-
-	return 1;
 }
 
 static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq)
@@ -4950,8 +4965,8 @@ static struct elv_fs_entry bfq_attrs[] = {
 
 static struct elevator_type iosched_bfq_mq = {
 	.ops.mq = {
-		.get_rq_priv		= bfq_get_rq_private,
-		.put_rq_priv		= bfq_put_rq_private,
+		.prepare_request	= bfq_prepare_request,
+		.finish_request		= bfq_finish_request,
 		.exit_icq		= bfq_exit_icq,
 		.insert_requests	= bfq_insert_requests,
 		.dispatch_request	= bfq_dispatch_request,
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index ae783c06dfd9..5c3bf9861492 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -759,6 +759,12 @@ struct bfq_group {
 	/* must be the first member */
 	struct blkg_policy_data pd;
 
+	/* cached path for this blkg (see comments in bfq_bic_update_cgroup) */
+	char blkg_path[128];
+
+	/* reference counter (see comments in bfq_bic_update_cgroup) */
+	int ref;
+
 	struct bfq_entity entity;
 	struct bfq_sched_data sched_data;
 
@@ -838,7 +844,7 @@ struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
 struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
 struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
 struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node);
-void bfqg_put(struct bfq_group *bfqg);
+void bfqg_and_blkg_put(struct bfq_group *bfqg);
 
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 extern struct cftype bfq_blkcg_legacy_files[];
@@ -910,20 +916,13 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq);
 struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
 
 #define bfq_log_bfqq(bfqd, bfqq, fmt, args...)	do {			\
-	char __pbuf[128];						\
-									\
-	blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
-	blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid, \
+	blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid,\
 			bfq_bfqq_sync((bfqq)) ? 'S' : 'A',		\
-			  __pbuf, ##args);				\
+			bfqq_group(bfqq)->blkg_path, ##args);		\
 } while (0)
 
-#define bfq_log_bfqg(bfqd, bfqg, fmt, args...)	do {			\
-	char __pbuf[128];						\
-									\
-	blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf));		\
-	blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args);	\
-} while (0)
+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...)	\
+	blk_add_trace_msg((bfqd)->queue, "%s " fmt, (bfqg)->blkg_path, ##args)
 
 #else /* CONFIG_BFQ_GROUP_IOSCHED */
 
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 5384713d48bc..b8a3a65f7364 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -175,6 +175,9 @@ bool bio_integrity_enabled(struct bio *bio)
 	if (bio_op(bio) != REQ_OP_READ && bio_op(bio) != REQ_OP_WRITE)
 		return false;
 
+	if (!bio_sectors(bio))
+		return false;
+
 	/* Already protected? */
 	if (bio_integrity(bio))
 		return false;
@@ -221,7 +224,7 @@ static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
  * @bio:	bio to generate/verify integrity metadata for
  * @proc_fn:	Pointer to the relevant processing function
  */
-static int bio_integrity_process(struct bio *bio,
+static blk_status_t bio_integrity_process(struct bio *bio,
 				 integrity_processing_fn *proc_fn)
 {
 	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
@@ -229,7 +232,7 @@ static int bio_integrity_process(struct bio *bio,
 	struct bvec_iter bviter;
 	struct bio_vec bv;
 	struct bio_integrity_payload *bip = bio_integrity(bio);
-	unsigned int ret = 0;
+	blk_status_t ret = BLK_STS_OK;
 	void *prot_buf = page_address(bip->bip_vec->bv_page) +
 		bip->bip_vec->bv_offset;
 
@@ -366,7 +369,7 @@ static void bio_integrity_verify_fn(struct work_struct *work)
 	struct bio *bio = bip->bip_bio;
 	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
 
-	bio->bi_error = bio_integrity_process(bio, bi->profile->verify_fn);
+	bio->bi_status = bio_integrity_process(bio, bi->profile->verify_fn);
 
 	/* Restore original bio completion handler */
 	bio->bi_end_io = bip->bip_end_io;
@@ -395,7 +398,7 @@ void bio_integrity_endio(struct bio *bio)
 	 * integrity metadata.  Restore original bio end_io handler
 	 * and run it.
 	 */
-	if (bio->bi_error) {
+	if (bio->bi_status) {
 		bio->bi_end_io = bip->bip_end_io;
 		bio_endio(bio);
 
diff --git a/block/bio.c b/block/bio.c
index 888e7801c638..1cfcd0df3f30 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -240,20 +240,21 @@ fallback:
 	return bvl;
 }
 
-static void __bio_free(struct bio *bio)
+void bio_uninit(struct bio *bio)
 {
 	bio_disassociate_task(bio);
 
 	if (bio_integrity(bio))
 		bio_integrity_free(bio);
 }
+EXPORT_SYMBOL(bio_uninit);
 
 static void bio_free(struct bio *bio)
 {
 	struct bio_set *bs = bio->bi_pool;
 	void *p;
 
-	__bio_free(bio);
+	bio_uninit(bio);
 
 	if (bs) {
 		bvec_free(bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio));
@@ -271,6 +272,11 @@ static void bio_free(struct bio *bio)
 	}
 }
 
+/*
+ * Users of this function have their own bio allocation. Subsequently,
+ * they must remember to pair any call to bio_init() with bio_uninit()
+ * when IO has completed, or when the bio is released.
+ */
 void bio_init(struct bio *bio, struct bio_vec *table,
 	      unsigned short max_vecs)
 {
@@ -297,7 +303,7 @@ void bio_reset(struct bio *bio)
 {
 	unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
 
-	__bio_free(bio);
+	bio_uninit(bio);
 
 	memset(bio, 0, BIO_RESET_BYTES);
 	bio->bi_flags = flags;
@@ -309,8 +315,8 @@ static struct bio *__bio_chain_endio(struct bio *bio)
 {
 	struct bio *parent = bio->bi_private;
 
-	if (!parent->bi_error)
-		parent->bi_error = bio->bi_error;
+	if (!parent->bi_status)
+		parent->bi_status = bio->bi_status;
 	bio_put(bio);
 	return parent;
 }
@@ -363,6 +369,8 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
 	struct bio_list punt, nopunt;
 	struct bio *bio;
 
+	if (WARN_ON_ONCE(!bs->rescue_workqueue))
+		return;
 	/*
 	 * In order to guarantee forward progress we must punt only bios that
 	 * were allocated from this bio_set; otherwise, if there was a bio on
@@ -474,7 +482,8 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
 
 		if (current->bio_list &&
 		    (!bio_list_empty(&current->bio_list[0]) ||
-		     !bio_list_empty(&current->bio_list[1])))
+		     !bio_list_empty(&current->bio_list[1])) &&
+		    bs->rescue_workqueue)
 			gfp_mask &= ~__GFP_DIRECT_RECLAIM;
 
 		p = mempool_alloc(bs->bio_pool, gfp_mask);
@@ -544,7 +553,7 @@ EXPORT_SYMBOL(zero_fill_bio);
  *
  * Description:
  *   Put a reference to a &struct bio, either one you have gotten with
- *   bio_alloc, bio_get or bio_clone. The last put of a bio will free it.
+ *   bio_alloc, bio_get or bio_clone_*. The last put of a bio will free it.
  **/
 void bio_put(struct bio *bio)
 {
@@ -593,6 +602,7 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
 	bio->bi_bdev = bio_src->bi_bdev;
 	bio_set_flag(bio, BIO_CLONED);
 	bio->bi_opf = bio_src->bi_opf;
+	bio->bi_write_hint = bio_src->bi_write_hint;
 	bio->bi_iter = bio_src->bi_iter;
 	bio->bi_io_vec = bio_src->bi_io_vec;
 
@@ -676,6 +686,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 		return NULL;
 	bio->bi_bdev		= bio_src->bi_bdev;
 	bio->bi_opf		= bio_src->bi_opf;
+	bio->bi_write_hint	= bio_src->bi_write_hint;
 	bio->bi_iter.bi_sector	= bio_src->bi_iter.bi_sector;
 	bio->bi_iter.bi_size	= bio_src->bi_iter.bi_size;
 
@@ -918,7 +929,7 @@ static void submit_bio_wait_endio(struct bio *bio)
 {
 	struct submit_bio_ret *ret = bio->bi_private;
 
-	ret->error = bio->bi_error;
+	ret->error = blk_status_to_errno(bio->bi_status);
 	complete(&ret->event);
 }
 
@@ -1817,8 +1828,8 @@ again:
 	}
 
 	if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
-		trace_block_bio_complete(bdev_get_queue(bio->bi_bdev),
-					 bio, bio->bi_error);
+		trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio,
+					 blk_status_to_errno(bio->bi_status));
 		bio_clear_flag(bio, BIO_TRACE_COMPLETION);
 	}
 
@@ -1921,9 +1932,29 @@ void bioset_free(struct bio_set *bs)
 }
 EXPORT_SYMBOL(bioset_free);
 
-static struct bio_set *__bioset_create(unsigned int pool_size,
-				       unsigned int front_pad,
-				       bool create_bvec_pool)
+/**
+ * bioset_create  - Create a bio_set
+ * @pool_size:	Number of bio and bio_vecs to cache in the mempool
+ * @front_pad:	Number of bytes to allocate in front of the returned bio
+ * @flags:	Flags to modify behavior, currently %BIOSET_NEED_BVECS
+ *              and %BIOSET_NEED_RESCUER
+ *
+ * Description:
+ *    Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
+ *    to ask for a number of bytes to be allocated in front of the bio.
+ *    Front pad allocation is useful for embedding the bio inside
+ *    another structure, to avoid allocating extra data to go with the bio.
+ *    Note that the bio must be embedded at the END of that structure always,
+ *    or things will break badly.
+ *    If %BIOSET_NEED_BVECS is set in @flags, a separate pool will be allocated
+ *    for allocating iovecs.  This pool is not needed e.g. for bio_clone_fast().
+ *    If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used to
+ *    dispatch queued requests when the mempool runs out of space.
+ *
+ */
+struct bio_set *bioset_create(unsigned int pool_size,
+			      unsigned int front_pad,
+			      int flags)
 {
 	unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
 	struct bio_set *bs;
@@ -1948,12 +1979,15 @@ static struct bio_set *__bioset_create(unsigned int pool_size,
 	if (!bs->bio_pool)
 		goto bad;
 
-	if (create_bvec_pool) {
+	if (flags & BIOSET_NEED_BVECS) {
 		bs->bvec_pool = biovec_create_pool(pool_size);
 		if (!bs->bvec_pool)
 			goto bad;
 	}
 
+	if (!(flags & BIOSET_NEED_RESCUER))
+		return bs;
+
 	bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
 	if (!bs->rescue_workqueue)
 		goto bad;
@@ -1963,41 +1997,8 @@ bad:
 	bioset_free(bs);
 	return NULL;
 }
-
-/**
- * bioset_create  - Create a bio_set
- * @pool_size:	Number of bio and bio_vecs to cache in the mempool
- * @front_pad:	Number of bytes to allocate in front of the returned bio
- *
- * Description:
- *    Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
- *    to ask for a number of bytes to be allocated in front of the bio.
- *    Front pad allocation is useful for embedding the bio inside
- *    another structure, to avoid allocating extra data to go with the bio.
- *    Note that the bio must be embedded at the END of that structure always,
- *    or things will break badly.
- */
-struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
-{
-	return __bioset_create(pool_size, front_pad, true);
-}
 EXPORT_SYMBOL(bioset_create);
 
-/**
- * bioset_create_nobvec  - Create a bio_set without bio_vec mempool
- * @pool_size:	Number of bio to cache in the mempool
- * @front_pad:	Number of bytes to allocate in front of the returned bio
- *
- * Description:
- *    Same functionality as bioset_create() except that mempool is not
- *    created for bio_vecs. Saving some memory for bio_clone_fast() users.
- */
-struct bio_set *bioset_create_nobvec(unsigned int pool_size, unsigned int front_pad)
-{
-	return __bioset_create(pool_size, front_pad, false);
-}
-EXPORT_SYMBOL(bioset_create_nobvec);
-
 #ifdef CONFIG_BLK_CGROUP
 
 /**
@@ -2112,7 +2113,7 @@ static int __init init_bio(void)
 	bio_integrity_init();
 	biovec_init_slabs();
 
-	fs_bio_set = bioset_create(BIO_POOL_SIZE, 0);
+	fs_bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
 	if (!fs_bio_set)
 		panic("bio: can't allocate bios\n");
 
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 7c2947128f58..0480892e97e5 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -74,7 +74,7 @@ static void blkg_free(struct blkcg_gq *blkg)
 			blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
 
 	if (blkg->blkcg != &blkcg_root)
-		blk_exit_rl(&blkg->rl);
+		blk_exit_rl(blkg->q, &blkg->rl);
 
 	blkg_rwstat_exit(&blkg->stat_ios);
 	blkg_rwstat_exit(&blkg->stat_bytes);
diff --git a/block/blk-core.c b/block/blk-core.c
index c7068520794b..af393d5a9680 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -129,11 +129,70 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 }
 EXPORT_SYMBOL(blk_rq_init);
 
+static const struct {
+	int		errno;
+	const char	*name;
+} blk_errors[] = {
+	[BLK_STS_OK]		= { 0,		"" },
+	[BLK_STS_NOTSUPP]	= { -EOPNOTSUPP, "operation not supported" },
+	[BLK_STS_TIMEOUT]	= { -ETIMEDOUT,	"timeout" },
+	[BLK_STS_NOSPC]		= { -ENOSPC,	"critical space allocation" },
+	[BLK_STS_TRANSPORT]	= { -ENOLINK,	"recoverable transport" },
+	[BLK_STS_TARGET]	= { -EREMOTEIO,	"critical target" },
+	[BLK_STS_NEXUS]		= { -EBADE,	"critical nexus" },
+	[BLK_STS_MEDIUM]	= { -ENODATA,	"critical medium" },
+	[BLK_STS_PROTECTION]	= { -EILSEQ,	"protection" },
+	[BLK_STS_RESOURCE]	= { -ENOMEM,	"kernel resource" },
+	[BLK_STS_AGAIN]		= { -EAGAIN,	"nonblocking retry" },
+
+	/* device mapper special case, should not leak out: */
+	[BLK_STS_DM_REQUEUE]	= { -EREMCHG, "dm internal retry" },
+
+	/* everything else not covered above: */
+	[BLK_STS_IOERR]		= { -EIO,	"I/O" },
+};
+
+blk_status_t errno_to_blk_status(int errno)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(blk_errors); i++) {
+		if (blk_errors[i].errno == errno)
+			return (__force blk_status_t)i;
+	}
+
+	return BLK_STS_IOERR;
+}
+EXPORT_SYMBOL_GPL(errno_to_blk_status);
+
+int blk_status_to_errno(blk_status_t status)
+{
+	int idx = (__force int)status;
+
+	if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
+		return -EIO;
+	return blk_errors[idx].errno;
+}
+EXPORT_SYMBOL_GPL(blk_status_to_errno);
+
+static void print_req_error(struct request *req, blk_status_t status)
+{
+	int idx = (__force int)status;
+
+	if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
+		return;
+
+	printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu\n",
+			   __func__, blk_errors[idx].name, req->rq_disk ?
+			   req->rq_disk->disk_name : "?",
+			   (unsigned long long)blk_rq_pos(req));
+}
+
 static void req_bio_endio(struct request *rq, struct bio *bio,
-			  unsigned int nbytes, int error)
+			  unsigned int nbytes, blk_status_t error)
 {
 	if (error)
-		bio->bi_error = error;
+		bio->bi_status = error;
 
 	if (unlikely(rq->rq_flags & RQF_QUIET))
 		bio_set_flag(bio, BIO_QUIET);
@@ -177,10 +236,13 @@ static void blk_delay_work(struct work_struct *work)
  * Description:
  *   Sometimes queueing needs to be postponed for a little while, to allow
  *   resources to come back. This function will make sure that queueing is
- *   restarted around the specified time. Queue lock must be held.
+ *   restarted around the specified time.
  */
 void blk_delay_queue(struct request_queue *q, unsigned long msecs)
 {
+	lockdep_assert_held(q->queue_lock);
+	WARN_ON_ONCE(q->mq_ops);
+
 	if (likely(!blk_queue_dead(q)))
 		queue_delayed_work(kblockd_workqueue, &q->delay_work,
 				   msecs_to_jiffies(msecs));
@@ -198,6 +260,9 @@ EXPORT_SYMBOL(blk_delay_queue);
  **/
 void blk_start_queue_async(struct request_queue *q)
 {
+	lockdep_assert_held(q->queue_lock);
+	WARN_ON_ONCE(q->mq_ops);
+
 	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
 	blk_run_queue_async(q);
 }
@@ -210,11 +275,13 @@ EXPORT_SYMBOL(blk_start_queue_async);
  * Description:
  *   blk_start_queue() will clear the stop flag on the queue, and call
  *   the request_fn for the queue if it was in a stopped state when
- *   entered. Also see blk_stop_queue(). Queue lock must be held.
+ *   entered. Also see blk_stop_queue().
  **/
 void blk_start_queue(struct request_queue *q)
 {
+	lockdep_assert_held(q->queue_lock);
 	WARN_ON(!irqs_disabled());
+	WARN_ON_ONCE(q->mq_ops);
 
 	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
 	__blk_run_queue(q);
@@ -233,10 +300,13 @@ EXPORT_SYMBOL(blk_start_queue);
  *   or if it simply chooses not to queue more I/O at one point, it can
  *   call this function to prevent the request_fn from being called until
  *   the driver has signalled it's ready to go again. This happens by calling
- *   blk_start_queue() to restart queue operations. Queue lock must be held.
+ *   blk_start_queue() to restart queue operations.
  **/
 void blk_stop_queue(struct request_queue *q)
 {
+	lockdep_assert_held(q->queue_lock);
+	WARN_ON_ONCE(q->mq_ops);
+
 	cancel_delayed_work(&q->delay_work);
 	queue_flag_set(QUEUE_FLAG_STOPPED, q);
 }
@@ -289,6 +359,9 @@ EXPORT_SYMBOL(blk_sync_queue);
  */
 inline void __blk_run_queue_uncond(struct request_queue *q)
 {
+	lockdep_assert_held(q->queue_lock);
+	WARN_ON_ONCE(q->mq_ops);
+
 	if (unlikely(blk_queue_dead(q)))
 		return;
 
@@ -310,11 +383,13 @@ EXPORT_SYMBOL_GPL(__blk_run_queue_uncond);
  * @q:	The queue to run
  *
  * Description:
- *    See @blk_run_queue. This variant must be called with the queue lock
- *    held and interrupts disabled.
+ *    See @blk_run_queue.
  */
 void __blk_run_queue(struct request_queue *q)
 {
+	lockdep_assert_held(q->queue_lock);
+	WARN_ON_ONCE(q->mq_ops);
+
 	if (unlikely(blk_queue_stopped(q)))
 		return;
 
@@ -328,10 +403,18 @@ EXPORT_SYMBOL(__blk_run_queue);
  *
  * Description:
  *    Tells kblockd to perform the equivalent of @blk_run_queue on behalf
- *    of us. The caller must hold the queue lock.
+ *    of us.
+ *
+ * Note:
+ *    Since it is not allowed to run q->delay_work after blk_cleanup_queue()
+ *    has canceled q->delay_work, callers must hold the queue lock to avoid
+ *    race conditions between blk_cleanup_queue() and blk_run_queue_async().
  */
 void blk_run_queue_async(struct request_queue *q)
 {
+	lockdep_assert_held(q->queue_lock);
+	WARN_ON_ONCE(q->mq_ops);
+
 	if (likely(!blk_queue_stopped(q) && !blk_queue_dead(q)))
 		mod_delayed_work(kblockd_workqueue, &q->delay_work, 0);
 }
@@ -349,6 +432,8 @@ void blk_run_queue(struct request_queue *q)
 {
 	unsigned long flags;
 
+	WARN_ON_ONCE(q->mq_ops);
+
 	spin_lock_irqsave(q->queue_lock, flags);
 	__blk_run_queue(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
@@ -377,6 +462,7 @@ static void __blk_drain_queue(struct request_queue *q, bool drain_all)
 	int i;
 
 	lockdep_assert_held(q->queue_lock);
+	WARN_ON_ONCE(q->mq_ops);
 
 	while (true) {
 		bool drain = false;
@@ -455,6 +541,8 @@ static void __blk_drain_queue(struct request_queue *q, bool drain_all)
  */
 void blk_queue_bypass_start(struct request_queue *q)
 {
+	WARN_ON_ONCE(q->mq_ops);
+
 	spin_lock_irq(q->queue_lock);
 	q->bypass_depth++;
 	queue_flag_set(QUEUE_FLAG_BYPASS, q);
@@ -481,6 +569,9 @@ EXPORT_SYMBOL_GPL(blk_queue_bypass_start);
  * @q: queue of interest
  *
  * Leave bypass mode and restore the normal queueing behavior.
+ *
+ * Note: although blk_queue_bypass_start() is only called for blk-sq queues,
+ * this function is called for both blk-sq and blk-mq queues.
  */
 void blk_queue_bypass_end(struct request_queue *q)
 {
@@ -648,13 +739,19 @@ int blk_init_rl(struct request_list *rl, struct request_queue *q,
 	if (!rl->rq_pool)
 		return -ENOMEM;
 
+	if (rl != &q->root_rl)
+		WARN_ON_ONCE(!blk_get_queue(q));
+
 	return 0;
 }
 
-void blk_exit_rl(struct request_list *rl)
+void blk_exit_rl(struct request_queue *q, struct request_list *rl)
 {
-	if (rl->rq_pool)
+	if (rl->rq_pool) {
 		mempool_destroy(rl->rq_pool);
+		if (rl != &q->root_rl)
+			blk_put_queue(q);
+	}
 }
 
 struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
@@ -726,7 +823,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	if (q->id < 0)
 		goto fail_q;
 
-	q->bio_split = bioset_create(BIO_POOL_SIZE, 0);
+	q->bio_split = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
 	if (!q->bio_split)
 		goto fail_id;
 
@@ -872,6 +969,8 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio);
 
 int blk_init_allocated_queue(struct request_queue *q)
 {
+	WARN_ON_ONCE(q->mq_ops);
+
 	q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, q->cmd_size);
 	if (!q->fq)
 		return -ENOMEM;
@@ -1009,6 +1108,8 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
 	struct request_list *rl;
 	int on_thresh, off_thresh;
 
+	WARN_ON_ONCE(q->mq_ops);
+
 	spin_lock_irq(q->queue_lock);
 	q->nr_requests = nr;
 	blk_queue_congestion_threshold(q);
@@ -1071,6 +1172,8 @@ static struct request *__get_request(struct request_list *rl, unsigned int op,
 	int may_queue;
 	req_flags_t rq_flags = RQF_ALLOCED;
 
+	lockdep_assert_held(q->queue_lock);
+
 	if (unlikely(blk_queue_dying(q)))
 		return ERR_PTR(-ENODEV);
 
@@ -1244,12 +1347,20 @@ static struct request *get_request(struct request_queue *q, unsigned int op,
 	struct request_list *rl;
 	struct request *rq;
 
+	lockdep_assert_held(q->queue_lock);
+	WARN_ON_ONCE(q->mq_ops);
+
 	rl = blk_get_rl(q, bio);	/* transferred to @rq on success */
 retry:
 	rq = __get_request(rl, op, bio, gfp_mask);
 	if (!IS_ERR(rq))
 		return rq;
 
+	if (op & REQ_NOWAIT) {
+		blk_put_rl(rl);
+		return ERR_PTR(-EAGAIN);
+	}
+
 	if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) {
 		blk_put_rl(rl);
 		return rq;
@@ -1277,16 +1388,18 @@ retry:
 	goto retry;
 }
 
-static struct request *blk_old_get_request(struct request_queue *q, int rw,
-		gfp_t gfp_mask)
+static struct request *blk_old_get_request(struct request_queue *q,
+					   unsigned int op, gfp_t gfp_mask)
 {
 	struct request *rq;
 
+	WARN_ON_ONCE(q->mq_ops);
+
 	/* create ioc upfront */
 	create_io_context(gfp_mask, q->node);
 
 	spin_lock_irq(q->queue_lock);
-	rq = get_request(q, rw, NULL, gfp_mask);
+	rq = get_request(q, op, NULL, gfp_mask);
 	if (IS_ERR(rq)) {
 		spin_unlock_irq(q->queue_lock);
 		return rq;
@@ -1299,14 +1412,24 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw,
 	return rq;
 }
 
-struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
+struct request *blk_get_request(struct request_queue *q, unsigned int op,
+				gfp_t gfp_mask)
 {
-	if (q->mq_ops)
-		return blk_mq_alloc_request(q, rw,
+	struct request *req;
+
+	if (q->mq_ops) {
+		req = blk_mq_alloc_request(q, op,
 			(gfp_mask & __GFP_DIRECT_RECLAIM) ?
 				0 : BLK_MQ_REQ_NOWAIT);
-	else
-		return blk_old_get_request(q, rw, gfp_mask);
+		if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn)
+			q->mq_ops->initialize_rq_fn(req);
+	} else {
+		req = blk_old_get_request(q, op, gfp_mask);
+		if (!IS_ERR(req) && q->initialize_rq_fn)
+			q->initialize_rq_fn(req);
+	}
+
+	return req;
 }
 EXPORT_SYMBOL(blk_get_request);
 
@@ -1322,6 +1445,9 @@ EXPORT_SYMBOL(blk_get_request);
  */
 void blk_requeue_request(struct request_queue *q, struct request *rq)
 {
+	lockdep_assert_held(q->queue_lock);
+	WARN_ON_ONCE(q->mq_ops);
+
 	blk_delete_timer(rq);
 	blk_clear_rq_complete(rq);
 	trace_block_rq_requeue(q, rq);
@@ -1396,9 +1522,6 @@ static void blk_pm_put_request(struct request *rq)
 static inline void blk_pm_put_request(struct request *rq) {}
 #endif
 
-/*
- * queue lock must be held
- */
 void __blk_put_request(struct request_queue *q, struct request *req)
 {
 	req_flags_t rq_flags = req->rq_flags;
@@ -1411,6 +1534,8 @@ void __blk_put_request(struct request_queue *q, struct request *req)
 		return;
 	}
 
+	lockdep_assert_held(q->queue_lock);
+
 	blk_pm_put_request(req);
 
 	elv_completed_request(q, req);
@@ -1640,6 +1765,7 @@ void blk_init_request_from_bio(struct request *req, struct bio *bio)
 		req->ioprio = ioc->ioprio;
 	else
 		req->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
+	req->write_hint = bio->bi_write_hint;
 	blk_rq_bio_prep(req->q, req, bio);
 }
 EXPORT_SYMBOL_GPL(blk_init_request_from_bio);
@@ -1659,10 +1785,10 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
 	 */
 	blk_queue_bounce(q, &bio);
 
-	blk_queue_split(q, &bio, q->bio_split);
+	blk_queue_split(q, &bio);
 
 	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
-		bio->bi_error = -EIO;
+		bio->bi_status = BLK_STS_IOERR;
 		bio_endio(bio);
 		return BLK_QC_T_NONE;
 	}
@@ -1720,7 +1846,10 @@ get_rq:
 	req = get_request(q, bio->bi_opf, bio, GFP_NOIO);
 	if (IS_ERR(req)) {
 		__wbt_done(q->rq_wb, wb_acct);
-		bio->bi_error = PTR_ERR(req);
+		if (PTR_ERR(req) == -ENOMEM)
+			bio->bi_status = BLK_STS_RESOURCE;
+		else
+			bio->bi_status = BLK_STS_IOERR;
 		bio_endio(bio);
 		goto out_unlock;
 	}
@@ -1875,7 +2004,7 @@ generic_make_request_checks(struct bio *bio)
 {
 	struct request_queue *q;
 	int nr_sectors = bio_sectors(bio);
-	int err = -EIO;
+	blk_status_t status = BLK_STS_IOERR;
 	char b[BDEVNAME_SIZE];
 	struct hd_struct *part;
 
@@ -1894,6 +2023,14 @@ generic_make_request_checks(struct bio *bio)
 		goto end_io;
 	}
 
+	/*
+	 * For a REQ_NOWAIT based request, return -EOPNOTSUPP
+	 * if queue is not a request based queue.
+	 */
+
+	if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q))
+		goto not_supported;
+
 	part = bio->bi_bdev->bd_part;
 	if (should_fail_request(part, bio->bi_iter.bi_size) ||
 	    should_fail_request(&part_to_disk(part)->part0,
@@ -1918,7 +2055,7 @@ generic_make_request_checks(struct bio *bio)
 	    !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
 		bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
 		if (!nr_sectors) {
-			err = 0;
+			status = BLK_STS_OK;
 			goto end_io;
 		}
 	}
@@ -1970,9 +2107,9 @@ generic_make_request_checks(struct bio *bio)
 	return true;
 
 not_supported:
-	err = -EOPNOTSUPP;
+	status = BLK_STS_NOTSUPP;
 end_io:
-	bio->bi_error = err;
+	bio->bi_status = status;
 	bio_endio(bio);
 	return false;
 }
@@ -2051,7 +2188,7 @@ blk_qc_t generic_make_request(struct bio *bio)
 	do {
 		struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 
-		if (likely(blk_queue_enter(q, false) == 0)) {
+		if (likely(blk_queue_enter(q, bio->bi_opf & REQ_NOWAIT) == 0)) {
 			struct bio_list lower, same;
 
 			/* Create a fresh bio_list for all subordinate requests */
@@ -2076,7 +2213,11 @@ blk_qc_t generic_make_request(struct bio *bio)
 			bio_list_merge(&bio_list_on_stack[0], &same);
 			bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
 		} else {
-			bio_io_error(bio);
+			if (unlikely(!blk_queue_dying(q) &&
+					(bio->bi_opf & REQ_NOWAIT)))
+				bio_wouldblock_error(bio);
+			else
+				bio_io_error(bio);
 		}
 		bio = bio_list_pop(&bio_list_on_stack[0]);
 	} while (bio);
@@ -2177,29 +2318,29 @@ static int blk_cloned_rq_check_limits(struct request_queue *q,
  * @q:  the queue to submit the request
  * @rq: the request being queued
  */
-int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
+blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 {
 	unsigned long flags;
 	int where = ELEVATOR_INSERT_BACK;
 
 	if (blk_cloned_rq_check_limits(q, rq))
-		return -EIO;
+		return BLK_STS_IOERR;
 
 	if (rq->rq_disk &&
 	    should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq)))
-		return -EIO;
+		return BLK_STS_IOERR;
 
 	if (q->mq_ops) {
 		if (blk_queue_io_stat(q))
 			blk_account_io_start(rq, true);
 		blk_mq_sched_insert_request(rq, false, true, false, false);
-		return 0;
+		return BLK_STS_OK;
 	}
 
 	spin_lock_irqsave(q->queue_lock, flags);
 	if (unlikely(blk_queue_dying(q))) {
 		spin_unlock_irqrestore(q->queue_lock, flags);
-		return -ENODEV;
+		return BLK_STS_IOERR;
 	}
 
 	/*
@@ -2216,7 +2357,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 		__blk_run_queue(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 
-	return 0;
+	return BLK_STS_OK;
 }
 EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
 
@@ -2232,9 +2373,6 @@ EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
  *
  * Return:
  *     The number of bytes to fail.
- *
- * Context:
- *     queue_lock must be held.
  */
 unsigned int blk_rq_err_bytes(const struct request *rq)
 {
@@ -2374,15 +2512,15 @@ void blk_account_io_start(struct request *rq, bool new_io)
  * Return:
  *     Pointer to the request at the top of @q if available.  Null
  *     otherwise.
- *
- * Context:
- *     queue_lock must be held.
  */
 struct request *blk_peek_request(struct request_queue *q)
 {
 	struct request *rq;
 	int ret;
 
+	lockdep_assert_held(q->queue_lock);
+	WARN_ON_ONCE(q->mq_ops);
+
 	while ((rq = __elv_next_request(q)) != NULL) {
 
 		rq = blk_pm_peek_request(q, rq);
@@ -2450,15 +2588,14 @@ struct request *blk_peek_request(struct request_queue *q)
 			rq = NULL;
 			break;
 		} else if (ret == BLKPREP_KILL || ret == BLKPREP_INVALID) {
-			int err = (ret == BLKPREP_INVALID) ? -EREMOTEIO : -EIO;
-
 			rq->rq_flags |= RQF_QUIET;
 			/*
 			 * Mark this request as started so we don't trigger
 			 * any debug logic in the end I/O path.
 			 */
 			blk_start_request(rq);
-			__blk_end_request_all(rq, err);
+			__blk_end_request_all(rq, ret == BLKPREP_INVALID ?
+					BLK_STS_TARGET : BLK_STS_IOERR);
 		} else {
 			printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
 			break;
@@ -2499,12 +2636,12 @@ void blk_dequeue_request(struct request *rq)
  *
  *     Block internal functions which don't want to start timer should
  *     call blk_dequeue_request().
- *
- * Context:
- *     queue_lock must be held.
  */
 void blk_start_request(struct request *req)
 {
+	lockdep_assert_held(req->q->queue_lock);
+	WARN_ON_ONCE(req->q->mq_ops);
+
 	blk_dequeue_request(req);
 
 	if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) {
@@ -2529,14 +2666,14 @@ EXPORT_SYMBOL(blk_start_request);
  * Return:
  *     Pointer to the request at the top of @q if available.  Null
  *     otherwise.
- *
- * Context:
- *     queue_lock must be held.
  */
 struct request *blk_fetch_request(struct request_queue *q)
 {
 	struct request *rq;
 
+	lockdep_assert_held(q->queue_lock);
+	WARN_ON_ONCE(q->mq_ops);
+
 	rq = blk_peek_request(q);
 	if (rq)
 		blk_start_request(rq);
@@ -2547,7 +2684,7 @@ EXPORT_SYMBOL(blk_fetch_request);
 /**
  * blk_update_request - Special helper function for request stacking drivers
  * @req:      the request being processed
- * @error:    %0 for success, < %0 for error
+ * @error:    block status code
  * @nr_bytes: number of bytes to complete @req
  *
  * Description:
@@ -2566,49 +2703,19 @@ EXPORT_SYMBOL(blk_fetch_request);
  *     %false - this request doesn't have any more data
  *     %true  - this request has more data
  **/
-bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
+bool blk_update_request(struct request *req, blk_status_t error,
+		unsigned int nr_bytes)
 {
 	int total_bytes;
 
-	trace_block_rq_complete(req, error, nr_bytes);
+	trace_block_rq_complete(req, blk_status_to_errno(error), nr_bytes);
 
 	if (!req->bio)
 		return false;
 
-	if (error && !blk_rq_is_passthrough(req) &&
-	    !(req->rq_flags & RQF_QUIET)) {
-		char *error_type;
-
-		switch (error) {
-		case -ENOLINK:
-			error_type = "recoverable transport";
-			break;
-		case -EREMOTEIO:
-			error_type = "critical target";
-			break;
-		case -EBADE:
-			error_type = "critical nexus";
-			break;
-		case -ETIMEDOUT:
-			error_type = "timeout";
-			break;
-		case -ENOSPC:
-			error_type = "critical space allocation";
-			break;
-		case -ENODATA:
-			error_type = "critical medium";
-			break;
-		case -EIO:
-		default:
-			error_type = "I/O";
-			break;
-		}
-		printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu\n",
-				   __func__, error_type, req->rq_disk ?
-				   req->rq_disk->disk_name : "?",
-				   (unsigned long long)blk_rq_pos(req));
-
-	}
+	if (unlikely(error && !blk_rq_is_passthrough(req) &&
+		     !(req->rq_flags & RQF_QUIET)))
+		print_req_error(req, error);
 
 	blk_account_io_completion(req, nr_bytes);
 
@@ -2674,7 +2781,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 }
 EXPORT_SYMBOL_GPL(blk_update_request);
 
-static bool blk_update_bidi_request(struct request *rq, int error,
+static bool blk_update_bidi_request(struct request *rq, blk_status_t error,
 				    unsigned int nr_bytes,
 				    unsigned int bidi_bytes)
 {
@@ -2712,13 +2819,13 @@ void blk_unprep_request(struct request *req)
 }
 EXPORT_SYMBOL_GPL(blk_unprep_request);
 
-/*
- * queue lock must be held
- */
-void blk_finish_request(struct request *req, int error)
+void blk_finish_request(struct request *req, blk_status_t error)
 {
 	struct request_queue *q = req->q;
 
+	lockdep_assert_held(req->q->queue_lock);
+	WARN_ON_ONCE(q->mq_ops);
+
 	if (req->rq_flags & RQF_STATS)
 		blk_stat_add(req);
 
@@ -2752,7 +2859,7 @@ EXPORT_SYMBOL(blk_finish_request);
 /**
  * blk_end_bidi_request - Complete a bidi request
  * @rq:         the request to complete
- * @error:      %0 for success, < %0 for error
+ * @error:      block status code
  * @nr_bytes:   number of bytes to complete @rq
  * @bidi_bytes: number of bytes to complete @rq->next_rq
  *
@@ -2766,12 +2873,14 @@ EXPORT_SYMBOL(blk_finish_request);
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  **/
-static bool blk_end_bidi_request(struct request *rq, int error,
+static bool blk_end_bidi_request(struct request *rq, blk_status_t error,
 				 unsigned int nr_bytes, unsigned int bidi_bytes)
 {
 	struct request_queue *q = rq->q;
 	unsigned long flags;
 
+	WARN_ON_ONCE(q->mq_ops);
+
 	if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
 		return true;
 
@@ -2785,7 +2894,7 @@ static bool blk_end_bidi_request(struct request *rq, int error,
 /**
  * __blk_end_bidi_request - Complete a bidi request with queue lock held
  * @rq:         the request to complete
- * @error:      %0 for success, < %0 for error
+ * @error:      block status code
  * @nr_bytes:   number of bytes to complete @rq
  * @bidi_bytes: number of bytes to complete @rq->next_rq
  *
@@ -2797,9 +2906,12 @@ static bool blk_end_bidi_request(struct request *rq, int error,
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  **/
-static bool __blk_end_bidi_request(struct request *rq, int error,
+static bool __blk_end_bidi_request(struct request *rq, blk_status_t error,
 				   unsigned int nr_bytes, unsigned int bidi_bytes)
 {
+	lockdep_assert_held(rq->q->queue_lock);
+	WARN_ON_ONCE(rq->q->mq_ops);
+
 	if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
 		return true;
 
@@ -2811,7 +2923,7 @@ static bool __blk_end_bidi_request(struct request *rq, int error,
 /**
  * blk_end_request - Helper function for drivers to complete the request.
  * @rq:       the request being processed
- * @error:    %0 for success, < %0 for error
+ * @error:    block status code
  * @nr_bytes: number of bytes to complete
  *
  * Description:
@@ -2822,8 +2934,10 @@ static bool __blk_end_bidi_request(struct request *rq, int error,
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  **/
-bool blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
+bool blk_end_request(struct request *rq, blk_status_t error,
+		unsigned int nr_bytes)
 {
+	WARN_ON_ONCE(rq->q->mq_ops);
 	return blk_end_bidi_request(rq, error, nr_bytes, 0);
 }
 EXPORT_SYMBOL(blk_end_request);
@@ -2831,12 +2945,12 @@ EXPORT_SYMBOL(blk_end_request);
 /**
  * blk_end_request_all - Helper function for drives to finish the request.
  * @rq: the request to finish
- * @error: %0 for success, < %0 for error
+ * @error: block status code
  *
  * Description:
  *     Completely finish @rq.
  */
-void blk_end_request_all(struct request *rq, int error)
+void blk_end_request_all(struct request *rq, blk_status_t error)
 {
 	bool pending;
 	unsigned int bidi_bytes = 0;
@@ -2852,7 +2966,7 @@ EXPORT_SYMBOL(blk_end_request_all);
 /**
  * __blk_end_request - Helper function for drivers to complete the request.
  * @rq:       the request being processed
- * @error:    %0 for success, < %0 for error
+ * @error:    block status code
  * @nr_bytes: number of bytes to complete
  *
  * Description:
@@ -2862,8 +2976,12 @@ EXPORT_SYMBOL(blk_end_request_all);
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  **/
-bool __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
+bool __blk_end_request(struct request *rq, blk_status_t error,
+		unsigned int nr_bytes)
 {
+	lockdep_assert_held(rq->q->queue_lock);
+	WARN_ON_ONCE(rq->q->mq_ops);
+
 	return __blk_end_bidi_request(rq, error, nr_bytes, 0);
 }
 EXPORT_SYMBOL(__blk_end_request);
@@ -2871,16 +2989,19 @@ EXPORT_SYMBOL(__blk_end_request);
 /**
  * __blk_end_request_all - Helper function for drives to finish the request.
  * @rq: the request to finish
- * @error: %0 for success, < %0 for error
+ * @error:    block status code
  *
  * Description:
  *     Completely finish @rq.  Must be called with queue lock held.
  */
-void __blk_end_request_all(struct request *rq, int error)
+void __blk_end_request_all(struct request *rq, blk_status_t error)
 {
 	bool pending;
 	unsigned int bidi_bytes = 0;
 
+	lockdep_assert_held(rq->q->queue_lock);
+	WARN_ON_ONCE(rq->q->mq_ops);
+
 	if (unlikely(blk_bidi_rq(rq)))
 		bidi_bytes = blk_rq_bytes(rq->next_rq);
 
@@ -2892,7 +3013,7 @@ EXPORT_SYMBOL(__blk_end_request_all);
 /**
  * __blk_end_request_cur - Helper function to finish the current request chunk.
  * @rq: the request to finish the current chunk for
- * @error: %0 for success, < %0 for error
+ * @error:    block status code
  *
  * Description:
  *     Complete the current consecutively mapped chunk from @rq.  Must
@@ -2902,7 +3023,7 @@ EXPORT_SYMBOL(__blk_end_request_all);
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  */
-bool __blk_end_request_cur(struct request *rq, int error)
+bool __blk_end_request_cur(struct request *rq, blk_status_t error)
 {
 	return __blk_end_request(rq, error, blk_rq_cur_bytes(rq));
 }
@@ -3145,6 +3266,8 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
 			    bool from_schedule)
 	__releases(q->queue_lock)
 {
+	lockdep_assert_held(q->queue_lock);
+
 	trace_block_unplug(q, depth, !from_schedule);
 
 	if (from_schedule)
@@ -3243,7 +3366,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 		 * Short-circuit if @q is dead
 		 */
 		if (unlikely(blk_queue_dying(q))) {
-			__blk_end_request_all(rq, -ENODEV);
+			__blk_end_request_all(rq, BLK_STS_IOERR);
 			continue;
 		}
 
diff --git a/block/blk-exec.c b/block/blk-exec.c
index a9451e3b8587..5c0f3dc446dc 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -16,7 +16,7 @@
  * @rq: request to complete
  * @error: end I/O status of the request
  */
-static void blk_end_sync_rq(struct request *rq, int error)
+static void blk_end_sync_rq(struct request *rq, blk_status_t error)
 {
 	struct completion *waiting = rq->end_io_data;
 
@@ -69,7 +69,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 
 	if (unlikely(blk_queue_dying(q))) {
 		rq->rq_flags |= RQF_QUIET;
-		__blk_end_request_all(rq, -ENXIO);
+		__blk_end_request_all(rq, BLK_STS_IOERR);
 		spin_unlock_irq(q->queue_lock);
 		return;
 	}
diff --git a/block/blk-flush.c b/block/blk-flush.c
index c4e0880b54bb..ed5fe322abba 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -164,7 +164,7 @@ static bool blk_flush_queue_rq(struct request *rq, bool add_front)
  */
 static bool blk_flush_complete_seq(struct request *rq,
 				   struct blk_flush_queue *fq,
-				   unsigned int seq, int error)
+				   unsigned int seq, blk_status_t error)
 {
 	struct request_queue *q = rq->q;
 	struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
@@ -216,7 +216,7 @@ static bool blk_flush_complete_seq(struct request *rq,
 	return kicked | queued;
 }
 
-static void flush_end_io(struct request *flush_rq, int error)
+static void flush_end_io(struct request *flush_rq, blk_status_t error)
 {
 	struct request_queue *q = flush_rq->q;
 	struct list_head *running;
@@ -341,11 +341,13 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
 	return blk_flush_queue_rq(flush_rq, false);
 }
 
-static void flush_data_end_io(struct request *rq, int error)
+static void flush_data_end_io(struct request *rq, blk_status_t error)
 {
 	struct request_queue *q = rq->q;
 	struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
 
+	lockdep_assert_held(q->queue_lock);
+
 	/*
 	 * Updating q->in_flight[] here for making this tag usable
 	 * early. Because in blk_queue_start_tag(),
@@ -382,7 +384,7 @@ static void flush_data_end_io(struct request *rq, int error)
 		blk_run_queue_async(q);
 }
 
-static void mq_flush_data_end_io(struct request *rq, int error)
+static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
 {
 	struct request_queue *q = rq->q;
 	struct blk_mq_hw_ctx *hctx;
@@ -411,9 +413,6 @@ static void mq_flush_data_end_io(struct request *rq, int error)
  * or __blk_mq_run_hw_queue() to dispatch request.
  * @rq is being submitted.  Analyze what needs to be done and put it on the
  * right queue.
- *
- * CONTEXT:
- * spin_lock_irq(q->queue_lock) in !mq case
  */
 void blk_insert_flush(struct request *rq)
 {
@@ -422,6 +421,9 @@ void blk_insert_flush(struct request *rq)
 	unsigned int policy = blk_flush_policy(fflags, rq);
 	struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx);
 
+	if (!q->mq_ops)
+		lockdep_assert_held(q->queue_lock);
+
 	/*
 	 * @policy now records what operations need to be done.  Adjust
 	 * REQ_PREFLUSH and FUA for the driver.
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 0f891a9aff4d..feb30570eaf5 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -384,9 +384,9 @@ static struct kobj_type integrity_ktype = {
 	.sysfs_ops	= &integrity_ops,
 };
 
-static int blk_integrity_nop_fn(struct blk_integrity_iter *iter)
+static blk_status_t blk_integrity_nop_fn(struct blk_integrity_iter *iter)
 {
-	return 0;
+	return BLK_STS_OK;
 }
 
 static const struct blk_integrity_profile nop_profile = {
diff --git a/block/blk-map.c b/block/blk-map.c
index 3b5cb863318f..2547016aa7aa 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -16,6 +16,8 @@
  */
 int blk_rq_append_bio(struct request *rq, struct bio *bio)
 {
+	blk_queue_bounce(rq->q, &bio);
+
 	if (!rq->bio) {
 		blk_rq_bio_prep(rq->q, rq, bio);
 	} else {
@@ -72,15 +74,13 @@ static int __blk_rq_map_user_iov(struct request *rq,
 		map_data->offset += bio->bi_iter.bi_size;
 
 	orig_bio = bio;
-	blk_queue_bounce(q, &bio);
 
 	/*
 	 * We link the bounce buffer in and could have to traverse it
 	 * later so we have to get a ref to prevent it from being freed
 	 */
-	bio_get(bio);
-
 	ret = blk_rq_append_bio(rq, bio);
+	bio_get(bio);
 	if (ret) {
 		bio_endio(bio);
 		__blk_rq_unmap_user(orig_bio);
@@ -249,7 +249,6 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 		return ret;
 	}
 
-	blk_queue_bounce(q, &rq->bio);
 	return 0;
 }
 EXPORT_SYMBOL(blk_rq_map_kern);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 3990ae406341..99038830fb42 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -108,31 +108,9 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 	bool do_split = true;
 	struct bio *new = NULL;
 	const unsigned max_sectors = get_max_io_size(q, bio);
-	unsigned bvecs = 0;
 
 	bio_for_each_segment(bv, bio, iter) {
 		/*
-		 * With arbitrary bio size, the incoming bio may be very
-		 * big. We have to split the bio into small bios so that
-		 * each holds at most BIO_MAX_PAGES bvecs because
-		 * bio_clone() can fail to allocate big bvecs.
-		 *
-		 * It should have been better to apply the limit per
-		 * request queue in which bio_clone() is involved,
-		 * instead of globally. The biggest blocker is the
-		 * bio_clone() in bio bounce.
-		 *
-		 * If bio is splitted by this reason, we should have
-		 * allowed to continue bios merging, but don't do
-		 * that now for making the change simple.
-		 *
-		 * TODO: deal with bio bounce's bio_clone() gracefully
-		 * and convert the global limit into per-queue limit.
-		 */
-		if (bvecs++ >= BIO_MAX_PAGES)
-			goto split;
-
-		/*
 		 * If the queue doesn't support SG gaps and adding this
 		 * offset would create a gap, disallow it.
 		 */
@@ -202,8 +180,7 @@ split:
 	return do_split ? new : NULL;
 }
 
-void blk_queue_split(struct request_queue *q, struct bio **bio,
-		     struct bio_set *bs)
+void blk_queue_split(struct request_queue *q, struct bio **bio)
 {
 	struct bio *split, *res;
 	unsigned nsegs;
@@ -211,13 +188,13 @@ void blk_queue_split(struct request_queue *q, struct bio **bio,
 	switch (bio_op(*bio)) {
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
-		split = blk_bio_discard_split(q, *bio, bs, &nsegs);
+		split = blk_bio_discard_split(q, *bio, q->bio_split, &nsegs);
 		break;
 	case REQ_OP_WRITE_ZEROES:
-		split = blk_bio_write_zeroes_split(q, *bio, bs, &nsegs);
+		split = blk_bio_write_zeroes_split(q, *bio, q->bio_split, &nsegs);
 		break;
 	case REQ_OP_WRITE_SAME:
-		split = blk_bio_write_same_split(q, *bio, bs, &nsegs);
+		split = blk_bio_write_same_split(q, *bio, q->bio_split, &nsegs);
 		break;
 	default:
 		split = blk_bio_segment_split(q, *bio, q->bio_split, &nsegs);
@@ -671,6 +648,9 @@ static void blk_account_io_merge(struct request *req)
 static struct request *attempt_merge(struct request_queue *q,
 				     struct request *req, struct request *next)
 {
+	if (!q->mq_ops)
+		lockdep_assert_held(q->queue_lock);
+
 	if (!rq_mergeable(req) || !rq_mergeable(next))
 		return NULL;
 
@@ -693,6 +673,13 @@ static struct request *attempt_merge(struct request_queue *q,
 		return NULL;
 
 	/*
+	 * Don't allow merge of different write hints, or for a hint with
+	 * non-hint IO.
+	 */
+	if (req->write_hint != next->write_hint)
+		return NULL;
+
+	/*
 	 * If we are allowed to merge, then append bio list
 	 * from next to rq and release next. merge_requests_fn
 	 * will have updated segment counts, update sector
@@ -811,6 +798,13 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 	    !blk_write_same_mergeable(rq->bio, bio))
 		return false;
 
+	/*
+	 * Don't allow merge of different write hints, or for a hint with
+	 * non-hint IO.
+	 */
+	if (rq->write_hint != bio->bi_write_hint)
+		return false;
+
 	return true;
 }
 
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 8e61e8640e17..4891f042a22f 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -14,10 +14,14 @@
 #include "blk.h"
 #include "blk-mq.h"
 
-static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues,
-			      const int cpu)
+static int cpu_to_queue_index(unsigned int nr_queues, const int cpu)
 {
-	return cpu * nr_queues / nr_cpus;
+	/*
+	 * Non online CPU will be mapped to queue index 0.
+	 */
+	if (!cpu_online(cpu))
+		return 0;
+	return cpu % nr_queues;
 }
 
 static int get_first_sibling(unsigned int cpu)
@@ -35,56 +39,26 @@ int blk_mq_map_queues(struct blk_mq_tag_set *set)
 {
 	unsigned int *map = set->mq_map;
 	unsigned int nr_queues = set->nr_hw_queues;
-	const struct cpumask *online_mask = cpu_online_mask;
-	unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling;
-	cpumask_var_t cpus;
-
-	if (!alloc_cpumask_var(&cpus, GFP_ATOMIC))
-		return -ENOMEM;
-
-	cpumask_clear(cpus);
-	nr_cpus = nr_uniq_cpus = 0;
-	for_each_cpu(i, online_mask) {
-		nr_cpus++;
-		first_sibling = get_first_sibling(i);
-		if (!cpumask_test_cpu(first_sibling, cpus))
-			nr_uniq_cpus++;
-		cpumask_set_cpu(i, cpus);
-	}
-
-	queue = 0;
-	for_each_possible_cpu(i) {
-		if (!cpumask_test_cpu(i, online_mask)) {
-			map[i] = 0;
-			continue;
-		}
+	unsigned int cpu, first_sibling;
 
+	for_each_possible_cpu(cpu) {
 		/*
-		 * Easy case - we have equal or more hardware queues. Or
-		 * there are no thread siblings to take into account. Do
-		 * 1:1 if enough, or sequential mapping if less.
+		 * First do sequential mapping between CPUs and queues.
+		 * In case we still have CPUs to map, and we have some number of
+		 * threads per cores then map sibling threads to the same queue for
+		 * performace optimizations.
 		 */
-		if (nr_queues >= nr_cpus || nr_cpus == nr_uniq_cpus) {
-			map[i] = cpu_to_queue_index(nr_cpus, nr_queues, queue);
-			queue++;
-			continue;
+		if (cpu < nr_queues) {
+			map[cpu] = cpu_to_queue_index(nr_queues, cpu);
+		} else {
+			first_sibling = get_first_sibling(cpu);
+			if (first_sibling == cpu)
+				map[cpu] = cpu_to_queue_index(nr_queues, cpu);
+			else
+				map[cpu] = map[first_sibling];
 		}
-
-		/*
-		 * Less then nr_cpus queues, and we have some number of
-		 * threads per cores. Map sibling threads to the same
-		 * queue.
-		 */
-		first_sibling = get_first_sibling(i);
-		if (first_sibling == i) {
-			map[i] = cpu_to_queue_index(nr_uniq_cpus, nr_queues,
-							queue);
-			queue++;
-		} else
-			map[i] = map[first_sibling];
 	}
 
-	free_cpumask_var(cpus);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(blk_mq_map_queues);
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 803aed4d7221..9ebc2945f991 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -114,10 +114,12 @@ static ssize_t queue_state_write(void *data, const char __user *buf,
 		blk_mq_run_hw_queues(q, true);
 	} else if (strcmp(op, "start") == 0) {
 		blk_mq_start_stopped_hw_queues(q, true);
+	} else if (strcmp(op, "kick") == 0) {
+		blk_mq_kick_requeue_list(q);
 	} else {
 		pr_err("%s: unsupported operation '%s'\n", __func__, op);
 inval:
-		pr_err("%s: use either 'run' or 'start'\n", __func__);
+		pr_err("%s: use 'run', 'start' or 'kick'\n", __func__);
 		return -EINVAL;
 	}
 	return count;
@@ -133,6 +135,29 @@ static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
 	}
 }
 
+static int queue_write_hint_show(void *data, struct seq_file *m)
+{
+	struct request_queue *q = data;
+	int i;
+
+	for (i = 0; i < BLK_MAX_WRITE_HINTS; i++)
+		seq_printf(m, "hint%d: %llu\n", i, q->write_hints[i]);
+
+	return 0;
+}
+
+static ssize_t queue_write_hint_store(void *data, const char __user *buf,
+				      size_t count, loff_t *ppos)
+{
+	struct request_queue *q = data;
+	int i;
+
+	for (i = 0; i < BLK_MAX_WRITE_HINTS; i++)
+		q->write_hints[i] = 0;
+
+	return count;
+}
+
 static int queue_poll_stat_show(void *data, struct seq_file *m)
 {
 	struct request_queue *q = data;
@@ -267,6 +292,14 @@ static const char *const rqf_name[] = {
 };
 #undef RQF_NAME
 
+#define RQAF_NAME(name) [REQ_ATOM_##name] = #name
+static const char *const rqaf_name[] = {
+	RQAF_NAME(COMPLETE),
+	RQAF_NAME(STARTED),
+	RQAF_NAME(POLL_SLEPT),
+};
+#undef RQAF_NAME
+
 int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
 {
 	const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
@@ -283,6 +316,8 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
 	seq_puts(m, ", .rq_flags=");
 	blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name,
 		       ARRAY_SIZE(rqf_name));
+	seq_puts(m, ", .atomic_flags=");
+	blk_flags_show(m, rq->atomic_flags, rqaf_name, ARRAY_SIZE(rqaf_name));
 	seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag,
 		   rq->internal_tag);
 	if (mq_ops->show_rq)
@@ -298,6 +333,37 @@ int blk_mq_debugfs_rq_show(struct seq_file *m, void *v)
 }
 EXPORT_SYMBOL_GPL(blk_mq_debugfs_rq_show);
 
+static void *queue_requeue_list_start(struct seq_file *m, loff_t *pos)
+	__acquires(&q->requeue_lock)
+{
+	struct request_queue *q = m->private;
+
+	spin_lock_irq(&q->requeue_lock);
+	return seq_list_start(&q->requeue_list, *pos);
+}
+
+static void *queue_requeue_list_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct request_queue *q = m->private;
+
+	return seq_list_next(v, &q->requeue_list, pos);
+}
+
+static void queue_requeue_list_stop(struct seq_file *m, void *v)
+	__releases(&q->requeue_lock)
+{
+	struct request_queue *q = m->private;
+
+	spin_unlock_irq(&q->requeue_lock);
+}
+
+static const struct seq_operations queue_requeue_list_seq_ops = {
+	.start	= queue_requeue_list_start,
+	.next	= queue_requeue_list_next,
+	.stop	= queue_requeue_list_stop,
+	.show	= blk_mq_debugfs_rq_show,
+};
+
 static void *hctx_dispatch_start(struct seq_file *m, loff_t *pos)
 	__acquires(&hctx->lock)
 {
@@ -329,6 +395,36 @@ static const struct seq_operations hctx_dispatch_seq_ops = {
 	.show	= blk_mq_debugfs_rq_show,
 };
 
+struct show_busy_params {
+	struct seq_file		*m;
+	struct blk_mq_hw_ctx	*hctx;
+};
+
+/*
+ * Note: the state of a request may change while this function is in progress,
+ * e.g. due to a concurrent blk_mq_finish_request() call.
+ */
+static void hctx_show_busy_rq(struct request *rq, void *data, bool reserved)
+{
+	const struct show_busy_params *params = data;
+
+	if (blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) == params->hctx &&
+	    test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
+		__blk_mq_debugfs_rq_show(params->m,
+					 list_entry_rq(&rq->queuelist));
+}
+
+static int hctx_busy_show(void *data, struct seq_file *m)
+{
+	struct blk_mq_hw_ctx *hctx = data;
+	struct show_busy_params params = { .m = m, .hctx = hctx };
+
+	blk_mq_tagset_busy_iter(hctx->queue->tag_set, hctx_show_busy_rq,
+				&params);
+
+	return 0;
+}
+
 static int hctx_ctx_map_show(void *data, struct seq_file *m)
 {
 	struct blk_mq_hw_ctx *hctx = data;
@@ -655,7 +751,9 @@ const struct file_operations blk_mq_debugfs_fops = {
 
 static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
 	{"poll_stat", 0400, queue_poll_stat_show},
+	{"requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops},
 	{"state", 0600, queue_state_show, queue_state_write},
+	{"write_hints", 0600, queue_write_hint_show, queue_write_hint_store},
 	{},
 };
 
@@ -663,6 +761,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
 	{"state", 0400, hctx_state_show},
 	{"flags", 0400, hctx_flags_show},
 	{"dispatch", 0400, .seq_ops = &hctx_dispatch_seq_ops},
+	{"busy", 0400, hctx_busy_show},
 	{"ctx_map", 0400, hctx_ctx_map_show},
 	{"tags", 0400, hctx_tags_show},
 	{"tags_bitmap", 0400, hctx_tags_bitmap_show},
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 1f5b692526ae..7f0dc48ffb40 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -31,11 +31,10 @@ void blk_mq_sched_free_hctx_data(struct request_queue *q,
 }
 EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
 
-static void __blk_mq_sched_assign_ioc(struct request_queue *q,
-				      struct request *rq,
-				      struct bio *bio,
-				      struct io_context *ioc)
+void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio)
 {
+	struct request_queue *q = rq->q;
+	struct io_context *ioc = rq_ioc(bio);
 	struct io_cq *icq;
 
 	spin_lock_irq(q->queue_lock);
@@ -47,90 +46,47 @@ static void __blk_mq_sched_assign_ioc(struct request_queue *q,
 		if (!icq)
 			return;
 	}
-
+	get_io_context(icq->ioc);
 	rq->elv.icq = icq;
-	if (!blk_mq_sched_get_rq_priv(q, rq, bio)) {
-		rq->rq_flags |= RQF_ELVPRIV;
-		get_io_context(icq->ioc);
-		return;
-	}
-
-	rq->elv.icq = NULL;
 }
 
-static void blk_mq_sched_assign_ioc(struct request_queue *q,
-				    struct request *rq, struct bio *bio)
+/*
+ * Mark a hardware queue as needing a restart. For shared queues, maintain
+ * a count of how many hardware queues are marked for restart.
+ */
+static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
 {
-	struct io_context *ioc;
+	if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+		return;
+
+	if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
+		struct request_queue *q = hctx->queue;
 
-	ioc = rq_ioc(bio);
-	if (ioc)
-		__blk_mq_sched_assign_ioc(q, rq, bio, ioc);
+		if (!test_and_set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+			atomic_inc(&q->shared_hctx_restart);
+	} else
+		set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
 }
 
-struct request *blk_mq_sched_get_request(struct request_queue *q,
-					 struct bio *bio,
-					 unsigned int op,
-					 struct blk_mq_alloc_data *data)
+static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
 {
-	struct elevator_queue *e = q->elevator;
-	struct request *rq;
-
-	blk_queue_enter_live(q);
-	data->q = q;
-	if (likely(!data->ctx))
-		data->ctx = blk_mq_get_ctx(q);
-	if (likely(!data->hctx))
-		data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
-
-	if (e) {
-		data->flags |= BLK_MQ_REQ_INTERNAL;
-
-		/*
-		 * Flush requests are special and go directly to the
-		 * dispatch list.
-		 */
-		if (!op_is_flush(op) && e->type->ops.mq.get_request) {
-			rq = e->type->ops.mq.get_request(q, op, data);
-			if (rq)
-				rq->rq_flags |= RQF_QUEUED;
-		} else
-			rq = __blk_mq_alloc_request(data, op);
-	} else {
-		rq = __blk_mq_alloc_request(data, op);
-	}
-
-	if (rq) {
-		if (!op_is_flush(op)) {
-			rq->elv.icq = NULL;
-			if (e && e->type->icq_cache)
-				blk_mq_sched_assign_ioc(q, rq, bio);
-		}
-		data->hctx->queued++;
-		return rq;
-	}
+	if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+		return false;
 
-	blk_queue_exit(q);
-	return NULL;
-}
+	if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
+		struct request_queue *q = hctx->queue;
 
-void blk_mq_sched_put_request(struct request *rq)
-{
-	struct request_queue *q = rq->q;
-	struct elevator_queue *e = q->elevator;
+		if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+			atomic_dec(&q->shared_hctx_restart);
+	} else
+		clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
 
-	if (rq->rq_flags & RQF_ELVPRIV) {
-		blk_mq_sched_put_rq_priv(rq->q, rq);
-		if (rq->elv.icq) {
-			put_io_context(rq->elv.icq->ioc);
-			rq->elv.icq = NULL;
-		}
+	if (blk_mq_hctx_has_pending(hctx)) {
+		blk_mq_run_hw_queue(hctx, true);
+		return true;
 	}
 
-	if ((rq->rq_flags & RQF_QUEUED) && e && e->type->ops.mq.put_request)
-		e->type->ops.mq.put_request(rq);
-	else
-		blk_mq_finish_request(rq);
+	return false;
 }
 
 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
@@ -141,7 +97,8 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	bool did_work = false;
 	LIST_HEAD(rq_list);
 
-	if (unlikely(blk_mq_hctx_stopped(hctx)))
+	/* RCU or SRCU read lock is needed before checking quiesced flag */
+	if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
 		return;
 
 	hctx->run++;
@@ -221,19 +178,73 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
 }
 EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
 
+/*
+ * Reverse check our software queue for entries that we could potentially
+ * merge with. Currently includes a hand-wavy stop count of 8, to not spend
+ * too much time checking for merges.
+ */
+static bool blk_mq_attempt_merge(struct request_queue *q,
+				 struct blk_mq_ctx *ctx, struct bio *bio)
+{
+	struct request *rq;
+	int checked = 8;
+
+	lockdep_assert_held(&ctx->lock);
+
+	list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
+		bool merged = false;
+
+		if (!checked--)
+			break;
+
+		if (!blk_rq_merge_ok(rq, bio))
+			continue;
+
+		switch (blk_try_merge(rq, bio)) {
+		case ELEVATOR_BACK_MERGE:
+			if (blk_mq_sched_allow_merge(q, rq, bio))
+				merged = bio_attempt_back_merge(q, rq, bio);
+			break;
+		case ELEVATOR_FRONT_MERGE:
+			if (blk_mq_sched_allow_merge(q, rq, bio))
+				merged = bio_attempt_front_merge(q, rq, bio);
+			break;
+		case ELEVATOR_DISCARD_MERGE:
+			merged = bio_attempt_discard_merge(q, rq, bio);
+			break;
+		default:
+			continue;
+		}
+
+		if (merged)
+			ctx->rq_merged++;
+		return merged;
+	}
+
+	return false;
+}
+
 bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
 {
 	struct elevator_queue *e = q->elevator;
+	struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
+	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+	bool ret = false;
 
-	if (e->type->ops.mq.bio_merge) {
-		struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
-		struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
-
+	if (e && e->type->ops.mq.bio_merge) {
 		blk_mq_put_ctx(ctx);
 		return e->type->ops.mq.bio_merge(hctx, bio);
 	}
 
-	return false;
+	if (hctx->flags & BLK_MQ_F_SHOULD_MERGE) {
+		/* default per sw-queue merge */
+		spin_lock(&ctx->lock);
+		ret = blk_mq_attempt_merge(q, ctx, bio);
+		spin_unlock(&ctx->lock);
+	}
+
+	blk_mq_put_ctx(ctx);
+	return ret;
 }
 
 bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq)
@@ -266,18 +277,6 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
 	return true;
 }
 
-static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
-{
-	if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
-		clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
-		if (blk_mq_hctx_has_pending(hctx)) {
-			blk_mq_run_hw_queue(hctx, true);
-			return true;
-		}
-	}
-	return false;
-}
-
 /**
  * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list
  * @pos:    loop cursor.
@@ -309,6 +308,13 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx)
 	unsigned int i, j;
 
 	if (set->flags & BLK_MQ_F_TAG_SHARED) {
+		/*
+		 * If this is 0, then we know that no hardware queues
+		 * have RESTART marked. We're done.
+		 */
+		if (!atomic_read(&queue->shared_hctx_restart))
+			return;
+
 		rcu_read_lock();
 		list_for_each_entry_rcu_rr(q, queue, &set->tag_list,
 					   tag_set_list) {
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index edafb5383b7b..9267d0b7c197 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -7,8 +7,7 @@
 void blk_mq_sched_free_hctx_data(struct request_queue *q,
 				 void (*exit)(struct blk_mq_hw_ctx *));
 
-struct request *blk_mq_sched_get_request(struct request_queue *q, struct bio *bio, unsigned int op, struct blk_mq_alloc_data *data);
-void blk_mq_sched_put_request(struct request *rq);
+void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio);
 
 void blk_mq_sched_request_inserted(struct request *rq);
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
@@ -38,35 +37,12 @@ int blk_mq_sched_init(struct request_queue *q);
 static inline bool
 blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
 {
-	struct elevator_queue *e = q->elevator;
-
-	if (!e || blk_queue_nomerges(q) || !bio_mergeable(bio))
+	if (blk_queue_nomerges(q) || !bio_mergeable(bio))
 		return false;
 
 	return __blk_mq_sched_bio_merge(q, bio);
 }
 
-static inline int blk_mq_sched_get_rq_priv(struct request_queue *q,
-					   struct request *rq,
-					   struct bio *bio)
-{
-	struct elevator_queue *e = q->elevator;
-
-	if (e && e->type->ops.mq.get_rq_priv)
-		return e->type->ops.mq.get_rq_priv(q, rq, bio);
-
-	return 0;
-}
-
-static inline void blk_mq_sched_put_rq_priv(struct request_queue *q,
-					    struct request *rq)
-{
-	struct elevator_queue *e = q->elevator;
-
-	if (e && e->type->ops.mq.put_rq_priv)
-		e->type->ops.mq.put_rq_priv(q, rq);
-}
-
 static inline bool
 blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
 			 struct bio *bio)
@@ -115,15 +91,6 @@ static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
 	return false;
 }
 
-/*
- * Mark a hardware queue as needing a restart.
- */
-static inline void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
-{
-	if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
-		set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
-}
-
 static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx)
 {
 	return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a69ad122ed66..6cef42f419a5 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -37,12 +37,8 @@
 #include "blk-wbt.h"
 #include "blk-mq-sched.h"
 
-static DEFINE_MUTEX(all_q_mutex);
-static LIST_HEAD(all_q_list);
-
 static void blk_mq_poll_stats_start(struct request_queue *q);
 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
-static void __blk_mq_stop_hw_queues(struct request_queue *q, bool sync);
 
 static int blk_mq_poll_stats_bkt(const struct request *rq)
 {
@@ -154,13 +150,28 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
 
+/*
+ * FIXME: replace the scsi_internal_device_*block_nowait() calls in the
+ * mpt3sas driver such that this function can be removed.
+ */
+void blk_mq_quiesce_queue_nowait(struct request_queue *q)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	queue_flag_set(QUEUE_FLAG_QUIESCED, q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
+
 /**
- * blk_mq_quiesce_queue() - wait until all ongoing queue_rq calls have finished
+ * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
  * @q: request queue.
  *
  * Note: this function does not prevent that the struct request end_io()
- * callback function is invoked. Additionally, it is not prevented that
- * new queue_rq() calls occur unless the queue has been stopped first.
+ * callback function is invoked. Once this function is returned, we make
+ * sure no dispatch can happen until the queue is unquiesced via
+ * blk_mq_unquiesce_queue().
  */
 void blk_mq_quiesce_queue(struct request_queue *q)
 {
@@ -168,11 +179,11 @@ void blk_mq_quiesce_queue(struct request_queue *q)
 	unsigned int i;
 	bool rcu = false;
 
-	__blk_mq_stop_hw_queues(q, true);
+	blk_mq_quiesce_queue_nowait(q);
 
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (hctx->flags & BLK_MQ_F_BLOCKING)
-			synchronize_srcu(&hctx->queue_rq_srcu);
+			synchronize_srcu(hctx->queue_rq_srcu);
 		else
 			rcu = true;
 	}
@@ -181,6 +192,26 @@ void blk_mq_quiesce_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
 
+/*
+ * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
+ * @q: request queue.
+ *
+ * This function recovers queue into the state before quiescing
+ * which is done by blk_mq_quiesce_queue.
+ */
+void blk_mq_unquiesce_queue(struct request_queue *q)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	/* dispatch requests which are inserted during quiescing */
+	blk_mq_run_hw_queues(q, true);
+}
+EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
+
 void blk_mq_wake_waiters(struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
@@ -204,15 +235,33 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
 }
 EXPORT_SYMBOL(blk_mq_can_queue);
 
-void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
-			struct request *rq, unsigned int op)
+static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
+		unsigned int tag, unsigned int op)
 {
+	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
+	struct request *rq = tags->static_rqs[tag];
+
+	rq->rq_flags = 0;
+
+	if (data->flags & BLK_MQ_REQ_INTERNAL) {
+		rq->tag = -1;
+		rq->internal_tag = tag;
+	} else {
+		if (blk_mq_tag_busy(data->hctx)) {
+			rq->rq_flags = RQF_MQ_INFLIGHT;
+			atomic_inc(&data->hctx->nr_active);
+		}
+		rq->tag = tag;
+		rq->internal_tag = -1;
+		data->hctx->tags->rqs[rq->tag] = rq;
+	}
+
 	INIT_LIST_HEAD(&rq->queuelist);
 	/* csd/requeue_work/fifo_time is initialized before use */
-	rq->q = q;
-	rq->mq_ctx = ctx;
+	rq->q = data->q;
+	rq->mq_ctx = data->ctx;
 	rq->cmd_flags = op;
-	if (blk_queue_io_stat(q))
+	if (blk_queue_io_stat(data->q))
 		rq->rq_flags |= RQF_IO_STAT;
 	/* do not touch atomic flags, it needs atomic ops against the timer */
 	rq->cpu = -1;
@@ -241,44 +290,60 @@ void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 	rq->end_io_data = NULL;
 	rq->next_rq = NULL;
 
-	ctx->rq_dispatched[op_is_sync(op)]++;
+	data->ctx->rq_dispatched[op_is_sync(op)]++;
+	return rq;
 }
-EXPORT_SYMBOL_GPL(blk_mq_rq_ctx_init);
 
-struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
-				       unsigned int op)
+static struct request *blk_mq_get_request(struct request_queue *q,
+		struct bio *bio, unsigned int op,
+		struct blk_mq_alloc_data *data)
 {
+	struct elevator_queue *e = q->elevator;
 	struct request *rq;
 	unsigned int tag;
 
-	tag = blk_mq_get_tag(data);
-	if (tag != BLK_MQ_TAG_FAIL) {
-		struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
+	blk_queue_enter_live(q);
+	data->q = q;
+	if (likely(!data->ctx))
+		data->ctx = blk_mq_get_ctx(q);
+	if (likely(!data->hctx))
+		data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
+	if (op & REQ_NOWAIT)
+		data->flags |= BLK_MQ_REQ_NOWAIT;
 
-		rq = tags->static_rqs[tag];
+	if (e) {
+		data->flags |= BLK_MQ_REQ_INTERNAL;
 
-		if (data->flags & BLK_MQ_REQ_INTERNAL) {
-			rq->tag = -1;
-			rq->internal_tag = tag;
-		} else {
-			if (blk_mq_tag_busy(data->hctx)) {
-				rq->rq_flags = RQF_MQ_INFLIGHT;
-				atomic_inc(&data->hctx->nr_active);
-			}
-			rq->tag = tag;
-			rq->internal_tag = -1;
-			data->hctx->tags->rqs[rq->tag] = rq;
-		}
+		/*
+		 * Flush requests are special and go directly to the
+		 * dispatch list.
+		 */
+		if (!op_is_flush(op) && e->type->ops.mq.limit_depth)
+			e->type->ops.mq.limit_depth(op, data);
+	}
 
-		blk_mq_rq_ctx_init(data->q, data->ctx, rq, op);
-		return rq;
+	tag = blk_mq_get_tag(data);
+	if (tag == BLK_MQ_TAG_FAIL) {
+		blk_queue_exit(q);
+		return NULL;
 	}
 
-	return NULL;
+	rq = blk_mq_rq_ctx_init(data, tag, op);
+	if (!op_is_flush(op)) {
+		rq->elv.icq = NULL;
+		if (e && e->type->ops.mq.prepare_request) {
+			if (e->type->icq_cache && rq_ioc(bio))
+				blk_mq_sched_assign_ioc(rq, bio);
+
+			e->type->ops.mq.prepare_request(rq, bio);
+			rq->rq_flags |= RQF_ELVPRIV;
+		}
+	}
+	data->hctx->queued++;
+	return rq;
 }
-EXPORT_SYMBOL_GPL(__blk_mq_alloc_request);
 
-struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
+struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 		unsigned int flags)
 {
 	struct blk_mq_alloc_data alloc_data = { .flags = flags };
@@ -289,7 +354,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
 	if (ret)
 		return ERR_PTR(ret);
 
-	rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data);
+	rq = blk_mq_get_request(q, NULL, op, &alloc_data);
 
 	blk_mq_put_ctx(alloc_data.ctx);
 	blk_queue_exit(q);
@@ -304,8 +369,8 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
 }
 EXPORT_SYMBOL(blk_mq_alloc_request);
 
-struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
-		unsigned int flags, unsigned int hctx_idx)
+struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
+		unsigned int op, unsigned int flags, unsigned int hctx_idx)
 {
 	struct blk_mq_alloc_data alloc_data = { .flags = flags };
 	struct request *rq;
@@ -340,7 +405,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
 	cpu = cpumask_first(alloc_data.hctx->cpumask);
 	alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
 
-	rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data);
+	rq = blk_mq_get_request(q, NULL, op, &alloc_data);
 
 	blk_queue_exit(q);
 
@@ -351,17 +416,28 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
 }
 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
 
-void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
-			     struct request *rq)
+void blk_mq_free_request(struct request *rq)
 {
-	const int sched_tag = rq->internal_tag;
 	struct request_queue *q = rq->q;
+	struct elevator_queue *e = q->elevator;
+	struct blk_mq_ctx *ctx = rq->mq_ctx;
+	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+	const int sched_tag = rq->internal_tag;
+
+	if (rq->rq_flags & RQF_ELVPRIV) {
+		if (e && e->type->ops.mq.finish_request)
+			e->type->ops.mq.finish_request(rq);
+		if (rq->elv.icq) {
+			put_io_context(rq->elv.icq->ioc);
+			rq->elv.icq = NULL;
+		}
+	}
 
+	ctx->rq_completed[rq_is_sync(rq)]++;
 	if (rq->rq_flags & RQF_MQ_INFLIGHT)
 		atomic_dec(&hctx->nr_active);
 
 	wbt_done(q->rq_wb, &rq->issue_stat);
-	rq->rq_flags = 0;
 
 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 	clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
@@ -372,29 +448,9 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 	blk_mq_sched_restart(hctx);
 	blk_queue_exit(q);
 }
-
-static void blk_mq_finish_hctx_request(struct blk_mq_hw_ctx *hctx,
-				     struct request *rq)
-{
-	struct blk_mq_ctx *ctx = rq->mq_ctx;
-
-	ctx->rq_completed[rq_is_sync(rq)]++;
-	__blk_mq_finish_request(hctx, ctx, rq);
-}
-
-void blk_mq_finish_request(struct request *rq)
-{
-	blk_mq_finish_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq);
-}
-EXPORT_SYMBOL_GPL(blk_mq_finish_request);
-
-void blk_mq_free_request(struct request *rq)
-{
-	blk_mq_sched_put_request(rq);
-}
 EXPORT_SYMBOL_GPL(blk_mq_free_request);
 
-inline void __blk_mq_end_request(struct request *rq, int error)
+inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
 {
 	blk_account_io_done(rq);
 
@@ -409,7 +465,7 @@ inline void __blk_mq_end_request(struct request *rq, int error)
 }
 EXPORT_SYMBOL(__blk_mq_end_request);
 
-void blk_mq_end_request(struct request *rq, int error)
+void blk_mq_end_request(struct request *rq, blk_status_t error)
 {
 	if (blk_update_request(rq, error, blk_rq_bytes(rq)))
 		BUG();
@@ -628,25 +684,6 @@ void blk_mq_delay_kick_requeue_list(struct request_queue *q,
 }
 EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
 
-void blk_mq_abort_requeue_list(struct request_queue *q)
-{
-	unsigned long flags;
-	LIST_HEAD(rq_list);
-
-	spin_lock_irqsave(&q->requeue_lock, flags);
-	list_splice_init(&q->requeue_list, &rq_list);
-	spin_unlock_irqrestore(&q->requeue_lock, flags);
-
-	while (!list_empty(&rq_list)) {
-		struct request *rq;
-
-		rq = list_first_entry(&rq_list, struct request, queuelist);
-		list_del_init(&rq->queuelist);
-		blk_mq_end_request(rq, -EIO);
-	}
-}
-EXPORT_SYMBOL(blk_mq_abort_requeue_list);
-
 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 {
 	if (tag < tags->nr_tags) {
@@ -772,50 +809,6 @@ static void blk_mq_timeout_work(struct work_struct *work)
 	blk_queue_exit(q);
 }
 
-/*
- * Reverse check our software queue for entries that we could potentially
- * merge with. Currently includes a hand-wavy stop count of 8, to not spend
- * too much time checking for merges.
- */
-static bool blk_mq_attempt_merge(struct request_queue *q,
-				 struct blk_mq_ctx *ctx, struct bio *bio)
-{
-	struct request *rq;
-	int checked = 8;
-
-	list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
-		bool merged = false;
-
-		if (!checked--)
-			break;
-
-		if (!blk_rq_merge_ok(rq, bio))
-			continue;
-
-		switch (blk_try_merge(rq, bio)) {
-		case ELEVATOR_BACK_MERGE:
-			if (blk_mq_sched_allow_merge(q, rq, bio))
-				merged = bio_attempt_back_merge(q, rq, bio);
-			break;
-		case ELEVATOR_FRONT_MERGE:
-			if (blk_mq_sched_allow_merge(q, rq, bio))
-				merged = bio_attempt_front_merge(q, rq, bio);
-			break;
-		case ELEVATOR_DISCARD_MERGE:
-			merged = bio_attempt_discard_merge(q, rq, bio);
-			break;
-		default:
-			continue;
-		}
-
-		if (merged)
-			ctx->rq_merged++;
-		return merged;
-	}
-
-	return false;
-}
-
 struct flush_busy_ctx_data {
 	struct blk_mq_hw_ctx *hctx;
 	struct list_head *list;
@@ -945,14 +938,14 @@ static bool reorder_tags_to_front(struct list_head *list)
 	return first != NULL;
 }
 
-static int blk_mq_dispatch_wake(wait_queue_t *wait, unsigned mode, int flags,
+static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
 				void *key)
 {
 	struct blk_mq_hw_ctx *hctx;
 
 	hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
 
-	list_del(&wait->task_list);
+	list_del(&wait->entry);
 	clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
 	blk_mq_run_hw_queue(hctx, true);
 	return 1;
@@ -987,7 +980,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
 {
 	struct blk_mq_hw_ctx *hctx;
 	struct request *rq;
-	int errors, queued, ret = BLK_MQ_RQ_QUEUE_OK;
+	int errors, queued;
 
 	if (list_empty(list))
 		return false;
@@ -998,6 +991,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
 	errors = queued = 0;
 	do {
 		struct blk_mq_queue_data bd;
+		blk_status_t ret;
 
 		rq = list_first_entry(list, struct request, queuelist);
 		if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
@@ -1038,25 +1032,20 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
 		}
 
 		ret = q->mq_ops->queue_rq(hctx, &bd);
-		switch (ret) {
-		case BLK_MQ_RQ_QUEUE_OK:
-			queued++;
-			break;
-		case BLK_MQ_RQ_QUEUE_BUSY:
+		if (ret == BLK_STS_RESOURCE) {
 			blk_mq_put_driver_tag_hctx(hctx, rq);
 			list_add(&rq->queuelist, list);
 			__blk_mq_requeue_request(rq);
 			break;
-		default:
-			pr_err("blk-mq: bad return on queue: %d\n", ret);
-		case BLK_MQ_RQ_QUEUE_ERROR:
+		}
+
+		if (unlikely(ret != BLK_STS_OK)) {
 			errors++;
-			blk_mq_end_request(rq, -EIO);
-			break;
+			blk_mq_end_request(rq, BLK_STS_IOERR);
+			continue;
 		}
 
-		if (ret == BLK_MQ_RQ_QUEUE_BUSY)
-			break;
+		queued++;
 	} while (!list_empty(list));
 
 	hctx->dispatched[queued_to_index(queued)]++;
@@ -1094,7 +1083,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
 		 * - blk_mq_run_hw_queue() checks whether or not a queue has
 		 *   been stopped before rerunning a queue.
 		 * - Some but not all block drivers stop a queue before
-		 *   returning BLK_MQ_RQ_QUEUE_BUSY. Two exceptions are scsi-mq
+		 *   returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
 		 *   and dm-rq.
 		 */
 		if (!blk_mq_sched_needs_restart(hctx) &&
@@ -1119,9 +1108,9 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 	} else {
 		might_sleep();
 
-		srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
+		srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
 		blk_mq_sched_dispatch_requests(hctx);
-		srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
+		srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
 	}
 }
 
@@ -1153,8 +1142,10 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
 					unsigned long msecs)
 {
-	if (unlikely(blk_mq_hctx_stopped(hctx) ||
-		     !blk_mq_hw_queue_mapped(hctx)))
+	if (WARN_ON_ONCE(!blk_mq_hw_queue_mapped(hctx)))
+		return;
+
+	if (unlikely(blk_mq_hctx_stopped(hctx)))
 		return;
 
 	if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
@@ -1220,34 +1211,39 @@ bool blk_mq_queue_stopped(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_mq_queue_stopped);
 
-static void __blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx, bool sync)
+/*
+ * This function is often used for pausing .queue_rq() by driver when
+ * there isn't enough resource or some conditions aren't satisfied, and
+ * BLK_MQ_RQ_QUEUE_BUSY is usually returned.
+ *
+ * We do not guarantee that dispatch can be drained or blocked
+ * after blk_mq_stop_hw_queue() returns. Please use
+ * blk_mq_quiesce_queue() for that requirement.
+ */
+void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
-	if (sync)
-		cancel_delayed_work_sync(&hctx->run_work);
-	else
-		cancel_delayed_work(&hctx->run_work);
+	cancel_delayed_work(&hctx->run_work);
 
 	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
 }
-
-void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
-{
-	__blk_mq_stop_hw_queue(hctx, false);
-}
 EXPORT_SYMBOL(blk_mq_stop_hw_queue);
 
-static void __blk_mq_stop_hw_queues(struct request_queue *q, bool sync)
+/*
+ * This function is often used for pausing .queue_rq() by driver when
+ * there isn't enough resource or some conditions aren't satisfied, and
+ * BLK_MQ_RQ_QUEUE_BUSY is usually returned.
+ *
+ * We do not guarantee that dispatch can be drained or blocked
+ * after blk_mq_stop_hw_queues() returns. Please use
+ * blk_mq_quiesce_queue() for that requirement.
+ */
+void blk_mq_stop_hw_queues(struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
 	int i;
 
 	queue_for_each_hw_ctx(q, hctx, i)
-		__blk_mq_stop_hw_queue(hctx, sync);
-}
-
-void blk_mq_stop_hw_queues(struct request_queue *q)
-{
-	__blk_mq_stop_hw_queues(q, false);
+		blk_mq_stop_hw_queue(hctx);
 }
 EXPORT_SYMBOL(blk_mq_stop_hw_queues);
 
@@ -1314,7 +1310,7 @@ static void blk_mq_run_work_fn(struct work_struct *work)
 
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
 {
-	if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
+	if (WARN_ON_ONCE(!blk_mq_hw_queue_mapped(hctx)))
 		return;
 
 	/*
@@ -1336,6 +1332,8 @@ static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
 {
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
 
+	lockdep_assert_held(&ctx->lock);
+
 	trace_block_rq_insert(hctx->queue, rq);
 
 	if (at_head)
@@ -1349,6 +1347,8 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 {
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
 
+	lockdep_assert_held(&ctx->lock);
+
 	__blk_mq_insert_req_list(hctx, rq, at_head);
 	blk_mq_hctx_mark_pending(hctx, ctx);
 }
@@ -1446,30 +1446,13 @@ static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx)
 		!blk_queue_nomerges(hctx->queue);
 }
 
-static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx,
-					 struct blk_mq_ctx *ctx,
-					 struct request *rq, struct bio *bio)
+static inline void blk_mq_queue_io(struct blk_mq_hw_ctx *hctx,
+				   struct blk_mq_ctx *ctx,
+				   struct request *rq)
 {
-	if (!hctx_allow_merges(hctx) || !bio_mergeable(bio)) {
-		blk_mq_bio_to_request(rq, bio);
-		spin_lock(&ctx->lock);
-insert_rq:
-		__blk_mq_insert_request(hctx, rq, false);
-		spin_unlock(&ctx->lock);
-		return false;
-	} else {
-		struct request_queue *q = hctx->queue;
-
-		spin_lock(&ctx->lock);
-		if (!blk_mq_attempt_merge(q, ctx, bio)) {
-			blk_mq_bio_to_request(rq, bio);
-			goto insert_rq;
-		}
-
-		spin_unlock(&ctx->lock);
-		__blk_mq_finish_request(hctx, ctx, rq);
-		return true;
-	}
+	spin_lock(&ctx->lock);
+	__blk_mq_insert_request(hctx, rq, false);
+	spin_unlock(&ctx->lock);
 }
 
 static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
@@ -1480,22 +1463,29 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
 	return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
 }
 
-static void __blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie,
-				      bool may_sleep)
+static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
+					struct request *rq,
+					blk_qc_t *cookie, bool may_sleep)
 {
 	struct request_queue *q = rq->q;
 	struct blk_mq_queue_data bd = {
 		.rq = rq,
 		.last = true,
 	};
-	struct blk_mq_hw_ctx *hctx;
 	blk_qc_t new_cookie;
-	int ret;
+	blk_status_t ret;
+	bool run_queue = true;
+
+	/* RCU or SRCU read lock is needed before checking quiesced flag */
+	if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
+		run_queue = false;
+		goto insert;
+	}
 
 	if (q->elevator)
 		goto insert;
 
-	if (!blk_mq_get_driver_tag(rq, &hctx, false))
+	if (!blk_mq_get_driver_tag(rq, NULL, false))
 		goto insert;
 
 	new_cookie = request_to_qc_t(hctx, rq);
@@ -1506,20 +1496,21 @@ static void __blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie,
 	 * would have done
 	 */
 	ret = q->mq_ops->queue_rq(hctx, &bd);
-	if (ret == BLK_MQ_RQ_QUEUE_OK) {
+	switch (ret) {
+	case BLK_STS_OK:
 		*cookie = new_cookie;
 		return;
-	}
-
-	if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
+	case BLK_STS_RESOURCE:
+		__blk_mq_requeue_request(rq);
+		goto insert;
+	default:
 		*cookie = BLK_QC_T_NONE;
-		blk_mq_end_request(rq, -EIO);
+		blk_mq_end_request(rq, ret);
 		return;
 	}
 
-	__blk_mq_requeue_request(rq);
 insert:
-	blk_mq_sched_insert_request(rq, false, true, false, may_sleep);
+	blk_mq_sched_insert_request(rq, false, run_queue, false, may_sleep);
 }
 
 static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
@@ -1527,16 +1518,16 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 {
 	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
 		rcu_read_lock();
-		__blk_mq_try_issue_directly(rq, cookie, false);
+		__blk_mq_try_issue_directly(hctx, rq, cookie, false);
 		rcu_read_unlock();
 	} else {
 		unsigned int srcu_idx;
 
 		might_sleep();
 
-		srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
-		__blk_mq_try_issue_directly(rq, cookie, true);
-		srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
+		srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
+		__blk_mq_try_issue_directly(hctx, rq, cookie, true);
+		srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
 	}
 }
 
@@ -1554,7 +1545,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 
 	blk_queue_bounce(q, &bio);
 
-	blk_queue_split(q, &bio, q->bio_split);
+	blk_queue_split(q, &bio);
 
 	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
 		bio_io_error(bio);
@@ -1572,9 +1563,11 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 
 	trace_block_getrq(q, bio, bio->bi_opf);
 
-	rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data);
+	rq = blk_mq_get_request(q, bio, bio->bi_opf, &data);
 	if (unlikely(!rq)) {
 		__wbt_done(q->rq_wb, wb_acct);
+		if (bio->bi_opf & REQ_NOWAIT)
+			bio_wouldblock_error(bio);
 		return BLK_QC_T_NONE;
 	}
 
@@ -1638,9 +1631,12 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 
 		blk_mq_put_ctx(data.ctx);
 
-		if (same_queue_rq)
+		if (same_queue_rq) {
+			data.hctx = blk_mq_map_queue(q,
+					same_queue_rq->mq_ctx->cpu);
 			blk_mq_try_issue_directly(data.hctx, same_queue_rq,
 					&cookie);
+		}
 	} else if (q->nr_hw_queues > 1 && is_sync) {
 		blk_mq_put_ctx(data.ctx);
 		blk_mq_bio_to_request(rq, bio);
@@ -1649,11 +1645,12 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		blk_mq_put_ctx(data.ctx);
 		blk_mq_bio_to_request(rq, bio);
 		blk_mq_sched_insert_request(rq, false, true, true, true);
-	} else if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
+	} else {
 		blk_mq_put_ctx(data.ctx);
+		blk_mq_bio_to_request(rq, bio);
+		blk_mq_queue_io(data.hctx, data.ctx, rq);
 		blk_mq_run_hw_queue(data.hctx, true);
-	} else
-		blk_mq_put_ctx(data.ctx);
+	}
 
 	return cookie;
 }
@@ -1876,7 +1873,7 @@ static void blk_mq_exit_hctx(struct request_queue *q,
 		set->ops->exit_hctx(hctx, hctx_idx);
 
 	if (hctx->flags & BLK_MQ_F_BLOCKING)
-		cleanup_srcu_struct(&hctx->queue_rq_srcu);
+		cleanup_srcu_struct(hctx->queue_rq_srcu);
 
 	blk_mq_remove_cpuhp(hctx);
 	blk_free_flush_queue(hctx->fq);
@@ -1910,7 +1907,6 @@ static int blk_mq_init_hctx(struct request_queue *q,
 	spin_lock_init(&hctx->lock);
 	INIT_LIST_HEAD(&hctx->dispatch);
 	hctx->queue = q;
-	hctx->queue_num = hctx_idx;
 	hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
 
 	cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
@@ -1949,7 +1945,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
 		goto free_fq;
 
 	if (hctx->flags & BLK_MQ_F_BLOCKING)
-		init_srcu_struct(&hctx->queue_rq_srcu);
+		init_srcu_struct(hctx->queue_rq_srcu);
 
 	blk_mq_debugfs_register_hctx(q, hctx);
 
@@ -1985,8 +1981,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 		INIT_LIST_HEAD(&__ctx->rq_list);
 		__ctx->queue = q;
 
-		/* If the cpu isn't online, the cpu is mapped to first hctx */
-		if (!cpu_online(i))
+		/* If the cpu isn't present, the cpu is mapped to first hctx */
+		if (!cpu_present(i))
 			continue;
 
 		hctx = blk_mq_map_queue(q, i);
@@ -2029,8 +2025,7 @@ static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
 	}
 }
 
-static void blk_mq_map_swqueue(struct request_queue *q,
-			       const struct cpumask *online_mask)
+static void blk_mq_map_swqueue(struct request_queue *q)
 {
 	unsigned int i, hctx_idx;
 	struct blk_mq_hw_ctx *hctx;
@@ -2048,13 +2043,11 @@ static void blk_mq_map_swqueue(struct request_queue *q,
 	}
 
 	/*
-	 * Map software to hardware queues
+	 * Map software to hardware queues.
+	 *
+	 * If the cpu isn't present, the cpu is mapped to first hctx.
 	 */
-	for_each_possible_cpu(i) {
-		/* If the cpu isn't online, the cpu is mapped to first hctx */
-		if (!cpumask_test_cpu(i, online_mask))
-			continue;
-
+	for_each_present_cpu(i) {
 		hctx_idx = q->mq_map[i];
 		/* unmapped hw queue can be remapped after CPU topo changed */
 		if (!set->tags[hctx_idx] &&
@@ -2113,20 +2106,30 @@ static void blk_mq_map_swqueue(struct request_queue *q,
 	}
 }
 
+/*
+ * Caller needs to ensure that we're either frozen/quiesced, or that
+ * the queue isn't live yet.
+ */
 static void queue_set_hctx_shared(struct request_queue *q, bool shared)
 {
 	struct blk_mq_hw_ctx *hctx;
 	int i;
 
 	queue_for_each_hw_ctx(q, hctx, i) {
-		if (shared)
+		if (shared) {
+			if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+				atomic_inc(&q->shared_hctx_restart);
 			hctx->flags |= BLK_MQ_F_TAG_SHARED;
-		else
+		} else {
+			if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+				atomic_dec(&q->shared_hctx_restart);
 			hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
+		}
 	}
 }
 
-static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, bool shared)
+static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set,
+					bool shared)
 {
 	struct request_queue *q;
 
@@ -2224,6 +2227,20 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 }
 EXPORT_SYMBOL(blk_mq_init_queue);
 
+static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
+{
+	int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
+
+	BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, queue_rq_srcu),
+			   __alignof__(struct blk_mq_hw_ctx)) !=
+		     sizeof(struct blk_mq_hw_ctx));
+
+	if (tag_set->flags & BLK_MQ_F_BLOCKING)
+		hw_ctx_size += sizeof(struct srcu_struct);
+
+	return hw_ctx_size;
+}
+
 static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 						struct request_queue *q)
 {
@@ -2238,7 +2255,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 			continue;
 
 		node = blk_mq_hw_queue_to_node(q->mq_map, i);
-		hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx),
+		hctxs[i] = kzalloc_node(blk_mq_hw_ctx_size(set),
 					GFP_KERNEL, node);
 		if (!hctxs[i])
 			break;
@@ -2340,16 +2357,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 		blk_queue_softirq_done(q, set->ops->complete);
 
 	blk_mq_init_cpu_queues(q, set->nr_hw_queues);
-
-	get_online_cpus();
-	mutex_lock(&all_q_mutex);
-
-	list_add_tail(&q->all_q_node, &all_q_list);
 	blk_mq_add_queue_tag_set(set, q);
-	blk_mq_map_swqueue(q, cpu_online_mask);
-
-	mutex_unlock(&all_q_mutex);
-	put_online_cpus();
+	blk_mq_map_swqueue(q);
 
 	if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
 		int ret;
@@ -2375,18 +2384,12 @@ void blk_mq_free_queue(struct request_queue *q)
 {
 	struct blk_mq_tag_set	*set = q->tag_set;
 
-	mutex_lock(&all_q_mutex);
-	list_del_init(&q->all_q_node);
-	mutex_unlock(&all_q_mutex);
-
 	blk_mq_del_queue_tag_set(q);
-
 	blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
 }
 
 /* Basically redo blk_mq_init_queue with queue frozen */
-static void blk_mq_queue_reinit(struct request_queue *q,
-				const struct cpumask *online_mask)
+static void blk_mq_queue_reinit(struct request_queue *q)
 {
 	WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
 
@@ -2399,76 +2402,12 @@ static void blk_mq_queue_reinit(struct request_queue *q,
 	 * involves free and re-allocate memory, worthy doing?)
 	 */
 
-	blk_mq_map_swqueue(q, online_mask);
+	blk_mq_map_swqueue(q);
 
 	blk_mq_sysfs_register(q);
 	blk_mq_debugfs_register_hctxs(q);
 }
 
-/*
- * New online cpumask which is going to be set in this hotplug event.
- * Declare this cpumasks as global as cpu-hotplug operation is invoked
- * one-by-one and dynamically allocating this could result in a failure.
- */
-static struct cpumask cpuhp_online_new;
-
-static void blk_mq_queue_reinit_work(void)
-{
-	struct request_queue *q;
-
-	mutex_lock(&all_q_mutex);
-	/*
-	 * We need to freeze and reinit all existing queues.  Freezing
-	 * involves synchronous wait for an RCU grace period and doing it
-	 * one by one may take a long time.  Start freezing all queues in
-	 * one swoop and then wait for the completions so that freezing can
-	 * take place in parallel.
-	 */
-	list_for_each_entry(q, &all_q_list, all_q_node)
-		blk_freeze_queue_start(q);
-	list_for_each_entry(q, &all_q_list, all_q_node)
-		blk_mq_freeze_queue_wait(q);
-
-	list_for_each_entry(q, &all_q_list, all_q_node)
-		blk_mq_queue_reinit(q, &cpuhp_online_new);
-
-	list_for_each_entry(q, &all_q_list, all_q_node)
-		blk_mq_unfreeze_queue(q);
-
-	mutex_unlock(&all_q_mutex);
-}
-
-static int blk_mq_queue_reinit_dead(unsigned int cpu)
-{
-	cpumask_copy(&cpuhp_online_new, cpu_online_mask);
-	blk_mq_queue_reinit_work();
-	return 0;
-}
-
-/*
- * Before hotadded cpu starts handling requests, new mappings must be
- * established.  Otherwise, these requests in hw queue might never be
- * dispatched.
- *
- * For example, there is a single hw queue (hctx) and two CPU queues (ctx0
- * for CPU0, and ctx1 for CPU1).
- *
- * Now CPU1 is just onlined and a request is inserted into ctx1->rq_list
- * and set bit0 in pending bitmap as ctx1->index_hw is still zero.
- *
- * And then while running hw queue, blk_mq_flush_busy_ctxs() finds bit0 is set
- * in pending bitmap and tries to retrieve requests in hctx->ctxs[0]->rq_list.
- * But htx->ctxs[0] is a pointer to ctx0, so the request in ctx1->rq_list is
- * ignored.
- */
-static int blk_mq_queue_reinit_prepare(unsigned int cpu)
-{
-	cpumask_copy(&cpuhp_online_new, cpu_online_mask);
-	cpumask_set_cpu(cpu, &cpuhp_online_new);
-	blk_mq_queue_reinit_work();
-	return 0;
-}
-
 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 {
 	int i;
@@ -2660,7 +2599,8 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
 	return ret;
 }
 
-void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
+static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
+							int nr_hw_queues)
 {
 	struct request_queue *q;
 
@@ -2678,12 +2618,19 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
 	blk_mq_update_queue_map(set);
 	list_for_each_entry(q, &set->tag_list, tag_set_list) {
 		blk_mq_realloc_hw_ctxs(set, q);
-		blk_mq_queue_reinit(q, cpu_online_mask);
+		blk_mq_queue_reinit(q);
 	}
 
 	list_for_each_entry(q, &set->tag_list, tag_set_list)
 		blk_mq_unfreeze_queue(q);
 }
+
+void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
+{
+	mutex_lock(&set->tag_list_lock);
+	__blk_mq_update_nr_hw_queues(set, nr_hw_queues);
+	mutex_unlock(&set->tag_list_lock);
+}
 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
 
 /* Enable polling stats and return whether they were already enabled. */
@@ -2887,24 +2834,10 @@ bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
 }
 EXPORT_SYMBOL_GPL(blk_mq_poll);
 
-void blk_mq_disable_hotplug(void)
-{
-	mutex_lock(&all_q_mutex);
-}
-
-void blk_mq_enable_hotplug(void)
-{
-	mutex_unlock(&all_q_mutex);
-}
-
 static int __init blk_mq_init(void)
 {
 	cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
 				blk_mq_hctx_notify_dead);
-
-	cpuhp_setup_state_nocalls(CPUHP_BLK_MQ_PREPARE, "block/mq:prepare",
-				  blk_mq_queue_reinit_prepare,
-				  blk_mq_queue_reinit_dead);
 	return 0;
 }
 subsys_initcall(blk_mq_init);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index cc67b48e3551..60b01c0309bc 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -56,11 +56,6 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 				bool at_head);
 void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 				struct list_head *list);
-/*
- * CPU hotplug helpers
- */
-void blk_mq_enable_hotplug(void);
-void blk_mq_disable_hotplug(void);
 
 /*
  * CPU -> queue mappings
@@ -128,17 +123,6 @@ static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data
 	return data->hctx->tags;
 }
 
-/*
- * Internal helpers for request allocation/init/free
- */
-void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
-			struct request *rq, unsigned int op);
-void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
-				struct request *rq);
-void blk_mq_finish_request(struct request *rq);
-struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
-					unsigned int op);
-
 static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
 {
 	return test_bit(BLK_MQ_S_STOPPED, &hctx->state);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 4fa81ed383ca..be1f115b538b 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -172,11 +172,6 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
 	q->nr_batching = BLK_BATCH_REQ;
 
 	blk_set_default_limits(&q->limits);
-
-	/*
-	 * by default assume old behaviour and bounce for any highmem page
-	 */
-	blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
 }
 EXPORT_SYMBOL(blk_queue_make_request);
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 504fee940052..27aceab1cc31 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -777,24 +777,25 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
 }
 
 /**
- * blk_release_queue: - release a &struct request_queue when it is no longer needed
- * @kobj:    the kobj belonging to the request queue to be released
+ * __blk_release_queue - release a request queue when it is no longer needed
+ * @work: pointer to the release_work member of the request queue to be released
  *
  * Description:
- *     blk_release_queue is the pair to blk_init_queue() or
- *     blk_queue_make_request().  It should be called when a request queue is
- *     being released; typically when a block device is being de-registered.
- *     Currently, its primary task it to free all the &struct request
- *     structures that were allocated to the queue and the queue itself.
+ *     blk_release_queue is the counterpart of blk_init_queue(). It should be
+ *     called when a request queue is being released; typically when a block
+ *     device is being de-registered. Its primary task it to free the queue
+ *     itself.
  *
- * Note:
+ * Notes:
  *     The low level driver must have finished any outstanding requests first
  *     via blk_cleanup_queue().
- **/
-static void blk_release_queue(struct kobject *kobj)
+ *
+ *     Although blk_release_queue() may be called with preemption disabled,
+ *     __blk_release_queue() may sleep.
+ */
+static void __blk_release_queue(struct work_struct *work)
 {
-	struct request_queue *q =
-		container_of(kobj, struct request_queue, kobj);
+	struct request_queue *q = container_of(work, typeof(*q), release_work);
 
 	if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
 		blk_stat_remove_callback(q, q->poll_cb);
@@ -809,7 +810,7 @@ static void blk_release_queue(struct kobject *kobj)
 
 	blk_free_queue_stats(q->stats);
 
-	blk_exit_rl(&q->root_rl);
+	blk_exit_rl(q, &q->root_rl);
 
 	if (q->queue_tags)
 		__blk_queue_free_tags(q);
@@ -834,6 +835,15 @@ static void blk_release_queue(struct kobject *kobj)
 	call_rcu(&q->rcu_head, blk_free_queue_rcu);
 }
 
+static void blk_release_queue(struct kobject *kobj)
+{
+	struct request_queue *q =
+		container_of(kobj, struct request_queue, kobj);
+
+	INIT_WORK(&q->release_work, __blk_release_queue);
+	schedule_work(&q->release_work);
+}
+
 static const struct sysfs_ops queue_sysfs_ops = {
 	.show	= queue_attr_show,
 	.store	= queue_attr_store,
@@ -887,10 +897,10 @@ int blk_register_queue(struct gendisk *disk)
 		goto unlock;
 	}
 
-	if (q->mq_ops)
+	if (q->mq_ops) {
 		__blk_mq_register_dev(dev, q);
-
-	blk_mq_debugfs_register(q);
+		blk_mq_debugfs_register(q);
+	}
 
 	kobject_uevent(&q->kobj, KOBJ_ADD);
 
diff --git a/block/blk-tag.c b/block/blk-tag.c
index 07cc329fa4b0..2290f65b9d73 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -258,15 +258,14 @@ EXPORT_SYMBOL(blk_queue_resize_tags);
  *    all transfers have been done for a request. It's important to call
  *    this function before end_that_request_last(), as that will put the
  *    request back on the free list thus corrupting the internal tag list.
- *
- *  Notes:
- *   queue lock must be held.
  **/
 void blk_queue_end_tag(struct request_queue *q, struct request *rq)
 {
 	struct blk_queue_tag *bqt = q->queue_tags;
 	unsigned tag = rq->tag; /* negative tags invalid */
 
+	lockdep_assert_held(q->queue_lock);
+
 	BUG_ON(tag >= bqt->real_max_depth);
 
 	list_del_init(&rq->queuelist);
@@ -307,9 +306,6 @@ EXPORT_SYMBOL(blk_queue_end_tag);
  *    calling this function.  The request will also be removed from
  *    the request queue, so it's the drivers responsibility to readd
  *    it if it should need to be restarted for some reason.
- *
- *  Notes:
- *   queue lock must be held.
  **/
 int blk_queue_start_tag(struct request_queue *q, struct request *rq)
 {
@@ -317,6 +313,8 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
 	unsigned max_depth;
 	int tag;
 
+	lockdep_assert_held(q->queue_lock);
+
 	if (unlikely((rq->rq_flags & RQF_QUEUED))) {
 		printk(KERN_ERR
 		       "%s: request %p for device [%s] already tagged %d",
@@ -389,14 +387,13 @@ EXPORT_SYMBOL(blk_queue_start_tag);
  *   Hardware conditions may dictate a need to stop all pending requests.
  *   In this case, we will safely clear the block side of the tag queue and
  *   readd all requests to the request queue in the right order.
- *
- *  Notes:
- *   queue lock must be held.
  **/
 void blk_queue_invalidate_tags(struct request_queue *q)
 {
 	struct list_head *tmp, *n;
 
+	lockdep_assert_held(q->queue_lock);
+
 	list_for_each_safe(tmp, n, &q->tag_busy_list)
 		blk_requeue_request(q, list_entry_rq(tmp));
 }
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index b78db2e5fdff..a7285bf2831c 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -22,11 +22,18 @@ static int throtl_quantum = 32;
 #define DFL_THROTL_SLICE_HD (HZ / 10)
 #define DFL_THROTL_SLICE_SSD (HZ / 50)
 #define MAX_THROTL_SLICE (HZ)
-#define DFL_IDLE_THRESHOLD_SSD (1000L) /* 1 ms */
-#define DFL_IDLE_THRESHOLD_HD (100L * 1000) /* 100 ms */
 #define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */
-/* default latency target is 0, eg, guarantee IO latency by default */
-#define DFL_LATENCY_TARGET (0)
+#define MIN_THROTL_BPS (320 * 1024)
+#define MIN_THROTL_IOPS (10)
+#define DFL_LATENCY_TARGET (-1L)
+#define DFL_IDLE_THRESHOLD (0)
+#define DFL_HD_BASELINE_LATENCY (4000L) /* 4ms */
+#define LATENCY_FILTERED_SSD (0)
+/*
+ * For HD, very small latency comes from sequential IO. Such IO is helpless to
+ * help determine if its IO is impacted by others, hence we ignore the IO
+ */
+#define LATENCY_FILTERED_HD (1000L) /* 1ms */
 
 #define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT)
 
@@ -157,6 +164,7 @@ struct throtl_grp {
 	unsigned long last_check_time;
 
 	unsigned long latency_target; /* us */
+	unsigned long latency_target_conf; /* us */
 	/* When did we start a new slice */
 	unsigned long slice_start[2];
 	unsigned long slice_end[2];
@@ -165,6 +173,7 @@ struct throtl_grp {
 	unsigned long checked_last_finish_time; /* ns / 1024 */
 	unsigned long avg_idletime; /* ns / 1024 */
 	unsigned long idletime_threshold; /* us */
+	unsigned long idletime_threshold_conf; /* us */
 
 	unsigned int bio_cnt; /* total bios */
 	unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
@@ -201,8 +210,6 @@ struct throtl_data
 	unsigned int limit_index;
 	bool limit_valid[LIMIT_CNT];
 
-	unsigned long dft_idletime_threshold; /* us */
-
 	unsigned long low_upgrade_time;
 	unsigned long low_downgrade_time;
 
@@ -212,6 +219,7 @@ struct throtl_data
 	struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
 	struct latency_bucket __percpu *latency_buckets;
 	unsigned long last_calculate_time;
+	unsigned long filtered_latency;
 
 	bool track_bio_latency;
 };
@@ -294,8 +302,14 @@ static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw)
 
 	td = tg->td;
 	ret = tg->bps[rw][td->limit_index];
-	if (ret == 0 && td->limit_index == LIMIT_LOW)
-		return tg->bps[rw][LIMIT_MAX];
+	if (ret == 0 && td->limit_index == LIMIT_LOW) {
+		/* intermediate node or iops isn't 0 */
+		if (!list_empty(&blkg->blkcg->css.children) ||
+		    tg->iops[rw][td->limit_index])
+			return U64_MAX;
+		else
+			return MIN_THROTL_BPS;
+	}
 
 	if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] &&
 	    tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) {
@@ -315,10 +329,17 @@ static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
 
 	if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
 		return UINT_MAX;
+
 	td = tg->td;
 	ret = tg->iops[rw][td->limit_index];
-	if (ret == 0 && tg->td->limit_index == LIMIT_LOW)
-		return tg->iops[rw][LIMIT_MAX];
+	if (ret == 0 && tg->td->limit_index == LIMIT_LOW) {
+		/* intermediate node or bps isn't 0 */
+		if (!list_empty(&blkg->blkcg->css.children) ||
+		    tg->bps[rw][td->limit_index])
+			return UINT_MAX;
+		else
+			return MIN_THROTL_IOPS;
+	}
 
 	if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] &&
 	    tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) {
@@ -482,6 +503,9 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
 	/* LIMIT_LOW will have default value 0 */
 
 	tg->latency_target = DFL_LATENCY_TARGET;
+	tg->latency_target_conf = DFL_LATENCY_TARGET;
+	tg->idletime_threshold = DFL_IDLE_THRESHOLD;
+	tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD;
 
 	return &tg->pd;
 }
@@ -510,8 +534,6 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
 	if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent)
 		sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
 	tg->td = td;
-
-	tg->idletime_threshold = td->dft_idletime_threshold;
 }
 
 /*
@@ -684,7 +706,7 @@ static void throtl_dequeue_tg(struct throtl_grp *tg)
 static void throtl_schedule_pending_timer(struct throtl_service_queue *sq,
 					  unsigned long expires)
 {
-	unsigned long max_expire = jiffies + 8 * sq_to_tg(sq)->td->throtl_slice;
+	unsigned long max_expire = jiffies + 8 * sq_to_td(sq)->throtl_slice;
 
 	/*
 	 * Since we are adjusting the throttle limit dynamically, the sleep
@@ -1349,7 +1371,7 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v)
 	return 0;
 }
 
-static void tg_conf_updated(struct throtl_grp *tg)
+static void tg_conf_updated(struct throtl_grp *tg, bool global)
 {
 	struct throtl_service_queue *sq = &tg->service_queue;
 	struct cgroup_subsys_state *pos_css;
@@ -1367,8 +1389,26 @@ static void tg_conf_updated(struct throtl_grp *tg)
 	 * restrictions in the whole hierarchy and allows them to bypass
 	 * blk-throttle.
 	 */
-	blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg))
-		tg_update_has_rules(blkg_to_tg(blkg));
+	blkg_for_each_descendant_pre(blkg, pos_css,
+			global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) {
+		struct throtl_grp *this_tg = blkg_to_tg(blkg);
+		struct throtl_grp *parent_tg;
+
+		tg_update_has_rules(this_tg);
+		/* ignore root/second level */
+		if (!cgroup_subsys_on_dfl(io_cgrp_subsys) || !blkg->parent ||
+		    !blkg->parent->parent)
+			continue;
+		parent_tg = blkg_to_tg(blkg->parent);
+		/*
+		 * make sure all children has lower idle time threshold and
+		 * higher latency target
+		 */
+		this_tg->idletime_threshold = min(this_tg->idletime_threshold,
+				parent_tg->idletime_threshold);
+		this_tg->latency_target = max(this_tg->latency_target,
+				parent_tg->latency_target);
+	}
 
 	/*
 	 * We're already holding queue_lock and know @tg is valid.  Let's
@@ -1413,7 +1453,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
 	else
 		*(unsigned int *)((void *)tg + of_cft(of)->private) = v;
 
-	tg_conf_updated(tg);
+	tg_conf_updated(tg, false);
 	ret = 0;
 out_finish:
 	blkg_conf_finish(&ctx);
@@ -1497,34 +1537,34 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd,
 	    tg->iops_conf[READ][off] == iops_dft &&
 	    tg->iops_conf[WRITE][off] == iops_dft &&
 	    (off != LIMIT_LOW ||
-	     (tg->idletime_threshold == tg->td->dft_idletime_threshold &&
-	      tg->latency_target == DFL_LATENCY_TARGET)))
+	     (tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD &&
+	      tg->latency_target_conf == DFL_LATENCY_TARGET)))
 		return 0;
 
-	if (tg->bps_conf[READ][off] != bps_dft)
+	if (tg->bps_conf[READ][off] != U64_MAX)
 		snprintf(bufs[0], sizeof(bufs[0]), "%llu",
 			tg->bps_conf[READ][off]);
-	if (tg->bps_conf[WRITE][off] != bps_dft)
+	if (tg->bps_conf[WRITE][off] != U64_MAX)
 		snprintf(bufs[1], sizeof(bufs[1]), "%llu",
 			tg->bps_conf[WRITE][off]);
-	if (tg->iops_conf[READ][off] != iops_dft)
+	if (tg->iops_conf[READ][off] != UINT_MAX)
 		snprintf(bufs[2], sizeof(bufs[2]), "%u",
 			tg->iops_conf[READ][off]);
-	if (tg->iops_conf[WRITE][off] != iops_dft)
+	if (tg->iops_conf[WRITE][off] != UINT_MAX)
 		snprintf(bufs[3], sizeof(bufs[3]), "%u",
 			tg->iops_conf[WRITE][off]);
 	if (off == LIMIT_LOW) {
-		if (tg->idletime_threshold == ULONG_MAX)
+		if (tg->idletime_threshold_conf == ULONG_MAX)
 			strcpy(idle_time, " idle=max");
 		else
 			snprintf(idle_time, sizeof(idle_time), " idle=%lu",
-				tg->idletime_threshold);
+				tg->idletime_threshold_conf);
 
-		if (tg->latency_target == ULONG_MAX)
+		if (tg->latency_target_conf == ULONG_MAX)
 			strcpy(latency_time, " latency=max");
 		else
 			snprintf(latency_time, sizeof(latency_time),
-				" latency=%lu", tg->latency_target);
+				" latency=%lu", tg->latency_target_conf);
 	}
 
 	seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n",
@@ -1563,8 +1603,8 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
 	v[2] = tg->iops_conf[READ][index];
 	v[3] = tg->iops_conf[WRITE][index];
 
-	idle_time = tg->idletime_threshold;
-	latency_time = tg->latency_target;
+	idle_time = tg->idletime_threshold_conf;
+	latency_time = tg->latency_target_conf;
 	while (true) {
 		char tok[27];	/* wiops=18446744073709551616 */
 		char *p;
@@ -1623,17 +1663,33 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
 		tg->iops_conf[READ][LIMIT_MAX]);
 	tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW],
 		tg->iops_conf[WRITE][LIMIT_MAX]);
+	tg->idletime_threshold_conf = idle_time;
+	tg->latency_target_conf = latency_time;
+
+	/* force user to configure all settings for low limit  */
+	if (!(tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW] ||
+	      tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) ||
+	    tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD ||
+	    tg->latency_target_conf == DFL_LATENCY_TARGET) {
+		tg->bps[READ][LIMIT_LOW] = 0;
+		tg->bps[WRITE][LIMIT_LOW] = 0;
+		tg->iops[READ][LIMIT_LOW] = 0;
+		tg->iops[WRITE][LIMIT_LOW] = 0;
+		tg->idletime_threshold = DFL_IDLE_THRESHOLD;
+		tg->latency_target = DFL_LATENCY_TARGET;
+	} else if (index == LIMIT_LOW) {
+		tg->idletime_threshold = tg->idletime_threshold_conf;
+		tg->latency_target = tg->latency_target_conf;
+	}
 
-	if (index == LIMIT_LOW) {
-		blk_throtl_update_limit_valid(tg->td);
-		if (tg->td->limit_valid[LIMIT_LOW])
+	blk_throtl_update_limit_valid(tg->td);
+	if (tg->td->limit_valid[LIMIT_LOW]) {
+		if (index == LIMIT_LOW)
 			tg->td->limit_index = LIMIT_LOW;
-		tg->idletime_threshold = (idle_time == ULONG_MAX) ?
-			ULONG_MAX : idle_time;
-		tg->latency_target = (latency_time == ULONG_MAX) ?
-			ULONG_MAX : latency_time;
-	}
-	tg_conf_updated(tg);
+	} else
+		tg->td->limit_index = LIMIT_MAX;
+	tg_conf_updated(tg, index == LIMIT_LOW &&
+		tg->td->limit_valid[LIMIT_LOW]);
 	ret = 0;
 out_finish:
 	blkg_conf_finish(&ctx);
@@ -1722,17 +1778,25 @@ static bool throtl_tg_is_idle(struct throtl_grp *tg)
 	/*
 	 * cgroup is idle if:
 	 * - single idle is too long, longer than a fixed value (in case user
-	 *   configure a too big threshold) or 4 times of slice
+	 *   configure a too big threshold) or 4 times of idletime threshold
 	 * - average think time is more than threshold
 	 * - IO latency is largely below threshold
 	 */
-	unsigned long time = jiffies_to_usecs(4 * tg->td->throtl_slice);
-
-	time = min_t(unsigned long, MAX_IDLE_TIME, time);
-	return (ktime_get_ns() >> 10) - tg->last_finish_time > time ||
-	       tg->avg_idletime > tg->idletime_threshold ||
-	       (tg->latency_target && tg->bio_cnt &&
+	unsigned long time;
+	bool ret;
+
+	time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold);
+	ret = tg->latency_target == DFL_LATENCY_TARGET ||
+	      tg->idletime_threshold == DFL_IDLE_THRESHOLD ||
+	      (ktime_get_ns() >> 10) - tg->last_finish_time > time ||
+	      tg->avg_idletime > tg->idletime_threshold ||
+	      (tg->latency_target && tg->bio_cnt &&
 		tg->bad_bio_cnt * 5 < tg->bio_cnt);
+	throtl_log(&tg->service_queue,
+		"avg_idle=%ld, idle_threshold=%ld, bad_bio=%d, total_bio=%d, is_idle=%d, scale=%d",
+		tg->avg_idletime, tg->idletime_threshold, tg->bad_bio_cnt,
+		tg->bio_cnt, ret, tg->td->scale);
+	return ret;
 }
 
 static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
@@ -1828,6 +1892,7 @@ static void throtl_upgrade_state(struct throtl_data *td)
 	struct cgroup_subsys_state *pos_css;
 	struct blkcg_gq *blkg;
 
+	throtl_log(&td->service_queue, "upgrade to max");
 	td->limit_index = LIMIT_MAX;
 	td->low_upgrade_time = jiffies;
 	td->scale = 0;
@@ -1850,6 +1915,7 @@ static void throtl_downgrade_state(struct throtl_data *td, int new)
 {
 	td->scale /= 2;
 
+	throtl_log(&td->service_queue, "downgrade, scale %d", td->scale);
 	if (td->scale) {
 		td->low_upgrade_time = jiffies - td->scale * td->throtl_slice;
 		return;
@@ -2023,6 +2089,11 @@ static void throtl_update_latency_buckets(struct throtl_data *td)
 		td->avg_buckets[i].valid = true;
 		last_latency = td->avg_buckets[i].latency;
 	}
+
+	for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
+		throtl_log(&td->service_queue,
+			"Latency bucket %d: latency=%ld, valid=%d", i,
+			td->avg_buckets[i].latency, td->avg_buckets[i].valid);
 }
 #else
 static inline void throtl_update_latency_buckets(struct throtl_data *td)
@@ -2218,7 +2289,7 @@ void blk_throtl_bio_endio(struct bio *bio)
 		throtl_track_latency(tg->td, blk_stat_size(&bio->bi_issue_stat),
 			bio_op(bio), lat);
 
-	if (tg->latency_target) {
+	if (tg->latency_target && lat >= tg->td->filtered_latency) {
 		int bucket;
 		unsigned int threshold;
 
@@ -2354,18 +2425,19 @@ void blk_throtl_exit(struct request_queue *q)
 void blk_throtl_register_queue(struct request_queue *q)
 {
 	struct throtl_data *td;
-	struct cgroup_subsys_state *pos_css;
-	struct blkcg_gq *blkg;
+	int i;
 
 	td = q->td;
 	BUG_ON(!td);
 
 	if (blk_queue_nonrot(q)) {
 		td->throtl_slice = DFL_THROTL_SLICE_SSD;
-		td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_SSD;
+		td->filtered_latency = LATENCY_FILTERED_SSD;
 	} else {
 		td->throtl_slice = DFL_THROTL_SLICE_HD;
-		td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_HD;
+		td->filtered_latency = LATENCY_FILTERED_HD;
+		for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
+			td->avg_buckets[i].latency = DFL_HD_BASELINE_LATENCY;
 	}
 #ifndef CONFIG_BLK_DEV_THROTTLING_LOW
 	/* if no low limit, use previous default */
@@ -2375,18 +2447,6 @@ void blk_throtl_register_queue(struct request_queue *q)
 	td->track_bio_latency = !q->mq_ops && !q->request_fn;
 	if (!td->track_bio_latency)
 		blk_stat_enable_accounting(q);
-
-	/*
-	 * some tg are created before queue is fully initialized, eg, nonrot
-	 * isn't initialized yet
-	 */
-	rcu_read_lock();
-	blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) {
-		struct throtl_grp *tg = blkg_to_tg(blkg);
-
-		tg->idletime_threshold = td->dft_idletime_threshold;
-	}
-	rcu_read_unlock();
 }
 
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index cbff183f3d9f..17ec83bb0900 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -189,13 +189,15 @@ unsigned long blk_rq_timeout(unsigned long timeout)
  * Notes:
  *    Each request has its own timer, and as it is added to the queue, we
  *    set up the timer. When the request completes, we cancel the timer.
- *    Queue lock must be held for the non-mq case, mq case doesn't care.
  */
 void blk_add_timer(struct request *req)
 {
 	struct request_queue *q = req->q;
 	unsigned long expiry;
 
+	if (!q->mq_ops)
+		lockdep_assert_held(q->queue_lock);
+
 	/* blk-mq has its own handler, so we don't need ->rq_timed_out_fn */
 	if (!q->mq_ops && !q->rq_timed_out_fn)
 		return;
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 17676f4d7fd1..6a9a0f03a67b 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -503,7 +503,7 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
 }
 
 static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw,
-			     wait_queue_t *wait, unsigned long rw)
+			     wait_queue_entry_t *wait, unsigned long rw)
 {
 	/*
 	 * inc it here even if disabled, since we'll dec it at completion.
@@ -520,7 +520,7 @@ static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw,
 	 * in line to be woken up, wait for our turn.
 	 */
 	if (waitqueue_active(&rqw->wait) &&
-	    rqw->wait.task_list.next != &wait->task_list)
+	    rqw->wait.head.next != &wait->entry)
 		return false;
 
 	return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw));
diff --git a/block/blk.h b/block/blk.h
index 2ed70228e44f..01ebb8185f6b 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -59,7 +59,7 @@ void blk_free_flush_queue(struct blk_flush_queue *q);
 
 int blk_init_rl(struct request_list *rl, struct request_queue *q,
 		gfp_t gfp_mask);
-void blk_exit_rl(struct request_list *rl);
+void blk_exit_rl(struct request_queue *q, struct request_list *rl);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 			struct bio *bio);
 void blk_queue_bypass_start(struct request_queue *q);
@@ -143,6 +143,8 @@ static inline struct request *__elv_next_request(struct request_queue *q)
 	struct request *rq;
 	struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
 
+	WARN_ON_ONCE(q->mq_ops);
+
 	while (1) {
 		if (!list_empty(&q->queue_head)) {
 			rq = list_entry_rq(q->queue_head.next);
@@ -334,4 +336,17 @@ static inline void blk_throtl_bio_endio(struct bio *bio) { }
 static inline void blk_throtl_stat_add(struct request *rq, u64 time) { }
 #endif
 
+#ifdef CONFIG_BOUNCE
+extern int init_emergency_isa_pool(void);
+extern void blk_queue_bounce(struct request_queue *q, struct bio **bio);
+#else
+static inline int init_emergency_isa_pool(void)
+{
+	return 0;
+}
+static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio)
+{
+}
+#endif /* CONFIG_BOUNCE */
+
 #endif /* BLK_INTERNAL_H */
diff --git a/block/bounce.c b/block/bounce.c
index 1cb5dd3a5da1..5793c2dc1a15 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -22,10 +22,12 @@
 #include <asm/tlbflush.h>
 
 #include <trace/events/block.h>
+#include "blk.h"
 
 #define POOL_SIZE	64
 #define ISA_POOL_SIZE	16
 
+static struct bio_set *bounce_bio_set, *bounce_bio_split;
 static mempool_t *page_pool, *isa_page_pool;
 
 #if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL)
@@ -40,6 +42,14 @@ static __init int init_emergency_pool(void)
 	BUG_ON(!page_pool);
 	pr_info("pool size: %d pages\n", POOL_SIZE);
 
+	bounce_bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
+	BUG_ON(!bounce_bio_set);
+	if (bioset_integrity_create(bounce_bio_set, BIO_POOL_SIZE))
+		BUG_ON(1);
+
+	bounce_bio_split = bioset_create(BIO_POOL_SIZE, 0, 0);
+	BUG_ON(!bounce_bio_split);
+
 	return 0;
 }
 
@@ -143,7 +153,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool)
 		mempool_free(bvec->bv_page, pool);
 	}
 
-	bio_orig->bi_error = bio->bi_error;
+	bio_orig->bi_status = bio->bi_status;
 	bio_endio(bio_orig);
 	bio_put(bio);
 }
@@ -163,7 +173,7 @@ static void __bounce_end_io_read(struct bio *bio, mempool_t *pool)
 {
 	struct bio *bio_orig = bio->bi_private;
 
-	if (!bio->bi_error)
+	if (!bio->bi_status)
 		copy_to_high_bio_irq(bio_orig, bio);
 
 	bounce_end_io(bio, pool);
@@ -186,20 +196,31 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 	int rw = bio_data_dir(*bio_orig);
 	struct bio_vec *to, from;
 	struct bvec_iter iter;
-	unsigned i;
-
-	bio_for_each_segment(from, *bio_orig, iter)
-		if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q))
-			goto bounce;
+	unsigned i = 0;
+	bool bounce = false;
+	int sectors = 0;
+
+	bio_for_each_segment(from, *bio_orig, iter) {
+		if (i++ < BIO_MAX_PAGES)
+			sectors += from.bv_len >> 9;
+		if (page_to_pfn(from.bv_page) > q->limits.bounce_pfn)
+			bounce = true;
+	}
+	if (!bounce)
+		return;
 
-	return;
-bounce:
-	bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set);
+	if (sectors < bio_sectors(*bio_orig)) {
+		bio = bio_split(*bio_orig, sectors, GFP_NOIO, bounce_bio_split);
+		bio_chain(bio, *bio_orig);
+		generic_make_request(*bio_orig);
+		*bio_orig = bio;
+	}
+	bio = bio_clone_bioset(*bio_orig, GFP_NOIO, bounce_bio_set);
 
 	bio_for_each_segment_all(to, bio, i) {
 		struct page *page = to->bv_page;
 
-		if (page_to_pfn(page) <= queue_bounce_pfn(q))
+		if (page_to_pfn(page) <= q->limits.bounce_pfn)
 			continue;
 
 		to->bv_page = mempool_alloc(pool, q->bounce_gfp);
@@ -251,7 +272,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
 	 * don't waste time iterating over bio segments
 	 */
 	if (!(q->bounce_gfp & GFP_DMA)) {
-		if (queue_bounce_pfn(q) >= blk_max_pfn)
+		if (q->limits.bounce_pfn >= blk_max_pfn)
 			return;
 		pool = page_pool;
 	} else {
@@ -264,5 +285,3 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
 	 */
 	__blk_queue_bounce(q, bio_orig, pool);
 }
-
-EXPORT_SYMBOL(blk_queue_bounce);
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 0a23dbba2d30..c4513b23f57a 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -37,7 +37,7 @@ static void bsg_destroy_job(struct kref *kref)
 	struct bsg_job *job = container_of(kref, struct bsg_job, kref);
 	struct request *rq = job->req;
 
-	blk_end_request_all(rq, scsi_req(rq)->result);
+	blk_end_request_all(rq, BLK_STS_OK);
 
 	put_device(job->dev);	/* release reference for the request */
 
@@ -202,7 +202,7 @@ static void bsg_request_fn(struct request_queue *q)
 		ret = bsg_create_job(dev, req);
 		if (ret) {
 			scsi_req(req)->result = ret;
-			blk_end_request_all(req, ret);
+			blk_end_request_all(req, BLK_STS_OK);
 			spin_lock_irq(q->queue_lock);
 			continue;
 		}
@@ -246,6 +246,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, char *name,
 	q->bsg_job_size = dd_job_size;
 	q->bsg_job_fn = job_fn;
 	queue_flag_set_unlocked(QUEUE_FLAG_BIDI, q);
+	queue_flag_set_unlocked(QUEUE_FLAG_SCSI_PASSTHROUGH, q);
 	blk_queue_softirq_done(q, bsg_softirq_done);
 	blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
 
diff --git a/block/bsg.c b/block/bsg.c
index 6fd08544d77e..37663b664666 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -236,7 +236,6 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm)
 	rq = blk_get_request(q, op, GFP_KERNEL);
 	if (IS_ERR(rq))
 		return rq;
-	scsi_req_init(rq);
 
 	ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, has_write_perm);
 	if (ret)
@@ -294,14 +293,14 @@ out:
  * async completion call-back from the block layer, when scsi/ide/whatever
  * calls end_that_request_last() on a request
  */
-static void bsg_rq_end_io(struct request *rq, int uptodate)
+static void bsg_rq_end_io(struct request *rq, blk_status_t status)
 {
 	struct bsg_command *bc = rq->end_io_data;
 	struct bsg_device *bd = bc->bd;
 	unsigned long flags;
 
-	dprintk("%s: finished rq %p bc %p, bio %p stat %d\n",
-		bd->name, rq, bc, bc->bio, uptodate);
+	dprintk("%s: finished rq %p bc %p, bio %p\n",
+		bd->name, rq, bc, bc->bio);
 
 	bc->hdr.duration = jiffies_to_msecs(jiffies - bc->hdr.duration);
 
@@ -750,6 +749,12 @@ static struct bsg_device *bsg_add_device(struct inode *inode,
 #ifdef BSG_DEBUG
 	unsigned char buf[32];
 #endif
+
+	if (!blk_queue_scsi_passthrough(rq)) {
+		WARN_ONCE(true, "Attempt to register a non-SCSI queue\n");
+		return ERR_PTR(-EINVAL);
+	}
+
 	if (!blk_get_queue(rq))
 		return ERR_PTR(-ENXIO);
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index da69b079725f..3d5c28945719 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -38,9 +38,13 @@ static const u64 cfq_target_latency = (u64)NSEC_PER_SEC * 3/10; /* 300 ms */
 static const int cfq_hist_divisor = 4;
 
 /*
- * offset from end of service tree
+ * offset from end of queue service tree for idle class
  */
 #define CFQ_IDLE_DELAY		(NSEC_PER_SEC / 5)
+/* offset from end of group service tree under time slice mode */
+#define CFQ_SLICE_MODE_GROUP_DELAY (NSEC_PER_SEC / 5)
+/* offset from end of group service under IOPS mode */
+#define CFQ_IOPS_MODE_GROUP_DELAY (HZ / 5)
 
 /*
  * below this threshold, we consider thinktime immediate
@@ -978,15 +982,6 @@ static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
 	return min_vdisktime;
 }
 
-static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
-{
-	s64 delta = (s64)(vdisktime - min_vdisktime);
-	if (delta < 0)
-		min_vdisktime = vdisktime;
-
-	return min_vdisktime;
-}
-
 static void update_min_vdisktime(struct cfq_rb_root *st)
 {
 	struct cfq_group *cfqg;
@@ -1362,6 +1357,14 @@ cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
 	cfqg->vfraction = max_t(unsigned, vfr, 1);
 }
 
+static inline u64 cfq_get_cfqg_vdisktime_delay(struct cfq_data *cfqd)
+{
+	if (!iops_mode(cfqd))
+		return CFQ_SLICE_MODE_GROUP_DELAY;
+	else
+		return CFQ_IOPS_MODE_GROUP_DELAY;
+}
+
 static void
 cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
@@ -1381,7 +1384,8 @@ cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 	n = rb_last(&st->rb);
 	if (n) {
 		__cfqg = rb_entry_cfqg(n);
-		cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
+		cfqg->vdisktime = __cfqg->vdisktime +
+			cfq_get_cfqg_vdisktime_delay(cfqd);
 	} else
 		cfqg->vdisktime = st->min_vdisktime;
 	cfq_group_service_tree_add(st, cfqg);
diff --git a/block/elevator.c b/block/elevator.c
index dac99fbfc273..4bb2f0c93fa6 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -681,6 +681,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
 		 */
 		if (elv_attempt_insert_merge(q, rq))
 			break;
+		/* fall through */
 	case ELEVATOR_INSERT_SORT:
 		BUG_ON(blk_rq_is_passthrough(rq));
 		rq->rq_flags |= RQF_SORTED;
diff --git a/block/genhd.c b/block/genhd.c
index d252d29fe837..7f520fa25d16 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -36,7 +36,7 @@ struct kobject *block_depr;
 static DEFINE_SPINLOCK(ext_devt_lock);
 static DEFINE_IDR(ext_devt_idr);
 
-static struct device_type disk_type;
+static const struct device_type disk_type;
 
 static void disk_check_events(struct disk_events *ev,
 			      unsigned int *clearing_ptr);
@@ -1183,7 +1183,7 @@ static char *block_devnode(struct device *dev, umode_t *mode,
 	return NULL;
 }
 
-static struct device_type disk_type = {
+static const struct device_type disk_type = {
 	.name		= "disk",
 	.groups		= disk_attr_groups,
 	.release	= disk_release,
diff --git a/block/ioprio.c b/block/ioprio.c
index 4b120c9cf7e8..6f5d0b6625e3 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -75,7 +75,8 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
 		case IOPRIO_CLASS_RT:
 			if (!capable(CAP_SYS_ADMIN))
 				return -EPERM;
-			/* fall through, rt has prio field too */
+			/* fall through */
+			/* rt has prio field too */
 		case IOPRIO_CLASS_BE:
 			if (data >= IOPRIO_BE_NR || data < 0)
 				return -EINVAL;
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index b9faabc75fdb..f58cab82105b 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -99,7 +99,7 @@ struct kyber_hctx_data {
 	struct list_head rqs[KYBER_NUM_DOMAINS];
 	unsigned int cur_domain;
 	unsigned int batching;
-	wait_queue_t domain_wait[KYBER_NUM_DOMAINS];
+	wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
 	atomic_t wait_index[KYBER_NUM_DOMAINS];
 };
 
@@ -385,7 +385,7 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 
 	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 		INIT_LIST_HEAD(&khd->rqs[i]);
-		INIT_LIST_HEAD(&khd->domain_wait[i].task_list);
+		INIT_LIST_HEAD(&khd->domain_wait[i].entry);
 		atomic_set(&khd->wait_index[i], 0);
 	}
 
@@ -426,33 +426,29 @@ static void rq_clear_domain_token(struct kyber_queue_data *kqd,
 	}
 }
 
-static struct request *kyber_get_request(struct request_queue *q,
-					 unsigned int op,
-					 struct blk_mq_alloc_data *data)
+static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
 {
-	struct kyber_queue_data *kqd = q->elevator->elevator_data;
-	struct request *rq;
-
 	/*
 	 * We use the scheduler tags as per-hardware queue queueing tokens.
 	 * Async requests can be limited at this stage.
 	 */
-	if (!op_is_sync(op))
+	if (!op_is_sync(op)) {
+		struct kyber_queue_data *kqd = data->q->elevator->elevator_data;
+
 		data->shallow_depth = kqd->async_depth;
+	}
+}
 
-	rq = __blk_mq_alloc_request(data, op);
-	if (rq)
-		rq_set_domain_token(rq, -1);
-	return rq;
+static void kyber_prepare_request(struct request *rq, struct bio *bio)
+{
+	rq_set_domain_token(rq, -1);
 }
 
-static void kyber_put_request(struct request *rq)
+static void kyber_finish_request(struct request *rq)
 {
-	struct request_queue *q = rq->q;
-	struct kyber_queue_data *kqd = q->elevator->elevator_data;
+	struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
 
 	rq_clear_domain_token(kqd, rq);
-	blk_mq_finish_request(rq);
 }
 
 static void kyber_completed_request(struct request *rq)
@@ -507,12 +503,12 @@ static void kyber_flush_busy_ctxs(struct kyber_hctx_data *khd,
 	}
 }
 
-static int kyber_domain_wake(wait_queue_t *wait, unsigned mode, int flags,
+static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
 			     void *key)
 {
 	struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private);
 
-	list_del_init(&wait->task_list);
+	list_del_init(&wait->entry);
 	blk_mq_run_hw_queue(hctx, true);
 	return 1;
 }
@@ -523,7 +519,7 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd,
 {
 	unsigned int sched_domain = khd->cur_domain;
 	struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
-	wait_queue_t *wait = &khd->domain_wait[sched_domain];
+	wait_queue_entry_t *wait = &khd->domain_wait[sched_domain];
 	struct sbq_wait_state *ws;
 	int nr;
 
@@ -536,7 +532,7 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd,
 	 * run when one becomes available. Note that this is serialized on
 	 * khd->lock, but we still need to be careful about the waker.
 	 */
-	if (list_empty_careful(&wait->task_list)) {
+	if (list_empty_careful(&wait->entry)) {
 		init_waitqueue_func_entry(wait, kyber_domain_wake);
 		wait->private = hctx;
 		ws = sbq_wait_ptr(domain_tokens,
@@ -734,9 +730,9 @@ static int kyber_##name##_waiting_show(void *data, struct seq_file *m)	\
 {									\
 	struct blk_mq_hw_ctx *hctx = data;				\
 	struct kyber_hctx_data *khd = hctx->sched_data;			\
-	wait_queue_t *wait = &khd->domain_wait[domain];			\
+	wait_queue_entry_t *wait = &khd->domain_wait[domain];		\
 									\
-	seq_printf(m, "%d\n", !list_empty_careful(&wait->task_list));	\
+	seq_printf(m, "%d\n", !list_empty_careful(&wait->entry));	\
 	return 0;							\
 }
 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read)
@@ -815,8 +811,9 @@ static struct elevator_type kyber_sched = {
 		.exit_sched = kyber_exit_sched,
 		.init_hctx = kyber_init_hctx,
 		.exit_hctx = kyber_exit_hctx,
-		.get_request = kyber_get_request,
-		.put_request = kyber_put_request,
+		.limit_depth = kyber_limit_depth,
+		.prepare_request = kyber_prepare_request,
+		.finish_request = kyber_finish_request,
 		.completed_request = kyber_completed_request,
 		.dispatch_request = kyber_dispatch_request,
 		.has_work = kyber_has_work,
diff --git a/block/partition-generic.c b/block/partition-generic.c
index ff07b9143ca4..c5ec8246e25e 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -320,8 +320,10 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 
 	if (info) {
 		struct partition_meta_info *pinfo = alloc_part_info(disk);
-		if (!pinfo)
+		if (!pinfo) {
+			err = -ENOMEM;
 			goto out_free_stats;
+		}
 		memcpy(pinfo, info, sizeof(*info));
 		p->info = pinfo;
 	}
diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c
index edcea70674c9..2a365c756648 100644
--- a/block/partitions/ldm.c
+++ b/block/partitions/ldm.c
@@ -115,7 +115,7 @@ static bool ldm_parse_privhead(const u8 *data, struct privhead *ph)
 		ldm_error("PRIVHEAD disk size doesn't match real disk size");
 		return false;
 	}
-	if (uuid_be_to_bin(data + 0x0030, (uuid_be *)ph->disk_id)) {
+	if (uuid_parse(data + 0x0030, &ph->disk_id)) {
 		ldm_error("PRIVHEAD contains an invalid GUID.");
 		return false;
 	}
@@ -234,7 +234,7 @@ static bool ldm_compare_privheads (const struct privhead *ph1,
 		(ph1->logical_disk_size  == ph2->logical_disk_size)	&&
 		(ph1->config_start       == ph2->config_start)		&&
 		(ph1->config_size        == ph2->config_size)		&&
-		!memcmp (ph1->disk_id, ph2->disk_id, GUID_SIZE));
+		uuid_equal(&ph1->disk_id, &ph2->disk_id));
 }
 
 /**
@@ -557,7 +557,7 @@ static struct vblk * ldm_get_disk_objid (const struct ldmdb *ldb)
 
 	list_for_each (item, &ldb->v_disk) {
 		struct vblk *v = list_entry (item, struct vblk, list);
-		if (!memcmp (v->vblk.disk.disk_id, ldb->ph.disk_id, GUID_SIZE))
+		if (uuid_equal(&v->vblk.disk.disk_id, &ldb->ph.disk_id))
 			return v;
 	}
 
@@ -892,7 +892,7 @@ static bool ldm_parse_dsk3 (const u8 *buffer, int buflen, struct vblk *vb)
 	disk = &vb->vblk.disk;
 	ldm_get_vstr (buffer + 0x18 + r_diskid, disk->alt_name,
 		sizeof (disk->alt_name));
-	if (uuid_be_to_bin(buffer + 0x19 + r_name, (uuid_be *)disk->disk_id))
+	if (uuid_parse(buffer + 0x19 + r_name, &disk->disk_id))
 		return false;
 
 	return true;
@@ -927,7 +927,7 @@ static bool ldm_parse_dsk4 (const u8 *buffer, int buflen, struct vblk *vb)
 		return false;
 
 	disk = &vb->vblk.disk;
-	memcpy (disk->disk_id, buffer + 0x18 + r_name, GUID_SIZE);
+	uuid_copy(&disk->disk_id, (uuid_t *)(buffer + 0x18 + r_name));
 	return true;
 }
 
diff --git a/block/partitions/ldm.h b/block/partitions/ldm.h
index 374242c0971a..f4c6055df956 100644
--- a/block/partitions/ldm.h
+++ b/block/partitions/ldm.h
@@ -112,8 +112,6 @@ struct frag {				/* VBLK Fragment handling */
 
 /* In memory LDM database structures. */
 
-#define GUID_SIZE		16
-
 struct privhead {			/* Offsets and sizes are in sectors. */
 	u16	ver_major;
 	u16	ver_minor;
@@ -121,7 +119,7 @@ struct privhead {			/* Offsets and sizes are in sectors. */
 	u64	logical_disk_size;
 	u64	config_start;
 	u64	config_size;
-	u8	disk_id[GUID_SIZE];
+	uuid_t	disk_id;
 };
 
 struct tocblock {			/* We have exactly two bitmaps. */
@@ -154,7 +152,7 @@ struct vblk_dgrp {			/* VBLK Disk Group */
 };
 
 struct vblk_disk {			/* VBLK Disk */
-	u8	disk_id[GUID_SIZE];
+	uuid_t	disk_id;
 	u8	alt_name[128];
 };
 
diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c
index 93e7c1b32edd..5610cd537da7 100644
--- a/block/partitions/msdos.c
+++ b/block/partitions/msdos.c
@@ -300,6 +300,8 @@ static void parse_bsd(struct parsed_partitions *state,
 			continue;
 		bsd_start = le32_to_cpu(p->p_offset);
 		bsd_size = le32_to_cpu(p->p_size);
+		if (memcmp(flavour, "bsd\0", 4) == 0)
+			bsd_start += offset;
 		if (offset == bsd_start && size == bsd_size)
 			/* full parent partition, we have it already */
 			continue;
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 4a294a5f7fab..7440de44dd85 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -326,7 +326,6 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
 	req = scsi_req(rq);
-	scsi_req_init(rq);
 
 	if (hdr->cmd_len > BLK_MAX_CDB) {
 		req->cmd = kzalloc(hdr->cmd_len, GFP_KERNEL);
@@ -456,7 +455,6 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 		goto error_free_buffer;
 	}
 	req = scsi_req(rq);
-	scsi_req_init(rq);
 
 	cmdlen = COMMAND_SIZE(opcode);
 
@@ -542,7 +540,6 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
 	rq = blk_get_request(q, REQ_OP_SCSI_OUT, __GFP_RECLAIM);
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
-	scsi_req_init(rq);
 	rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
 	scsi_req(rq)->cmd[0] = cmd;
 	scsi_req(rq)->cmd[4] = data;
@@ -744,10 +741,14 @@ int scsi_cmd_blk_ioctl(struct block_device *bd, fmode_t mode,
 }
 EXPORT_SYMBOL(scsi_cmd_blk_ioctl);
 
-void scsi_req_init(struct request *rq)
+/**
+ * scsi_req_init - initialize certain fields of a scsi_request structure
+ * @req: Pointer to a scsi_request structure.
+ * Initializes .__cmd[], .cmd, .cmd_len and .sense_len but no other members
+ * of struct scsi_request.
+ */
+void scsi_req_init(struct scsi_request *req)
 {
-	struct scsi_request *req = scsi_req(rq);
-
 	memset(req->__cmd, 0, sizeof(req->__cmd));
 	req->cmd = req->__cmd;
 	req->cmd_len = BLK_MAX_CDB;
diff --git a/block/t10-pi.c b/block/t10-pi.c
index 680c6d636298..3416dadf7b15 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -46,8 +46,8 @@ static __be16 t10_pi_ip_fn(void *data, unsigned int len)
  * 16 bit app tag, 32 bit reference tag. Type 3 does not define the ref
  * tag.
  */
-static int t10_pi_generate(struct blk_integrity_iter *iter, csum_fn *fn,
-			   unsigned int type)
+static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter,
+		csum_fn *fn, unsigned int type)
 {
 	unsigned int i;
 
@@ -67,11 +67,11 @@ static int t10_pi_generate(struct blk_integrity_iter *iter, csum_fn *fn,
 		iter->seed++;
 	}
 
-	return 0;
+	return BLK_STS_OK;
 }
 
-static int t10_pi_verify(struct blk_integrity_iter *iter, csum_fn *fn,
-				unsigned int type)
+static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter,
+		csum_fn *fn, unsigned int type)
 {
 	unsigned int i;
 
@@ -91,7 +91,7 @@ static int t10_pi_verify(struct blk_integrity_iter *iter, csum_fn *fn,
 				       "(rcvd %u)\n", iter->disk_name,
 				       (unsigned long long)
 				       iter->seed, be32_to_cpu(pi->ref_tag));
-				return -EILSEQ;
+				return BLK_STS_PROTECTION;
 			}
 			break;
 		case 3:
@@ -108,7 +108,7 @@ static int t10_pi_verify(struct blk_integrity_iter *iter, csum_fn *fn,
 			       "(rcvd %04x, want %04x)\n", iter->disk_name,
 			       (unsigned long long)iter->seed,
 			       be16_to_cpu(pi->guard_tag), be16_to_cpu(csum));
-			return -EILSEQ;
+			return BLK_STS_PROTECTION;
 		}
 
 next:
@@ -117,45 +117,45 @@ next:
 		iter->seed++;
 	}
 
-	return 0;
+	return BLK_STS_OK;
 }
 
-static int t10_pi_type1_generate_crc(struct blk_integrity_iter *iter)
+static blk_status_t t10_pi_type1_generate_crc(struct blk_integrity_iter *iter)
 {
 	return t10_pi_generate(iter, t10_pi_crc_fn, 1);
 }
 
-static int t10_pi_type1_generate_ip(struct blk_integrity_iter *iter)
+static blk_status_t t10_pi_type1_generate_ip(struct blk_integrity_iter *iter)
 {
 	return t10_pi_generate(iter, t10_pi_ip_fn, 1);
 }
 
-static int t10_pi_type1_verify_crc(struct blk_integrity_iter *iter)
+static blk_status_t t10_pi_type1_verify_crc(struct blk_integrity_iter *iter)
 {
 	return t10_pi_verify(iter, t10_pi_crc_fn, 1);
 }
 
-static int t10_pi_type1_verify_ip(struct blk_integrity_iter *iter)
+static blk_status_t t10_pi_type1_verify_ip(struct blk_integrity_iter *iter)
 {
 	return t10_pi_verify(iter, t10_pi_ip_fn, 1);
 }
 
-static int t10_pi_type3_generate_crc(struct blk_integrity_iter *iter)
+static blk_status_t t10_pi_type3_generate_crc(struct blk_integrity_iter *iter)
 {
 	return t10_pi_generate(iter, t10_pi_crc_fn, 3);
 }
 
-static int t10_pi_type3_generate_ip(struct blk_integrity_iter *iter)
+static blk_status_t t10_pi_type3_generate_ip(struct blk_integrity_iter *iter)
 {
 	return t10_pi_generate(iter, t10_pi_ip_fn, 3);
 }
 
-static int t10_pi_type3_verify_crc(struct blk_integrity_iter *iter)
+static blk_status_t t10_pi_type3_verify_crc(struct blk_integrity_iter *iter)
 {
 	return t10_pi_verify(iter, t10_pi_crc_fn, 3);
 }
 
-static int t10_pi_type3_verify_ip(struct blk_integrity_iter *iter)
+static blk_status_t t10_pi_type3_verify_ip(struct blk_integrity_iter *iter)
 {
 	return t10_pi_verify(iter, t10_pi_ip_fn, 3);
 }