From 74c450521dd8d245b982da62592a18aa6f88b045 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 29 Oct 2014 11:14:52 -0600
Subject: blk-mq: add a 'list' parameter to ->queue_rq()

Since we have the notion of a 'last' request in a chain, we can use
this to have the hardware optimize the issuing of requests. Add
a list_head parameter to queue_rq that the driver can use to
temporarily store hw commands for issue when 'last' is true. If we
are doing a chain of requests, pass in a NULL list for the first
request to force issue of that immediately, then batch the remainder
for deferred issue until the last request has been sent.

Instead of adding yet another argument to the hot ->queue_rq path,
encapsulate the passed arguments in a blk_mq_queue_data structure.
This is passed as a constant, and has been tested as faster than
passing 4 (or even 3) args through ->queue_rq. Update drivers for
the new ->queue_rq() prototype. There are no functional changes
in this patch for drivers - if they don't use the passed in list,
then they will just queue requests individually like before.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 68929bad9a6a..7e5303820452 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -680,6 +680,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 	struct request_queue *q = hctx->queue;
 	struct request *rq;
 	LIST_HEAD(rq_list);
+	LIST_HEAD(driver_list);
+	struct list_head *dptr;
 	int queued;
 
 	WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
@@ -705,17 +707,28 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 		spin_unlock(&hctx->lock);
 	}
 
+	/*
+	 * Start off with dptr being NULL, so we start the first request
+	 * immediately, even if we have more pending.
+	 */
+	dptr = NULL;
+
 	/*
 	 * Now process all the entries, sending them to the driver.
 	 */
 	queued = 0;
 	while (!list_empty(&rq_list)) {
+		struct blk_mq_queue_data bd;
 		int ret;
 
 		rq = list_first_entry(&rq_list, struct request, queuelist);
 		list_del_init(&rq->queuelist);
 
-		ret = q->mq_ops->queue_rq(hctx, rq, list_empty(&rq_list));
+		bd.rq = rq;
+		bd.list = dptr;
+		bd.last = list_empty(&rq_list);
+
+		ret = q->mq_ops->queue_rq(hctx, &bd);
 		switch (ret) {
 		case BLK_MQ_RQ_QUEUE_OK:
 			queued++;
@@ -734,6 +747,13 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 
 		if (ret == BLK_MQ_RQ_QUEUE_BUSY)
 			break;
+
+		/*
+		 * We've done the first request. If we have more than 1
+		 * left in the list, set dptr to defer issue.
+		 */
+		if (!dptr && rq_list.next != rq_list.prev)
+			dptr = &driver_list;
 	}
 
 	if (!queued)
@@ -1153,6 +1173,11 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	}
 
 	if (is_sync) {
+		struct blk_mq_queue_data bd = {
+			.rq = rq,
+			.list = NULL,
+			.last = 1
+		};
 		int ret;
 
 		blk_mq_bio_to_request(rq, bio);
@@ -1162,7 +1187,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		 * error (busy), just add it to our list as we previously
 		 * would have done
 		 */
-		ret = q->mq_ops->queue_rq(data.hctx, rq, true);
+		ret = q->mq_ops->queue_rq(data.hctx, &bd);
 		if (ret == BLK_MQ_RQ_QUEUE_OK)
 			goto done;
 		else {
-- 
cgit v1.2.3


From e167dfb53cb85fde7b15f644e9dbef7ba31896b6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 29 Oct 2014 11:18:26 -0600
Subject: blk-mq: add BLK_MQ_F_DEFER_ISSUE support flag

Drivers can now tell blk-mq if they take advantage of the deferred
issue through 'last' or not. If they do, don't do queue-direct
for sync IO. This is a preparation patch for the nvme conversion.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7e5303820452..b355b5957cd7 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1172,7 +1172,12 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		goto run_queue;
 	}
 
-	if (is_sync) {
+	/*
+	 * If the driver supports defer issued based on 'last', then
+	 * queue it up like normal since we can potentially save some
+	 * CPU this way.
+	 */
+	if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
 		struct blk_mq_queue_data bd = {
 			.rq = rq,
 			.list = NULL,
-- 
cgit v1.2.3


From 398205b8391b208f0034a392242867b28ad8af3d Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 7 Nov 2014 23:03:59 +0100
Subject: blk_mq: call preempt_disable/enable in blk_mq_run_hw_queue, and only
 if needed

preempt_disable/enable surrounds every call to blk_mq_run_hw_queue,
except the one in blk-flush.c.  In fact that one is always asynchronous,
and it does not need smp_processor_id().

We can do the same for all other calls, avoiding preempt_disable when
async is true.  This avoids peppering blk-mq.c with preemption-disabled
regions.

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Reported-by: Clark Williams <williams@redhat.com>
Tested-by: Clark Williams <williams@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index b355b5957cd7..8b309e81ed0f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -801,9 +801,18 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
 		return;
 
-	if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask))
-		__blk_mq_run_hw_queue(hctx);
-	else if (hctx->queue->nr_hw_queues == 1)
+	if (!async) {
+		preempt_disable();
+		if (cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) {
+			__blk_mq_run_hw_queue(hctx);
+			preempt_enable();
+			return;
+		}
+
+		preempt_enable();
+	}
+
+	if (hctx->queue->nr_hw_queues == 1)
 		kblockd_schedule_delayed_work(&hctx->run_work, 0);
 	else {
 		unsigned int cpu;
@@ -824,9 +833,7 @@ void blk_mq_run_queues(struct request_queue *q, bool async)
 		    test_bit(BLK_MQ_S_STOPPED, &hctx->state))
 			continue;
 
-		preempt_disable();
 		blk_mq_run_hw_queue(hctx, async);
-		preempt_enable();
 	}
 }
 EXPORT_SYMBOL(blk_mq_run_queues);
@@ -853,9 +860,7 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
 	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
 
-	preempt_disable();
 	blk_mq_run_hw_queue(hctx, false);
-	preempt_enable();
 }
 EXPORT_SYMBOL(blk_mq_start_hw_queue);
 
@@ -880,9 +885,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
 			continue;
 
 		clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
-		preempt_disable();
 		blk_mq_run_hw_queue(hctx, async);
-		preempt_enable();
 	}
 }
 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
-- 
cgit v1.2.3


From 2a90d4aae5509e9cf1ba848c5d0b3458201160a0 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 7 Nov 2014 23:04:00 +0100
Subject: blk-mq: use get_cpu/put_cpu instead of preempt_disable/preempt_enable

blk-mq is using preempt_disable/enable in order to ensure that the
queue runners are placed on the right CPU.  This does not work with
the RT patches, because __blk_mq_run_hw_queue takes a non-raw
spinlock with the preemption-disabled region.  If there is contention
on the lock, this violates the rules for preemption-disabled regions.

While this should be easily fixable within the RT patches just by doing
migrate_disable/enable, we can do better and document _why_ this
particular region runs with disabled preemption.  After the previous
patch, it is trivial to switch it to get/put_cpu; the RT patches then
can change it to get_cpu_light, which lets virtio-blk run under RT
kernels.

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Reported-by: Clark Williams <williams@redhat.com>
Tested-by: Clark Williams <williams@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 8b309e81ed0f..06ab0683a1f1 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -802,14 +802,14 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 		return;
 
 	if (!async) {
-		preempt_disable();
-		if (cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) {
+		int cpu = get_cpu();
+		if (cpumask_test_cpu(cpu, hctx->cpumask)) {
 			__blk_mq_run_hw_queue(hctx);
-			preempt_enable();
+			put_cpu();
 			return;
 		}
 
-		preempt_enable();
+		put_cpu();
 	}
 
 	if (hctx->queue->nr_hw_queues == 1)
-- 
cgit v1.2.3


From 1a3b595a281a44be4074fe33b317a0a4854b4197 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Mon, 17 Nov 2014 10:40:48 -0700
Subject: blk-mq: export blk_mq_free_request()

Drivers that know they are blk-mq should just use this function
instead of calling through blk_put_request().

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 06ab0683a1f1..fdf12152946e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -280,6 +280,7 @@ void blk_mq_free_request(struct request *rq)
 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
 	__blk_mq_free_request(hctx, ctx, rq);
 }
+EXPORT_SYMBOL_GPL(blk_mq_free_request);
 
 inline void __blk_mq_end_request(struct request *rq, int error)
 {
-- 
cgit v1.2.3


From 7c7f2f2bc9a63f9605a16eabac59fc655dfe7c9a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Mon, 17 Nov 2014 10:41:57 -0700
Subject: blk-mq: add blk_mq_free_hctx_request()

It's silly to use blk_mq_free_request() which in turn maps the
request to the hardware queue, for places where we already know
what the hardware queue is. This saves us an extra mapping of a
hardware queue on request completion, if the caller knows this
information already.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index fdf12152946e..4347aa2be6ae 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -269,16 +269,23 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
 	blk_mq_queue_exit(q);
 }
 
-void blk_mq_free_request(struct request *rq)
+void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
 {
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
-	struct blk_mq_hw_ctx *hctx;
-	struct request_queue *q = rq->q;
 
 	ctx->rq_completed[rq_is_sync(rq)]++;
-
-	hctx = q->mq_ops->map_queue(q, ctx->cpu);
 	__blk_mq_free_request(hctx, ctx, rq);
+
+}
+EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request);
+
+void blk_mq_free_request(struct request *rq)
+{
+	struct blk_mq_hw_ctx *hctx;
+	struct request_queue *q = rq->q;
+
+	hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu);
+	blk_mq_free_hctx_request(hctx, rq);
 }
 EXPORT_SYMBOL_GPL(blk_mq_free_request);
 
-- 
cgit v1.2.3


From b657d7e632e0bc40e5e231332be39d69b2f1a0bb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 24 Nov 2014 09:27:23 +0100
Subject: blk-mq: handle the single queue case in blk_mq_hctx_next_cpu

Don't duplicate the code to handle the not cpu bounce case in the
caller, do it inside blk_mq_hctx_next_cpu instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 31 ++++++++++---------------------
 1 file changed, 10 insertions(+), 21 deletions(-)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4347aa2be6ae..27a347fe8f5b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -788,10 +788,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
  */
 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 {
-	int cpu = hctx->next_cpu;
+	if (hctx->queue->nr_hw_queues == 1)
+		return WORK_CPU_UNBOUND;
 
 	if (--hctx->next_cpu_batch <= 0) {
-		int next_cpu;
+		int cpu = hctx->next_cpu, next_cpu;
 
 		next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
 		if (next_cpu >= nr_cpu_ids)
@@ -799,9 +800,11 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 
 		hctx->next_cpu = next_cpu;
 		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
+
+		return cpu;
 	}
 
-	return cpu;
+	return hctx->next_cpu;
 }
 
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
@@ -820,14 +823,8 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 		put_cpu();
 	}
 
-	if (hctx->queue->nr_hw_queues == 1)
-		kblockd_schedule_delayed_work(&hctx->run_work, 0);
-	else {
-		unsigned int cpu;
-
-		cpu = blk_mq_hctx_next_cpu(hctx);
-		kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0);
-	}
+	kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+			&hctx->run_work, 0);
 }
 
 void blk_mq_run_queues(struct request_queue *q, bool async)
@@ -919,16 +916,8 @@ static void blk_mq_delay_work_fn(struct work_struct *work)
 
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
 {
-	unsigned long tmo = msecs_to_jiffies(msecs);
-
-	if (hctx->queue->nr_hw_queues == 1)
-		kblockd_schedule_delayed_work(&hctx->delay_work, tmo);
-	else {
-		unsigned int cpu;
-
-		cpu = blk_mq_hctx_next_cpu(hctx);
-		kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo);
-	}
+	kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+			&hctx->delay_work, msecs_to_jiffies(msecs));
 }
 EXPORT_SYMBOL(blk_mq_delay_queue);
 
-- 
cgit v1.2.3


From 6637fadf25657e619a50fde5ff3ae09a98d20eb5 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Sun, 30 Nov 2014 16:00:58 -0800
Subject: blk-mq: move the kdump check to blk_mq_alloc_tag_set

We call blk_mq_alloc_tag_set() first then blk_mq_init_queue(). The requests are
allocated in the former function. So the kdump check should be moved to there
to really save memory.

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 27a347fe8f5b..4854e709aa34 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1804,16 +1804,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 	if (!ctx)
 		return ERR_PTR(-ENOMEM);
 
-	/*
-	 * If a crashdump is active, then we are potentially in a very
-	 * memory constrained environment. Limit us to 1 queue and
-	 * 64 tags to prevent using too much memory.
-	 */
-	if (is_kdump_kernel()) {
-		set->nr_hw_queues = 1;
-		set->queue_depth = min(64U, set->queue_depth);
-	}
-
 	hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
 			set->numa_node);
 
@@ -2070,6 +2060,16 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 		set->queue_depth = BLK_MQ_MAX_DEPTH;
 	}
 
+	/*
+	 * If a crashdump is active, then we are potentially in a very
+	 * memory constrained environment. Limit us to 1 queue and
+	 * 64 tags to prevent using too much memory.
+	 */
+	if (is_kdump_kernel()) {
+		set->nr_hw_queues = 1;
+		set->queue_depth = min(64U, set->queue_depth);
+	}
+
 	set->tags = kmalloc_node(set->nr_hw_queues *
 				 sizeof(struct blk_mq_tags *),
 				 GFP_KERNEL, set->numa_node);
-- 
cgit v1.2.3


From 19c66e59ce57e7b181625cbb408d48eb10837763 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Wed, 3 Dec 2014 19:38:04 +0800
Subject: blk-mq: prevent unmapped hw queue from being scheduled

When one hardware queue has no mapped software queues, it
shouldn't have been scheduled. Otherwise WARNING or OOPS
can triggered.

blk_mq_hw_queue_mapped() helper is introduce for fixing
the problem.

Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'block/blk-mq.c')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4854e709aa34..b21a3b6f7b65 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -589,7 +589,7 @@ static void blk_mq_rq_timer(unsigned long priv)
 		 * If not software queues are currently mapped to this
 		 * hardware queue, there's nothing to check
 		 */
-		if (!hctx->nr_ctx || !hctx->tags)
+		if (!blk_mq_hw_queue_mapped(hctx))
 			continue;
 
 		blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data);
@@ -809,7 +809,8 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 {
-	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
+	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state) ||
+	    !blk_mq_hw_queue_mapped(hctx)))
 		return;
 
 	if (!async) {
@@ -916,6 +917,9 @@ static void blk_mq_delay_work_fn(struct work_struct *work)
 
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
 {
+	if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
+		return;
+
 	kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
 			&hctx->delay_work, msecs_to_jiffies(msecs));
 }
-- 
cgit v1.2.3