From f0276924fa35a3607920a58cf5d878212824b951 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Tue, 31 Dec 2013 11:38:50 +0800
Subject: blk-mq: Don't reserve a tag for flush request

Reserving a tag (request) for flush to avoid dead lock is a overkill. A
tag is valuable resource. We can track the number of flush requests and
disallow having too many pending flush requests allocated. With this
patch, blk_mq_alloc_request_pinned() could do a busy nop (but not a dead
loop) if too many pending requests are allocated and new flush request
is allocated. But this should not be a problem, too many pending flush
requests are very rare case.

I verified this can fix the deadlock caused by too many pending flush
requests.

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-mq.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 161b23105b1e..1e8f16f65af4 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -36,12 +36,15 @@ struct blk_mq_hw_ctx {
 	struct list_head	page_list;
 	struct blk_mq_tags	*tags;
 
+	atomic_t		pending_flush;
+
 	unsigned long		queued;
 	unsigned long		run;
 #define BLK_MQ_MAX_DISPATCH_ORDER	10
 	unsigned long		dispatched[BLK_MQ_MAX_DISPATCH_ORDER];
 
 	unsigned int		queue_depth;
+	unsigned int		reserved_tags;
 	unsigned int		numa_node;
 	unsigned int		cmd_size;	/* per-request extra data */
 
-- 
cgit v1.2.3


From 80bfa2f6e2e81049fc6cd3bfaeedcb64db3a9ba6 Mon Sep 17 00:00:00 2001
From: Roger Pau Monne <roger.pau@citrix.com>
Date: Tue, 4 Feb 2014 11:26:15 +0100
Subject: xen-blkif: drop struct blkif_request_segment_aligned
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This was wrongly introduced in commit 402b27f9, the only difference
between blkif_request_segment_aligned and blkif_request_segment is
that the former has a named padding, while both share the same
memory layout.

Also correct a few minor glitches in the description, including for it
to no longer assume PAGE_SIZE == 4096.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
[Description fix by Jan Beulich]
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reported-by: Jan Beulich <jbeulich@suse.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Tested-by: Matt Rushton <mrushton@amazon.com>
Cc: Matt Wilson <msw@amazon.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 include/xen/interface/io/blkif.h | 34 ++++++++++++++--------------------
 1 file changed, 14 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h
index ae665ac59c36..32ec05a6572f 100644
--- a/include/xen/interface/io/blkif.h
+++ b/include/xen/interface/io/blkif.h
@@ -113,13 +113,13 @@ typedef uint64_t blkif_sector_t;
  * it's less than the number provided by the backend. The indirect_grefs field
  * in blkif_request_indirect should be filled by the frontend with the
  * grant references of the pages that are holding the indirect segments.
- * This pages are filled with an array of blkif_request_segment_aligned
- * that hold the information about the segments. The number of indirect
- * pages to use is determined by the maximum number of segments
- * a indirect request contains. Every indirect page can contain a maximum
- * of 512 segments (PAGE_SIZE/sizeof(blkif_request_segment_aligned)),
- * so to calculate the number of indirect pages to use we have to do
- * ceil(indirect_segments/512).
+ * These pages are filled with an array of blkif_request_segment that hold the
+ * information about the segments. The number of indirect pages to use is
+ * determined by the number of segments an indirect request contains. Every
+ * indirect page can contain a maximum of
+ * (PAGE_SIZE / sizeof(struct blkif_request_segment)) segments, so to
+ * calculate the number of indirect pages to use we have to do
+ * ceil(indirect_segments / (PAGE_SIZE / sizeof(struct blkif_request_segment))).
  *
  * If a backend does not recognize BLKIF_OP_INDIRECT, it should *not*
  * create the "feature-max-indirect-segments" node!
@@ -135,13 +135,12 @@ typedef uint64_t blkif_sector_t;
 
 #define BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST 8
 
-struct blkif_request_segment_aligned {
-	grant_ref_t gref;        /* reference to I/O buffer frame        */
-	/* @first_sect: first sector in frame to transfer (inclusive).   */
-	/* @last_sect: last sector in frame to transfer (inclusive).     */
-	uint8_t     first_sect, last_sect;
-	uint16_t    _pad; /* padding to make it 8 bytes, so it's cache-aligned */
-} __attribute__((__packed__));
+struct blkif_request_segment {
+		grant_ref_t gref;        /* reference to I/O buffer frame        */
+		/* @first_sect: first sector in frame to transfer (inclusive).   */
+		/* @last_sect: last sector in frame to transfer (inclusive).     */
+		uint8_t     first_sect, last_sect;
+};
 
 struct blkif_request_rw {
 	uint8_t        nr_segments;  /* number of segments                   */
@@ -151,12 +150,7 @@ struct blkif_request_rw {
 #endif
 	uint64_t       id;           /* private guest value, echoed in resp  */
 	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
-	struct blkif_request_segment {
-		grant_ref_t gref;        /* reference to I/O buffer frame        */
-		/* @first_sect: first sector in frame to transfer (inclusive).   */
-		/* @last_sect: last sector in frame to transfer (inclusive).     */
-		uint8_t     first_sect, last_sect;
-	} seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 } __attribute__((__packed__));
 
 struct blkif_request_discard {
-- 
cgit v1.2.3


From 72a0a36e2854a6eadb4cf2561858f613f9cd4639 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 7 Feb 2014 10:22:36 -0800
Subject: blk-mq: support at_head inserations for blk_execute_rq

This is neede for proper SG_IO operation as well as various uses of
blk_execute_rq from the SCSI midlayer.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk-mq.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 1e8f16f65af4..b7638be58599 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -122,7 +122,8 @@ void blk_mq_init_commands(struct request_queue *, void (*init)(void *data, struc
 
 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
 
-void blk_mq_insert_request(struct request_queue *, struct request *, bool);
+void blk_mq_insert_request(struct request_queue *, struct request *,
+		bool, bool);
 void blk_mq_run_queues(struct request_queue *q, bool async);
 void blk_mq_free_request(struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
-- 
cgit v1.2.3


From c4540a7d8c1e595560e53acedf88901daf15a2b5 Mon Sep 17 00:00:00 2001
From: Rashika Kheria <rashika.kheria@gmail.com>
Date: Sun, 9 Feb 2014 18:30:39 +0530
Subject: fs: Add prototype declaration to appropriate header file
 include/linux/bio.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add prototype declaration to header file include/linux/bio.h because it
is used by more than one file.

This eliminates the following warning in bio-integrity.c:
fs/bio-integrity.c:214:14: warning: no previous prototype for ‘bio_integrity_tag_size’ [-Wmissing-prototypes]

Signed-off-by: Rashika Kheria <rashika.kheria@gmail.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/bio.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 70654521dab6..d6791bba8264 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -332,6 +332,7 @@ extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
 extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
 
 extern struct bio_set *fs_bio_set;
+unsigned int bio_integrity_tag_size(struct bio *bio);
 
 static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
 {
-- 
cgit v1.2.3


From 30a91cb4ef385fe1b260df204ef314d86fff2850 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 10 Feb 2014 03:24:38 -0800
Subject: blk-mq: rework I/O completions

Rework I/O completions to work more like the old code path.  blk_mq_end_io
now stays out of the business of deferring completions to others CPUs
and calling blk_mark_rq_complete.  The latter is very important to allow
completing requests that have timed out and thus are already marked completed,
the former allows using the IPI callout even for driver specific completions
instead of having to reimplement them.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk-mq.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b7638be58599..468be242db90 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -86,6 +86,8 @@ struct blk_mq_ops {
 	 */
 	rq_timed_out_fn		*timeout;
 
+	softirq_done_fn		*complete;
+
 	/*
 	 * Override for hctx allocations (should probably go)
 	 */
@@ -137,6 +139,8 @@ void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int);
 
 void blk_mq_end_io(struct request *rq, int error);
 
+void blk_mq_complete_request(struct request *rq);
+
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_stop_hw_queues(struct request_queue *q);
-- 
cgit v1.2.3


From 18741986a4b1dc4b1f171634c4191abc3b0fa023 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 10 Feb 2014 09:29:00 -0700
Subject: blk-mq: rework flush sequencing logic

Witch to using a preallocated flush_rq for blk-mq similar to what's done
with the old request path.  This allows us to set up the request properly
with a tag from the actually allowed range and ->rq_disk as needed by
some drivers.  To make life easier we also switch to dynamic allocation
of ->flush_rq for the old path.

This effectively reverts most of

    "blk-mq: fix for flush deadlock"

and

    "blk-mq: Don't reserve a tag for flush request"

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk-mq.h |  5 +----
 include/linux/blkdev.h | 11 +++--------
 2 files changed, 4 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 468be242db90..18ba8a627f46 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -36,15 +36,12 @@ struct blk_mq_hw_ctx {
 	struct list_head	page_list;
 	struct blk_mq_tags	*tags;
 
-	atomic_t		pending_flush;
-
 	unsigned long		queued;
 	unsigned long		run;
 #define BLK_MQ_MAX_DISPATCH_ORDER	10
 	unsigned long		dispatched[BLK_MQ_MAX_DISPATCH_ORDER];
 
 	unsigned int		queue_depth;
-	unsigned int		reserved_tags;
 	unsigned int		numa_node;
 	unsigned int		cmd_size;	/* per-request extra data */
 
@@ -129,7 +126,7 @@ void blk_mq_insert_request(struct request_queue *, struct request *,
 void blk_mq_run_queues(struct request_queue *q, bool async);
 void blk_mq_free_request(struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
-struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, bool reserved);
+struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp);
 struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp);
 struct request *blk_mq_rq_from_tag(struct request_queue *q, unsigned int tag);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0375654adb28..b2d25ecbcbc1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -101,7 +101,7 @@ struct request {
 	};
 	union {
 		struct call_single_data csd;
-		struct work_struct mq_flush_data;
+		struct work_struct mq_flush_work;
 	};
 
 	struct request_queue *q;
@@ -451,13 +451,8 @@ struct request_queue {
 	unsigned long		flush_pending_since;
 	struct list_head	flush_queue[2];
 	struct list_head	flush_data_in_flight;
-	union {
-		struct request	flush_rq;
-		struct {
-			spinlock_t mq_flush_lock;
-			struct work_struct mq_flush_work;
-		};
-	};
+	struct request		*flush_rq;
+	spinlock_t		mq_flush_lock;
 
 	struct mutex		sysfs_lock;
 
-- 
cgit v1.2.3


From 8423ae3d7a3cfe084865262cfaeba1359d405182 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Mon, 10 Feb 2014 17:45:50 -0800
Subject: block: Fix cloning of discard/write same bios

Immutable biovecs changed the way bio segments are treated in such a way that
bio_for_each_segment() cannot now do what we want for discard/write same bios,
since bi_size means something completely different for them.

Fortunately discard and write same bios never have more than a single biovec, so
bio_for_each_segment() is unnecessary and not terribly meaningful for them, but
we still have to special case them in a few places.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Tested-by: Richard W.M. Jones <rjones@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/bio.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index d6791bba8264..5a4d39b4686b 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -250,6 +250,17 @@ static inline unsigned bio_segments(struct bio *bio)
 	struct bio_vec bv;
 	struct bvec_iter iter;
 
+	/*
+	 * We special case discard/write same, because they interpret bi_size
+	 * differently:
+	 */
+
+	if (bio->bi_rw & REQ_DISCARD)
+		return 1;
+
+	if (bio->bi_rw & REQ_WRITE_SAME)
+		return 1;
+
 	bio_for_each_segment(bv, bio, iter)
 		segs++;
 
-- 
cgit v1.2.3