From f0276924fa35a3607920a58cf5d878212824b951 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 31 Dec 2013 11:38:50 +0800 Subject: blk-mq: Don't reserve a tag for flush request Reserving a tag (request) for flush to avoid dead lock is a overkill. A tag is valuable resource. We can track the number of flush requests and disallow having too many pending flush requests allocated. With this patch, blk_mq_alloc_request_pinned() could do a busy nop (but not a dead loop) if too many pending requests are allocated and new flush request is allocated. But this should not be a problem, too many pending flush requests are very rare case. I verified this can fix the deadlock caused by too many pending flush requests. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 161b23105b1e..1e8f16f65af4 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -36,12 +36,15 @@ struct blk_mq_hw_ctx { struct list_head page_list; struct blk_mq_tags *tags; + atomic_t pending_flush; + unsigned long queued; unsigned long run; #define BLK_MQ_MAX_DISPATCH_ORDER 10 unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; unsigned int queue_depth; + unsigned int reserved_tags; unsigned int numa_node; unsigned int cmd_size; /* per-request extra data */ -- cgit v1.2.3 From 80bfa2f6e2e81049fc6cd3bfaeedcb64db3a9ba6 Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Tue, 4 Feb 2014 11:26:15 +0100 Subject: xen-blkif: drop struct blkif_request_segment_aligned MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This was wrongly introduced in commit 402b27f9, the only difference between blkif_request_segment_aligned and blkif_request_segment is that the former has a named padding, while both share the same memory layout. Also correct a few minor glitches in the description, including for it to no longer assume PAGE_SIZE == 4096. Signed-off-by: Roger Pau Monné [Description fix by Jan Beulich] Signed-off-by: Jan Beulich Reported-by: Jan Beulich Cc: Konrad Rzeszutek Wilk Cc: David Vrabel Cc: Boris Ostrovsky Tested-by: Matt Rushton Cc: Matt Wilson Signed-off-by: Konrad Rzeszutek Wilk --- include/xen/interface/io/blkif.h | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h index ae665ac59c36..32ec05a6572f 100644 --- a/include/xen/interface/io/blkif.h +++ b/include/xen/interface/io/blkif.h @@ -113,13 +113,13 @@ typedef uint64_t blkif_sector_t; * it's less than the number provided by the backend. The indirect_grefs field * in blkif_request_indirect should be filled by the frontend with the * grant references of the pages that are holding the indirect segments. - * This pages are filled with an array of blkif_request_segment_aligned - * that hold the information about the segments. The number of indirect - * pages to use is determined by the maximum number of segments - * a indirect request contains. Every indirect page can contain a maximum - * of 512 segments (PAGE_SIZE/sizeof(blkif_request_segment_aligned)), - * so to calculate the number of indirect pages to use we have to do - * ceil(indirect_segments/512). + * These pages are filled with an array of blkif_request_segment that hold the + * information about the segments. The number of indirect pages to use is + * determined by the number of segments an indirect request contains. Every + * indirect page can contain a maximum of + * (PAGE_SIZE / sizeof(struct blkif_request_segment)) segments, so to + * calculate the number of indirect pages to use we have to do + * ceil(indirect_segments / (PAGE_SIZE / sizeof(struct blkif_request_segment))). * * If a backend does not recognize BLKIF_OP_INDIRECT, it should *not* * create the "feature-max-indirect-segments" node! @@ -135,13 +135,12 @@ typedef uint64_t blkif_sector_t; #define BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST 8 -struct blkif_request_segment_aligned { - grant_ref_t gref; /* reference to I/O buffer frame */ - /* @first_sect: first sector in frame to transfer (inclusive). */ - /* @last_sect: last sector in frame to transfer (inclusive). */ - uint8_t first_sect, last_sect; - uint16_t _pad; /* padding to make it 8 bytes, so it's cache-aligned */ -} __attribute__((__packed__)); +struct blkif_request_segment { + grant_ref_t gref; /* reference to I/O buffer frame */ + /* @first_sect: first sector in frame to transfer (inclusive). */ + /* @last_sect: last sector in frame to transfer (inclusive). */ + uint8_t first_sect, last_sect; +}; struct blkif_request_rw { uint8_t nr_segments; /* number of segments */ @@ -151,12 +150,7 @@ struct blkif_request_rw { #endif uint64_t id; /* private guest value, echoed in resp */ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ - struct blkif_request_segment { - grant_ref_t gref; /* reference to I/O buffer frame */ - /* @first_sect: first sector in frame to transfer (inclusive). */ - /* @last_sect: last sector in frame to transfer (inclusive). */ - uint8_t first_sect, last_sect; - } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; } __attribute__((__packed__)); struct blkif_request_discard { -- cgit v1.2.3 From 72a0a36e2854a6eadb4cf2561858f613f9cd4639 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 7 Feb 2014 10:22:36 -0800 Subject: blk-mq: support at_head inserations for blk_execute_rq This is neede for proper SG_IO operation as well as various uses of blk_execute_rq from the SCSI midlayer. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 1e8f16f65af4..b7638be58599 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -122,7 +122,8 @@ void blk_mq_init_commands(struct request_queue *, void (*init)(void *data, struc void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); -void blk_mq_insert_request(struct request_queue *, struct request *, bool); +void blk_mq_insert_request(struct request_queue *, struct request *, + bool, bool); void blk_mq_run_queues(struct request_queue *q, bool async); void blk_mq_free_request(struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); -- cgit v1.2.3 From c4540a7d8c1e595560e53acedf88901daf15a2b5 Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Sun, 9 Feb 2014 18:30:39 +0530 Subject: fs: Add prototype declaration to appropriate header file include/linux/bio.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add prototype declaration to header file include/linux/bio.h because it is used by more than one file. This eliminates the following warning in bio-integrity.c: fs/bio-integrity.c:214:14: warning: no previous prototype for ‘bio_integrity_tag_size’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Signed-off-by: Jens Axboe --- include/linux/bio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 70654521dab6..d6791bba8264 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -332,6 +332,7 @@ extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *); extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs); extern struct bio_set *fs_bio_set; +unsigned int bio_integrity_tag_size(struct bio *bio); static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) { -- cgit v1.2.3 From 30a91cb4ef385fe1b260df204ef314d86fff2850 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Feb 2014 03:24:38 -0800 Subject: blk-mq: rework I/O completions Rework I/O completions to work more like the old code path. blk_mq_end_io now stays out of the business of deferring completions to others CPUs and calling blk_mark_rq_complete. The latter is very important to allow completing requests that have timed out and thus are already marked completed, the former allows using the IPI callout even for driver specific completions instead of having to reimplement them. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b7638be58599..468be242db90 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -86,6 +86,8 @@ struct blk_mq_ops { */ rq_timed_out_fn *timeout; + softirq_done_fn *complete; + /* * Override for hctx allocations (should probably go) */ @@ -137,6 +139,8 @@ void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int); void blk_mq_end_io(struct request *rq, int error); +void blk_mq_complete_request(struct request *rq); + void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_stop_hw_queues(struct request_queue *q); -- cgit v1.2.3 From 18741986a4b1dc4b1f171634c4191abc3b0fa023 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Feb 2014 09:29:00 -0700 Subject: blk-mq: rework flush sequencing logic Witch to using a preallocated flush_rq for blk-mq similar to what's done with the old request path. This allows us to set up the request properly with a tag from the actually allowed range and ->rq_disk as needed by some drivers. To make life easier we also switch to dynamic allocation of ->flush_rq for the old path. This effectively reverts most of "blk-mq: fix for flush deadlock" and "blk-mq: Don't reserve a tag for flush request" Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 5 +---- include/linux/blkdev.h | 11 +++-------- 2 files changed, 4 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 468be242db90..18ba8a627f46 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -36,15 +36,12 @@ struct blk_mq_hw_ctx { struct list_head page_list; struct blk_mq_tags *tags; - atomic_t pending_flush; - unsigned long queued; unsigned long run; #define BLK_MQ_MAX_DISPATCH_ORDER 10 unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; unsigned int queue_depth; - unsigned int reserved_tags; unsigned int numa_node; unsigned int cmd_size; /* per-request extra data */ @@ -129,7 +126,7 @@ void blk_mq_insert_request(struct request_queue *, struct request *, void blk_mq_run_queues(struct request_queue *q, bool async); void blk_mq_free_request(struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); -struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, bool reserved); +struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp); struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp); struct request *blk_mq_rq_from_tag(struct request_queue *q, unsigned int tag); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0375654adb28..b2d25ecbcbc1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -101,7 +101,7 @@ struct request { }; union { struct call_single_data csd; - struct work_struct mq_flush_data; + struct work_struct mq_flush_work; }; struct request_queue *q; @@ -451,13 +451,8 @@ struct request_queue { unsigned long flush_pending_since; struct list_head flush_queue[2]; struct list_head flush_data_in_flight; - union { - struct request flush_rq; - struct { - spinlock_t mq_flush_lock; - struct work_struct mq_flush_work; - }; - }; + struct request *flush_rq; + spinlock_t mq_flush_lock; struct mutex sysfs_lock; -- cgit v1.2.3 From 8423ae3d7a3cfe084865262cfaeba1359d405182 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Feb 2014 17:45:50 -0800 Subject: block: Fix cloning of discard/write same bios Immutable biovecs changed the way bio segments are treated in such a way that bio_for_each_segment() cannot now do what we want for discard/write same bios, since bi_size means something completely different for them. Fortunately discard and write same bios never have more than a single biovec, so bio_for_each_segment() is unnecessary and not terribly meaningful for them, but we still have to special case them in a few places. Signed-off-by: Kent Overstreet Tested-by: Richard W.M. Jones Signed-off-by: Jens Axboe --- include/linux/bio.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index d6791bba8264..5a4d39b4686b 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -250,6 +250,17 @@ static inline unsigned bio_segments(struct bio *bio) struct bio_vec bv; struct bvec_iter iter; + /* + * We special case discard/write same, because they interpret bi_size + * differently: + */ + + if (bio->bi_rw & REQ_DISCARD) + return 1; + + if (bio->bi_rw & REQ_WRITE_SAME) + return 1; + bio_for_each_segment(bv, bio, iter) segs++; -- cgit v1.2.3