From 395c72a707d966b36d5a42fe12c3a237ded3a0d9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 6 Sep 2012 15:34:55 -0700 Subject: block: Generalized bio pool freeing With the old code, when you allocate a bio from a bio pool you have to implement your own destructor that knows how to find the bio pool the bio was originally allocated from. This adds a new field to struct bio (bi_pool) and changes bio_alloc_bioset() to use it. This makes various bio destructors unnecessary, so they're then deleted. v6: Explain the temporary if statement in bio_put Signed-off-by: Kent Overstreet CC: Jens Axboe CC: NeilBrown CC: Alasdair Kergon CC: Nicholas Bellinger CC: Lars Ellenberg Acked-by: Tejun Heo Acked-by: Nicholas Bellinger Signed-off-by: Jens Axboe --- fs/bio.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 71072ab99128..e017f7a5cdc6 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -272,10 +272,6 @@ EXPORT_SYMBOL(bio_init); * bio_alloc_bioset will try its own mempool to satisfy the allocation. * If %__GFP_WAIT is set then we will block on the internal pool waiting * for a &struct bio to become free. - * - * Note that the caller must set ->bi_destructor on successful return - * of a bio, to do the appropriate freeing of the bio once the reference - * count drops to zero. **/ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) { @@ -290,6 +286,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) bio = p + bs->front_pad; bio_init(bio); + bio->bi_pool = bs; if (unlikely(!nr_iovecs)) goto out_set; @@ -316,11 +313,6 @@ err_free: } EXPORT_SYMBOL(bio_alloc_bioset); -static void bio_fs_destructor(struct bio *bio) -{ - bio_free(bio, fs_bio_set); -} - /** * bio_alloc - allocate a new bio, memory pool backed * @gfp_mask: allocation mask to use @@ -342,12 +334,7 @@ static void bio_fs_destructor(struct bio *bio) */ struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) { - struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); - - if (bio) - bio->bi_destructor = bio_fs_destructor; - - return bio; + return bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); } EXPORT_SYMBOL(bio_alloc); @@ -423,7 +410,16 @@ void bio_put(struct bio *bio) if (atomic_dec_and_test(&bio->bi_cnt)) { bio_disassociate_task(bio); bio->bi_next = NULL; - bio->bi_destructor(bio); + + /* + * This if statement is temporary - bi_pool is replacing + * bi_destructor, but bi_destructor will be taken out in another + * patch. + */ + if (bio->bi_pool) + bio_free(bio, bio->bi_pool); + else + bio->bi_destructor(bio); } } EXPORT_SYMBOL(bio_put); @@ -474,12 +470,11 @@ EXPORT_SYMBOL(__bio_clone); */ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) { - struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set); + struct bio *b = bio_alloc(gfp_mask, bio->bi_max_vecs); if (!b) return NULL; - b->bi_destructor = bio_fs_destructor; __bio_clone(b, bio); if (bio_integrity(bio)) { -- cgit v1.2.3 From 1e2a410ff71504a64d1af2e354287ac51aeac1b0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 6 Sep 2012 15:34:56 -0700 Subject: block: Ues bi_pool for bio_integrity_alloc() Now that bios keep track of where they were allocated from, bio_integrity_alloc_bioset() becomes redundant. Remove bio_integrity_alloc_bioset() and drop bio_set argument from the related functions and make them use bio->bi_pool. Signed-off-by: Kent Overstreet CC: Jens Axboe CC: Martin K. Petersen Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/bio-integrity.c | 44 +++++++++++++++----------------------------- fs/bio.c | 6 +++--- 2 files changed, 18 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index e85c04b9f61c..a3f28f331b2b 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -70,23 +70,25 @@ static inline int use_bip_pool(unsigned int idx) } /** - * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio + * bio_integrity_alloc - Allocate integrity payload and attach it to bio * @bio: bio to attach integrity metadata to * @gfp_mask: Memory allocation mask * @nr_vecs: Number of integrity metadata scatter-gather elements - * @bs: bio_set to allocate from * * Description: This function prepares a bio for attaching integrity * metadata. nr_vecs specifies the maximum number of pages containing * integrity metadata that can be attached. */ -struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio, - gfp_t gfp_mask, - unsigned int nr_vecs, - struct bio_set *bs) +struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, + gfp_t gfp_mask, + unsigned int nr_vecs) { struct bio_integrity_payload *bip; unsigned int idx = vecs_to_idx(nr_vecs); + struct bio_set *bs = bio->bi_pool; + + if (!bs) + bs = fs_bio_set; BUG_ON(bio == NULL); bip = NULL; @@ -114,37 +116,22 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio, return bip; } -EXPORT_SYMBOL(bio_integrity_alloc_bioset); - -/** - * bio_integrity_alloc - Allocate integrity payload and attach it to bio - * @bio: bio to attach integrity metadata to - * @gfp_mask: Memory allocation mask - * @nr_vecs: Number of integrity metadata scatter-gather elements - * - * Description: This function prepares a bio for attaching integrity - * metadata. nr_vecs specifies the maximum number of pages containing - * integrity metadata that can be attached. - */ -struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, - gfp_t gfp_mask, - unsigned int nr_vecs) -{ - return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set); -} EXPORT_SYMBOL(bio_integrity_alloc); /** * bio_integrity_free - Free bio integrity payload * @bio: bio containing bip to be freed - * @bs: bio_set this bio was allocated from * * Description: Used to free the integrity portion of a bio. Usually * called from bio_free(). */ -void bio_integrity_free(struct bio *bio, struct bio_set *bs) +void bio_integrity_free(struct bio *bio) { struct bio_integrity_payload *bip = bio->bi_integrity; + struct bio_set *bs = bio->bi_pool; + + if (!bs) + bs = fs_bio_set; BUG_ON(bip == NULL); @@ -730,19 +717,18 @@ EXPORT_SYMBOL(bio_integrity_split); * @bio: New bio * @bio_src: Original bio * @gfp_mask: Memory allocation mask - * @bs: bio_set to allocate bip from * * Description: Called to allocate a bip when cloning a bio */ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, - gfp_t gfp_mask, struct bio_set *bs) + gfp_t gfp_mask) { struct bio_integrity_payload *bip_src = bio_src->bi_integrity; struct bio_integrity_payload *bip; BUG_ON(bip_src == NULL); - bip = bio_integrity_alloc_bioset(bio, gfp_mask, bip_src->bip_vcnt, bs); + bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt); if (bip == NULL) return -EIO; diff --git a/fs/bio.c b/fs/bio.c index e017f7a5cdc6..b14f71adff4a 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -241,7 +241,7 @@ void bio_free(struct bio *bio, struct bio_set *bs) bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio)); if (bio_integrity(bio)) - bio_integrity_free(bio, bs); + bio_integrity_free(bio); /* * If we have front padding, adjust the bio pointer before freeing @@ -341,7 +341,7 @@ EXPORT_SYMBOL(bio_alloc); static void bio_kmalloc_destructor(struct bio *bio) { if (bio_integrity(bio)) - bio_integrity_free(bio, fs_bio_set); + bio_integrity_free(bio); kfree(bio); } @@ -480,7 +480,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) if (bio_integrity(bio)) { int ret; - ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set); + ret = bio_integrity_clone(b, bio, gfp_mask); if (ret < 0) { bio_put(b); -- cgit v1.2.3 From f44b48c7691be7643877d1f881b5eeace654d05d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 6 Sep 2012 15:34:58 -0700 Subject: block: Add bio_reset() Reusing bios is something that's been highly frowned upon in the past, but driver code keeps doing it anyways. If it's going to happen anyways, we should provide a generic method. This'll help with getting rid of bi_destructor - drivers/block/pktcdvd.c was open coding it, by doing a bio_init() and resetting bi_destructor. This required reordering struct bio, but the block layer is not yet nearly fast enough for any cacheline effects to matter here. v5: Add a define BIO_RESET_BITS, to be very explicit about what parts of bio->bi_flags are saved. v6: Further commenting verbosity, per Tejun v9: Add a function comment Signed-off-by: Kent Overstreet CC: Jens Axboe Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/bio.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index b14f71adff4a..919ee9aa5c57 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -262,6 +262,30 @@ void bio_init(struct bio *bio) } EXPORT_SYMBOL(bio_init); +/** + * bio_reset - reinitialize a bio + * @bio: bio to reset + * + * Description: + * After calling bio_reset(), @bio will be in the same state as a freshly + * allocated bio returned bio bio_alloc_bioset() - the only fields that are + * preserved are the ones that are initialized by bio_alloc_bioset(). See + * comment in struct bio. + */ +void bio_reset(struct bio *bio) +{ + unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS); + + if (bio_integrity(bio)) + bio_integrity_free(bio); + + bio_disassociate_task(bio); + + memset(bio, 0, BIO_RESET_BYTES); + bio->bi_flags = flags|(1 << BIO_UPTODATE); +} +EXPORT_SYMBOL(bio_reset); + /** * bio_alloc_bioset - allocate a bio for I/O * @gfp_mask: the GFP_ mask given to the slab allocator -- cgit v1.2.3 From 4254bba17d92d53822a56ebc2a0c1eb7e2a71155 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 6 Sep 2012 15:35:00 -0700 Subject: block: Kill bi_destructor Now that we've got generic code for freeing bios allocated from bio pools, this isn't needed anymore. This patch also makes bio_free() static, since without bi_destructor there should be no need for it to be called anywhere else. bio_free() is now only called from bio_put, so we can refactor those a bit - move some code from bio_put() to bio_free() and kill the redundant bio->bi_next = NULL. v5: Switch to BIO_KMALLOC_POOL ((void *)~0), per Boaz v6: BIO_KMALLOC_POOL now NULL, drop bio_free's EXPORT_SYMBOL v7: No #define BIO_KMALLOC_POOL anymore Signed-off-by: Kent Overstreet CC: Jens Axboe Signed-off-by: Jens Axboe --- fs/bio.c | 64 ++++++++++++++++++++++++++-------------------------------------- 1 file changed, 26 insertions(+), 38 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 919ee9aa5c57..736ef12f5191 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -233,26 +233,37 @@ fallback: return bvl; } -void bio_free(struct bio *bio, struct bio_set *bs) +static void __bio_free(struct bio *bio) { - void *p; - - if (bio_has_allocated_vec(bio)) - bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio)); + bio_disassociate_task(bio); if (bio_integrity(bio)) bio_integrity_free(bio); +} - /* - * If we have front padding, adjust the bio pointer before freeing - */ - p = bio; - if (bs->front_pad) +static void bio_free(struct bio *bio) +{ + struct bio_set *bs = bio->bi_pool; + void *p; + + __bio_free(bio); + + if (bs) { + if (bio_has_allocated_vec(bio)) + bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio)); + + /* + * If we have front padding, adjust the bio pointer before freeing + */ + p = bio; p -= bs->front_pad; - mempool_free(p, bs->bio_pool); + mempool_free(p, bs->bio_pool); + } else { + /* Bio was allocated by bio_kmalloc() */ + kfree(bio); + } } -EXPORT_SYMBOL(bio_free); void bio_init(struct bio *bio) { @@ -276,10 +287,7 @@ void bio_reset(struct bio *bio) { unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS); - if (bio_integrity(bio)) - bio_integrity_free(bio); - - bio_disassociate_task(bio); + __bio_free(bio); memset(bio, 0, BIO_RESET_BYTES); bio->bi_flags = flags|(1 << BIO_UPTODATE); @@ -362,13 +370,6 @@ struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) } EXPORT_SYMBOL(bio_alloc); -static void bio_kmalloc_destructor(struct bio *bio) -{ - if (bio_integrity(bio)) - bio_integrity_free(bio); - kfree(bio); -} - /** * bio_kmalloc - allocate a bio for I/O using kmalloc() * @gfp_mask: the GFP_ mask given to the slab allocator @@ -395,7 +396,6 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET; bio->bi_max_vecs = nr_iovecs; bio->bi_io_vec = bio->bi_inline_vecs; - bio->bi_destructor = bio_kmalloc_destructor; return bio; } @@ -431,20 +431,8 @@ void bio_put(struct bio *bio) /* * last put frees it */ - if (atomic_dec_and_test(&bio->bi_cnt)) { - bio_disassociate_task(bio); - bio->bi_next = NULL; - - /* - * This if statement is temporary - bi_pool is replacing - * bi_destructor, but bi_destructor will be taken out in another - * patch. - */ - if (bio->bi_pool) - bio_free(bio, bio->bi_pool); - else - bio->bi_destructor(bio); - } + if (atomic_dec_and_test(&bio->bi_cnt)) + bio_free(bio); } EXPORT_SYMBOL(bio_put); -- cgit v1.2.3 From 3f86a82aeb03e6100f7ab39f4702e033a5e38166 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 6 Sep 2012 15:35:01 -0700 Subject: block: Consolidate bio_alloc_bioset(), bio_kmalloc() Previously, bio_kmalloc() and bio_alloc_bioset() behaved slightly different because there was some almost-duplicated code - this fixes some of that. The important change is that previously bio_kmalloc() always set bi_io_vec = bi_inline_vecs, even if nr_iovecs == 0 - unlike bio_alloc_bioset(). This would cause bio_has_data() to return true; I don't know if this resulted in any actual bugs but it was certainly wrong. bio_kmalloc() and bio_alloc_bioset() also have different arbitrary limits on nr_iovecs - 1024 (UIO_MAXIOV) for bio_kmalloc(), 256 (BIO_MAX_PAGES) for bio_alloc_bioset(). This patch doesn't fix that, but at least they're enforced closer together and hopefully they will be fixed in a later patch. This'll also help with some future cleanups - there are a fair number of functions that allocate bios (e.g. bio_clone()), and now they don't have to be duplicated for bio_alloc(), bio_alloc_bioset(), and bio_kmalloc(). Signed-off-by: Kent Overstreet CC: Jens Axboe v7: Re-add dropped comments, improv patch description Signed-off-by: Jens Axboe --- fs/bio.c | 110 +++++++++++++++++++++------------------------------------------ 1 file changed, 37 insertions(+), 73 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 736ef12f5191..191b9b86c272 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -55,6 +55,7 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { * IO code that does not need private memory pools. */ struct bio_set *fs_bio_set; +EXPORT_SYMBOL(fs_bio_set); /* * Our slab pool management @@ -301,39 +302,58 @@ EXPORT_SYMBOL(bio_reset); * @bs: the bio_set to allocate from. * * Description: - * bio_alloc_bioset will try its own mempool to satisfy the allocation. - * If %__GFP_WAIT is set then we will block on the internal pool waiting - * for a &struct bio to become free. - **/ + * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is + * backed by the @bs's mempool. + * + * When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be + * able to allocate a bio. This is due to the mempool guarantees. To make this + * work, callers must never allocate more than 1 bio at a time from this pool. + * Callers that need to allocate more than 1 bio must always submit the + * previously allocated bio for IO before attempting to allocate a new one. + * Failure to do so can cause deadlocks under memory pressure. + * + * RETURNS: + * Pointer to new bio on success, NULL on failure. + */ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) { + unsigned front_pad; + unsigned inline_vecs; unsigned long idx = BIO_POOL_NONE; struct bio_vec *bvl = NULL; struct bio *bio; void *p; - p = mempool_alloc(bs->bio_pool, gfp_mask); + if (!bs) { + if (nr_iovecs > UIO_MAXIOV) + return NULL; + + p = kmalloc(sizeof(struct bio) + + nr_iovecs * sizeof(struct bio_vec), + gfp_mask); + front_pad = 0; + inline_vecs = nr_iovecs; + } else { + p = mempool_alloc(bs->bio_pool, gfp_mask); + front_pad = bs->front_pad; + inline_vecs = BIO_INLINE_VECS; + } + if (unlikely(!p)) return NULL; - bio = p + bs->front_pad; + bio = p + front_pad; bio_init(bio); - bio->bi_pool = bs; - - if (unlikely(!nr_iovecs)) - goto out_set; - if (nr_iovecs <= BIO_INLINE_VECS) { - bvl = bio->bi_inline_vecs; - nr_iovecs = BIO_INLINE_VECS; - } else { + if (nr_iovecs > inline_vecs) { bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); if (unlikely(!bvl)) goto err_free; - - nr_iovecs = bvec_nr_vecs(idx); + } else if (nr_iovecs) { + bvl = bio->bi_inline_vecs; } -out_set: + + bio->bi_pool = bs; bio->bi_flags |= idx << BIO_POOL_OFFSET; bio->bi_max_vecs = nr_iovecs; bio->bi_io_vec = bvl; @@ -345,62 +365,6 @@ err_free: } EXPORT_SYMBOL(bio_alloc_bioset); -/** - * bio_alloc - allocate a new bio, memory pool backed - * @gfp_mask: allocation mask to use - * @nr_iovecs: number of iovecs - * - * bio_alloc will allocate a bio and associated bio_vec array that can hold - * at least @nr_iovecs entries. Allocations will be done from the - * fs_bio_set. Also see @bio_alloc_bioset and @bio_kmalloc. - * - * If %__GFP_WAIT is set, then bio_alloc will always be able to allocate - * a bio. This is due to the mempool guarantees. To make this work, callers - * must never allocate more than 1 bio at a time from this pool. Callers - * that need to allocate more than 1 bio must always submit the previously - * allocated bio for IO before attempting to allocate a new one. Failure to - * do so can cause livelocks under memory pressure. - * - * RETURNS: - * Pointer to new bio on success, NULL on failure. - */ -struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) -{ - return bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); -} -EXPORT_SYMBOL(bio_alloc); - -/** - * bio_kmalloc - allocate a bio for I/O using kmalloc() - * @gfp_mask: the GFP_ mask given to the slab allocator - * @nr_iovecs: number of iovecs to pre-allocate - * - * Description: - * Allocate a new bio with @nr_iovecs bvecs. If @gfp_mask contains - * %__GFP_WAIT, the allocation is guaranteed to succeed. - * - **/ -struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) -{ - struct bio *bio; - - if (nr_iovecs > UIO_MAXIOV) - return NULL; - - bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec), - gfp_mask); - if (unlikely(!bio)) - return NULL; - - bio_init(bio); - bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET; - bio->bi_max_vecs = nr_iovecs; - bio->bi_io_vec = bio->bi_inline_vecs; - - return bio; -} -EXPORT_SYMBOL(bio_kmalloc); - void zero_fill_bio(struct bio *bio) { unsigned long flags; -- cgit v1.2.3 From bf800ef1816b4283a885e55ad38068aec9711e4d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 6 Sep 2012 15:35:02 -0700 Subject: block: Add bio_clone_bioset(), bio_clone_kmalloc() Previously, there was bio_clone() but it only allocated from the fs bio set; as a result various users were open coding it and using __bio_clone(). This changes bio_clone() to become bio_clone_bioset(), and then we add bio_clone() and bio_clone_kmalloc() as wrappers around it, making use of the functionality the last patch adedd. This will also help in a later patch changing how bio cloning works. Signed-off-by: Kent Overstreet CC: Jens Axboe CC: NeilBrown CC: Alasdair Kergon CC: Boaz Harrosh CC: Jeff Garzik Acked-by: Jeff Garzik Signed-off-by: Jens Axboe --- fs/bio.c | 11 +++++++---- fs/exofs/ore.c | 5 ++--- 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 191b9b86c272..13e956779e10 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -438,16 +438,19 @@ void __bio_clone(struct bio *bio, struct bio *bio_src) EXPORT_SYMBOL(__bio_clone); /** - * bio_clone - clone a bio + * bio_clone_bioset - clone a bio * @bio: bio to clone * @gfp_mask: allocation priority + * @bs: bio_set to allocate from * * Like __bio_clone, only also allocates the returned bio */ -struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) +struct bio *bio_clone_bioset(struct bio *bio, gfp_t gfp_mask, + struct bio_set *bs) { - struct bio *b = bio_alloc(gfp_mask, bio->bi_max_vecs); + struct bio *b; + b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, bs); if (!b) return NULL; @@ -466,7 +469,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) return b; } -EXPORT_SYMBOL(bio_clone); +EXPORT_SYMBOL(bio_clone_bioset); /** * bio_get_nr_vecs - return approx number of vecs diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 1585db1aa365..f936cb50dc0d 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -814,8 +814,8 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) struct bio *bio; if (per_dev != master_dev) { - bio = bio_kmalloc(GFP_KERNEL, - master_dev->bio->bi_max_vecs); + bio = bio_clone_kmalloc(master_dev->bio, + GFP_KERNEL); if (unlikely(!bio)) { ORE_DBGMSG( "Failed to allocate BIO size=%u\n", @@ -824,7 +824,6 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) goto out; } - __bio_clone(bio, master_dev->bio); bio->bi_bdev = NULL; bio->bi_next = NULL; per_dev->offset = master_dev->offset; -- cgit v1.2.3 From 4363ac7c13a9a4b763c6e8d9fdbfc2468f3b8ca4 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Tue, 18 Sep 2012 12:19:27 -0400 Subject: block: Implement support for WRITE SAME The WRITE SAME command supported on some SCSI devices allows the same block to be efficiently replicated throughout a block range. Only a single logical block is transferred from the host and the storage device writes the same data to all blocks described by the I/O. This patch implements support for WRITE SAME in the block layer. The blkdev_issue_write_same() function can be used by filesystems and block drivers to replicate a buffer across a block range. This can be used to efficiently initialize software RAID devices, etc. Signed-off-by: Martin K. Petersen Acked-by: Mike Snitzer Signed-off-by: Jens Axboe --- fs/bio.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 13e956779e10..f855e0e1869c 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1487,9 +1487,12 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors) bp->bv1 = bi->bi_io_vec[0]; bp->bv2 = bi->bi_io_vec[0]; - bp->bv2.bv_offset += first_sectors << 9; - bp->bv2.bv_len -= first_sectors << 9; - bp->bv1.bv_len = first_sectors << 9; + + if (bio_is_rw(bi)) { + bp->bv2.bv_offset += first_sectors << 9; + bp->bv2.bv_len -= first_sectors << 9; + bp->bv1.bv_len = first_sectors << 9; + } bp->bio1.bi_io_vec = &bp->bv1; bp->bio2.bi_io_vec = &bp->bv2; -- cgit v1.2.3 From b87570f5d349661814b262dd5fc40787700f80d6 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 26 Sep 2012 07:46:40 +0200 Subject: Fix a crash when block device is read and block size is changed at the same time The kernel may crash when block size is changed and I/O is issued simultaneously. Because some subsystems (udev or lvm) may read any block device anytime, the bug actually puts any code that changes a block device size in jeopardy. The crash can be reproduced if you place "msleep(1000)" to blkdev_get_blocks just before "bh->b_size = max_blocks << inode->i_blkbits;". Then, run "dd if=/dev/ram0 of=/dev/null bs=4k count=1 iflag=direct" While it is waiting in msleep, run "blockdev --setbsz 2048 /dev/ram0" You get a BUG. The direct and non-direct I/O is written with the assumption that block size does not change. It doesn't seem practical to fix these crashes one-by-one there may be many crash possibilities when block size changes at a certain place and it is impossible to find them all and verify the code. This patch introduces a new rw-lock bd_block_size_semaphore. The lock is taken for read during I/O. It is taken for write when changing block size. Consequently, block size can't be changed while I/O is being submitted. For asynchronous I/O, the patch only prevents block size change while the I/O is being submitted. The block size can change when the I/O is in progress or when the I/O is being finished. This is acceptable because there are no accesses to block size when asynchronous I/O is being finished. The patch prevents block size changing while the device is mapped with mmap. Signed-off-by: Mikulas Patocka Signed-off-by: Jens Axboe --- fs/block_dev.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 38e721b35d45..cdfb625824e2 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -116,6 +116,8 @@ EXPORT_SYMBOL(invalidate_bdev); int set_blocksize(struct block_device *bdev, int size) { + struct address_space *mapping; + /* Size must be a power of two, and between 512 and PAGE_SIZE */ if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) return -EINVAL; @@ -124,6 +126,20 @@ int set_blocksize(struct block_device *bdev, int size) if (size < bdev_logical_block_size(bdev)) return -EINVAL; + /* Prevent starting I/O or mapping the device */ + down_write(&bdev->bd_block_size_semaphore); + + /* Check that the block device is not memory mapped */ + mapping = bdev->bd_inode->i_mapping; + mutex_lock(&mapping->i_mmap_mutex); + if (!prio_tree_empty(&mapping->i_mmap) || + !list_empty(&mapping->i_mmap_nonlinear)) { + mutex_unlock(&mapping->i_mmap_mutex); + up_write(&bdev->bd_block_size_semaphore); + return -EBUSY; + } + mutex_unlock(&mapping->i_mmap_mutex); + /* Don't change the size if it is same as current */ if (bdev->bd_block_size != size) { sync_blockdev(bdev); @@ -131,6 +147,9 @@ int set_blocksize(struct block_device *bdev, int size) bdev->bd_inode->i_blkbits = blksize_bits(size); kill_bdev(bdev); } + + up_write(&bdev->bd_block_size_semaphore); + return 0; } @@ -472,6 +491,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); /* Initialize mutex for freeze. */ mutex_init(&bdev->bd_fsfreeze_mutex); + init_rwsem(&bdev->bd_block_size_semaphore); } static inline void __bd_forget(struct inode *inode) @@ -1567,6 +1587,22 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) return blkdev_ioctl(bdev, mode, cmd, arg); } +ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + ssize_t ret; + struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); + + down_read(&bdev->bd_block_size_semaphore); + + ret = generic_file_aio_read(iocb, iov, nr_segs, pos); + + up_read(&bdev->bd_block_size_semaphore); + + return ret; +} +EXPORT_SYMBOL_GPL(blkdev_aio_read); + /* * Write data to the block device. Only intended for the block device itself * and the raw driver which basically is a fake block device. @@ -1578,12 +1614,16 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; + struct block_device *bdev = I_BDEV(file->f_mapping->host); struct blk_plug plug; ssize_t ret; BUG_ON(iocb->ki_pos != pos); blk_start_plug(&plug); + + down_read(&bdev->bd_block_size_semaphore); + ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); if (ret > 0 || ret == -EIOCBQUEUED) { ssize_t err; @@ -1592,11 +1632,29 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err < 0 && ret > 0) ret = err; } + + up_read(&bdev->bd_block_size_semaphore); + blk_finish_plug(&plug); + return ret; } EXPORT_SYMBOL_GPL(blkdev_aio_write); +int blkdev_mmap(struct file *file, struct vm_area_struct *vma) +{ + int ret; + struct block_device *bdev = I_BDEV(file->f_mapping->host); + + down_read(&bdev->bd_block_size_semaphore); + + ret = generic_file_mmap(file, vma); + + up_read(&bdev->bd_block_size_semaphore); + + return ret; +} + /* * Try to release a page associated with block device when the system * is under memory pressure. @@ -1627,9 +1685,9 @@ const struct file_operations def_blk_fops = { .llseek = block_llseek, .read = do_sync_read, .write = do_sync_write, - .aio_read = generic_file_aio_read, + .aio_read = blkdev_aio_read, .aio_write = blkdev_aio_write, - .mmap = generic_file_mmap, + .mmap = blkdev_mmap, .fsync = blkdev_fsync, .unlocked_ioctl = block_ioctl, #ifdef CONFIG_COMPAT -- cgit v1.2.3 From 62ac665ff9fc07497ca524bd20d6a96893d11071 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 26 Sep 2012 07:46:43 +0200 Subject: blockdev: turn a rw semaphore into a percpu rw semaphore This avoids cache line bouncing when many processes lock the semaphore for read. New percpu lock implementation The lock consists of an array of percpu unsigned integers, a boolean variable and a mutex. When we take the lock for read, we enter rcu read section, check for a "locked" variable. If it is false, we increase a percpu counter on the current cpu and exit the rcu section. If "locked" is true, we exit the rcu section, take the mutex and drop it (this waits until a writer finished) and retry. Unlocking for read just decreases percpu variable. Note that we can unlock on a difference cpu than where we locked, in this case the counter underflows. The sum of all percpu counters represents the number of processes that hold the lock for read. When we need to lock for write, we take the mutex, set "locked" variable to true and synchronize rcu. Since RCU has been synchronized, no processes can create new read locks. We wait until the sum of percpu counters is zero - when it is, there are no readers in the critical section. Signed-off-by: Mikulas Patocka Signed-off-by: Jens Axboe --- fs/block_dev.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index cdfb625824e2..7eeb0635338b 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -127,7 +127,7 @@ int set_blocksize(struct block_device *bdev, int size) return -EINVAL; /* Prevent starting I/O or mapping the device */ - down_write(&bdev->bd_block_size_semaphore); + percpu_down_write(&bdev->bd_block_size_semaphore); /* Check that the block device is not memory mapped */ mapping = bdev->bd_inode->i_mapping; @@ -135,7 +135,7 @@ int set_blocksize(struct block_device *bdev, int size) if (!prio_tree_empty(&mapping->i_mmap) || !list_empty(&mapping->i_mmap_nonlinear)) { mutex_unlock(&mapping->i_mmap_mutex); - up_write(&bdev->bd_block_size_semaphore); + percpu_up_write(&bdev->bd_block_size_semaphore); return -EBUSY; } mutex_unlock(&mapping->i_mmap_mutex); @@ -148,7 +148,7 @@ int set_blocksize(struct block_device *bdev, int size) kill_bdev(bdev); } - up_write(&bdev->bd_block_size_semaphore); + percpu_up_write(&bdev->bd_block_size_semaphore); return 0; } @@ -460,6 +460,12 @@ static struct inode *bdev_alloc_inode(struct super_block *sb) struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); if (!ei) return NULL; + + if (unlikely(percpu_init_rwsem(&ei->bdev.bd_block_size_semaphore))) { + kmem_cache_free(bdev_cachep, ei); + return NULL; + } + return &ei->vfs_inode; } @@ -468,6 +474,8 @@ static void bdev_i_callback(struct rcu_head *head) struct inode *inode = container_of(head, struct inode, i_rcu); struct bdev_inode *bdi = BDEV_I(inode); + percpu_free_rwsem(&bdi->bdev.bd_block_size_semaphore); + kmem_cache_free(bdev_cachep, bdi); } @@ -491,7 +499,6 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); /* Initialize mutex for freeze. */ mutex_init(&bdev->bd_fsfreeze_mutex); - init_rwsem(&bdev->bd_block_size_semaphore); } static inline void __bd_forget(struct inode *inode) @@ -1593,11 +1600,11 @@ ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, ssize_t ret; struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); - down_read(&bdev->bd_block_size_semaphore); + percpu_down_read(&bdev->bd_block_size_semaphore); ret = generic_file_aio_read(iocb, iov, nr_segs, pos); - up_read(&bdev->bd_block_size_semaphore); + percpu_up_read(&bdev->bd_block_size_semaphore); return ret; } @@ -1622,7 +1629,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, blk_start_plug(&plug); - down_read(&bdev->bd_block_size_semaphore); + percpu_down_read(&bdev->bd_block_size_semaphore); ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); if (ret > 0 || ret == -EIOCBQUEUED) { @@ -1633,7 +1640,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, ret = err; } - up_read(&bdev->bd_block_size_semaphore); + percpu_up_read(&bdev->bd_block_size_semaphore); blk_finish_plug(&plug); @@ -1646,11 +1653,11 @@ int blkdev_mmap(struct file *file, struct vm_area_struct *vma) int ret; struct block_device *bdev = I_BDEV(file->f_mapping->host); - down_read(&bdev->bd_block_size_semaphore); + percpu_down_read(&bdev->bd_block_size_semaphore); ret = generic_file_mmap(file, vma); - up_read(&bdev->bd_block_size_semaphore); + percpu_up_read(&bdev->bd_block_size_semaphore); return ret; } -- cgit v1.2.3 From 3eab7315c8dd6685f58acba00319dd8b80a21d7a Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Wed, 26 Sep 2012 09:57:55 +0200 Subject: fs/block_dev.c:1644:5: sparse: symbol 'blkdev_mmap' was not declared blkdev_mmap() isn't used outside of fs/block_dev.c, mark it as static. Reported-by: Fengguang Wu Signed-off-by: Jens Axboe --- fs/block_dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 7eeb0635338b..37967bcea05c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1648,7 +1648,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, } EXPORT_SYMBOL_GPL(blkdev_aio_write); -int blkdev_mmap(struct file *file, struct vm_area_struct *vma) +static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) { int ret; struct block_device *bdev = I_BDEV(file->f_mapping->host); -- cgit v1.2.3 From 02f3939e1a9357b7c370a4a69717cf9c02452737 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 28 Sep 2012 10:38:48 +0200 Subject: block: makes bio_split support bio without data discard bio hasn't data attached. We hit a BUG_ON with such bio. This makes bio_split works for such bio. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown Signed-off-by: Jens Axboe --- fs/bio.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index f855e0e1869c..9298c65ad9c7 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1475,7 +1475,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors) trace_block_split(bdev_get_queue(bi->bi_bdev), bi, bi->bi_sector + first_sectors); - BUG_ON(bi->bi_vcnt != 1); + BUG_ON(bi->bi_vcnt != 1 && bi->bi_vcnt != 0); BUG_ON(bi->bi_idx != 0); atomic_set(&bp->cnt, 3); bp->error = 0; @@ -1485,20 +1485,22 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors) bp->bio2.bi_size -= first_sectors << 9; bp->bio1.bi_size = first_sectors << 9; - bp->bv1 = bi->bi_io_vec[0]; - bp->bv2 = bi->bi_io_vec[0]; + if (bi->bi_vcnt != 0) { + bp->bv1 = bi->bi_io_vec[0]; + bp->bv2 = bi->bi_io_vec[0]; - if (bio_is_rw(bi)) { - bp->bv2.bv_offset += first_sectors << 9; - bp->bv2.bv_len -= first_sectors << 9; - bp->bv1.bv_len = first_sectors << 9; - } + if (bio_is_rw(bi)) { + bp->bv2.bv_offset += first_sectors << 9; + bp->bv2.bv_len -= first_sectors << 9; + bp->bv1.bv_len = first_sectors << 9; + } - bp->bio1.bi_io_vec = &bp->bv1; - bp->bio2.bi_io_vec = &bp->bv2; + bp->bio1.bi_io_vec = &bp->bv1; + bp->bio2.bi_io_vec = &bp->bv2; - bp->bio1.bi_max_vecs = 1; - bp->bio2.bi_max_vecs = 1; + bp->bio1.bi_max_vecs = 1; + bp->bio2.bi_max_vecs = 1; + } bp->bio1.bi_end_io = bio_pair_end_1; bp->bio2.bi_end_io = bio_pair_end_2; -- cgit v1.2.3