summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-10-11 09:04:23 +0900
committerLinus Torvalds <torvalds@linux-foundation.org>2012-10-11 09:04:23 +0900
commitce40be7a820bb393ac4ac69865f018d2f4038cf0 (patch)
treeb1fe5a93346eb06f22b1c303d63ec5456d7212ab
parentba0a5a36f60e4c1152af3a2ae2813251974405bf (diff)
parent02f3939e1a9357b7c370a4a69717cf9c02452737 (diff)
downloadlinux-ce40be7a820bb393ac4ac69865f018d2f4038cf0.tar.gz
linux-ce40be7a820bb393ac4ac69865f018d2f4038cf0.tar.bz2
linux-ce40be7a820bb393ac4ac69865f018d2f4038cf0.zip
Merge branch 'for-3.7/core' of git://git.kernel.dk/linux-block
Pull block IO update from Jens Axboe: "Core block IO bits for 3.7. Not a huge round this time, it contains: - First series from Kent cleaning up and generalizing bio allocation and freeing. - WRITE_SAME support from Martin. - Mikulas patches to prevent O_DIRECT crashes when someone changes the block size of a device. - Make bio_split() work on data-less bio's (like trim/discards). - A few other minor fixups." Fixed up silent semantic mis-merge as per Mikulas Patocka and Andrew Morton. It is due to the VM no longer using a prio-tree (see commit 6b2dbba8b6ac: "mm: replace vma prio_tree with an interval tree"). So make set_blocksize() use mapping_mapped() instead of open-coding the internal VM knowledge that has changed. * 'for-3.7/core' of git://git.kernel.dk/linux-block: (26 commits) block: makes bio_split support bio without data scatterlist: refactor the sg_nents scatterlist: add sg_nents fs: fix include/percpu-rwsem.h export error percpu-rw-semaphore: fix documentation typos fs/block_dev.c:1644:5: sparse: symbol 'blkdev_mmap' was not declared blockdev: turn a rw semaphore into a percpu rw semaphore Fix a crash when block device is read and block size is changed at the same time block: fix request_queue->flags initialization block: lift the initial queue bypass mode on blk_register_queue() instead of blk_init_allocated_queue() block: ioctl to zero block ranges block: Make blkdev_issue_zeroout use WRITE SAME block: Implement support for WRITE SAME block: Consolidate command flag and queue limit checks for merges block: Clean up special command handling logic block/blk-tag.c: Remove useless kfree block: remove the duplicated setting for congestion_threshold block: reject invalid queue attribute values block: Add bio_clone_bioset(), bio_clone_kmalloc() block: Consolidate bio_alloc_bioset(), bio_kmalloc() ...
-rw-r--r--Documentation/ABI/testing/sysfs-block14
-rw-r--r--Documentation/block/biodoc.txt5
-rw-r--r--Documentation/percpu-rw-semaphore.txt27
-rw-r--r--block/blk-core.c51
-rw-r--r--block/blk-lib.c104
-rw-r--r--block/blk-merge.c53
-rw-r--r--block/blk-settings.c16
-rw-r--r--block/blk-sysfs.c44
-rw-r--r--block/blk-tag.c6
-rw-r--r--block/blk.h5
-rw-r--r--block/elevator.c6
-rw-r--r--block/ioctl.c27
-rw-r--r--drivers/block/drbd/drbd_main.c13
-rw-r--r--drivers/block/osdblk.c3
-rw-r--r--drivers/block/pktcdvd.c52
-rw-r--r--drivers/char/raw.c2
-rw-r--r--drivers/md/dm-crypt.c16
-rw-r--r--drivers/md/dm-io.c11
-rw-r--r--drivers/md/dm.c74
-rw-r--r--drivers/md/md.c44
-rw-r--r--drivers/md/raid0.c1
-rw-r--r--drivers/target/target_core_iblock.c9
-rw-r--r--fs/bio-integrity.c44
-rw-r--r--fs/bio.c231
-rw-r--r--fs/block_dev.c68
-rw-r--r--fs/exofs/ore.c5
-rw-r--r--include/linux/bio.h70
-rw-r--r--include/linux/blk_types.h36
-rw-r--r--include/linux/blkdev.h82
-rw-r--r--include/linux/fs.h6
-rw-r--r--include/linux/percpu-rwsem.h89
-rw-r--r--include/linux/scatterlist.h1
-rw-r--r--lib/scatterlist.c19
33 files changed, 770 insertions, 464 deletions
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index c1eb41cb9876..279da08f7541 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -206,3 +206,17 @@ Description:
when a discarded area is read the discard_zeroes_data
parameter will be set to one. Otherwise it will be 0 and
the result of reading a discarded area is undefined.
+
+What: /sys/block/<disk>/queue/write_same_max_bytes
+Date: January 2012
+Contact: Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+ Some devices support a write same operation in which a
+ single data block can be written to a range of several
+ contiguous blocks on storage. This can be used to wipe
+ areas on disk or to initialize drives in a RAID
+ configuration. write_same_max_bytes indicates how many
+ bytes can be written in a single write same command. If
+ write_same_max_bytes is 0, write same is not supported
+ by the device.
+
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index e418dc0a7086..8df5e8e6dceb 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -465,7 +465,6 @@ struct bio {
bio_end_io_t *bi_end_io; /* bi_end_io (bio) */
atomic_t bi_cnt; /* pin count: free when it hits zero */
void *bi_private;
- bio_destructor_t *bi_destructor; /* bi_destructor (bio) */
};
With this multipage bio design:
@@ -647,10 +646,6 @@ for a non-clone bio. There are the 6 pools setup for different size biovecs,
so bio_alloc(gfp_mask, nr_iovecs) will allocate a vec_list of the
given size from these slabs.
-The bi_destructor() routine takes into account the possibility of the bio
-having originated from a different source (see later discussions on
-n/w to block transfers and kvec_cb)
-
The bio_get() routine may be used to hold an extra reference on a bio prior
to i/o submission, if the bio fields are likely to be accessed after the
i/o is issued (since the bio may otherwise get freed in case i/o completion
diff --git a/Documentation/percpu-rw-semaphore.txt b/Documentation/percpu-rw-semaphore.txt
new file mode 100644
index 000000000000..7d3c82431909
--- /dev/null
+++ b/Documentation/percpu-rw-semaphore.txt
@@ -0,0 +1,27 @@
+Percpu rw semaphores
+--------------------
+
+Percpu rw semaphores is a new read-write semaphore design that is
+optimized for locking for reading.
+
+The problem with traditional read-write semaphores is that when multiple
+cores take the lock for reading, the cache line containing the semaphore
+is bouncing between L1 caches of the cores, causing performance
+degradation.
+
+Locking for reading is very fast, it uses RCU and it avoids any atomic
+instruction in the lock and unlock path. On the other hand, locking for
+writing is very expensive, it calls synchronize_rcu() that can take
+hundreds of milliseconds.
+
+The lock is declared with "struct percpu_rw_semaphore" type.
+The lock is initialized percpu_init_rwsem, it returns 0 on success and
+-ENOMEM on allocation failure.
+The lock must be freed with percpu_free_rwsem to avoid memory leak.
+
+The lock is locked for read with percpu_down_read, percpu_up_read and
+for write with percpu_down_write, percpu_up_write.
+
+The idea of using RCU for optimized rw-lock was introduced by
+Eric Dumazet <eric.dumazet@gmail.com>.
+The code was written by Mikulas Patocka <mpatocka@redhat.com>
diff --git a/block/blk-core.c b/block/blk-core.c
index d2da64170513..a33870b1847b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -606,8 +606,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
/*
* A queue starts its life with bypass turned on to avoid
* unnecessary bypass on/off overhead and nasty surprises during
- * init. The initial bypass will be finished at the end of
- * blk_init_allocated_queue().
+ * init. The initial bypass will be finished when the queue is
+ * registered by blk_register_queue().
*/
q->bypass_depth = 1;
__set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
@@ -694,7 +694,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
q->request_fn = rfn;
q->prep_rq_fn = NULL;
q->unprep_rq_fn = NULL;
- q->queue_flags = QUEUE_FLAG_DEFAULT;
+ q->queue_flags |= QUEUE_FLAG_DEFAULT;
/* Override internal queue lock with supplied lock pointer */
if (lock)
@@ -710,11 +710,6 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
/* init elevator */
if (elevator_init(q, NULL))
return NULL;
-
- blk_queue_congestion_threshold(q);
-
- /* all done, end the initial bypass */
- blk_queue_bypass_end(q);
return q;
}
EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1657,8 +1652,8 @@ generic_make_request_checks(struct bio *bio)
goto end_io;
}
- if (unlikely(!(bio->bi_rw & REQ_DISCARD) &&
- nr_sectors > queue_max_hw_sectors(q))) {
+ if (likely(bio_is_rw(bio) &&
+ nr_sectors > queue_max_hw_sectors(q))) {
printk(KERN_ERR "bio too big device %s (%u > %u)\n",
bdevname(bio->bi_bdev, b),
bio_sectors(bio),
@@ -1699,8 +1694,12 @@ generic_make_request_checks(struct bio *bio)
if ((bio->bi_rw & REQ_DISCARD) &&
(!blk_queue_discard(q) ||
- ((bio->bi_rw & REQ_SECURE) &&
- !blk_queue_secdiscard(q)))) {
+ ((bio->bi_rw & REQ_SECURE) && !blk_queue_secdiscard(q)))) {
+ err = -EOPNOTSUPP;
+ goto end_io;
+ }
+
+ if (bio->bi_rw & REQ_WRITE_SAME && !bdev_write_same(bio->bi_bdev)) {
err = -EOPNOTSUPP;
goto end_io;
}
@@ -1810,15 +1809,20 @@ EXPORT_SYMBOL(generic_make_request);
*/
void submit_bio(int rw, struct bio *bio)
{
- int count = bio_sectors(bio);
-
bio->bi_rw |= rw;
/*
* If it's a regular read/write or a barrier with data attached,
* go through the normal accounting stuff before submission.
*/
- if (bio_has_data(bio) && !(rw & REQ_DISCARD)) {
+ if (bio_has_data(bio)) {
+ unsigned int count;
+
+ if (unlikely(rw & REQ_WRITE_SAME))
+ count = bdev_logical_block_size(bio->bi_bdev) >> 9;
+ else
+ count = bio_sectors(bio);
+
if (rw & WRITE) {
count_vm_events(PGPGOUT, count);
} else {
@@ -1864,11 +1868,10 @@ EXPORT_SYMBOL(submit_bio);
*/
int blk_rq_check_limits(struct request_queue *q, struct request *rq)
{
- if (rq->cmd_flags & REQ_DISCARD)
+ if (!rq_mergeable(rq))
return 0;
- if (blk_rq_sectors(rq) > queue_max_sectors(q) ||
- blk_rq_bytes(rq) > queue_max_hw_sectors(q) << 9) {
+ if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, rq->cmd_flags)) {
printk(KERN_ERR "%s: over max size limit.\n", __func__);
return -EIO;
}
@@ -2340,7 +2343,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
req->buffer = bio_data(req->bio);
/* update sector only for requests with clear definition of sector */
- if (req->cmd_type == REQ_TYPE_FS || (req->cmd_flags & REQ_DISCARD))
+ if (req->cmd_type == REQ_TYPE_FS)
req->__sector += total_bytes >> 9;
/* mixed attributes always follow the first bio */
@@ -2781,16 +2784,10 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
blk_rq_init(NULL, rq);
__rq_for_each_bio(bio_src, rq_src) {
- bio = bio_alloc_bioset(gfp_mask, bio_src->bi_max_vecs, bs);
+ bio = bio_clone_bioset(bio_src, gfp_mask, bs);
if (!bio)
goto free_and_out;
- __bio_clone(bio, bio_src);
-
- if (bio_integrity(bio_src) &&
- bio_integrity_clone(bio, bio_src, gfp_mask, bs))
- goto free_and_out;
-
if (bio_ctr && bio_ctr(bio, bio_src, data))
goto free_and_out;
@@ -2807,7 +2804,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
free_and_out:
if (bio)
- bio_free(bio, bs);
+ bio_put(bio);
blk_rq_unprep_clone(rq);
return -ENOMEM;
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 19cc761cacb2..9373b58dfab1 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -130,6 +130,80 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
EXPORT_SYMBOL(blkdev_issue_discard);
/**
+ * blkdev_issue_write_same - queue a write same operation
+ * @bdev: target blockdev
+ * @sector: start sector
+ * @nr_sects: number of sectors to write
+ * @gfp_mask: memory allocation flags (for bio_alloc)
+ * @page: page containing data to write
+ *
+ * Description:
+ * Issue a write same request for the sectors in question.
+ */
+int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp_mask,
+ struct page *page)
+{
+ DECLARE_COMPLETION_ONSTACK(wait);
+ struct request_queue *q = bdev_get_queue(bdev);
+ unsigned int max_write_same_sectors;
+ struct bio_batch bb;
+ struct bio *bio;
+ int ret = 0;
+
+ if (!q)
+ return -ENXIO;
+
+ max_write_same_sectors = q->limits.max_write_same_sectors;
+
+ if (max_write_same_sectors == 0)
+ return -EOPNOTSUPP;
+
+ atomic_set(&bb.done, 1);
+ bb.flags = 1 << BIO_UPTODATE;
+ bb.wait = &wait;
+
+ while (nr_sects) {
+ bio = bio_alloc(gfp_mask, 1);
+ if (!bio) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ bio->bi_sector = sector;
+ bio->bi_end_io = bio_batch_end_io;
+ bio->bi_bdev = bdev;
+ bio->bi_private = &bb;
+ bio->bi_vcnt = 1;
+ bio->bi_io_vec->bv_page = page;
+ bio->bi_io_vec->bv_offset = 0;
+ bio->bi_io_vec->bv_len = bdev_logical_block_size(bdev);
+
+ if (nr_sects > max_write_same_sectors) {
+ bio->bi_size = max_write_same_sectors << 9;
+ nr_sects -= max_write_same_sectors;
+ sector += max_write_same_sectors;
+ } else {
+ bio->bi_size = nr_sects << 9;
+ nr_sects = 0;
+ }
+
+ atomic_inc(&bb.done);
+ submit_bio(REQ_WRITE | REQ_WRITE_SAME, bio);
+ }
+
+ /* Wait for bios in-flight */
+ if (!atomic_dec_and_test(&bb.done))
+ wait_for_completion(&wait);
+
+ if (!test_bit(BIO_UPTODATE, &bb.flags))
+ ret = -ENOTSUPP;
+
+ return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_write_same);
+
+/**
* blkdev_issue_zeroout - generate number of zero filed write bios
* @bdev: blockdev to issue
* @sector: start sector
@@ -140,7 +214,7 @@ EXPORT_SYMBOL(blkdev_issue_discard);
* Generate and issue number of bios with zerofiled pages.
*/
-int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
+int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask)
{
int ret;
@@ -190,4 +264,32 @@ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
return ret;
}
+
+/**
+ * blkdev_issue_zeroout - zero-fill a block range
+ * @bdev: blockdev to write
+ * @sector: start sector
+ * @nr_sects: number of sectors to write
+ * @gfp_mask: memory allocation flags (for bio_alloc)
+ *
+ * Description:
+ * Generate and issue number of bios with zerofiled pages.
+ */
+
+int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp_mask)
+{
+ if (bdev_write_same(bdev)) {
+ unsigned char bdn[BDEVNAME_SIZE];
+
+ if (!blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
+ ZERO_PAGE(0)))
+ return 0;
+
+ bdevname(bdev, bdn);
+ pr_err("%s: WRITE SAME failed. Manually zeroing.\n", bdn);
+ }
+
+ return __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask);
+}
EXPORT_SYMBOL(blkdev_issue_zeroout);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index e76279e41162..936a110de0b9 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -275,14 +275,8 @@ no_merge:
int ll_back_merge_fn(struct request_queue *q, struct request *req,
struct bio *bio)
{
- unsigned short max_sectors;
-
- if (unlikely(req->cmd_type == REQ_TYPE_BLOCK_PC))
- max_sectors = queue_max_hw_sectors(q);
- else
- max_sectors = queue_max_sectors(q);
-
- if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) {
+ if (blk_rq_sectors(req) + bio_sectors(bio) >
+ blk_rq_get_max_sectors(req)) {
req->cmd_flags |= REQ_NOMERGE;
if (req == q->last_merge)
q->last_merge = NULL;
@@ -299,15 +293,8 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
int ll_front_merge_fn(struct request_queue *q, struct request *req,
struct bio *bio)
{
- unsigned short max_sectors;
-
- if (unlikely(req->cmd_type == REQ_TYPE_BLOCK_PC))
- max_sectors = queue_max_hw_sectors(q);
- else
- max_sectors = queue_max_sectors(q);
-
-
- if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) {
+ if (blk_rq_sectors(req) + bio_sectors(bio) >
+ blk_rq_get_max_sectors(req)) {
req->cmd_flags |= REQ_NOMERGE;
if (req == q->last_merge)
q->last_merge = NULL;
@@ -338,7 +325,8 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
/*
* Will it become too large?
*/
- if ((blk_rq_sectors(req) + blk_rq_sectors(next)) > queue_max_sectors(q))
+ if ((blk_rq_sectors(req) + blk_rq_sectors(next)) >
+ blk_rq_get_max_sectors(req))
return 0;
total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
@@ -417,16 +405,7 @@ static int attempt_merge(struct request_queue *q, struct request *req,
if (!rq_mergeable(req) || !rq_mergeable(next))
return 0;
- /*
- * Don't merge file system requests and discard requests
- */
- if ((req->cmd_flags & REQ_DISCARD) != (next->cmd_flags & REQ_DISCARD))
- return 0;
-
- /*
- * Don't merge discard requests and secure discard requests
- */
- if ((req->cmd_flags & REQ_SECURE) != (next->cmd_flags & REQ_SECURE))
+ if (!blk_check_merge_flags(req->cmd_flags, next->cmd_flags))
return 0;
/*
@@ -440,6 +419,10 @@ static int attempt_merge(struct request_queue *q, struct request *req,
|| next->special)
return 0;
+ if (req->cmd_flags & REQ_WRITE_SAME &&
+ !blk_write_same_mergeable(req->bio, next->bio))
+ return 0;
+
/*
* If we are allowed to merge, then append bio list
* from next to rq and release next. merge_requests_fn
@@ -521,15 +504,10 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
{
- if (!rq_mergeable(rq))
+ if (!rq_mergeable(rq) || !bio_mergeable(bio))
return false;
- /* don't merge file system requests and discard requests */
- if ((bio->bi_rw & REQ_DISCARD) != (rq->bio->bi_rw & REQ_DISCARD))
- return false;
-
- /* don't merge discard requests and secure discard requests */
- if ((bio->bi_rw & REQ_SECURE) != (rq->bio->bi_rw & REQ_SECURE))
+ if (!blk_check_merge_flags(rq->cmd_flags, bio->bi_rw))
return false;
/* different data direction or already started, don't merge */
@@ -544,6 +522,11 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
if (bio_integrity(bio) != blk_integrity_rq(rq))
return false;
+ /* must be using the same buffer */
+ if (rq->cmd_flags & REQ_WRITE_SAME &&
+ !blk_write_same_mergeable(rq->bio, bio))
+ return false;
+
return true;
}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 565a6786032f..779bb7646bcd 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -113,6 +113,7 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
+ lim->max_write_same_sectors = 0;
lim->max_discard_sectors = 0;
lim->discard_granularity = 0;
lim->discard_alignment = 0;
@@ -144,6 +145,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
lim->max_segments = USHRT_MAX;
lim->max_hw_sectors = UINT_MAX;
lim->max_sectors = UINT_MAX;
+ lim->max_write_same_sectors = UINT_MAX;
}
EXPORT_SYMBOL(blk_set_stacking_limits);
@@ -286,6 +288,18 @@ void blk_queue_max_discard_sectors(struct request_queue *q,
EXPORT_SYMBOL(blk_queue_max_discard_sectors);
/**
+ * blk_queue_max_write_same_sectors - set max sectors for a single write same
+ * @q: the request queue for the device
+ * @max_write_same_sectors: maximum number of sectors to write per command
+ **/
+void blk_queue_max_write_same_sectors(struct request_queue *q,
+ unsigned int max_write_same_sectors)
+{
+ q->limits.max_write_same_sectors = max_write_same_sectors;
+}
+EXPORT_SYMBOL(blk_queue_max_write_same_sectors);
+
+/**
* blk_queue_max_segments - set max hw segments for a request for this queue
* @q: the request queue for the device
* @max_segments: max number of segments
@@ -510,6 +524,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
+ t->max_write_same_sectors = min(t->max_write_same_sectors,
+ b->max_write_same_sectors);
t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 9628b291f960..ce6204608822 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -26,9 +26,15 @@ queue_var_show(unsigned long var, char *page)
static ssize_t
queue_var_store(unsigned long *var, const char *page, size_t count)
{
- char *p = (char *) page;
+ int err;
+ unsigned long v;
+
+ err = strict_strtoul(page, 10, &v);
+ if (err || v > UINT_MAX)
+ return -EINVAL;
+
+ *var = v;
- *var = simple_strtoul(p, &p, 10);
return count;
}
@@ -48,6 +54,9 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
return -EINVAL;
ret = queue_var_store(&nr, page, count);
+ if (ret < 0)
+ return ret;
+
if (nr < BLKDEV_MIN_RQ)
nr = BLKDEV_MIN_RQ;
@@ -102,6 +111,9 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count)
unsigned long ra_kb;
ssize_t ret = queue_var_store(&ra_kb, page, count);
+ if (ret < 0)
+ return ret;
+
q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10);
return ret;
@@ -168,6 +180,13 @@ static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *pag
return queue_var_show(queue_discard_zeroes_data(q), page);
}
+static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
+{
+ return sprintf(page, "%llu\n",
+ (unsigned long long)q->limits.max_write_same_sectors << 9);
+}
+
+
static ssize_t
queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
{
@@ -176,6 +195,9 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
page_kb = 1 << (PAGE_CACHE_SHIFT - 10);
ssize_t ret = queue_var_store(&max_sectors_kb, page, count);
+ if (ret < 0)
+ return ret;
+
if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)
return -EINVAL;
@@ -236,6 +258,9 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
unsigned long nm;
ssize_t ret = queue_var_store(&nm, page, count);
+ if (ret < 0)
+ return ret;
+
spin_lock_irq(q->queue_lock);
queue_flag_clear(QUEUE_FLAG_NOMERGES, q);
queue_flag_clear(QUEUE_FLAG_NOXMERGES, q);
@@ -264,6 +289,9 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
unsigned long val;
ret = queue_var_store(&val, page, count);
+ if (ret < 0)
+ return ret;
+
spin_lock_irq(q->queue_lock);
if (val == 2) {
queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
@@ -364,6 +392,11 @@ static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
.show = queue_discard_zeroes_data_show,
};
+static struct queue_sysfs_entry queue_write_same_max_entry = {
+ .attr = {.name = "write_same_max_bytes", .mode = S_IRUGO },
+ .show = queue_write_same_max_show,
+};
+
static struct queue_sysfs_entry queue_nonrot_entry = {
.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
.show = queue_show_nonrot,
@@ -411,6 +444,7 @@ static struct attribute *default_attrs[] = {
&queue_discard_granularity_entry.attr,
&queue_discard_max_entry.attr,
&queue_discard_zeroes_data_entry.attr,
+ &queue_write_same_max_entry.attr,
&queue_nonrot_entry.attr,
&queue_nomerges_entry.attr,
&queue_rq_affinity_entry.attr,
@@ -527,6 +561,12 @@ int blk_register_queue(struct gendisk *disk)
if (WARN_ON(!q))
return -ENXIO;
+ /*
+ * Initialization must be complete by now. Finish the initial
+ * bypass from queue allocation.
+ */
+ blk_queue_bypass_end(q);
+
ret = blk_trace_init_sysfs(dev);
if (ret)
return ret;
diff --git a/block/blk-tag.c b/block/blk-tag.c
index 4af6f5cc1167..cc345e1d8d4e 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -186,7 +186,8 @@ int blk_queue_init_tags(struct request_queue *q, int depth,
tags = __blk_queue_init_tags(q, depth);
if (!tags)
- goto fail;
+ return -ENOMEM;
+
} else if (q->queue_tags) {
rc = blk_queue_resize_tags(q, depth);
if (rc)
@@ -203,9 +204,6 @@ int blk_queue_init_tags(struct request_queue *q, int depth,
queue_flag_set_unlocked(QUEUE_FLAG_QUEUED, q);
INIT_LIST_HEAD(&q->tag_busy_list);
return 0;
-fail:
- kfree(tags);
- return -ENOMEM;
}
EXPORT_SYMBOL(blk_queue_init_tags);
diff --git a/block/blk.h b/block/blk.h
index 2a0ea32d249f..ca51543b248c 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -171,14 +171,13 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
*
* a) it's attached to a gendisk, and
* b) the queue had IO stats enabled when this request was started, and
- * c) it's a file system request or a discard request
+ * c) it's a file system request
*/
static inline int blk_do_io_stat(struct request *rq)
{
return rq->rq_disk &&
(rq->cmd_flags & REQ_IO_STAT) &&
- (rq->cmd_type == REQ_TYPE_FS ||
- (rq->cmd_flags & REQ_DISCARD));
+ (rq->cmd_type == REQ_TYPE_FS);
}
/*
diff --git a/block/elevator.c b/block/elevator.c
index 6a55d418896f..9b1d42b62f20 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -562,8 +562,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
if (rq->cmd_flags & REQ_SOFTBARRIER) {
/* barriers are scheduling boundary, update end_sector */
- if (rq->cmd_type == REQ_TYPE_FS ||
- (rq->cmd_flags & REQ_DISCARD)) {
+ if (rq->cmd_type == REQ_TYPE_FS) {
q->end_sector = rq_end_sector(rq);
q->boundary_rq = rq;
}
@@ -605,8 +604,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
if (elv_attempt_insert_merge(q, rq))
break;
case ELEVATOR_INSERT_SORT:
- BUG_ON(rq->cmd_type != REQ_TYPE_FS &&
- !(rq->cmd_flags & REQ_DISCARD));
+ BUG_ON(rq->cmd_type != REQ_TYPE_FS);
rq->cmd_flags |= REQ_SORTED;
q->nr_sorted++;
if (rq_mergeable(rq)) {
diff --git a/block/ioctl.c b/block/ioctl.c
index 4a85096f5410..a31d91d9bc5a 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -185,6 +185,22 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags);
}
+static int blk_ioctl_zeroout(struct block_device *bdev, uint64_t start,
+ uint64_t len)
+{
+ if (start & 511)
+ return -EINVAL;
+ if (len & 511)
+ return -EINVAL;
+ start >>= 9;
+ len >>= 9;
+
+ if (start + len > (i_size_read(bdev->bd_inode) >> 9))
+ return -EINVAL;
+
+ return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL);
+}
+
static int put_ushort(unsigned long arg, unsigned short val)
{
return put_user(val, (unsigned short __user *)arg);
@@ -300,6 +316,17 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
return blk_ioctl_discard(bdev, range[0], range[1],
cmd == BLKSECDISCARD);
}
+ case BLKZEROOUT: {
+ uint64_t range[2];
+
+ if (!(mode & FMODE_WRITE))
+ return -EBADF;
+
+ if (copy_from_user(range, (void __user *)arg, sizeof(range)))
+ return -EFAULT;
+
+ return blk_ioctl_zeroout(bdev, range[0], range[1]);
+ }
case HDIO_GETGEO: {
struct hd_geometry geo;
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index f93a0320e952..f55683ad4ffa 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -162,23 +162,12 @@ static const struct block_device_operations drbd_ops = {
.release = drbd_release,
};
-static void bio_destructor_drbd(struct bio *bio)
-{
- bio_free(bio, drbd_md_io_bio_set);
-}
-
struct bio *bio_alloc_drbd(gfp_t gfp_mask)
{
- struct bio *bio;
-
if (!drbd_md_io_bio_set)
return bio_alloc(gfp_mask, 1);
- bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
- if (!bio)
- return NULL;
- bio->bi_destructor = bio_destructor_drbd;
- return bio;
+ return bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
}
#ifdef __CHECKER__
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
index 87311ebac0db..1bbc681688e4 100644
--- a/drivers/block/osdblk.c
+++ b/drivers/block/osdblk.c
@@ -266,11 +266,10 @@ static struct bio *bio_chain_clone(struct bio *old_chain, gfp_t gfpmask)
struct bio *tmp, *new_chain = NULL, *tail = NULL;
while (old_chain) {
- tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
+ tmp = bio_clone_kmalloc(old_chain, gfpmask);
if (!tmp)
goto err_out;
- __bio_clone(tmp, old_chain);
tmp->bi_bdev = NULL;
gfpmask &= ~__GFP_WAIT;
tmp->bi_next = NULL;
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index ba66e4445f41..2e7de7a59bfc 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -522,38 +522,6 @@ static void pkt_bio_finished(struct pktcdvd_device *pd)
}
}
-static void pkt_bio_destructor(struct bio *bio)
-{
- kfree(bio->bi_io_vec);
- kfree(bio);
-}
-
-static struct bio *pkt_bio_alloc(int nr_iovecs)
-{
- struct bio_vec *bvl = NULL;
- struct bio *bio;
-
- bio = kmalloc(sizeof(struct bio), GFP_KERNEL);
- if (!bio)
- goto no_bio;
- bio_init(bio);
-
- bvl = kcalloc(nr_iovecs, sizeof(struct bio_vec), GFP_KERNEL);
- if (!bvl)
- goto no_bvl;
-
- bio->bi_max_vecs = nr_iovecs;
- bio->bi_io_vec = bvl;
- bio->bi_destructor = pkt_bio_destructor;
-
- return bio;
-
- no_bvl:
- kfree(bio);
- no_bio:
- return NULL;
-}
-
/*
* Allocate a packet_data struct
*/
@@ -567,7 +535,7 @@ static struct packet_data *pkt_alloc_packet_data(int frames)
goto no_pkt;
pkt->frames = frames;
- pkt->w_bio = pkt_bio_alloc(frames);
+ pkt->w_bio = bio_kmalloc(GFP_KERNEL, frames);
if (!pkt->w_bio)
goto no_bio;
@@ -581,9 +549,10 @@ static struct packet_data *pkt_alloc_packet_data(int frames)
bio_list_init(&pkt->orig_bios);
for (i = 0; i < frames; i++) {
- struct bio *bio = pkt_bio_alloc(1);
+ struct bio *bio = bio_kmalloc(GFP_KERNEL, 1);
if (!bio)
goto no_rd_bio;
+
pkt->r_bios[i] = bio;
}
@@ -1111,21 +1080,17 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
* Schedule reads for missing parts of the packet.
*/
for (f = 0; f < pkt->frames; f++) {
- struct bio_vec *vec;
-
int p, offset;
+
if (written[f])
continue;
+
bio = pkt->r_bios[f];
- vec = bio->bi_io_vec;
- bio_init(bio);
- bio->bi_max_vecs = 1;
+ bio_reset(bio);
bio->bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9);
bio->bi_bdev = pd->bdev;
bio->bi_end_io = pkt_end_io_read;
bio->bi_private = pkt;
- bio->bi_io_vec = vec;
- bio->bi_destructor = pkt_bio_destructor;
p = (f * CD_FRAMESIZE) / PAGE_SIZE;
offset = (f * CD_FRAMESIZE) % PAGE_SIZE;
@@ -1418,14 +1383,11 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
}
/* Start the write request */
- bio_init(pkt->w_bio);
- pkt->w_bio->bi_max_vecs = PACKET_MAX_SIZE;
+ bio_reset(pkt->w_bio);
pkt->w_bio->bi_sector = pkt->sector;
pkt->w_bio->bi_bdev = pd->bdev;
pkt->w_bio->bi_end_io = pkt_end_io_packet_write;
pkt->w_bio->bi_private = pkt;
- pkt->w_bio->bi_io_vec = bvec;
- pkt->w_bio->bi_destructor = pkt_bio_destructor;
for (f = 0; f < pkt->frames; f++)
if (!bio_add_page(pkt->w_bio, bvec[f].bv_page, CD_FRAMESIZE, bvec[f].bv_offset))
BUG();
diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index 54a3a6d09819..0bb207eaef2f 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -285,7 +285,7 @@ static long raw_ctl_compat_ioctl(struct file *file, unsigned int cmd,
static const struct file_operations raw_fops = {
.read = do_sync_read,
- .aio_read = generic_file_aio_read,
+ .aio_read = blkdev_aio_read,
.write = do_sync_write,
.aio_write = blkdev_aio_write,
.fsync = blkdev_fsync,
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 664743d6a6cd..bbf459bca61d 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -798,14 +798,6 @@ static int crypt_convert(struct crypt_config *cc,
return 0;
}
-static void dm_crypt_bio_destructor(struct bio *bio)
-{
- struct dm_crypt_io *io = bio->bi_private;
- struct crypt_config *cc = io->cc;
-
- bio_free(bio, cc->bs);
-}
-
/*
* Generate a new unfragmented bio with the given size
* This should never violate the device limitations
@@ -974,7 +966,6 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
clone->bi_end_io = crypt_endio;
clone->bi_bdev = cc->dev->bdev;
clone->bi_rw = io->base_bio->bi_rw;
- clone->bi_destructor = dm_crypt_bio_destructor;
}
static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
@@ -988,19 +979,14 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
* copy the required bvecs because we need the original
* one in order to decrypt the whole bio data *afterwards*.
*/
- clone = bio_alloc_bioset(gfp, bio_segments(base_bio), cc->bs);
+ clone = bio_clone_bioset(base_bio, gfp, cc->bs);
if (!clone)
return 1;
crypt_inc_pending(io);
clone_init(io, clone);
- clone->bi_idx = 0;
- clone->bi_vcnt = bio_segments(base_bio);
- clone->bi_size = base_bio->bi_size;
clone->bi_sector = cc->start + io->sector;
- memcpy(clone->bi_io_vec, bio_iovec(base_bio),
- sizeof(struct bio_vec) * clone->bi_vcnt);
generic_make_request(clone);
return 0;
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index ea5dd289fe2a..1c46f97d6664 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -249,16 +249,6 @@ static void vm_dp_init(struct dpages *dp, void *data)
dp->context_ptr = data;
}
-static void dm_bio_destructor(struct bio *bio)
-{
- unsigned region;
- struct io *io;
-
- retrieve_io_and_region_from_bio(bio, &io, &region);
-
- bio_free(bio, io->client->bios);
-}
-
/*
* Functions for getting the pages from kernel memory.
*/
@@ -317,7 +307,6 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
bio->bi_sector = where->sector + (where->count - remaining);
bio->bi_bdev = where->bdev;
bio->bi_end_io = endio;
- bio->bi_destructor = dm_bio_destructor;
store_io_and_region_in_bio(bio, io, region);
if (rw & REQ_DISCARD) {
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 67ffa391edcf..66ceaff6455c 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -86,12 +86,17 @@ struct dm_rq_target_io {
};
/*
- * For request-based dm.
- * One of these is allocated per bio.
+ * For request-based dm - the bio clones we allocate are embedded in these
+ * structs.
+ *
+ * We allocate these with bio_alloc_bioset, using the front_pad parameter when
+ * the bioset is created - this means the bio has to come at the end of the
+ * struct.
*/
struct dm_rq_clone_bio_info {
struct bio *orig;
struct dm_rq_target_io *tio;
+ struct bio clone;
};
union map_info *dm_get_mapinfo(struct bio *bio)
@@ -211,6 +216,11 @@ struct dm_md_mempools {
static struct kmem_cache *_io_cache;
static struct kmem_cache *_tio_cache;
static struct kmem_cache *_rq_tio_cache;
+
+/*
+ * Unused now, and needs to be deleted. But since io_pool is overloaded and it's
+ * still used for _io_cache, I'm leaving this for a later cleanup
+ */
static struct kmem_cache *_rq_bio_info_cache;
static int __init local_init(void)
@@ -467,16 +477,6 @@ static void free_rq_tio(struct dm_rq_target_io *tio)
mempool_free(tio, tio->md->tio_pool);
}
-static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md)
-{
- return mempool_alloc(md->io_pool, GFP_ATOMIC);
-}
-
-static void free_bio_info(struct dm_rq_clone_bio_info *info)
-{
- mempool_free(info, info->tio->md->io_pool);
-}
-
static int md_in_flight(struct mapped_device *md)
{
return atomic_read(&md->pending[READ]) +
@@ -681,11 +681,6 @@ static void clone_endio(struct bio *bio, int error)
}
}
- /*
- * Store md for cleanup instead of tio which is about to get freed.
- */
- bio->bi_private = md->bs;
-
free_tio(md, tio);
bio_put(bio);
dec_pending(io, error);
@@ -1036,11 +1031,6 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
/* error the io and bail out, or requeue it if needed */
md = tio->io->md;
dec_pending(tio->io, r);
- /*
- * Store bio_set for cleanup.
- */
- clone->bi_end_io = NULL;
- clone->bi_private = md->bs;
bio_put(clone);
free_tio(md, tio);
} else if (r) {
@@ -1059,13 +1049,6 @@ struct clone_info {
unsigned short idx;
};
-static void dm_bio_destructor(struct bio *bio)
-{
- struct bio_set *bs = bio->bi_private;
-
- bio_free(bio, bs);
-}
-
/*
* Creates a little bio that just does part of a bvec.
*/
@@ -1077,7 +1060,6 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
struct bio_vec *bv = bio->bi_io_vec + idx;
clone = bio_alloc_bioset(GFP_NOIO, 1, bs);
- clone->bi_destructor = dm_bio_destructor;
*clone->bi_io_vec = *bv;
clone->bi_sector = sector;
@@ -1090,7 +1072,7 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
clone->bi_flags |= 1 << BIO_CLONED;
if (bio_integrity(bio)) {
- bio_integrity_clone(clone, bio, GFP_NOIO, bs);
+ bio_integrity_clone(clone, bio, GFP_NOIO);
bio_integrity_trim(clone,
bio_sector_offset(bio, idx, offset), len);
}
@@ -1109,7 +1091,6 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
__bio_clone(clone, bio);
- clone->bi_destructor = dm_bio_destructor;
clone->bi_sector = sector;
clone->bi_idx = idx;
clone->bi_vcnt = idx + bv_count;
@@ -1117,7 +1098,7 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
clone->bi_flags &= ~(1 << BIO_SEG_VALID);
if (bio_integrity(bio)) {
- bio_integrity_clone(clone, bio, GFP_NOIO, bs);
+ bio_integrity_clone(clone, bio, GFP_NOIO);
if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
bio_integrity_trim(clone,
@@ -1152,9 +1133,8 @@ static void __issue_target_request(struct clone_info *ci, struct dm_target *ti,
* ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
* and discard, so no need for concern about wasted bvec allocations.
*/
- clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs);
- __bio_clone(clone, ci->bio);
- clone->bi_destructor = dm_bio_destructor;
+ clone = bio_clone_bioset(ci->bio, GFP_NOIO, ci->md->bs);
+
if (len) {
clone->bi_sector = ci->sector;
clone->bi_size = to_bytes(len);
@@ -1484,30 +1464,17 @@ void dm_dispatch_request(struct request *rq)
}
EXPORT_SYMBOL_GPL(dm_dispatch_request);
-static void dm_rq_bio_destructor(struct bio *bio)
-{
- struct dm_rq_clone_bio_info *info = bio->bi_private;
- struct mapped_device *md = info->tio->md;
-
- free_bio_info(info);
- bio_free(bio, md->bs);
-}
-
static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
void *data)
{
struct dm_rq_target_io *tio = data;
- struct mapped_device *md = tio->md;
- struct dm_rq_clone_bio_info *info = alloc_bio_info(md);
-
- if (!info)
- return -ENOMEM;
+ struct dm_rq_clone_bio_info *info =
+ container_of(bio, struct dm_rq_clone_bio_info, clone);
info->orig = bio_orig;
info->tio = tio;
bio->bi_end_io = end_clone_bio;
bio->bi_private = info;
- bio->bi_destructor = dm_rq_bio_destructor;
return 0;
}
@@ -2771,7 +2738,10 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity)
if (!pools->tio_pool)
goto free_io_pool_and_out;
- pools->bs = bioset_create(pool_size, 0);
+ pools->bs = (type == DM_TYPE_BIO_BASED) ?
+ bioset_create(pool_size, 0) :
+ bioset_create(pool_size,
+ offsetof(struct dm_rq_clone_bio_info, clone));
if (!pools->bs)
goto free_tio_pool_and_out;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 308e87b417e0..95c88012a3b9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -155,32 +155,17 @@ static int start_readonly;
* like bio_clone, but with a local bio set
*/
-static void mddev_bio_destructor(struct bio *bio)
-{
- struct mddev *mddev, **mddevp;
-
- mddevp = (void*)bio;
- mddev = mddevp[-1];
-
- bio_free(bio, mddev->bio_set);
-}
-
struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
struct mddev *mddev)
{
struct bio *b;
- struct mddev **mddevp;
if (!mddev || !mddev->bio_set)
return bio_alloc(gfp_mask, nr_iovecs);
- b = bio_alloc_bioset(gfp_mask, nr_iovecs,
- mddev->bio_set);
+ b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set);
if (!b)
return NULL;
- mddevp = (void*)b;
- mddevp[-1] = mddev;
- b->bi_destructor = mddev_bio_destructor;
return b;
}
EXPORT_SYMBOL_GPL(bio_alloc_mddev);
@@ -188,32 +173,10 @@ EXPORT_SYMBOL_GPL(bio_alloc_mddev);
struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
struct mddev *mddev)
{
- struct bio *b;
- struct mddev **mddevp;
-
if (!mddev || !mddev->bio_set)
return bio_clone(bio, gfp_mask);
- b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs,
- mddev->bio_set);
- if (!b)
- return NULL;
- mddevp = (void*)b;
- mddevp[-1] = mddev;
- b->bi_destructor = mddev_bio_destructor;
- __bio_clone(b, bio);
- if (bio_integrity(bio)) {
- int ret;
-
- ret = bio_integrity_clone(b, bio, gfp_mask, mddev->bio_set);
-
- if (ret < 0) {
- bio_put(b);
- return NULL;
- }
- }
-
- return b;
+ return bio_clone_bioset(bio, gfp_mask, mddev->bio_set);
}
EXPORT_SYMBOL_GPL(bio_clone_mddev);
@@ -5006,8 +4969,7 @@ int md_run(struct mddev *mddev)
}
if (mddev->bio_set == NULL)
- mddev->bio_set = bioset_create(BIO_POOL_SIZE,
- sizeof(struct mddev *));
+ mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0);
spin_lock(&pers_lock);
pers = find_pers(mddev->level, mddev->clevel);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index de63a1fc3737..a9e4fa95dfaa 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -422,6 +422,7 @@ static int raid0_run(struct mddev *mddev)
if (md_check_no_bitmap(mddev))
return -EINVAL;
blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
+ blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
/* if private is not null, we are here after takeover */
if (mddev->private == NULL) {
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index 29408d46a6d9..57d7674c5013 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -553,14 +553,6 @@ static void iblock_complete_cmd(struct se_cmd *cmd)
kfree(ibr);
}
-static void iblock_bio_destructor(struct bio *bio)
-{
- struct se_cmd *cmd = bio->bi_private;
- struct iblock_dev *ib_dev = cmd->se_dev->dev_ptr;
-
- bio_free(bio, ib_dev->ibd_bio_set);
-}
-
static struct bio *
iblock_get_bio(struct se_cmd *cmd, sector_t lba, u32 sg_num)
{
@@ -582,7 +574,6 @@ iblock_get_bio(struct se_cmd *cmd, sector_t lba, u32 sg_num)
bio->bi_bdev = ib_dev->ibd_bd;
bio->bi_private = cmd;
- bio->bi_destructor = iblock_bio_destructor;
bio->bi_end_io = &iblock_bio_done;
bio->bi_sector = lba;
return bio;
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index e85c04b9f61c..a3f28f331b2b 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -70,23 +70,25 @@ static inline int use_bip_pool(unsigned int idx)
}
/**
- * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio
+ * bio_integrity_alloc - Allocate integrity payload and attach it to bio
* @bio: bio to attach integrity metadata to
* @gfp_mask: Memory allocation mask
* @nr_vecs: Number of integrity metadata scatter-gather elements
- * @bs: bio_set to allocate from
*
* Description: This function prepares a bio for attaching integrity
* metadata. nr_vecs specifies the maximum number of pages containing
* integrity metadata that can be attached.
*/
-struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
- gfp_t gfp_mask,
- unsigned int nr_vecs,
- struct bio_set *bs)
+struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
+ gfp_t gfp_mask,
+ unsigned int nr_vecs)
{
struct bio_integrity_payload *bip;
unsigned int idx = vecs_to_idx(nr_vecs);
+ struct bio_set *bs = bio->bi_pool;
+
+ if (!bs)
+ bs = fs_bio_set;
BUG_ON(bio == NULL);
bip = NULL;
@@ -114,37 +116,22 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
return bip;
}
-EXPORT_SYMBOL(bio_integrity_alloc_bioset);
-
-/**
- * bio_integrity_alloc - Allocate integrity payload and attach it to bio
- * @bio: bio to attach integrity metadata to
- * @gfp_mask: Memory allocation mask
- * @nr_vecs: Number of integrity metadata scatter-gather elements
- *
- * Description: This function prepares a bio for attaching integrity
- * metadata. nr_vecs specifies the maximum number of pages containing
- * integrity metadata that can be attached.
- */
-struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
- gfp_t gfp_mask,
- unsigned int nr_vecs)
-{
- return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set);
-}
EXPORT_SYMBOL(bio_integrity_alloc);
/**
* bio_integrity_free - Free bio integrity payload
* @bio: bio containing bip to be freed
- * @bs: bio_set this bio was allocated from
*
* Description: Used to free the integrity portion of a bio. Usually
* called from bio_free().
*/
-void bio_integrity_free(struct bio *bio, struct bio_set *bs)
+void bio_integrity_free(struct bio *bio)
{
struct bio_integrity_payload *bip = bio->bi_integrity;
+ struct bio_set *bs = bio->bi_pool;
+
+ if (!bs)
+ bs = fs_bio_set;
BUG_ON(bip == NULL);
@@ -730,19 +717,18 @@ EXPORT_SYMBOL(bio_integrity_split);
* @bio: New bio
* @bio_src: Original bio
* @gfp_mask: Memory allocation mask
- * @bs: bio_set to allocate bip from
*
* Description: Called to allocate a bip when cloning a bio
*/
int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
- gfp_t gfp_mask, struct bio_set *bs)
+ gfp_t gfp_mask)
{
struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
struct bio_integrity_payload *bip;
BUG_ON(bip_src == NULL);
- bip = bio_integrity_alloc_bioset(bio, gfp_mask, bip_src->bip_vcnt, bs);
+ bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt);
if (bip == NULL)
return -EIO;
diff --git a/fs/bio.c b/fs/bio.c
index 71072ab99128..9298c65ad9c7 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -55,6 +55,7 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
* IO code that does not need private memory pools.
*/
struct bio_set *fs_bio_set;
+EXPORT_SYMBOL(fs_bio_set);
/*
* Our slab pool management
@@ -233,26 +234,37 @@ fallback:
return bvl;
}
-void bio_free(struct bio *bio, struct bio_set *bs)
+static void __bio_free(struct bio *bio)
{
+ bio_disassociate_task(bio);
+
+ if (bio_integrity(bio))
+ bio_integrity_free(bio);
+}
+
+static void bio_free(struct bio *bio)
+{
+ struct bio_set *bs = bio->bi_pool;
void *p;
- if (bio_has_allocated_vec(bio))
- bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
+ __bio_free(bio);
- if (bio_integrity(bio))
- bio_integrity_free(bio, bs);
+ if (bs) {
+ if (bio_has_allocated_vec(bio))
+ bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
- /*
- * If we have front padding, adjust the bio pointer before freeing
- */
- p = bio;
- if (bs->front_pad)
+ /*
+ * If we have front padding, adjust the bio pointer before freeing
+ */
+ p = bio;
p -= bs->front_pad;
- mempool_free(p, bs->bio_pool);
+ mempool_free(p, bs->bio_pool);
+ } else {
+ /* Bio was allocated by bio_kmalloc() */
+ kfree(bio);
+ }
}
-EXPORT_SYMBOL(bio_free);
void bio_init(struct bio *bio)
{
@@ -263,48 +275,85 @@ void bio_init(struct bio *bio)
EXPORT_SYMBOL(bio_init);
/**
+ * bio_reset - reinitialize a bio
+ * @bio: bio to reset
+ *
+ * Description:
+ * After calling bio_reset(), @bio will be in the same state as a freshly
+ * allocated bio returned bio bio_alloc_bioset() - the only fields that are
+ * preserved are the ones that are initialized by bio_alloc_bioset(). See
+ * comment in struct bio.
+ */
+void bio_reset(struct bio *bio)
+{
+ unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
+
+ __bio_free(bio);
+
+ memset(bio, 0, BIO_RESET_BYTES);
+ bio->bi_flags = flags|(1 << BIO_UPTODATE);
+}
+EXPORT_SYMBOL(bio_reset);
+
+/**
* bio_alloc_bioset - allocate a bio for I/O
* @gfp_mask: the GFP_ mask given to the slab allocator
* @nr_iovecs: number of iovecs to pre-allocate
* @bs: the bio_set to allocate from.
*
* Description:
- * bio_alloc_bioset will try its own mempool to satisfy the allocation.
- * If %__GFP_WAIT is set then we will block on the internal pool waiting
- * for a &struct bio to become free.
+ * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is
+ * backed by the @bs's mempool.
*
- * Note that the caller must set ->bi_destructor on successful return
- * of a bio, to do the appropriate freeing of the bio once the reference
- * count drops to zero.
- **/
+ * When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be
+ * able to allocate a bio. This is due to the mempool guarantees. To make this
+ * work, callers must never allocate more than 1 bio at a time from this pool.
+ * Callers that need to allocate more than 1 bio must always submit the
+ * previously allocated bio for IO before attempting to allocate a new one.
+ * Failure to do so can cause deadlocks under memory pressure.
+ *
+ * RETURNS:
+ * Pointer to new bio on success, NULL on failure.
+ */
struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
{
+ unsigned front_pad;
+ unsigned inline_vecs;
unsigned long idx = BIO_POOL_NONE;
struct bio_vec *bvl = NULL;
struct bio *bio;
void *p;
- p = mempool_alloc(bs->bio_pool, gfp_mask);
+ if (!bs) {
+ if (nr_iovecs > UIO_MAXIOV)
+ return NULL;
+
+ p = kmalloc(sizeof(struct bio) +
+ nr_iovecs * sizeof(struct bio_vec),
+ gfp_mask);
+ front_pad = 0;
+ inline_vecs = nr_iovecs;
+ } else {
+ p = mempool_alloc(bs->bio_pool, gfp_mask);
+ front_pad = bs->front_pad;
+ inline_vecs = BIO_INLINE_VECS;
+ }
+
if (unlikely(!p))
return NULL;
- bio = p + bs->front_pad;
+ bio = p + front_pad;
bio_init(bio);
- if (unlikely(!nr_iovecs))
- goto out_set;
-
- if (nr_iovecs <= BIO_INLINE_VECS) {
- bvl = bio->bi_inline_vecs;
- nr_iovecs = BIO_INLINE_VECS;
- } else {
+ if (nr_iovecs > inline_vecs) {
bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
if (unlikely(!bvl))
goto err_free;
-
- nr_iovecs = bvec_nr_vecs(idx);
+ } else if (nr_iovecs) {
+ bvl = bio->bi_inline_vecs;
}
-out_set:
+
+ bio->bi_pool = bs;
bio->bi_flags |= idx << BIO_POOL_OFFSET;
bio->bi_max_vecs = nr_iovecs;
bio->bi_io_vec = bvl;
@@ -316,80 +365,6 @@ err_free:
}
EXPORT_SYMBOL(bio_alloc_bioset);
-static void bio_fs_destructor(struct bio *bio)
-{
- bio_free(bio, fs_bio_set);
-}
-
-/**
- * bio_alloc - allocate a new bio, memory pool backed
- * @gfp_mask: allocation mask to use
- * @nr_iovecs: number of iovecs
- *
- * bio_alloc will allocate a bio and associated bio_vec array that can hold
- * at least @nr_iovecs entries. Allocations will be done from the
- * fs_bio_set. Also see @bio_alloc_bioset and @bio_kmalloc.
- *
- * If %__GFP_WAIT is set, then bio_alloc will always be able to allocate
- * a bio. This is due to the mempool guarantees. To make this work, callers
- * must never allocate more than 1 bio at a time from this pool. Callers
- * that need to allocate more than 1 bio must always submit the previously
- * allocated bio for IO before attempting to allocate a new one. Failure to
- * do so can cause livelocks under memory pressure.
- *
- * RETURNS:
- * Pointer to new bio on success, NULL on failure.
- */
-struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
-{
- struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
-
- if (bio)
- bio->bi_destructor = bio_fs_destructor;
-
- return bio;
-}
-EXPORT_SYMBOL(bio_alloc);
-
-static void bio_kmalloc_destructor(struct bio *bio)
-{
- if (bio_integrity(bio))
- bio_integrity_free(bio, fs_bio_set);
- kfree(bio);
-}
-
-/**
- * bio_kmalloc - allocate a bio for I/O using kmalloc()
- * @gfp_mask: the GFP_ mask given to the slab allocator
- * @nr_iovecs: number of iovecs to pre-allocate
- *
- * Description:
- * Allocate a new bio with @nr_iovecs bvecs. If @gfp_mask contains
- * %__GFP_WAIT, the allocation is guaranteed to succeed.
- *
- **/
-struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
-{
- struct bio *bio;
-
- if (nr_iovecs > UIO_MAXIOV)
- return NULL;
-
- bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec),
- gfp_mask);
- if (unlikely(!bio))
- return NULL;
-
- bio_init(bio);
- bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET;
- bio->bi_max_vecs = nr_iovecs;
- bio->bi_io_vec = bio->bi_inline_vecs;
- bio->bi_destructor = bio_kmalloc_destructor;
-
- return bio;
-}
-EXPORT_SYMBOL(bio_kmalloc);
-
void zero_fill_bio(struct bio *bio)
{
unsigned long flags;
@@ -420,11 +395,8 @@ void bio_put(struct bio *bio)
/*
* last put frees it
*/
- if (atomic_dec_and_test(&bio->bi_cnt)) {
- bio_disassociate_task(bio);
- bio->bi_next = NULL;
- bio->bi_destructor(bio);
- }
+ if (atomic_dec_and_test(&bio->bi_cnt))
+ bio_free(bio);
}
EXPORT_SYMBOL(bio_put);
@@ -466,26 +438,28 @@ void __bio_clone(struct bio *bio, struct bio *bio_src)
EXPORT_SYMBOL(__bio_clone);
/**
- * bio_clone - clone a bio
+ * bio_clone_bioset - clone a bio
* @bio: bio to clone
* @gfp_mask: allocation priority
+ * @bs: bio_set to allocate from
*
* Like __bio_clone, only also allocates the returned bio
*/
-struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
+struct bio *bio_clone_bioset(struct bio *bio, gfp_t gfp_mask,
+ struct bio_set *bs)
{
- struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set);
+ struct bio *b;
+ b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, bs);
if (!b)
return NULL;
- b->bi_destructor = bio_fs_destructor;
__bio_clone(b, bio);
if (bio_integrity(bio)) {
int ret;
- ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set);
+ ret = bio_integrity_clone(b, bio, gfp_mask);
if (ret < 0) {
bio_put(b);
@@ -495,7 +469,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
return b;
}
-EXPORT_SYMBOL(bio_clone);
+EXPORT_SYMBOL(bio_clone_bioset);
/**
* bio_get_nr_vecs - return approx number of vecs
@@ -1501,7 +1475,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
bi->bi_sector + first_sectors);
- BUG_ON(bi->bi_vcnt != 1);
+ BUG_ON(bi->bi_vcnt != 1 && bi->bi_vcnt != 0);
BUG_ON(bi->bi_idx != 0);
atomic_set(&bp->cnt, 3);
bp->error = 0;
@@ -1511,17 +1485,22 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
bp->bio2.bi_size -= first_sectors << 9;
bp->bio1.bi_size = first_sectors << 9;
- bp->bv1 = bi->bi_io_vec[0];
- bp->bv2 = bi->bi_io_vec[0];
- bp->bv2.bv_offset += first_sectors << 9;
- bp->bv2.bv_len -= first_sectors << 9;
- bp->bv1.bv_len = first_sectors << 9;
+ if (bi->bi_vcnt != 0) {
+ bp->bv1 = bi->bi_io_vec[0];
+ bp->bv2 = bi->bi_io_vec[0];
+
+ if (bio_is_rw(bi)) {
+ bp->bv2.bv_offset += first_sectors << 9;
+ bp->bv2.bv_len -= first_sectors << 9;
+ bp->bv1.bv_len = first_sectors << 9;
+ }
- bp->bio1.bi_io_vec = &bp->bv1;
- bp->bio2.bi_io_vec = &bp->bv2;
+ bp->bio1.bi_io_vec = &bp->bv1;
+ bp->bio2.bi_io_vec = &bp->bv2;
- bp->bio1.bi_max_vecs = 1;
- bp->bio2.bi_max_vecs = 1;
+ bp->bio1.bi_max_vecs = 1;
+ bp->bio2.bi_max_vecs = 1;
+ }
bp->bio1.bi_end_io = bio_pair_end_1;
bp->bio2.bi_end_io = bio_pair_end_2;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 38e721b35d45..b3c1d3dae77d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -116,6 +116,8 @@ EXPORT_SYMBOL(invalidate_bdev);
int set_blocksize(struct block_device *bdev, int size)
{
+ struct address_space *mapping;
+
/* Size must be a power of two, and between 512 and PAGE_SIZE */
if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
return -EINVAL;
@@ -124,6 +126,19 @@ int set_blocksize(struct block_device *bdev, int size)
if (size < bdev_logical_block_size(bdev))
return -EINVAL;
+ /* Prevent starting I/O or mapping the device */
+ percpu_down_write(&bdev->bd_block_size_semaphore);
+
+ /* Check that the block device is not memory mapped */
+ mapping = bdev->bd_inode->i_mapping;
+ mutex_lock(&mapping->i_mmap_mutex);
+ if (mapping_mapped(mapping)) {
+ mutex_unlock(&mapping->i_mmap_mutex);
+ percpu_up_write(&bdev->bd_block_size_semaphore);
+ return -EBUSY;
+ }
+ mutex_unlock(&mapping->i_mmap_mutex);
+
/* Don't change the size if it is same as current */
if (bdev->bd_block_size != size) {
sync_blockdev(bdev);
@@ -131,6 +146,9 @@ int set_blocksize(struct block_device *bdev, int size)
bdev->bd_inode->i_blkbits = blksize_bits(size);
kill_bdev(bdev);
}
+
+ percpu_up_write(&bdev->bd_block_size_semaphore);
+
return 0;
}
@@ -441,6 +459,12 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)
struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
if (!ei)
return NULL;
+
+ if (unlikely(percpu_init_rwsem(&ei->bdev.bd_block_size_semaphore))) {
+ kmem_cache_free(bdev_cachep, ei);
+ return NULL;
+ }
+
return &ei->vfs_inode;
}
@@ -449,6 +473,8 @@ static void bdev_i_callback(struct rcu_head *head)
struct inode *inode = container_of(head, struct inode, i_rcu);
struct bdev_inode *bdi = BDEV_I(inode);
+ percpu_free_rwsem(&bdi->bdev.bd_block_size_semaphore);
+
kmem_cache_free(bdev_cachep, bdi);
}
@@ -1567,6 +1593,22 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
return blkdev_ioctl(bdev, mode, cmd, arg);
}
+ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ ssize_t ret;
+ struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
+
+ percpu_down_read(&bdev->bd_block_size_semaphore);
+
+ ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
+
+ percpu_up_read(&bdev->bd_block_size_semaphore);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(blkdev_aio_read);
+
/*
* Write data to the block device. Only intended for the block device itself
* and the raw driver which basically is a fake block device.
@@ -1578,12 +1620,16 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct file *file = iocb->ki_filp;
+ struct block_device *bdev = I_BDEV(file->f_mapping->host);
struct blk_plug plug;
ssize_t ret;
BUG_ON(iocb->ki_pos != pos);
blk_start_plug(&plug);
+
+ percpu_down_read(&bdev->bd_block_size_semaphore);
+
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
if (ret > 0 || ret == -EIOCBQUEUED) {
ssize_t err;
@@ -1592,11 +1638,29 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (err < 0 && ret > 0)
ret = err;
}
+
+ percpu_up_read(&bdev->bd_block_size_semaphore);
+
blk_finish_plug(&plug);
+
return ret;
}
EXPORT_SYMBOL_GPL(blkdev_aio_write);
+static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ int ret;
+ struct block_device *bdev = I_BDEV(file->f_mapping->host);
+
+ percpu_down_read(&bdev->bd_block_size_semaphore);
+
+ ret = generic_file_mmap(file, vma);
+
+ percpu_up_read(&bdev->bd_block_size_semaphore);
+
+ return ret;
+}
+
/*
* Try to release a page associated with block device when the system
* is under memory pressure.
@@ -1627,9 +1691,9 @@ const struct file_operations def_blk_fops = {
.llseek = block_llseek,
.read = do_sync_read,
.write = do_sync_write,
- .aio_read = generic_file_aio_read,
+ .aio_read = blkdev_aio_read,
.aio_write = blkdev_aio_write,
- .mmap = generic_file_mmap,
+ .mmap = blkdev_mmap,
.fsync = blkdev_fsync,
.unlocked_ioctl = block_ioctl,
#ifdef CONFIG_COMPAT
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 1585db1aa365..f936cb50dc0d 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -814,8 +814,8 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
struct bio *bio;
if (per_dev != master_dev) {
- bio = bio_kmalloc(GFP_KERNEL,
- master_dev->bio->bi_max_vecs);
+ bio = bio_clone_kmalloc(master_dev->bio,
+ GFP_KERNEL);
if (unlikely(!bio)) {
ORE_DBGMSG(
"Failed to allocate BIO size=%u\n",
@@ -824,7 +824,6 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
goto out;
}
- __bio_clone(bio, master_dev->bio);
bio->bi_bdev = NULL;
bio->bi_next = NULL;
per_dev->offset = master_dev->offset;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 26435890dc87..820e7aaad4fd 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -212,20 +212,41 @@ extern void bio_pair_release(struct bio_pair *dbio);
extern struct bio_set *bioset_create(unsigned int, unsigned int);
extern void bioset_free(struct bio_set *);
-extern struct bio *bio_alloc(gfp_t, unsigned int);
-extern struct bio *bio_kmalloc(gfp_t, unsigned int);
extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
extern void bio_put(struct bio *);
-extern void bio_free(struct bio *, struct bio_set *);
+
+extern void __bio_clone(struct bio *, struct bio *);
+extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
+
+extern struct bio_set *fs_bio_set;
+
+static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
+{
+ return bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
+}
+
+static inline struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
+{
+ return bio_clone_bioset(bio, gfp_mask, fs_bio_set);
+}
+
+static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs)
+{
+ return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
+}
+
+static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask)
+{
+ return bio_clone_bioset(bio, gfp_mask, NULL);
+
+}
extern void bio_endio(struct bio *, int);
struct request_queue;
extern int bio_phys_segments(struct request_queue *, struct bio *);
-extern void __bio_clone(struct bio *, struct bio *);
-extern struct bio *bio_clone(struct bio *, gfp_t);
-
extern void bio_init(struct bio *);
+extern void bio_reset(struct bio *);
extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
@@ -304,8 +325,6 @@ struct biovec_slab {
struct kmem_cache *slab;
};
-extern struct bio_set *fs_bio_set;
-
/*
* a small number of entries is fine, not going to be performance critical.
* basically we just need to survive
@@ -367,9 +386,31 @@ static inline char *__bio_kmap_irq(struct bio *bio, unsigned short idx,
/*
* Check whether this bio carries any data or not. A NULL bio is allowed.
*/
-static inline int bio_has_data(struct bio *bio)
+static inline bool bio_has_data(struct bio *bio)
{
- return bio && bio->bi_io_vec != NULL;
+ if (bio && bio->bi_vcnt)
+ return true;
+
+ return false;
+}
+
+static inline bool bio_is_rw(struct bio *bio)
+{
+ if (!bio_has_data(bio))
+ return false;
+
+ if (bio->bi_rw & REQ_WRITE_SAME)
+ return false;
+
+ return true;
+}
+
+static inline bool bio_mergeable(struct bio *bio)
+{
+ if (bio->bi_rw & REQ_NOMERGE_FLAGS)
+ return false;
+
+ return true;
}
/*
@@ -505,9 +546,8 @@ static inline struct bio *bio_list_get(struct bio_list *bl)
#define bio_integrity(bio) (bio->bi_integrity != NULL)
-extern struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *, gfp_t, unsigned int, struct bio_set *);
extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int);
-extern void bio_integrity_free(struct bio *, struct bio_set *);
+extern void bio_integrity_free(struct bio *);
extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int);
extern int bio_integrity_enabled(struct bio *bio);
extern int bio_integrity_set_tag(struct bio *, void *, unsigned int);
@@ -517,7 +557,7 @@ extern void bio_integrity_endio(struct bio *, int);
extern void bio_integrity_advance(struct bio *, unsigned int);
extern void bio_integrity_trim(struct bio *, unsigned int, unsigned int);
extern void bio_integrity_split(struct bio *, struct bio_pair *, int);
-extern int bio_integrity_clone(struct bio *, struct bio *, gfp_t, struct bio_set *);
+extern int bio_integrity_clone(struct bio *, struct bio *, gfp_t);
extern int bioset_integrity_create(struct bio_set *, int);
extern void bioset_integrity_free(struct bio_set *);
extern void bio_integrity_init(void);
@@ -549,13 +589,13 @@ static inline int bio_integrity_prep(struct bio *bio)
return 0;
}
-static inline void bio_integrity_free(struct bio *bio, struct bio_set *bs)
+static inline void bio_integrity_free(struct bio *bio)
{
return;
}
static inline int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
- gfp_t gfp_mask, struct bio_set *bs)
+ gfp_t gfp_mask)
{
return 0;
}
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 7b7ac9ccec7a..cdf11191e645 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -59,12 +59,6 @@ struct bio {
unsigned int bi_seg_front_size;
unsigned int bi_seg_back_size;
- unsigned int bi_max_vecs; /* max bvl_vecs we can hold */
-
- atomic_t bi_cnt; /* pin count */
-
- struct bio_vec *bi_io_vec; /* the actual vec list */
-
bio_end_io_t *bi_end_io;
void *bi_private;
@@ -80,7 +74,17 @@ struct bio {
struct bio_integrity_payload *bi_integrity; /* data integrity */
#endif
- bio_destructor_t *bi_destructor; /* destructor */
+ /*
+ * Everything starting with bi_max_vecs will be preserved by bio_reset()
+ */
+
+ unsigned int bi_max_vecs; /* max bvl_vecs we can hold */
+
+ atomic_t bi_cnt; /* pin count */
+
+ struct bio_vec *bi_io_vec; /* the actual vec list */
+
+ struct bio_set *bi_pool;
/*
* We can inline a number of vecs at the end of the bio, to avoid
@@ -90,6 +94,8 @@ struct bio {
struct bio_vec bi_inline_vecs[0];
};
+#define BIO_RESET_BYTES offsetof(struct bio, bi_max_vecs)
+
/*
* bio flags
*/
@@ -105,6 +111,13 @@ struct bio {
#define BIO_FS_INTEGRITY 9 /* fs owns integrity data, not block layer */
#define BIO_QUIET 10 /* Make BIO Quiet */
#define BIO_MAPPED_INTEGRITY 11/* integrity metadata has been remapped */
+
+/*
+ * Flags starting here get preserved by bio_reset() - this includes
+ * BIO_POOL_IDX()
+ */
+#define BIO_RESET_BITS 12
+
#define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag)))
/*
@@ -134,6 +147,7 @@ enum rq_flag_bits {
__REQ_PRIO, /* boost priority in cfq */
__REQ_DISCARD, /* request to discard sectors */
__REQ_SECURE, /* secure discard (used with __REQ_DISCARD) */
+ __REQ_WRITE_SAME, /* write same block many times */
__REQ_NOIDLE, /* don't anticipate more IO after this one */
__REQ_FUA, /* forced unit access */
@@ -172,15 +186,21 @@ enum rq_flag_bits {
#define REQ_META (1 << __REQ_META)
#define REQ_PRIO (1 << __REQ_PRIO)
#define REQ_DISCARD (1 << __REQ_DISCARD)
+#define REQ_WRITE_SAME (1 << __REQ_WRITE_SAME)
#define REQ_NOIDLE (1 << __REQ_NOIDLE)
#define REQ_FAILFAST_MASK \
(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
#define REQ_COMMON_MASK \
(REQ_WRITE | REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | \
- REQ_DISCARD | REQ_NOIDLE | REQ_FLUSH | REQ_FUA | REQ_SECURE)
+ REQ_DISCARD | REQ_WRITE_SAME | REQ_NOIDLE | REQ_FLUSH | REQ_FUA | \
+ REQ_SECURE)
#define REQ_CLONE_MASK REQ_COMMON_MASK
+/* This mask is used for both bio and request merge checking */
+#define REQ_NOMERGE_FLAGS \
+ (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA)
+
#define REQ_RAHEAD (1 << __REQ_RAHEAD)
#define REQ_THROTTLED (1 << __REQ_THROTTLED)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4a2ab7c85393..1756001210d2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -270,6 +270,7 @@ struct queue_limits {
unsigned int io_min;
unsigned int io_opt;
unsigned int max_discard_sectors;
+ unsigned int max_write_same_sectors;
unsigned int discard_granularity;
unsigned int discard_alignment;
@@ -540,8 +541,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
#define blk_account_rq(rq) \
(((rq)->cmd_flags & REQ_STARTED) && \
- ((rq)->cmd_type == REQ_TYPE_FS || \
- ((rq)->cmd_flags & REQ_DISCARD)))
+ ((rq)->cmd_type == REQ_TYPE_FS))
#define blk_pm_request(rq) \
((rq)->cmd_type == REQ_TYPE_PM_SUSPEND || \
@@ -595,17 +595,39 @@ static inline void blk_clear_rl_full(struct request_list *rl, bool sync)
rl->flags &= ~flag;
}
+static inline bool rq_mergeable(struct request *rq)
+{
+ if (rq->cmd_type != REQ_TYPE_FS)
+ return false;
-/*
- * mergeable request must not have _NOMERGE or _BARRIER bit set, nor may
- * it already be started by driver.
- */
-#define RQ_NOMERGE_FLAGS \
- (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA | REQ_DISCARD)
-#define rq_mergeable(rq) \
- (!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \
- (((rq)->cmd_flags & REQ_DISCARD) || \
- (rq)->cmd_type == REQ_TYPE_FS))
+ if (rq->cmd_flags & REQ_NOMERGE_FLAGS)
+ return false;
+
+ return true;
+}
+
+static inline bool blk_check_merge_flags(unsigned int flags1,
+ unsigned int flags2)
+{
+ if ((flags1 & REQ_DISCARD) != (flags2 & REQ_DISCARD))
+ return false;
+
+ if ((flags1 & REQ_SECURE) != (flags2 & REQ_SECURE))
+ return false;
+
+ if ((flags1 & REQ_WRITE_SAME) != (flags2 & REQ_WRITE_SAME))
+ return false;
+
+ return true;
+}
+
+static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b)
+{
+ if (bio_data(a) == bio_data(b))
+ return true;
+
+ return false;
+}
/*
* q->prep_rq_fn return values
@@ -802,6 +824,28 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
return blk_rq_cur_bytes(rq) >> 9;
}
+static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
+ unsigned int cmd_flags)
+{
+ if (unlikely(cmd_flags & REQ_DISCARD))
+ return q->limits.max_discard_sectors;
+
+ if (unlikely(cmd_flags & REQ_WRITE_SAME))
+ return q->limits.max_write_same_sectors;
+
+ return q->limits.max_sectors;
+}
+
+static inline unsigned int blk_rq_get_max_sectors(struct request *rq)
+{
+ struct request_queue *q = rq->q;
+
+ if (unlikely(rq->cmd_type == REQ_TYPE_BLOCK_PC))
+ return q->limits.max_hw_sectors;
+
+ return blk_queue_get_max_sectors(q, rq->cmd_flags);
+}
+
/*
* Request issue related functions.
*/
@@ -857,6 +901,8 @@ extern void blk_queue_max_segments(struct request_queue *, unsigned short);
extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
extern void blk_queue_max_discard_sectors(struct request_queue *q,
unsigned int max_discard_sectors);
+extern void blk_queue_max_write_same_sectors(struct request_queue *q,
+ unsigned int max_write_same_sectors);
extern void blk_queue_logical_block_size(struct request_queue *, unsigned short);
extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
extern void blk_queue_alignment_offset(struct request_queue *q,
@@ -987,6 +1033,8 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
+extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp_mask, struct page *page);
extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask);
static inline int sb_issue_discard(struct super_block *sb, sector_t block,
@@ -1164,6 +1212,16 @@ static inline unsigned int bdev_discard_zeroes_data(struct block_device *bdev)
return queue_discard_zeroes_data(bdev_get_queue(bdev));
}
+static inline unsigned int bdev_write_same(struct block_device *bdev)
+{
+ struct request_queue *q = bdev_get_queue(bdev);
+
+ if (q)
+ return q->limits.max_write_same_sectors;
+
+ return 0;
+}
+
static inline int queue_dma_alignment(struct request_queue *q)
{
return q ? q->dma_alignment : 511;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c617ed024df8..39f3e12ca752 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -335,6 +335,7 @@ struct inodes_stat_t {
#define BLKDISCARDZEROES _IO(0x12,124)
#define BLKSECDISCARD _IO(0x12,125)
#define BLKROTATIONAL _IO(0x12,126)
+#define BLKZEROOUT _IO(0x12,127)
#define BMAP_IOCTL 1 /* obsolete - kept for compatibility */
#define FIBMAP _IO(0x00,1) /* bmap access */
@@ -415,6 +416,7 @@ struct inodes_stat_t {
#include <linux/migrate_mode.h>
#include <linux/uidgid.h>
#include <linux/lockdep.h>
+#include <linux/percpu-rwsem.h>
#include <asm/byteorder.h>
@@ -724,6 +726,8 @@ struct block_device {
int bd_fsfreeze_count;
/* Mutex for freeze */
struct mutex bd_fsfreeze_mutex;
+ /* A semaphore that prevents I/O while block size is being changed */
+ struct percpu_rw_semaphore bd_block_size_semaphore;
};
/*
@@ -2570,6 +2574,8 @@ extern int generic_segment_checks(const struct iovec *iov,
unsigned long *nr_segs, size_t *count, int access_flags);
/* fs/block_dev.c */
+extern ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos);
extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos);
extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
new file mode 100644
index 000000000000..cf80f7e5277f
--- /dev/null
+++ b/include/linux/percpu-rwsem.h
@@ -0,0 +1,89 @@
+#ifndef _LINUX_PERCPU_RWSEM_H
+#define _LINUX_PERCPU_RWSEM_H
+
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/delay.h>
+
+struct percpu_rw_semaphore {
+ unsigned __percpu *counters;
+ bool locked;
+ struct mutex mtx;
+};
+
+static inline void percpu_down_read(struct percpu_rw_semaphore *p)
+{
+ rcu_read_lock();
+ if (unlikely(p->locked)) {
+ rcu_read_unlock();
+ mutex_lock(&p->mtx);
+ this_cpu_inc(*p->counters);
+ mutex_unlock(&p->mtx);
+ return;
+ }
+ this_cpu_inc(*p->counters);
+ rcu_read_unlock();
+}
+
+static inline void percpu_up_read(struct percpu_rw_semaphore *p)
+{
+ /*
+ * On X86, write operation in this_cpu_dec serves as a memory unlock
+ * barrier (i.e. memory accesses may be moved before the write, but
+ * no memory accesses are moved past the write).
+ * On other architectures this may not be the case, so we need smp_mb()
+ * there.
+ */
+#if defined(CONFIG_X86) && (!defined(CONFIG_X86_PPRO_FENCE) && !defined(CONFIG_X86_OOSTORE))
+ barrier();
+#else
+ smp_mb();
+#endif
+ this_cpu_dec(*p->counters);
+}
+
+static inline unsigned __percpu_count(unsigned __percpu *counters)
+{
+ unsigned total = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ total += ACCESS_ONCE(*per_cpu_ptr(counters, cpu));
+
+ return total;
+}
+
+static inline void percpu_down_write(struct percpu_rw_semaphore *p)
+{
+ mutex_lock(&p->mtx);
+ p->locked = true;
+ synchronize_rcu();
+ while (__percpu_count(p->counters))
+ msleep(1);
+ smp_rmb(); /* paired with smp_mb() in percpu_sem_up_read() */
+}
+
+static inline void percpu_up_write(struct percpu_rw_semaphore *p)
+{
+ p->locked = false;
+ mutex_unlock(&p->mtx);
+}
+
+static inline int percpu_init_rwsem(struct percpu_rw_semaphore *p)
+{
+ p->counters = alloc_percpu(unsigned);
+ if (unlikely(!p->counters))
+ return -ENOMEM;
+ p->locked = false;
+ mutex_init(&p->mtx);
+ return 0;
+}
+
+static inline void percpu_free_rwsem(struct percpu_rw_semaphore *p)
+{
+ free_percpu(p->counters);
+ p->counters = NULL; /* catch use after free bugs */
+}
+
+#endif
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 7b600da9a635..4bd6c06eb28e 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -201,6 +201,7 @@ static inline void *sg_virt(struct scatterlist *sg)
return page_address(sg_page(sg)) + sg->offset;
}
+int sg_nents(struct scatterlist *sg);
struct scatterlist *sg_next(struct scatterlist *);
struct scatterlist *sg_last(struct scatterlist *s, unsigned int);
void sg_init_table(struct scatterlist *, unsigned int);
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index e76d85cf3175..3675452b23ca 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -39,6 +39,25 @@ struct scatterlist *sg_next(struct scatterlist *sg)
EXPORT_SYMBOL(sg_next);
/**
+ * sg_nents - return total count of entries in scatterlist
+ * @sg: The scatterlist
+ *
+ * Description:
+ * Allows to know how many entries are in sg, taking into acount
+ * chaining as well
+ *
+ **/
+int sg_nents(struct scatterlist *sg)
+{
+ int nents;
+ for (nents = 0; sg; sg = sg_next(sg))
+ nents++;
+ return nents;
+}
+EXPORT_SYMBOL(sg_nents);
+
+
+/**
* sg_last - return the last scatterlist entry in a list
* @sgl: First entry in the scatterlist
* @nents: Number of entries in the scatterlist