From d1e36282b0bbd5de6a9c4d5275e94ef3b3438f48 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 29 Aug 2018 10:36:56 -0600 Subject: block: add REQ_HIPRI and inherit it from IOCB_HIPRI We use IOCB_HIPRI to poll for IO in the caller instead of scheduling. This information is not available for (or after) IO submission. The driver may make different queue choices based on the type of IO, so make the fact that we will poll for this IO known to the lower layers as well. Reviewed-by: Hannes Reinecke Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- fs/block_dev.c | 2 ++ fs/direct-io.c | 2 ++ fs/iomap.c | 9 ++++++++- 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index a80b4f0ee7c4..c039abfb2052 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -232,6 +232,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, bio.bi_opf = dio_bio_write_op(iocb); task_io_account_write(ret); } + if (iocb->ki_flags & IOCB_HIPRI) + bio.bi_opf |= REQ_HIPRI; qc = submit_bio(&bio); for (;;) { diff --git a/fs/direct-io.c b/fs/direct-io.c index 722d17c88edb..ea07d5a34317 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1265,6 +1265,8 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, } else { dio->op = REQ_OP_READ; } + if (iocb->ki_flags & IOCB_HIPRI) + dio->op_flags |= REQ_HIPRI; /* * For AIO O_(D)SYNC writes we need to defer completions to a workqueue diff --git a/fs/iomap.c b/fs/iomap.c index 64ce240217a1..f61d13dfdf09 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1553,6 +1553,7 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, unsigned len) { struct page *page = ZERO_PAGE(0); + int flags = REQ_SYNC | REQ_IDLE; struct bio *bio; bio = bio_alloc(GFP_KERNEL, 1); @@ -1561,9 +1562,12 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; + if (dio->iocb->ki_flags & IOCB_HIPRI) + flags |= REQ_HIPRI; + get_page(page); __bio_add_page(bio, page, len, 0); - bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE); + bio_set_op_attrs(bio, REQ_OP_WRITE, flags); atomic_inc(&dio->ref); return submit_bio(bio); @@ -1662,6 +1666,9 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, bio_set_pages_dirty(bio); } + if (dio->iocb->ki_flags & IOCB_HIPRI) + bio->bi_opf |= REQ_HIPRI; + iov_iter_advance(dio->submit.iter, n); dio->size += n; -- cgit v1.2.3 From 0619317ff8baa2da9238191ad5167ed3618c16d9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 13 Nov 2018 21:16:54 -0700 Subject: block: add polled wakeup task helper If we're polling for IO on a device that doesn't use interrupts, then IO completion loop (and wake of task) is done by submitting task itself. If that is the case, then we don't need to enter the wake_up_process() function, we can simply mark ourselves as TASK_RUNNING. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 4 ++-- fs/iomap.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index c039abfb2052..9fe56672cfe5 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -181,7 +181,7 @@ static void blkdev_bio_end_io_simple(struct bio *bio) struct task_struct *waiter = bio->bi_private; WRITE_ONCE(bio->bi_private, NULL); - wake_up_process(waiter); + blk_wake_io_task(waiter); } static ssize_t @@ -305,7 +305,7 @@ static void blkdev_bio_end_io(struct bio *bio) struct task_struct *waiter = dio->waiter; WRITE_ONCE(dio->waiter, NULL); - wake_up_process(waiter); + blk_wake_io_task(waiter); } } diff --git a/fs/iomap.c b/fs/iomap.c index f61d13dfdf09..b0462b363bad 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1525,7 +1525,7 @@ static void iomap_dio_bio_end_io(struct bio *bio) if (dio->wait_for_completion) { struct task_struct *waiter = dio->submit.waiter; WRITE_ONCE(dio->submit.waiter, NULL); - wake_up_process(waiter); + blk_wake_io_task(waiter); } else if (dio->flags & IOMAP_DIO_WRITE) { struct inode *inode = file_inode(dio->iocb->ki_filp); -- cgit v1.2.3 From d34513d384487e8022f143a3a6b791e6d7f0dad6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 6 Nov 2018 14:29:11 -0700 Subject: block: for async O_DIRECT, mark us as polling if asked to Inherit the iocb IOCB_HIPRI flag, and pass on REQ_HIPRI for those kinds of requests. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 9fe56672cfe5..e72b119ede84 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -383,6 +383,9 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES); if (!nr_pages) { + if (iocb->ki_flags & IOCB_HIPRI) + bio->bi_opf |= REQ_HIPRI; + qc = submit_bio(bio); break; } -- cgit v1.2.3 From cb700eb3faa488fbb4b60689adec84032d7cf24a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 15 Nov 2018 19:56:53 -0700 Subject: block: don't plug for aio/O_DIRECT HIPRI IO Those will go straight to issue inside blk-mq, so don't bother setting up a block plug for them. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index e72b119ede84..4d79bc80fb41 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -330,6 +330,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) struct blk_plug plug; struct blkdev_dio *dio; struct bio *bio; + bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0; bool is_read = (iov_iter_rw(iter) == READ), is_sync; loff_t pos = iocb->ki_pos; blk_qc_t qc = BLK_QC_T_NONE; @@ -353,7 +354,13 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) dio->multi_bio = false; dio->should_dirty = is_read && iter_is_iovec(iter); - blk_start_plug(&plug); + /* + * Don't plug for HIPRI/polled IO, as those should go straight + * to issue + */ + if (!is_poll) + blk_start_plug(&plug); + for (;;) { bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = pos >> 9; @@ -400,7 +407,9 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) submit_bio(bio); bio = bio_alloc(GFP_KERNEL, nr_pages); } - blk_finish_plug(&plug); + + if (!is_poll) + blk_finish_plug(&plug); if (!is_sync) return -EIOCBQUEUED; -- cgit v1.2.3 From 849a370016a5489c49253338507ee6cc4a08df4b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 16 Nov 2018 08:37:34 -0700 Subject: block: avoid ordered task state change for polled IO For the core poll helper, the task state setting don't need to imply any atomics, as it's the current task itself that is being modified and we're not going to sleep. For IRQ driven, the wakeup path have the necessary barriers to not need us using the heavy handed version of the task state setting. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 7 +++++-- fs/iomap.c | 3 ++- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 4d79bc80fb41..64ba27b8b754 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -237,9 +237,11 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, qc = submit_bio(&bio); for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); + __set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(bio.bi_private)) break; + if (!(iocb->ki_flags & IOCB_HIPRI) || !blk_poll(bdev_get_queue(bdev), qc)) io_schedule(); @@ -415,7 +417,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) return -EIOCBQUEUED; for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); + __set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(dio->waiter)) break; diff --git a/fs/iomap.c b/fs/iomap.c index b0462b363bad..c5df035ace6f 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1888,7 +1888,8 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, return -EIOCBQUEUED; for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); + __set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(dio->submit.waiter)) break; -- cgit v1.2.3 From 76dc891395dc61e92e2ff31b6161815ce5eb715b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 20 Nov 2018 10:52:36 +0900 Subject: aio: Fix fallback I/O priority value For cases when the application does not specify aio_reqprio for an aio, fallback to use get_current_ioprio() to obtain the task I/O priority last set using ioprio_set() rather than the hardcoded IOPRIO_CLASS_NONE value. Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Adam Manzanares Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- fs/aio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 301e6314183b..b984918be4b7 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1441,7 +1441,7 @@ static int aio_prep_rw(struct kiocb *req, struct iocb *iocb) req->ki_ioprio = iocb->aio_reqprio; } else - req->ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); + req->ki_ioprio = get_current_ioprio(); ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); if (unlikely(ret)) -- cgit v1.2.3 From 0a1b8b87d064a47fad9ec475316002da28559207 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 26 Nov 2018 08:24:43 -0700 Subject: block: make blk_poll() take a parameter on whether to spin or not blk_poll() has always kept spinning until it found an IO. This is fine for SYNC polling, since we need to find one request we have pending, but in preparation for ASYNC polling it can be beneficial to just check if we have any entries available or not. Existing callers are converted to pass in 'spin == true', to retain the old behavior. Signed-off-by: Jens Axboe --- fs/block_dev.c | 4 ++-- fs/direct-io.c | 2 +- fs/iomap.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 64ba27b8b754..d233a59ea364 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -243,7 +243,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, break; if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(bdev_get_queue(bdev), qc)) + !blk_poll(bdev_get_queue(bdev), qc, true)) io_schedule(); } __set_current_state(TASK_RUNNING); @@ -423,7 +423,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) break; if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(bdev_get_queue(bdev), qc)) + !blk_poll(bdev_get_queue(bdev), qc, true)) io_schedule(); } __set_current_state(TASK_RUNNING); diff --git a/fs/direct-io.c b/fs/direct-io.c index ea07d5a34317..a5a4e5a1423e 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -518,7 +518,7 @@ static struct bio *dio_await_one(struct dio *dio) dio->waiter = current; spin_unlock_irqrestore(&dio->bio_lock, flags); if (!(dio->iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(dio->bio_disk->queue, dio->bio_cookie)) + !blk_poll(dio->bio_disk->queue, dio->bio_cookie, true)) io_schedule(); /* wake up sets us TASK_RUNNING */ spin_lock_irqsave(&dio->bio_lock, flags); diff --git a/fs/iomap.c b/fs/iomap.c index c5df035ace6f..74c1f37f0fd6 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1896,7 +1896,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (!(iocb->ki_flags & IOCB_HIPRI) || !dio->submit.last_queue || !blk_poll(dio->submit.last_queue, - dio->submit.cookie)) + dio->submit.cookie, true)) io_schedule(); } __set_current_state(TASK_RUNNING); -- cgit v1.2.3 From 531724abc3bfb556c1dd68086cf9cb51f76464e3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 30 Nov 2018 09:23:48 +0100 Subject: block: avoid extra bio reference for async O_DIRECT The bio referencing has a trick that doesn't do any actual atomic inc/dec on the reference count until we have to elevator to > 1. For the async IO O_DIRECT case, we can't use the simple DIO variants, so we use __blkdev_direct_IO(). It always grabs an extra reference to the bio after allocation, which means we then enter the slower path of actually having to do atomic_inc/dec on the count. We don't need to do that for the async case, unless we end up going multi-bio, in which case we're already doing huge amounts of IO. For the smaller IO case (< BIO_MAX_PAGES), we can do without the extra ref. Based on an earlier patch (and commit log) from Jens Axboe. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index d233a59ea364..e1886cc7048f 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -302,7 +302,8 @@ static void blkdev_bio_end_io(struct bio *bio) } dio->iocb->ki_complete(iocb, ret, 0); - bio_put(&dio->bio); + if (dio->multi_bio) + bio_put(&dio->bio); } else { struct task_struct *waiter = dio->waiter; @@ -343,14 +344,15 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) return -EINVAL; bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool); - bio_get(bio); /* extra ref for the completion handler */ dio = container_of(bio, struct blkdev_dio, bio); dio->is_sync = is_sync = is_sync_kiocb(iocb); - if (dio->is_sync) + if (dio->is_sync) { dio->waiter = current; - else + bio_get(bio); + } else { dio->iocb = iocb; + } dio->size = 0; dio->multi_bio = false; @@ -400,6 +402,13 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) } if (!dio->multi_bio) { + /* + * AIO needs an extra reference to ensure the dio + * structure which is embedded into the first bio + * stays around. + */ + if (!is_sync) + bio_get(bio); dio->multi_bio = true; atomic_set(&dio->ref, 2); } else { -- cgit v1.2.3 From 154989e45fd8de9bfb52bbd6e5ea763e437e54c5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Nov 2018 16:44:07 +0100 Subject: aio: clear IOCB_HIPRI No one is going to poll for aio (yet), so we must clear the HIPRI flag, as we would otherwise send it down the poll queues, where no one will be polling for completions. Signed-off-by: Christoph Hellwig IOCB_HIPRI, not RWF_HIPRI. Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- fs/aio.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 205390c0c1bb..05647d352bf3 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1436,8 +1436,7 @@ static int aio_prep_rw(struct kiocb *req, struct iocb *iocb) ret = ioprio_check_cap(iocb->aio_reqprio); if (ret) { pr_debug("aio ioprio check cap error: %d\n", ret); - fput(req->ki_filp); - return ret; + goto out_fput; } req->ki_ioprio = iocb->aio_reqprio; @@ -1446,7 +1445,13 @@ static int aio_prep_rw(struct kiocb *req, struct iocb *iocb) ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); if (unlikely(ret)) - fput(req->ki_filp); + goto out_fput; + + req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */ + return 0; + +out_fput: + fput(req->ki_filp); return ret; } -- cgit v1.2.3 From fd42df305f804ddc0d5ac028e944784283b2f92d Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Wed, 5 Dec 2018 12:10:34 -0500 Subject: blkcg: associate writeback bios with a blkg One of the goals of this series is to remove a separate reference to the css of the bio. This can and should be accessed via bio_blkcg(). In this patch, wbc_init_bio() now requires a bio to have a device associated with it. Signed-off-by: Dennis Zhou Reviewed-by: Josef Bacik Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/buffer.c | 10 +++++----- fs/ext4/page-io.c | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 1286c2b95498..d60d61e8ed7d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3060,11 +3060,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, */ bio = bio_alloc(GFP_NOIO, 1); - if (wbc) { - wbc_init_bio(wbc, bio); - wbc_account_io(wbc, bh->b_page, bh->b_size); - } - bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio_set_dev(bio, bh->b_bdev); bio->bi_write_hint = write_hint; @@ -3084,6 +3079,11 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, op_flags |= REQ_PRIO; bio_set_op_attrs(bio, op, op_flags); + if (wbc) { + wbc_init_bio(wbc, bio); + wbc_account_io(wbc, bh->b_page, bh->b_size); + } + submit_bio(bio); return 0; } diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index db7590178dfc..2aa62d58d8dd 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -374,13 +374,13 @@ static int io_submit_init_bio(struct ext4_io_submit *io, bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); if (!bio) return -ENOMEM; - wbc_init_bio(io->io_wbc, bio); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio_set_dev(bio, bh->b_bdev); bio->bi_end_io = ext4_end_bio; bio->bi_private = ext4_get_io_end(io->io_end); io->io_bio = bio; io->io_next_block = bh->b_blocknr; + wbc_init_bio(io->io_wbc, bio); return 0; } -- cgit v1.2.3