From 1311326cf4755c7ffefd20f576144ecf46d9906b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 25 Jun 2018 19:31:49 +0800 Subject: blk-mq: avoid to synchronize rcu inside blk_cleanup_queue() SCSI probing may synchronously create and destroy a lot of request_queues for non-existent devices. Any synchronize_rcu() in queue creation or destroy path may introduce long latency during booting, see detailed description in comment of blk_register_queue(). This patch removes one synchronize_rcu() inside blk_cleanup_queue() for this case, commit c2856ae2f315d75(blk-mq: quiesce queue before freeing queue) needs synchronize_rcu() for implementing blk_mq_quiesce_queue(), but when queue isn't initialized, it isn't necessary to do that since only pass-through requests are involved, no original issue in scsi_execute() at all. Without this patch and previous one, it may take more 20+ seconds for virtio-scsi to complete disk probe. With the two patches, the time becomes less than 100ms. Fixes: c2856ae2f315d75 ("blk-mq: quiesce queue before freeing queue") Reported-by: Andrew Jones Cc: Omar Sandoval Cc: Bart Van Assche Cc: linux-scsi@vger.kernel.org Cc: "Martin K. Petersen" Cc: Christoph Hellwig Tested-by: Andrew Jones Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'block/blk-core.c') diff --git a/block/blk-core.c b/block/blk-core.c index f84a9b7b6f5a..947e7a4abd8c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -762,9 +762,13 @@ void blk_cleanup_queue(struct request_queue *q) * make sure all in-progress dispatch are completed because * blk_freeze_queue() can only complete all requests, and * dispatch may still be in-progress since we dispatch requests - * from more than one contexts + * from more than one contexts. + * + * No need to quiesce queue if it isn't initialized yet since + * blk_freeze_queue() should be enough for cases of passthrough + * request. */ - if (q->mq_ops) + if (q->mq_ops && blk_queue_init_done(q)) blk_mq_quiesce_queue(q); /* for synchronous bio-based driver finish in-flight integrity i/o */ -- cgit v1.2.3 From 1954e9a998d59d08520d7d4bebeafb8f66ba0d0f Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 27 Jun 2018 13:09:05 -0700 Subject: block: Document how blk_update_request() handles RQF_SPECIAL_PAYLOAD requests The payload of struct request is stored in the request.bio chain if the RQF_SPECIAL_PAYLOAD flag is not set and in request.special_vec if RQF_SPECIAL_PAYLOAD has been set. However, blk_update_request() iterates over req->bio whether or not RQF_SPECIAL_PAYLOAD has been set. Additionally, the RQF_SPECIAL_PAYLOAD flag is ignored by blk_rq_bytes() which means that the value returned by that function is incorrect if the RQF_SPECIAL_PAYLOAD flag has been set. It is not clear to me whether this is an oversight or whether this happened on purpose. Anyway, document that it is known that both functions ignore RQF_SPECIAL_PAYLOAD. See also commit f9d03f96b988 ("block: improve handling of the magic discard payload"). Reviewed-by: Christoph Hellwig Signed-off-by: Bart Van Assche Cc: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'block/blk-core.c') diff --git a/block/blk-core.c b/block/blk-core.c index 947e7a4abd8c..2ff8e131a892 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3056,6 +3056,10 @@ EXPORT_SYMBOL_GPL(blk_steal_bios); * Passing the result of blk_rq_bytes() as @nr_bytes guarantees * %false return from this function. * + * Note: + * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in both + * blk_rq_bytes() and in blk_update_request(). + * * Return: * %false - this request doesn't have any more data * %true - this request has more data -- cgit v1.2.3 From a79050434b45959f397042080fd1d70ffa9bd9df Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 3 Jul 2018 09:32:35 -0600 Subject: blk-rq-qos: refactor out common elements of blk-wbt blkcg-qos is going to do essentially what wbt does, only on a cgroup basis. Break out the common code that will be shared between blkcg-qos and wbt into blk-rq-qos.* so they can both utilize the same infrastructure. Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- block/blk-core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'block/blk-core.c') diff --git a/block/blk-core.c b/block/blk-core.c index 2ff8e131a892..b33a73bcf2d0 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1645,7 +1645,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq) blk_delete_timer(rq); blk_clear_rq_complete(rq); trace_block_rq_requeue(q, rq); - wbt_requeue(q->rq_wb, rq); + rq_qos_requeue(q, rq); if (rq->rq_flags & RQF_QUEUED) blk_queue_end_tag(q, rq); @@ -1752,7 +1752,7 @@ void __blk_put_request(struct request_queue *q, struct request *req) /* this is a bio leak */ WARN_ON(req->bio != NULL); - wbt_done(q->rq_wb, req); + rq_qos_done(q, req); /* * Request may not have originated from ll_rw_blk. if not, @@ -2044,7 +2044,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) } get_rq: - wb_acct = wbt_wait(q->rq_wb, bio, q->queue_lock); + wb_acct = rq_qos_throttle(q, bio, q->queue_lock); /* * Grab a free request. This is might sleep but can not fail. @@ -2054,7 +2054,7 @@ get_rq: req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO); if (IS_ERR(req)) { blk_queue_exit(q); - __wbt_done(q->rq_wb, wb_acct); + rq_qos_cleanup(q, wb_acct); if (PTR_ERR(req) == -ENOMEM) bio->bi_status = BLK_STS_RESOURCE; else @@ -2983,7 +2983,7 @@ void blk_start_request(struct request *req) req->throtl_size = blk_rq_sectors(req); #endif req->rq_flags |= RQF_STATS; - wbt_issue(req->q->rq_wb, req); + rq_qos_issue(req->q, req); } BUG_ON(blk_rq_is_complete(req)); @@ -3207,7 +3207,7 @@ void blk_finish_request(struct request *req, blk_status_t error) blk_account_io_done(req, now); if (req->end_io) { - wbt_done(req->q->rq_wb, req); + rq_qos_done(q, req); req->end_io(req, error); } else { if (blk_bidi_rq(req)) -- cgit v1.2.3 From c1c80384c8f47021a01a0cc42894a06bed2b801b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 3 Jul 2018 11:14:59 -0400 Subject: block: remove external dependency on wbt_flags We don't really need to save this stuff in the core block code, we can just pass the bio back into the helpers later on to derive the same flags and update the rq->wbt_flags appropriately. Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- block/blk-core.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'block/blk-core.c') diff --git a/block/blk-core.c b/block/blk-core.c index b33a73bcf2d0..687d7732f23a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -42,7 +42,7 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-sched.h" -#include "blk-wbt.h" +#include "blk-rq-qos.h" #ifdef CONFIG_DEBUG_FS struct dentry *blk_debugfs_root; @@ -1986,7 +1986,6 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) int where = ELEVATOR_INSERT_SORT; struct request *req, *free; unsigned int request_count = 0; - unsigned int wb_acct; /* * low level driver can indicate that it wants pages above a @@ -2044,7 +2043,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) } get_rq: - wb_acct = rq_qos_throttle(q, bio, q->queue_lock); + rq_qos_throttle(q, bio, q->queue_lock); /* * Grab a free request. This is might sleep but can not fail. @@ -2054,7 +2053,7 @@ get_rq: req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO); if (IS_ERR(req)) { blk_queue_exit(q); - rq_qos_cleanup(q, wb_acct); + rq_qos_cleanup(q, bio); if (PTR_ERR(req) == -ENOMEM) bio->bi_status = BLK_STS_RESOURCE; else @@ -2063,7 +2062,7 @@ get_rq: goto out_unlock; } - wbt_track(req, wb_acct); + rq_qos_track(q, req, bio); /* * After dropping the lock and possibly sleeping here, our request -- cgit v1.2.3 From e9a83853302b339e63dea4072f6210e5a88ab4bb Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 6 Jul 2018 10:49:35 +0200 Subject: block: Add default switch case to blk_pm_allow_request() to kill warning With gcc 4.9.0 and 7.3.0: block/blk-core.c: In function 'blk_pm_allow_request': block/blk-core.c:2747:2: warning: enumeration value 'RPM_ACTIVE' not handled in switch [-Wswitch] switch (rq->q->rpm_status) { ^ Convert the return statement below the switch() block into a default case to fix this. Fixes: e4f36b249b4d4e75 ("block: fix peeking requests during PM") Signed-off-by: Geert Uytterhoeven Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block/blk-core.c') diff --git a/block/blk-core.c b/block/blk-core.c index 687d7732f23a..c4b57d8806fe 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2753,9 +2753,9 @@ static bool blk_pm_allow_request(struct request *rq) return rq->rq_flags & RQF_PM; case RPM_SUSPENDED: return false; + default: + return true; } - - return true; } #else static bool blk_pm_allow_request(struct request *rq) -- cgit v1.2.3 From ddcf35d397976421a4ec1d0d00fbcc027a8cb034 Mon Sep 17 00:00:00 2001 From: Michael Callahan Date: Wed, 18 Jul 2018 04:47:39 -0700 Subject: block: Add and use op_stat_group() for indexing disk_stat fields. Add and use a new op_stat_group() function for indexing partition stat fields rather than indexing them by rq_data_dir() or bio_data_dir(). This function works similarly to op_is_sync() in that it takes the request::cmd_flags or bio::bi_opf flags and determines which stats should et updated. In addition, the second parameter to generic_start_io_acct() and generic_end_io_acct() is now a REQ_OP rather than simply a read or write bit and it uses op_stat_group() on the parameter to determine the stat group. Note that the partition in_flight counts are not part of the per-cpu statistics and as such are not indexed via this function. It's now indexed by op_is_write(). tj: Refreshed on top of v4.17. Updated to pass around REQ_OP. Signed-off-by: Michael Callahan Signed-off-by: Tejun Heo Cc: Minchan Kim Cc: Dan Williams Cc: Joshua Morris Cc: Philipp Reisner Cc: Matias Bjorling Cc: Kent Overstreet Cc: Alasdair Kergon Signed-off-by: Jens Axboe --- block/blk-core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'block/blk-core.c') diff --git a/block/blk-core.c b/block/blk-core.c index c4b57d8806fe..03a4ea93a5f3 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2702,13 +2702,13 @@ EXPORT_SYMBOL_GPL(blk_rq_err_bytes); void blk_account_io_completion(struct request *req, unsigned int bytes) { if (blk_do_io_stat(req)) { - const int rw = rq_data_dir(req); + const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; int cpu; cpu = part_stat_lock(); part = req->part; - part_stat_add(cpu, part, sectors[rw], bytes >> 9); + part_stat_add(cpu, part, sectors[sgrp], bytes >> 9); part_stat_unlock(); } } @@ -2722,7 +2722,7 @@ void blk_account_io_done(struct request *req, u64 now) */ if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) { unsigned long duration; - const int rw = rq_data_dir(req); + const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; int cpu; @@ -2730,10 +2730,10 @@ void blk_account_io_done(struct request *req, u64 now) cpu = part_stat_lock(); part = req->part; - part_stat_inc(cpu, part, ios[rw]); - part_stat_add(cpu, part, ticks[rw], duration); + part_stat_inc(cpu, part, ios[sgrp]); + part_stat_add(cpu, part, ticks[sgrp], duration); part_round_stats(req->q, cpu, part); - part_dec_in_flight(req->q, part, rw); + part_dec_in_flight(req->q, part, rq_data_dir(req)); hd_struct_put(part); part_stat_unlock(); -- cgit v1.2.3 From 54648cf1ec2d7f4b6a71767799c45676a138ca24 Mon Sep 17 00:00:00 2001 From: xiao jin Date: Mon, 30 Jul 2018 14:11:12 +0800 Subject: block: blk_init_allocated_queue() set q->fq as NULL in the fail case We find the memory use-after-free issue in __blk_drain_queue() on the kernel 4.14. After read the latest kernel 4.18-rc6 we think it has the same problem. Memory is allocated for q->fq in the blk_init_allocated_queue(). If the elevator init function called with error return, it will run into the fail case to free the q->fq. Then the __blk_drain_queue() uses the same memory after the free of the q->fq, it will lead to the unpredictable event. The patch is to set q->fq as NULL in the fail case of blk_init_allocated_queue(). Fixes: commit 7c94e1c157a2 ("block: introduce blk_flush_queue to drive flush machinery") Cc: Reviewed-by: Ming Lei Reviewed-by: Bart Van Assche Signed-off-by: xiao jin Signed-off-by: Jens Axboe --- block/blk-core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block/blk-core.c') diff --git a/block/blk-core.c b/block/blk-core.c index 03a4ea93a5f3..23cd1b7770e7 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1184,6 +1184,7 @@ out_exit_flush_rq: q->exit_rq_fn(q, q->fq->flush_rq); out_free_flush_queue: blk_free_flush_queue(q->fq); + q->fq = NULL; return -ENOMEM; } EXPORT_SYMBOL(blk_init_allocated_queue); -- cgit v1.2.3 From b233f127042dba991229e3882c6217c80492f6ef Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 30 Jul 2018 20:02:19 +0800 Subject: block: really disable runtime-pm for blk-mq Runtime PM isn't ready for blk-mq yet, and commit 765e40b675a9 ("block: disable runtime-pm for blk-mq") tried to disable it. Unfortunately, it can't take effect in that way since user space still can switch it on via 'echo auto > /sys/block/sdN/device/power/control'. This patch disables runtime-pm for blk-mq really by pm_runtime_disable() and fixes all kinds of PM related kernel crash. Cc: Tomas Janousek Cc: Przemek Socha Cc: Alan Stern Cc: Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Tested-by: Patrick Steinhardt Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'block/blk-core.c') diff --git a/block/blk-core.c b/block/blk-core.c index 23cd1b7770e7..f9ad73d8573c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3770,9 +3770,11 @@ EXPORT_SYMBOL(blk_finish_plug); */ void blk_pm_runtime_init(struct request_queue *q, struct device *dev) { - /* not support for RQF_PM and ->rpm_status in blk-mq yet */ - if (q->mq_ops) + /* Don't enable runtime PM for blk-mq until it is ready */ + if (q->mq_ops) { + pm_runtime_disable(dev); return; + } q->dev = dev; q->rpm_status = RPM_ACTIVE; -- cgit v1.2.3 From 4cf6324b17e96b7b7ab4021c6929500934d46750 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 9 Aug 2018 07:53:37 -0700 Subject: block: Introduce blk_exit_queue() This patch does not change any functionality. Signed-off-by: Bart Van Assche Reviewed-by: Johannes Thumshirn Cc: Christoph Hellwig Cc: Ming Lei Cc: Omar Sandoval Cc: Alexandru Moise <00moses.alexander00@gmail.com> Cc: Joseph Qi Cc: Signed-off-by: Jens Axboe --- block/blk-core.c | 54 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 24 deletions(-) (limited to 'block/blk-core.c') diff --git a/block/blk-core.c b/block/blk-core.c index f9ad73d8573c..49af34bf2119 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -715,6 +715,35 @@ void blk_set_queue_dying(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_set_queue_dying); +/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */ +void blk_exit_queue(struct request_queue *q) +{ + /* + * Since the I/O scheduler exit code may access cgroup information, + * perform I/O scheduler exit before disassociating from the block + * cgroup controller. + */ + if (q->elevator) { + ioc_clear_queue(q); + elevator_exit(q, q->elevator); + q->elevator = NULL; + } + + /* + * Remove all references to @q from the block cgroup controller before + * restoring @q->queue_lock to avoid that restoring this pointer causes + * e.g. blkcg_print_blkgs() to crash. + */ + blkcg_exit_queue(q); + + /* + * Since the cgroup code may dereference the @q->backing_dev_info + * pointer, only decrease its reference count after having removed the + * association with the block cgroup controller. + */ + bdi_put(q->backing_dev_info); +} + /** * blk_cleanup_queue - shutdown a request queue * @q: request queue to shutdown @@ -784,30 +813,7 @@ void blk_cleanup_queue(struct request_queue *q) */ WARN_ON_ONCE(q->kobj.state_in_sysfs); - /* - * Since the I/O scheduler exit code may access cgroup information, - * perform I/O scheduler exit before disassociating from the block - * cgroup controller. - */ - if (q->elevator) { - ioc_clear_queue(q); - elevator_exit(q, q->elevator); - q->elevator = NULL; - } - - /* - * Remove all references to @q from the block cgroup controller before - * restoring @q->queue_lock to avoid that restoring this pointer causes - * e.g. blkcg_print_blkgs() to crash. - */ - blkcg_exit_queue(q); - - /* - * Since the cgroup code may dereference the @q->backing_dev_info - * pointer, only decrease its reference count after having removed the - * association with the block cgroup controller. - */ - bdi_put(q->backing_dev_info); + blk_exit_queue(q); if (q->mq_ops) blk_mq_free_queue(q); -- cgit v1.2.3