diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-08-14 10:23:25 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-08-14 10:23:25 -0700 |
commit | 73ba2fb33c492916853dfe63e3b3163da0be661d (patch) | |
tree | c2fda8ca1273744d2e884d24189a15ac1a7d63c2 /block | |
parent | 958f338e96f874a0d29442396d6adf9c1e17aa2d (diff) | |
parent | b86d865cb1cae1e61527ea0b8977078bbf694328 (diff) | |
download | linux-73ba2fb33c492916853dfe63e3b3163da0be661d.tar.gz linux-73ba2fb33c492916853dfe63e3b3163da0be661d.tar.bz2 linux-73ba2fb33c492916853dfe63e3b3163da0be661d.zip |
Merge tag 'for-4.19/block-20180812' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe:
"First pull request for this merge window, there will also be a
followup request with some stragglers.
This pull request contains:
- Fix for a thundering heard issue in the wbt block code (Anchal
Agarwal)
- A few NVMe pull requests:
* Improved tracepoints (Keith)
* Larger inline data support for RDMA (Steve Wise)
* RDMA setup/teardown fixes (Sagi)
* Effects log suppor for NVMe target (Chaitanya Kulkarni)
* Buffered IO suppor for NVMe target (Chaitanya Kulkarni)
* TP4004 (ANA) support (Christoph)
* Various NVMe fixes
- Block io-latency controller support. Much needed support for
properly containing block devices. (Josef)
- Series improving how we handle sense information on the stack
(Kees)
- Lightnvm fixes and updates/improvements (Mathias/Javier et al)
- Zoned device support for null_blk (Matias)
- AIX partition fixes (Mauricio Faria de Oliveira)
- DIF checksum code made generic (Max Gurtovoy)
- Add support for discard in iostats (Michael Callahan / Tejun)
- Set of updates for BFQ (Paolo)
- Removal of async write support for bsg (Christoph)
- Bio page dirtying and clone fixups (Christoph)
- Set of bcache fix/changes (via Coly)
- Series improving blk-mq queue setup/teardown speed (Ming)
- Series improving merging performance on blk-mq (Ming)
- Lots of other fixes and cleanups from a slew of folks"
* tag 'for-4.19/block-20180812' of git://git.kernel.dk/linux-block: (190 commits)
blkcg: Make blkg_root_lookup() work for queues in bypass mode
bcache: fix error setting writeback_rate through sysfs interface
null_blk: add lock drop/acquire annotation
Blk-throttle: reduce tail io latency when iops limit is enforced
block: paride: pd: mark expected switch fall-throughs
block: Ensure that a request queue is dissociated from the cgroup controller
block: Introduce blk_exit_queue()
blkcg: Introduce blkg_root_lookup()
block: Remove two superfluous #include directives
blk-mq: count the hctx as active before allocating tag
block: bvec_nr_vecs() returns value for wrong slab
bcache: trivial - remove tailing backslash in macro BTREE_FLAG
bcache: make the pr_err statement used for ENOENT only in sysfs_attatch section
bcache: set max writeback rate when I/O request is idle
bcache: add code comments for bset.c
bcache: fix mistaken comments in request.c
bcache: fix mistaken code comments in bcache.h
bcache: add a comment in super.c
bcache: avoid unncessary cache prefetch bch_btree_node_get()
bcache: display rate debug parameters to 0 when writeback is not running
...
Diffstat (limited to 'block')
40 files changed, 2563 insertions, 1220 deletions
diff --git a/block/Kconfig b/block/Kconfig index eb50fd4977c2..1f2469a0123c 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -149,6 +149,18 @@ config BLK_WBT dynamically on an algorithm loosely based on CoDel, factoring in the realtime performance of the disk. +config BLK_CGROUP_IOLATENCY + bool "Enable support for latency based cgroup IO protection" + depends on BLK_CGROUP=y + default n + ---help--- + Enabling this option enables the .latency interface for IO throttling. + The IO controller will attempt to maintain average IO latencies below + the configured latency target, throttling anybody with a higher latency + target than the victimized group. + + Note, this is an experimental interface and could be changed someday. + config BLK_WBT_SQ bool "Single queue writeback throttling" default n @@ -177,6 +189,10 @@ config BLK_DEBUG_FS Unless you are building a kernel for a tiny system, you should say Y here. +config BLK_DEBUG_FS_ZONED + bool + default BLK_DEBUG_FS && BLK_DEV_ZONED + config BLK_SED_OPAL bool "Logic for interfacing with Opal enabled SEDs" ---help--- diff --git a/block/Makefile b/block/Makefile index 6a56303b9925..572b33f32c07 100644 --- a/block/Makefile +++ b/block/Makefile @@ -9,7 +9,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ genhd.o partition-generic.o ioprio.o \ - badblocks.o partitions/ + badblocks.o partitions/ blk-rq-qos.o obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o @@ -17,6 +17,7 @@ obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o +obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o @@ -34,4 +35,5 @@ obj-$(CONFIG_BLK_MQ_RDMA) += blk-mq-rdma.o obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o obj-$(CONFIG_BLK_WBT) += blk-wbt.o obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o +obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 495b9ddb3355..41d9036b1822 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -634,7 +634,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd) * The following function returns true if every queue must receive the * same share of the throughput (this condition is used when deciding * whether idling may be disabled, see the comments in the function - * bfq_bfqq_may_idle()). + * bfq_better_to_idle()). * * Such a scenario occurs when: * 1) all active queues have the same weight, @@ -742,8 +742,9 @@ inc_counter: * See the comments to the function bfq_weights_tree_add() for considerations * about overhead. */ -void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity, - struct rb_root *root) +void __bfq_weights_tree_remove(struct bfq_data *bfqd, + struct bfq_entity *entity, + struct rb_root *root) { if (!entity->weight_counter) return; @@ -760,6 +761,43 @@ reset_entity_pointer: } /* + * Invoke __bfq_weights_tree_remove on bfqq and all its inactive + * parent entities. + */ +void bfq_weights_tree_remove(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = bfqq->entity.parent; + + __bfq_weights_tree_remove(bfqd, &bfqq->entity, + &bfqd->queue_weights_tree); + + for_each_entity(entity) { + struct bfq_sched_data *sd = entity->my_sched_data; + + if (sd->next_in_service || sd->in_service_entity) { + /* + * entity is still active, because either + * next_in_service or in_service_entity is not + * NULL (see the comments on the definition of + * next_in_service for details on why + * in_service_entity must be checked too). + * + * As a consequence, the weight of entity is + * not to be removed. In addition, if entity + * is active, then its parent entities are + * active as well, and thus their weights are + * not to be removed either. In the end, this + * loop must stop here. + */ + break; + } + __bfq_weights_tree_remove(bfqd, entity, + &bfqd->group_weights_tree); + } +} + +/* * Return expired entry, or NULL to just start from scratch in rbtree. */ static struct request *bfq_check_fifo(struct bfq_queue *bfqq, @@ -1344,18 +1382,30 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, * remain unchanged after such an expiration, and the * following statement therefore assigns to * entity->budget the remaining budget on such an - * expiration. For clarity, entity->service is not - * updated on expiration in any case, and, in normal - * operation, is reset only when bfqq is selected for - * service (see bfq_get_next_queue). + * expiration. */ entity->budget = min_t(unsigned long, bfq_bfqq_budget_left(bfqq), bfqq->max_budget); + /* + * At this point, we have used entity->service to get + * the budget left (needed for updating + * entity->budget). Thus we finally can, and have to, + * reset entity->service. The latter must be reset + * because bfqq would otherwise be charged again for + * the service it has received during its previous + * service slot(s). + */ + entity->service = 0; + return true; } + /* + * We can finally complete expiration, by setting service to 0. + */ + entity->service = 0; entity->budget = max_t(unsigned long, bfqq->max_budget, bfq_serv_to_charge(bfqq->next_rq, bfqq)); bfq_clear_bfqq_non_blocking_wait_rq(bfqq); @@ -3233,11 +3283,21 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, ref = bfqq->ref; __bfq_bfqq_expire(bfqd, bfqq); + if (ref == 1) /* bfqq is gone, no more actions on it */ + return; + /* mark bfqq as waiting a request only if a bic still points to it */ - if (ref > 1 && !bfq_bfqq_busy(bfqq) && + if (!bfq_bfqq_busy(bfqq) && reason != BFQQE_BUDGET_TIMEOUT && - reason != BFQQE_BUDGET_EXHAUSTED) + reason != BFQQE_BUDGET_EXHAUSTED) { bfq_mark_bfqq_non_blocking_wait_rq(bfqq); + /* + * Not setting service to 0, because, if the next rq + * arrives in time, the queue will go on receiving + * service with this same budget (as if it never expired) + */ + } else + entity->service = 0; } /* @@ -3295,7 +3355,7 @@ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) * issues taken into account are not trivial. We discuss these issues * individually while introducing the variables. */ -static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) +static bool bfq_better_to_idle(struct bfq_queue *bfqq) { struct bfq_data *bfqd = bfqq->bfqd; bool rot_without_queueing = @@ -3528,19 +3588,19 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) } /* - * If the in-service queue is empty but the function bfq_bfqq_may_idle + * If the in-service queue is empty but the function bfq_better_to_idle * returns true, then: * 1) the queue must remain in service and cannot be expired, and * 2) the device must be idled to wait for the possible arrival of a new * request for the queue. - * See the comments on the function bfq_bfqq_may_idle for the reasons + * See the comments on the function bfq_better_to_idle for the reasons * why performing device idling is the best choice to boost the throughput - * and preserve service guarantees when bfq_bfqq_may_idle itself + * and preserve service guarantees when bfq_better_to_idle itself * returns true. */ static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) { - return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_may_idle(bfqq); + return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq); } /* @@ -3559,8 +3619,14 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); + /* + * Do not expire bfqq for budget timeout if bfqq may be about + * to enjoy device idling. The reason why, in this case, we + * prevent bfqq from expiring is the same as in the comments + * on the case where bfq_bfqq_must_idle() returns true, in + * bfq_completed_request(). + */ if (bfq_may_expire_for_budg_timeout(bfqq) && - !bfq_bfqq_wait_request(bfqq) && !bfq_bfqq_must_idle(bfqq)) goto expire; @@ -3620,7 +3686,7 @@ check_queue: * may idle after their completion, then keep it anyway. */ if (bfq_bfqq_wait_request(bfqq) || - (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { + (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) { bfqq = NULL; goto keep_queue; } @@ -4582,8 +4648,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) */ bfqq->budget_timeout = jiffies; - bfq_weights_tree_remove(bfqd, &bfqq->entity, - &bfqd->queue_weights_tree); + bfq_weights_tree_remove(bfqd, bfqq); } now_ns = ktime_get_ns(); @@ -4637,15 +4702,39 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) * or if we want to idle in case it has no pending requests. */ if (bfqd->in_service_queue == bfqq) { - if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) { - bfq_arm_slice_timer(bfqd); + if (bfq_bfqq_must_idle(bfqq)) { + if (bfqq->dispatched == 0) + bfq_arm_slice_timer(bfqd); + /* + * If we get here, we do not expire bfqq, even + * if bfqq was in budget timeout or had no + * more requests (as controlled in the next + * conditional instructions). The reason for + * not expiring bfqq is as follows. + * + * Here bfqq->dispatched > 0 holds, but + * bfq_bfqq_must_idle() returned true. This + * implies that, even if no request arrives + * for bfqq before bfqq->dispatched reaches 0, + * bfqq will, however, not be expired on the + * completion event that causes bfqq->dispatch + * to reach zero. In contrast, on this event, + * bfqq will start enjoying device idling + * (I/O-dispatch plugging). + * + * But, if we expired bfqq here, bfqq would + * not have the chance to enjoy device idling + * when bfqq->dispatched finally reaches + * zero. This would expose bfqq to violation + * of its reserved service guarantees. + */ return; } else if (bfq_may_expire_for_budg_timeout(bfqq)) bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_TIMEOUT); else if (RB_EMPTY_ROOT(&bfqq->sort_list) && (bfqq->dispatched == 0 || - !bfq_bfqq_may_idle(bfqq))) + !bfq_better_to_idle(bfqq))) bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_NO_MORE_REQUESTS); } diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 0f712e03b035..a8a2e5aca4d4 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -827,8 +827,11 @@ struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic); void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity, struct rb_root *root); -void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity, - struct rb_root *root); +void __bfq_weights_tree_remove(struct bfq_data *bfqd, + struct bfq_entity *entity, + struct rb_root *root); +void bfq_weights_tree_remove(struct bfq_data *bfqd, + struct bfq_queue *bfqq); void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool compensate, enum bfqq_expiration reason); void bfq_put_queue(struct bfq_queue *bfqq); diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 4498c43245e2..dbc07b456059 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -499,9 +499,6 @@ static void bfq_active_insert(struct bfq_service_tree *st, if (bfqq) list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); #ifdef CONFIG_BFQ_GROUP_IOSCHED - else /* bfq_group */ - bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree); - if (bfqg != bfqd->root_group) bfqg->active_entities++; #endif @@ -601,10 +598,6 @@ static void bfq_active_extract(struct bfq_service_tree *st, if (bfqq) list_del(&bfqq->bfqq_list); #ifdef CONFIG_BFQ_GROUP_IOSCHED - else /* bfq_group */ - bfq_weights_tree_remove(bfqd, entity, - &bfqd->group_weights_tree); - if (bfqg != bfqd->root_group) bfqg->active_entities--; #endif @@ -799,7 +792,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, if (prev_weight != new_weight) { root = bfqq ? &bfqd->queue_weights_tree : &bfqd->group_weights_tree; - bfq_weights_tree_remove(bfqd, entity, root); + __bfq_weights_tree_remove(bfqd, entity, root); } entity->weight = new_weight; /* @@ -971,7 +964,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, * one of its children receives a new request. * * Basically, this function updates the timestamps of entity and - * inserts entity into its active tree, ater possibly extracting it + * inserts entity into its active tree, after possibly extracting it * from its idle tree. */ static void __bfq_activate_entity(struct bfq_entity *entity, @@ -1015,6 +1008,16 @@ static void __bfq_activate_entity(struct bfq_entity *entity, entity->on_st = true; } +#ifdef BFQ_GROUP_IOSCHED_ENABLED + if (!bfq_entity_to_bfqq(entity)) { /* bfq_group */ + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_weights_tree_add(bfqg->bfqd, entity, + &bfqd->group_weights_tree); + } +#endif + bfq_update_fin_time_enqueue(entity, st, backshifted); } @@ -1542,12 +1545,6 @@ struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) sd->in_service_entity = entity; /* - * Reset the accumulator of the amount of service that - * the entity is about to receive. - */ - entity->service = 0; - - /* * If entity is no longer a candidate for next * service, then it must be extracted from its active * tree, so as to make sure that it won't be @@ -1664,8 +1661,7 @@ void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqd->busy_queues--; if (!bfqq->dispatched) - bfq_weights_tree_remove(bfqd, &bfqq->entity, - &bfqd->queue_weights_tree); + bfq_weights_tree_remove(bfqd, bfqq); if (bfqq->wr_coeff > 1) bfqd->wr_busy_queues--; diff --git a/block/bio-integrity.c b/block/bio-integrity.c index add7c7c85335..67b5fb861a51 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -160,28 +160,6 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, EXPORT_SYMBOL(bio_integrity_add_page); /** - * bio_integrity_intervals - Return number of integrity intervals for a bio - * @bi: blk_integrity profile for device - * @sectors: Size of the bio in 512-byte sectors - * - * Description: The block layer calculates everything in 512 byte - * sectors but integrity metadata is done in terms of the data integrity - * interval size of the storage device. Convert the block layer sectors - * to the appropriate number of integrity intervals. - */ -static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, - unsigned int sectors) -{ - return sectors >> (bi->interval_exp - 9); -} - -static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, - unsigned int sectors) -{ - return bio_integrity_intervals(bi, sectors) * bi->tuple_size; -} - -/** * bio_integrity_process - Process integrity metadata for a bio * @bio: bio to generate/verify integrity metadata for * @proc_iter: iterator to process diff --git a/block/bio.c b/block/bio.c index 047c5dca6d90..b12966e415d3 100644 --- a/block/bio.c +++ b/block/bio.c @@ -28,9 +28,11 @@ #include <linux/mempool.h> #include <linux/workqueue.h> #include <linux/cgroup.h> +#include <linux/blk-cgroup.h> #include <trace/events/block.h> #include "blk.h" +#include "blk-rq-qos.h" /* * Test patch to inline a certain number of bi_io_vec's inside the bio @@ -156,7 +158,7 @@ out: unsigned int bvec_nr_vecs(unsigned short idx) { - return bvec_slabs[idx].nr_vecs; + return bvec_slabs[--idx].nr_vecs; } void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) @@ -645,83 +647,6 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs) EXPORT_SYMBOL(bio_clone_fast); /** - * bio_clone_bioset - clone a bio - * @bio_src: bio to clone - * @gfp_mask: allocation priority - * @bs: bio_set to allocate from - * - * Clone bio. Caller will own the returned bio, but not the actual data it - * points to. Reference count of returned bio will be one. - */ -struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, - struct bio_set *bs) -{ - struct bvec_iter iter; - struct bio_vec bv; - struct bio *bio; - - /* - * Pre immutable biovecs, __bio_clone() used to just do a memcpy from - * bio_src->bi_io_vec to bio->bi_io_vec. - * - * We can't do that anymore, because: - * - * - The point of cloning the biovec is to produce a bio with a biovec - * the caller can modify: bi_idx and bi_bvec_done should be 0. - * - * - The original bio could've had more than BIO_MAX_PAGES biovecs; if - * we tried to clone the whole thing bio_alloc_bioset() would fail. - * But the clone should succeed as long as the number of biovecs we - * actually need to allocate is fewer than BIO_MAX_PAGES. - * - * - Lastly, bi_vcnt should not be looked at or relied upon by code - * that does not own the bio - reason being drivers don't use it for - * iterating over the biovec anymore, so expecting it to be kept up - * to date (i.e. for clones that share the parent biovec) is just - * asking for trouble and would force extra work on - * __bio_clone_fast() anyways. - */ - - bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); - if (!bio) - return NULL; - bio->bi_disk = bio_src->bi_disk; - bio->bi_opf = bio_src->bi_opf; - bio->bi_write_hint = bio_src->bi_write_hint; - bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; - bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; - - switch (bio_op(bio)) { - case REQ_OP_DISCARD: - case REQ_OP_SECURE_ERASE: - case REQ_OP_WRITE_ZEROES: - break; - case REQ_OP_WRITE_SAME: - bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; - break; - default: - bio_for_each_segment(bv, bio_src, iter) - bio->bi_io_vec[bio->bi_vcnt++] = bv; - break; - } - - if (bio_integrity(bio_src)) { - int ret; - - ret = bio_integrity_clone(bio, bio_src, gfp_mask); - if (ret < 0) { - bio_put(bio); - return NULL; - } - } - - bio_clone_blkcg_association(bio, bio_src); - - return bio; -} -EXPORT_SYMBOL(bio_clone_bioset); - -/** * bio_add_pc_page - attempt to add page to bio * @q: the target queue * @bio: destination bio @@ -1661,10 +1586,8 @@ void bio_set_pages_dirty(struct bio *bio) int i; bio_for_each_segment_all(bvec, bio, i) { - struct page *page = bvec->bv_page; - - if (page && !PageCompound(page)) - set_page_dirty_lock(page); + if (!PageCompound(bvec->bv_page)) + set_page_dirty_lock(bvec->bv_page); } } EXPORT_SYMBOL_GPL(bio_set_pages_dirty); @@ -1674,19 +1597,15 @@ static void bio_release_pages(struct bio *bio) struct bio_vec *bvec; int i; - bio_for_each_segment_all(bvec, bio, i) { - struct page *page = bvec->bv_page; - - if (page) - put_page(page); - } + bio_for_each_segment_all(bvec, bio, i) + put_page(bvec->bv_page); } /* * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. * If they are, then fine. If, however, some pages are clean then they must * have been written out during the direct-IO read. So we take another ref on - * the BIO and the offending pages and re-dirty the pages in process context. + * the BIO and re-dirty the pages in process context. * * It is expected that bio_check_pages_dirty() will wholly own the BIO from * here on. It will run one put_page() against each page and will run one @@ -1704,78 +1623,70 @@ static struct bio *bio_dirty_list; */ static void bio_dirty_fn(struct work_struct *work) { - unsigned long flags; - struct bio *bio; + struct bio *bio, *next; - spin_lock_irqsave(&bio_dirty_lock, flags); - bio = bio_dirty_list; + spin_lock_irq(&bio_dirty_lock); + next = bio_dirty_list; bio_dirty_list = NULL; - spin_unlock_irqrestore(&bio_dirty_lock, flags); + spin_unlock_irq(&bio_dirty_lock); - while (bio) { - struct bio *next = bio->bi_private; + while ((bio = next) != NULL) { + next = bio->bi_private; bio_set_pages_dirty(bio); bio_release_pages(bio); bio_put(bio); - bio = next; } } void bio_check_pages_dirty(struct bio *bio) { struct bio_vec *bvec; - int nr_clean_pages = 0; + unsigned long flags; int i; bio_for_each_segment_all(bvec, bio, i) { - struct page *page = bvec->bv_page; - - if (PageDirty(page) || PageCompound(page)) { - put_page(page); - bvec->bv_page = NULL; - } else { - nr_clean_pages++; - } + if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page)) + goto defer; } - if (nr_clean_pages) { - unsigned long flags; - - spin_lock_irqsave(&bio_dirty_lock, flags); - bio->bi_private = bio_dirty_list; - bio_dirty_list = bio; - spin_unlock_irqrestore(&bio_dirty_lock, flags); - schedule_work(&bio_dirty_work); - } else { - bio_put(bio); - } + bio_release_pages(bio); + bio_put(bio); + return; +defer: + spin_lock_irqsave(&bio_dirty_lock, flags); + bio->bi_private = bio_dirty_list; + bio_dirty_list = bio; + spin_unlock_irqrestore(&bio_dirty_lock, flags); + schedule_work(&bio_dirty_work); } EXPORT_SYMBOL_GPL(bio_check_pages_dirty); -void generic_start_io_acct(struct request_queue *q, int rw, +void generic_start_io_acct(struct request_queue *q, int op, unsigned long sectors, struct hd_struct *part) { + const int sgrp = op_stat_group(op); int cpu = part_stat_lock(); part_round_stats(q, cpu, part); - part_stat_inc(cpu, part, ios[rw]); - part_stat_add(cpu, part, sectors[rw], sectors); - part_inc_in_flight(q, part, rw); + part_stat_inc(cpu, part, ios[sgrp]); + part_stat_add(cpu, part, sectors[sgrp], sectors); + part_inc_in_flight(q, part, op_is_write(op)); part_stat_unlock(); } EXPORT_SYMBOL(generic_start_io_acct); -void generic_end_io_acct(struct request_queue *q, int rw, +void generic_end_io_acct(struct request_queue *q, int req_op, struct hd_struct *part, unsigned long start_time) { unsigned long duration = jiffies - start_time; + const int sgrp = op_stat_group(req_op); int cpu = part_stat_lock(); - part_stat_add(cpu, part, ticks[rw], duration); + part_stat_add(cpu, part, ticks[sgrp], duration); part_round_stats(q, cpu, part); - part_dec_in_flight(q, part, rw); + part_dec_in_flight(q, part, op_is_write(req_op)); part_stat_unlock(); } @@ -1834,6 +1745,9 @@ again: if (!bio_integrity_endio(bio)) return; + if (bio->bi_disk) + rq_qos_done_bio(bio->bi_disk->queue, bio); + /* * Need to have a real endio function for chained bios, otherwise * various corner cases will break (like stacking block devices that @@ -2042,6 +1956,30 @@ EXPORT_SYMBOL(bioset_init_from_src); #ifdef CONFIG_BLK_CGROUP +#ifdef CONFIG_MEMCG +/** + * bio_associate_blkcg_from_page - associate a bio with the page's blkcg + * @bio: target bio + * @page: the page to lookup the blkcg from + * + * Associate @bio with the blkcg from @page's owning memcg. This works like + * every other associate function wrt references. + */ +int bio_associate_blkcg_from_page(struct bio *bio, struct page *page) +{ + struct cgroup_subsys_state *blkcg_css; + + if (unlikely(bio->bi_css)) + return -EBUSY; + if (!page->mem_cgroup) + return 0; + blkcg_css = cgroup_get_e_css(page->mem_cgroup->css.cgroup, + &io_cgrp_subsys); + bio->bi_css = blkcg_css; + return 0; +} +#endif /* CONFIG_MEMCG */ + /** * bio_associate_blkcg - associate a bio with the specified blkcg * @bio: target bio @@ -2065,6 +2003,24 @@ int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) EXPORT_SYMBOL_GPL(bio_associate_blkcg); /** + * bio_associate_blkg - associate a bio with the specified blkg + * @bio: target bio + * @blkg: the blkg to associate + * + * Associate @bio with the blkg specified by @blkg. This is the queue specific + * blkcg information associated with the @bio, a reference will be taken on the + * @blkg and will be freed when the bio is freed. + */ +int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg) +{ + if (unlikely(bio->bi_blkg)) + return -EBUSY; + blkg_get(blkg); + bio->bi_blkg = blkg; + return 0; +} + +/** * bio_disassociate_task - undo bio_associate_current() * @bio: target bio */ @@ -2078,6 +2034,10 @@ void bio_disassociate_task(struct bio *bio) css_put(bio->bi_css); bio->bi_css = NULL; } + if (bio->bi_blkg) { + blkg_put(bio->bi_blkg); + bio->bi_blkg = NULL; + } } /** diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index eb85cb87c40f..694595b29b8f 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -27,6 +27,7 @@ #include <linux/atomic.h> #include <linux/ctype.h> #include <linux/blk-cgroup.h> +#include <linux/tracehook.h> #include "blk.h" #define MAX_KEY_LEN 100 @@ -50,6 +51,8 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ +static bool blkcg_debug_stats = false; + static bool blkcg_policy_enabled(struct request_queue *q, const struct blkcg_policy *pol) { @@ -564,6 +567,7 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, [BLKG_RWSTAT_WRITE] = "Write", [BLKG_RWSTAT_SYNC] = "Sync", [BLKG_RWSTAT_ASYNC] = "Async", + [BLKG_RWSTAT_DISCARD] = "Discard", }; const char *dname = blkg_dev_name(pd->blkg); u64 v; @@ -577,7 +581,8 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, (unsigned long long)atomic64_read(&rwstat->aux_cnt[i])); v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) + - atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]); + atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]) + + atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_DISCARD]); seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); return v; } @@ -954,30 +959,77 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { const char *dname; + char *buf; struct blkg_rwstat rwstat; - u64 rbytes, wbytes, rios, wios; + u64 rbytes, wbytes, rios, wios, dbytes, dios; + size_t size = seq_get_buf(sf, &buf), off = 0; + int i; + bool has_stats = false; dname = blkg_dev_name(blkg); if (!dname) continue; + /* + * Hooray string manipulation, count is the size written NOT + * INCLUDING THE \0, so size is now count+1 less than what we + * had before, but we want to start writing the next bit from + * the \0 so we only add count to buf. + */ + off += scnprintf(buf+off, size-off, "%s ", dname); + spin_lock_irq(blkg->q->queue_lock); rwstat = blkg_rwstat_recursive_sum(blkg, NULL, offsetof(struct blkcg_gq, stat_bytes)); rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); + dbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]); rwstat = blkg_rwstat_recursive_sum(blkg, NULL, offsetof(struct blkcg_gq, stat_ios)); rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); + dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]); spin_unlock_irq(blkg->q->queue_lock); - if (rbytes || wbytes || rios || wios) - seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n", - dname, rbytes, wbytes, rios, wios); + if (rbytes || wbytes || rios || wios) { + has_stats = true; + off += scnprintf(buf+off, size-off, + "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", + rbytes, wbytes, rios, wios, + dbytes, dios); + } + + if (!blkcg_debug_stats) + goto next; + + if (atomic_read(&blkg->use_delay)) { + has_stats = true; + off += scnprintf(buf+off, size-off, + " use_delay=%d delay_nsec=%llu", + atomic_read(&blkg->use_delay), + (unsigned long long)atomic64_read(&blkg->delay_nsec)); + } + + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + size_t written; + + if (!blkg->pd[i] || !pol->pd_stat_fn) + continue; + + written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off); + if (written) + has_stats = true; + off += written; + } +next: + if (has_stats) { + off += scnprintf(buf+off, size-off, "\n"); + seq_commit(sf, off); + } } rcu_read_unlock(); @@ -1191,6 +1243,14 @@ int blkcg_init_queue(struct request_queue *q) if (preloaded) radix_tree_preload_end(); + ret = blk_iolatency_init(q); + if (ret) { + spin_lock_irq(q->queue_lock); + blkg_destroy_all(q); + spin_unlock_irq(q->queue_lock); + return ret; + } + ret = blk_throtl_init(q); if (ret) { spin_lock_irq(q->queue_lock); @@ -1288,6 +1348,13 @@ static void blkcg_bind(struct cgroup_subsys_state *root_css) mutex_unlock(&blkcg_pol_mutex); } +static void blkcg_exit(struct task_struct *tsk) +{ + if (tsk->throttle_queue) + blk_put_queue(tsk->throttle_queue); + tsk->throttle_queue = NULL; +} + struct cgroup_subsys io_cgrp_subsys = { .css_alloc = blkcg_css_alloc, .css_offline = blkcg_css_offline, @@ -1297,6 +1364,7 @@ struct cgroup_subsys io_cgrp_subsys = { .dfl_cftypes = blkcg_files, .legacy_cftypes = blkcg_legacy_files, .legacy_name = "blkio", + .exit = blkcg_exit, #ifdef CONFIG_MEMCG /* * This ensures that, if available, memcg is automatically enabled @@ -1547,3 +1615,209 @@ out_unlock: mutex_unlock(&blkcg_pol_register_mutex); } EXPORT_SYMBOL_GPL(blkcg_policy_unregister); + +/* + * Scale the accumulated delay based on how long it has been since we updated + * the delay. We only call this when we are adding delay, in case it's been a + * while since we added delay, and when we are checking to see if we need to + * delay a task, to account for any delays that may have occurred. + */ +static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) +{ + u64 old = atomic64_read(&blkg->delay_start); + + /* + * We only want to scale down every second. The idea here is that we + * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain + * time window. We only want to throttle tasks for recent delay that + * has occurred, in 1 second time windows since that's the maximum + * things can be throttled. We save the current delay window in + * blkg->last_delay so we know what amount is still left to be charged + * to the blkg from this point onward. blkg->last_use keeps track of + * the use_delay counter. The idea is if we're unthrottling the blkg we + * are ok with whatever is happening now, and we can take away more of + * the accumulated delay as we've already throttled enough that + * everybody is happy with their IO latencies. + */ + if (time_before64(old + NSEC_PER_SEC, now) && + atomic64_cmpxchg(&blkg->delay_start, old, now) == old) { + u64 cur = atomic64_read(&blkg->delay_nsec); + u64 sub = min_t(u64, blkg->last_delay, now - old); + int cur_use = atomic_read(&blkg->use_delay); + + /* + * We've been unthrottled, subtract a larger chunk of our + * accumulated delay. + */ + if (cur_use < blkg->last_use) + sub = max_t(u64, sub, blkg->last_delay >> 1); + + /* + * This shouldn't happen, but handle it anyway. Our delay_nsec + * should only ever be growing except here where we subtract out + * min(last_delay, 1 second), but lord knows bugs happen and I'd + * rather not end up with negative numbers. + */ + if (unlikely(cur < sub)) { + atomic64_set(&blkg->delay_nsec, 0); + blkg->last_delay = 0; + } else { + atomic64_sub(sub, &blkg->delay_nsec); + blkg->last_delay = cur - sub; + } + blkg->last_use = cur_use; + } +} + +/* + * This is called when we want to actually walk up the hierarchy and check to + * see if we need to throttle, and then actually throttle if there is some + * accumulated delay. This should only be called upon return to user space so + * we're not holding some lock that would induce a priority inversion. + */ +static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) +{ + u64 now = ktime_to_ns(ktime_get()); + u64 exp; + u64 delay_nsec = 0; + int tok; + + while (blkg->parent) { + if (atomic_read(&blkg->use_delay)) { + blkcg_scale_delay(blkg, now); + delay_nsec = max_t(u64, delay_nsec, + atomic64_read(&blkg->delay_nsec)); + } + blkg = blkg->parent; + } + + if (!delay_nsec) + return; + + /* + * Let's not sleep for all eternity if we've amassed a huge delay. + * Swapping or metadata IO can accumulate 10's of seconds worth of + * delay, and we want userspace to be able to do _something_ so cap the + * delays at 1 second. If there's 10's of seconds worth of delay then + * the tasks will be delayed for 1 second for every syscall. + */ + delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); + + /* + * TODO: the use_memdelay flag is going to be for the upcoming psi stuff + * that hasn't landed upstream yet. Once that stuff is in place we need + * to do a psi_memstall_enter/leave if memdelay is set. + */ + + exp = ktime_add_ns(now, delay_nsec); + tok = io_schedule_prepare(); + do { + __set_current_state(TASK_KILLABLE); + if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS)) + break; + } while (!fatal_signal_pending(current)); + io_schedule_finish(tok); +} + +/** + * blkcg_maybe_throttle_current - throttle the current task if it has been marked + * + * This is only called if we've been marked with set_notify_resume(). Obviously + * we can be set_notify_resume() for reasons other than blkcg throttling, so we + * check to see if current->throttle_queue is set and if not this doesn't do + * anything. This should only ever be called by the resume code, it's not meant + * to be called by people willy-nilly as it will actually do the work to + * throttle the task if it is setup for throttling. + */ +void blkcg_maybe_throttle_current(void) +{ + struct request_queue *q = current->throttle_queue; + struct cgroup_subsys_state *css; + struct blkcg *blkcg; + struct blkcg_gq *blkg; + bool use_memdelay = current->use_memdelay; + + if (!q) + return; + + current->throttle_queue = NULL; + current->use_memdelay = false; + + rcu_read_lock(); + css = kthread_blkcg(); + if (css) + blkcg = css_to_blkcg(css); + else + blkcg = css_to_blkcg(task_css(current, io_cgrp_id)); + + if (!blkcg) + goto out; + blkg = blkg_lookup(blkcg, q); + if (!blkg) + goto out; + blkg = blkg_try_get(blkg); + if (!blkg) + goto out; + rcu_read_unlock(); + + blkcg_maybe_throttle_blkg(blkg, use_memdelay); + blkg_put(blkg); + blk_put_queue(q); + return; +out: + rcu_read_unlock(); + blk_put_queue(q); +} +EXPORT_SYMBOL_GPL(blkcg_maybe_throttle_current); + +/** + * blkcg_schedule_throttle - this task needs to check for throttling + * @q - the request queue IO was submitted on + * @use_memdelay - do we charge this to memory delay for PSI + * + * This is called by the IO controller when we know there's delay accumulated + * for the blkg for this task. We do not pass the blkg because there are places + * we call this that may not have that information, the swapping code for + * instance will only have a request_queue at that point. This set's the + * notify_resume for the task to check and see if it requires throttling before + * returning to user space. + * + * We will only schedule once per syscall. You can call this over and over + * again and it will only do the check once upon return to user space, and only + * throttle once. If the task needs to be throttled again it'll need to be + * re-set at the next time we see the task. + */ +void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) +{ + if (unlikely(current->flags & PF_KTHREAD)) + return; + + if (!blk_get_queue(q)) + return; + + if (current->throttle_queue) + blk_put_queue(current->throttle_queue); + current->throttle_queue = q; + if (use_memdelay) + current->use_memdelay = use_memdelay; + set_notify_resume(current); +} +EXPORT_SYMBOL_GPL(blkcg_schedule_throttle); + +/** + * blkcg_add_delay - add delay to this blkg + * @now - the current time in nanoseconds + * @delta - how many nanoseconds of delay to add + * + * Charge @delta to the blkg's current delay accumulation. This is used to + * throttle tasks if an IO controller thinks we need more throttling. + */ +void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) +{ + blkcg_scale_delay(blkg, now); + atomic64_add(delta, &blkg->delay_nsec); +} +EXPORT_SYMBOL_GPL(blkcg_add_delay); + +module_param(blkcg_debug_stats, bool, 0644); +MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not"); diff --git a/block/blk-core.c b/block/blk-core.c index ee33590f54eb..12550340418d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -42,7 +42,7 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-sched.h" -#include "blk-wbt.h" +#include "blk-rq-qos.h" #ifdef CONFIG_DEBUG_FS struct dentry *blk_debugfs_root; @@ -715,6 +715,35 @@ void blk_set_queue_dying(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_set_queue_dying); +/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */ +void blk_exit_queue(struct request_queue *q) +{ + /* + * Since the I/O scheduler exit code may access cgroup information, + * perform I/O scheduler exit before disassociating from the block + * cgroup controller. + */ + if (q->elevator) { + ioc_clear_queue(q); + elevator_exit(q, q->elevator); + q->elevator = NULL; + } + + /* + * Remove all references to @q from the block cgroup controller before + * restoring @q->queue_lock to avoid that restoring this pointer causes + * e.g. blkcg_print_blkgs() to crash. + */ + blkcg_exit_queue(q); + + /* + * Since the cgroup code may dereference the @q->backing_dev_info + * pointer, only decrease its reference count after having removed the + * association with the block cgroup controller. + */ + bdi_put(q->backing_dev_info); +} + /** * blk_cleanup_queue - shutdown a request queue * @q: request queue to shutdown @@ -762,9 +791,13 @@ void blk_cleanup_queue(struct request_queue *q) * make sure all in-progress dispatch are completed because * blk_freeze_queue() can only complete all requests, and * dispatch may still be in-progress since we dispatch requests - * from more than one contexts + * from more than one contexts. + * + * No need to quiesce queue if it isn't initialized yet since + * blk_freeze_queue() should be enough for cases of passthrough + * request. */ - if (q->mq_ops) + if (q->mq_ops && blk_queue_init_done(q)) blk_mq_quiesce_queue(q); /* for synchronous bio-based driver finish in-flight integrity i/o */ @@ -780,30 +813,7 @@ void blk_cleanup_queue(struct request_queue *q) */ WARN_ON_ONCE(q->kobj.state_in_sysfs); - /* - * Since the I/O scheduler exit code may access cgroup information, - * perform I/O scheduler exit before disassociating from the block - * cgroup controller. - */ - if (q->elevator) { - ioc_clear_queue(q); - elevator_exit(q, q->elevator); - q->elevator = NULL; - } - - /* - * Remove all references to @q from the block cgroup controller before - * restoring @q->queue_lock to avoid that restoring this pointer causes - * e.g. blkcg_print_blkgs() to crash. - */ - blkcg_exit_queue(q); - - /* - * Since the cgroup code may dereference the @q->backing_dev_info - * pointer, only decrease its reference count after having removed the - * association with the block cgroup controller. - */ - bdi_put(q->backing_dev_info); + blk_exit_queue(q); if (q->mq_ops) blk_mq_free_queue(q); @@ -1180,6 +1190,7 @@ out_exit_flush_rq: q->exit_rq_fn(q, q->fq->flush_rq); out_free_flush_queue: blk_free_flush_queue(q->fq); + q->fq = NULL; return -ENOMEM; } EXPORT_SYMBOL(blk_init_allocated_queue); @@ -1641,7 +1652,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq) blk_delete_timer(rq); blk_clear_rq_complete(rq); trace_block_rq_requeue(q, rq); - wbt_requeue(q->rq_wb, rq); + rq_qos_requeue(q, rq); if (rq->rq_flags & RQF_QUEUED) blk_queue_end_tag(q, rq); @@ -1748,7 +1759,7 @@ void __blk_put_request(struct request_queue *q, struct request *req) /* this is a bio leak */ WARN_ON(req->bio != NULL); - wbt_done(q->rq_wb, req); + rq_qos_done(q, req); /* * Request may not have originated from ll_rw_blk. if not, @@ -1982,7 +1993,6 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) int where = ELEVATOR_INSERT_SORT; struct request *req, *free; unsigned int request_count = 0; - unsigned int wb_acct; /* * low level driver can indicate that it wants pages above a @@ -2040,7 +2050,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) } get_rq: - wb_acct = wbt_wait(q->rq_wb, bio, q->queue_lock); + rq_qos_throttle(q, bio, q->queue_lock); /* * Grab a free request. This is might sleep but can not fail. @@ -2050,7 +2060,7 @@ get_rq: req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO); if (IS_ERR(req)) { blk_queue_exit(q); - __wbt_done(q->rq_wb, wb_acct); + rq_qos_cleanup(q, bio); if (PTR_ERR(req) == -ENOMEM) bio->bi_status = BLK_STS_RESOURCE; else @@ -2059,7 +2069,7 @@ get_rq: goto out_unlock; } - wbt_track(req, wb_acct); + rq_qos_track(q, req, bio); /* * After dropping the lock and possibly sleeping here, our request @@ -2700,13 +2710,13 @@ EXPORT_SYMBOL_GPL(blk_rq_err_bytes); void blk_account_io_completion(struct request *req, unsigned int bytes) { if (blk_do_io_stat(req)) { - const int rw = rq_data_dir(req); + const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; int cpu; cpu = part_stat_lock(); part = req->part; - part_stat_add(cpu, part, sectors[rw], bytes >> 9); + part_stat_add(cpu, part, sectors[sgrp], bytes >> 9); part_stat_unlock(); } } @@ -2720,7 +2730,7 @@ void blk_account_io_done(struct request *req, u64 now) */ if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) { unsigned long duration; - const int rw = rq_data_dir(req); + const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; int cpu; @@ -2728,10 +2738,10 @@ void blk_account_io_done(struct request *req, u64 now) cpu = part_stat_lock(); part = req->part; - part_stat_inc(cpu, part, ios[rw]); - part_stat_add(cpu, part, ticks[rw], duration); + part_stat_inc(cpu, part, ios[sgrp]); + part_stat_add(cpu, part, ticks[sgrp], duration); part_round_stats(req->q, cpu, part); - part_dec_in_flight(req->q, part, rw); + part_dec_in_flight(req->q, part, rq_data_dir(req)); hd_struct_put(part); part_stat_unlock(); @@ -2751,9 +2761,9 @@ static bool blk_pm_allow_request(struct request *rq) return rq->rq_flags & RQF_PM; case RPM_SUSPENDED: return false; + default: + return true; } - - return true; } #else static bool blk_pm_allow_request(struct request *rq) @@ -2980,7 +2990,7 @@ void blk_start_request(struct request *req) req->throtl_size = blk_rq_sectors(req); #endif req->rq_flags |= RQF_STATS; - wbt_issue(req->q->rq_wb, req); + rq_qos_issue(req->q, req); } BUG_ON(blk_rq_is_complete(req)); @@ -3053,6 +3063,10 @@ EXPORT_SYMBOL_GPL(blk_steal_bios); * Passing the result of blk_rq_bytes() as @nr_bytes guarantees * %false return from this function. * + * Note: + * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in both + * blk_rq_bytes() and in blk_update_request(). + * * Return: * %false - this request doesn't have any more data * %true - this request has more data @@ -3200,7 +3214,7 @@ void blk_finish_request(struct request *req, blk_status_t error) blk_account_io_done(req, now); if (req->end_io) { - wbt_done(req->q->rq_wb, req); + rq_qos_done(q, req); req->end_io(req, error); } else { if (blk_bidi_rq(req)) @@ -3763,9 +3777,11 @@ EXPORT_SYMBOL(blk_finish_plug); */ void blk_pm_runtime_init(struct request_queue *q, struct device *dev) { - /* not support for RQF_PM and ->rpm_status in blk-mq yet */ - if (q->mq_ops) + /* Don't enable runtime PM for blk-mq until it is ready */ + if (q->mq_ops) { + pm_runtime_disable(dev); return; + } q->dev = dev; q->rpm_status = RPM_ACTIVE; diff --git a/block/blk-ioc.c b/block/blk-ioc.c index f23311e4b201..01580f88fcb3 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -278,7 +278,7 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) atomic_set(&ioc->nr_tasks, 1); atomic_set(&ioc->active_ref, 1); spin_lock_init(&ioc->lock); - INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH); + INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC); INIT_HLIST_HEAD(&ioc->icq_list); INIT_WORK(&ioc->release_work, ioc_release_fn); diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c new file mode 100644 index 000000000000..19923f8a029d --- /dev/null +++ b/block/blk-iolatency.c @@ -0,0 +1,955 @@ +/* + * Block rq-qos base io controller + * + * This works similar to wbt with a few exceptions + * + * - It's bio based, so the latency covers the whole block layer in addition to + * the actual io. + * - We will throttle all IO that comes in here if we need to. + * - We use the mean latency over the 100ms window. This is because writes can + * be particularly fast, which could give us a false sense of the impact of + * other workloads on our protected workload. + * - By default there's no throttling, we set the queue_depth to UINT_MAX so + * that we can have as many outstanding bio's as we're allowed to. Only at + * throttle time do we pay attention to the actual queue depth. + * + * The hierarchy works like the cpu controller does, we track the latency at + * every configured node, and each configured node has it's own independent + * queue depth. This means that we only care about our latency targets at the + * peer level. Some group at the bottom of the hierarchy isn't going to affect + * a group at the end of some other path if we're only configred at leaf level. + * + * Consider the following + * + * root blkg + * / \ + * fast (target=5ms) slow (target=10ms) + * / \ / \ + * a b normal(15ms) unloved + * + * "a" and "b" have no target, but their combined io under "fast" cannot exceed + * an average latency of 5ms. If it does then we will throttle the "slow" + * group. In the case of "normal", if it exceeds its 15ms target, we will + * throttle "unloved", but nobody else. + * + * In this example "fast", "slow", and "normal" will be the only groups actually + * accounting their io latencies. We have to walk up the heirarchy to the root + * on every submit and complete so we can do the appropriate stat recording and + * adjust the queue depth of ourselves if needed. + * + * There are 2 ways we throttle IO. + * + * 1) Queue depth throttling. As we throttle down we will adjust the maximum + * number of IO's we're allowed to have in flight. This starts at (u64)-1 down + * to 1. If the group is only ever submitting IO for itself then this is the + * only way we throttle. + * + * 2) Induced delay throttling. This is for the case that a group is generating + * IO that has to be issued by the root cg to avoid priority inversion. So think + * REQ_META or REQ_SWAP. If we are already at qd == 1 and we're getting a lot + * of work done for us on behalf of the root cg and are being asked to scale + * down more then we induce a latency at userspace return. We accumulate the + * total amount of time we need to be punished by doing + * + * total_time += min_lat_nsec - actual_io_completion + * + * and then at throttle time will do + * + * throttle_time = min(total_time, NSEC_PER_SEC) + * + * This induced delay will throttle back the activity that is generating the + * root cg issued io's, wethere that's some metadata intensive operation or the + * group is using so much memory that it is pushing us into swap. + * + * Copyright (C) 2018 Josef Bacik + */ +#include <linux/kernel.h> +#include <linux/blk_types.h> +#include <linux/backing-dev.h> +#include <linux/module.h> +#include <linux/timer.h> +#include <linux/memcontrol.h> +#include <linux/sched/loadavg.h> +#include <linux/sched/signal.h> +#include <trace/events/block.h> +#include "blk-rq-qos.h" +#include "blk-stat.h" + +#define DEFAULT_SCALE_COOKIE 1000000U + +static struct blkcg_policy blkcg_policy_iolatency; +struct iolatency_grp; + +struct blk_iolatency { + struct rq_qos rqos; + struct timer_list timer; + atomic_t enabled; +}; + +static inline struct blk_iolatency *BLKIOLATENCY(struct rq_qos *rqos) +{ + return container_of(rqos, struct blk_iolatency, rqos); +} + +static inline bool blk_iolatency_enabled(struct blk_iolatency *blkiolat) +{ + return atomic_read(&blkiolat->enabled) > 0; +} + +struct child_latency_info { + spinlock_t lock; + + /* Last time we adjusted the scale of everybody. */ + u64 last_scale_event; + + /* The latency that we missed. */ + u64 scale_lat; + + /* Total io's from all of our children for the last summation. */ + u64 nr_samples; + + /* The guy who actually changed the latency numbers. */ + struct iolatency_grp *scale_grp; + + /* Cookie to tell if we need to scale up or down. */ + atomic_t scale_cookie; +}; + +struct iolatency_grp { + struct blkg_policy_data pd; + struct blk_rq_stat __percpu *stats; + struct blk_iolatency *blkiolat; + struct rq_depth rq_depth; + struct rq_wait rq_wait; + atomic64_t window_start; + atomic_t scale_cookie; + u64 min_lat_nsec; + u64 cur_win_nsec; + + /* total running average of our io latency. */ + u64 lat_avg; + + /* Our current number of IO's for the last summation. */ + u64 nr_samples; + + struct child_latency_info child_lat; +}; + +#define BLKIOLATENCY_MIN_WIN_SIZE (100 * NSEC_PER_MSEC) +#define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC +/* + * These are the constants used to fake the fixed-point moving average + * calculation just like load average. The call to CALC_LOAD folds + * (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg. The sampling + * window size is bucketed to try to approximately calculate average + * latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows + * elapse immediately. Note, windows only elapse with IO activity. Idle + * periods extend the most recent window. + */ +#define BLKIOLATENCY_NR_EXP_FACTORS 5 +#define BLKIOLATENCY_EXP_BUCKET_SIZE (BLKIOLATENCY_MAX_WIN_SIZE / \ + (BLKIOLATENCY_NR_EXP_FACTORS - 1)) +static const u64 iolatency_exp_factors[BLKIOLATENCY_NR_EXP_FACTORS] = { + 2045, // exp(1/600) - 600 samples + 2039, // exp(1/240) - 240 samples + 2031, // exp(1/120) - 120 samples + 2023, // exp(1/80) - 80 samples + 2014, // exp(1/60) - 60 samples +}; + +static inline struct iolatency_grp *pd_to_lat(struct blkg_policy_data *pd) +{ + return pd ? container_of(pd, struct iolatency_grp, pd) : NULL; +} + +static inline struct iolatency_grp *blkg_to_lat(struct blkcg_gq *blkg) +{ + return pd_to_lat(blkg_to_pd(blkg, &blkcg_policy_iolatency)); +} + +static inline struct blkcg_gq *lat_to_blkg(struct iolatency_grp *iolat) +{ + return pd_to_blkg(&iolat->pd); +} + +static inline bool iolatency_may_queue(struct iolatency_grp *iolat, + wait_queue_entry_t *wait, + bool first_block) +{ + struct rq_wait *rqw = &iolat->rq_wait; + + if (first_block && waitqueue_active(&rqw->wait) && + rqw->wait.head.next != &wait->entry) + return false; + return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth); +} + +static void __blkcg_iolatency_throttle(struct rq_qos *rqos, + struct iolatency_grp *iolat, + spinlock_t *lock, bool issue_as_root, + bool use_memdelay) + __releases(lock) + __acquires(lock) +{ + struct rq_wait *rqw = &iolat->rq_wait; + unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay); + DEFINE_WAIT(wait); + bool first_block = true; + + if (use_delay) + blkcg_schedule_throttle(rqos->q, use_memdelay); + + /* + * To avoid priority inversions we want to just take a slot if we are + * issuing as root. If we're being killed off there's no point in + * delaying things, we may have been killed by OOM so throttling may + * make recovery take even longer, so just let the IO's through so the + * task can go away. + */ + if (issue_as_root || fatal_signal_pending(current)) { + atomic_inc(&rqw->inflight); + return; + } + + if (iolatency_may_queue(iolat, &wait, first_block)) + return; + + do { + prepare_to_wait_exclusive(&rqw->wait, &wait, + TASK_UNINTERRUPTIBLE); + + if (iolatency_may_queue(iolat, &wait, first_block)) + break; + first_block = false; + + if (lock) { + spin_unlock_irq(lock); + io_schedule(); + spin_lock_irq(lock); + } else { + io_schedule(); + } + } while (1); + + finish_wait(&rqw->wait, &wait); +} + +#define SCALE_DOWN_FACTOR 2 +#define SCALE_UP_FACTOR 4 + +static inline unsigned long scale_amount(unsigned long qd, bool up) +{ + return max(up ? qd >> SCALE_UP_FACTOR : qd >> SCALE_DOWN_FACTOR, 1UL); +} + +/* + * We scale the qd down faster than we scale up, so we need to use this helper + * to adjust the scale_cookie accordingly so we don't prematurely get + * scale_cookie at DEFAULT_SCALE_COOKIE and unthrottle too much. + * + * Each group has their own local copy of the last scale cookie they saw, so if + * the global scale cookie goes up or down they know which way they need to go + * based on their last knowledge of it. + */ +static void scale_cookie_change(struct blk_iolatency *blkiolat, + struct child_latency_info *lat_info, + bool up) +{ + unsigned long qd = blk_queue_depth(blkiolat->rqos.q); + unsigned long scale = scale_amount(qd, up); + unsigned long old = atomic_read(&lat_info->scale_cookie); + unsigned long max_scale = qd << 1; + unsigned long diff = 0; + + if (old < DEFAULT_SCALE_COOKIE) + diff = DEFAULT_SCALE_COOKIE - old; + + if (up) { + if (scale + old > DEFAULT_SCALE_COOKIE) + atomic_set(&lat_info->scale_cookie, + DEFAULT_SCALE_COOKIE); + else if (diff > qd) + atomic_inc(&lat_info->scale_cookie); + else + atomic_add(scale, &lat_info->scale_cookie); + } else { + /* + * We don't want to dig a hole so deep that it takes us hours to + * dig out of it. Just enough that we don't throttle/unthrottle + * with jagged workloads but can still unthrottle once pressure + * has sufficiently dissipated. + */ + if (diff > qd) { + if (diff < max_scale) + atomic_dec(&lat_info->scale_cookie); + } else { + atomic_sub(scale, &lat_info->scale_cookie); + } + } +} + +/* + * Change the queue depth of the iolatency_grp. We add/subtract 1/16th of the + * queue depth at a time so we don't get wild swings and hopefully dial in to + * fairer distribution of the overall queue depth. + */ +static void scale_change(struct iolatency_grp *iolat, bool up) +{ + unsigned long qd = blk_queue_depth(iolat->blkiolat->rqos.q); + unsigned long scale = scale_amount(qd, up); + unsigned long old = iolat->rq_depth.max_depth; + bool changed = false; + + if (old > qd) + old = qd; + + if (up) { + if (old == 1 && blkcg_unuse_delay(lat_to_blkg(iolat))) + return; + + if (old < qd) { + changed = true; + old += scale; + old = min(old, qd); + iolat->rq_depth.max_depth = old; + wake_up_all(&iolat->rq_wait.wait); + } + } else if (old > 1) { + old >>= 1; + changed = true; + iolat->rq_depth.max_depth = max(old, 1UL); + } +} + +/* Check our parent and see if the scale cookie has changed. */ +static void check_scale_change(struct iolatency_grp *iolat) +{ + struct iolatency_grp *parent; + struct child_latency_info *lat_info; + unsigned int cur_cookie; + unsigned int our_cookie = atomic_read(&iolat->scale_cookie); + u64 scale_lat; + unsigned int old; + int direction = 0; + + if (lat_to_blkg(iolat)->parent == NULL) + return; + + parent = blkg_to_lat(lat_to_blkg(iolat)->parent); + if (!parent) + return; + + lat_info = &parent->child_lat; + cur_cookie = atomic_read(&lat_info->scale_cookie); + scale_lat = READ_ONCE(lat_info->scale_lat); + + if (cur_cookie < our_cookie) + direction = -1; + else if (cur_cookie > our_cookie) + direction = 1; + else + return; + + old = atomic_cmpxchg(&iolat->scale_cookie, our_cookie, cur_cookie); + + /* Somebody beat us to the punch, just bail. */ + if (old != our_cookie) + return; + + if (direction < 0 && iolat->min_lat_nsec) { + u64 samples_thresh; + + if (!scale_lat || iolat->min_lat_nsec <= scale_lat) + return; + + /* + * Sometimes high priority groups are their own worst enemy, so + * instead of taking it out on some poor other group that did 5% + * or less of the IO's for the last summation just skip this + * scale down event. + */ + samples_thresh = lat_info->nr_samples * 5; + samples_thresh = div64_u64(samples_thresh, 100); + if (iolat->nr_samples <= samples_thresh) + return; + } + + /* We're as low as we can go. */ + if (iolat->rq_depth.max_depth == 1 && direction < 0) { + blkcg_use_delay(lat_to_blkg(iolat)); + return; + } + + /* We're back to the default cookie, unthrottle all the things. */ + if (cur_cookie == DEFAULT_SCALE_COOKIE) { + blkcg_clear_delay(lat_to_blkg(iolat)); + iolat->rq_depth.max_depth = UINT_MAX; + wake_up_all(&iolat->rq_wait.wait); + return; + } + + scale_change(iolat, direction > 0); +} + +static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio, + spinlock_t *lock) +{ + struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); + struct blkcg *blkcg; + struct blkcg_gq *blkg; + struct request_queue *q = rqos->q; + bool issue_as_root = bio_issue_as_root_blkg(bio); + + if (!blk_iolatency_enabled(blkiolat)) + return; + + rcu_read_lock(); + blkcg = bio_blkcg(bio); + bio_associate_blkcg(bio, &blkcg->css); + blkg = blkg_lookup(blkcg, q); + if (unlikely(!blkg)) { + if (!lock) + spin_lock_irq(q->queue_lock); + blkg = blkg_lookup_create(blkcg, q); + if (IS_ERR(blkg)) + blkg = NULL; + if (!lock) + spin_unlock_irq(q->queue_lock); + } + if (!blkg) + goto out; + + bio_issue_init(&bio->bi_issue, bio_sectors(bio)); + bio_associate_blkg(bio, blkg); +out: + rcu_read_unlock(); + while (blkg && blkg->parent) { + struct iolatency_grp *iolat = blkg_to_lat(blkg); + if (!iolat) { + blkg = blkg->parent; + continue; + } + + check_scale_change(iolat); + __blkcg_iolatency_throttle(rqos, iolat, lock, issue_as_root, + (bio->bi_opf & REQ_SWAP) == REQ_SWAP); + blkg = blkg->parent; + } + if (!timer_pending(&blkiolat->timer)) + mod_timer(&blkiolat->timer, jiffies + HZ); +} + +static void iolatency_record_time(struct iolatency_grp *iolat, + struct bio_issue *issue, u64 now, + bool issue_as_root) +{ + struct blk_rq_stat *rq_stat; + u64 start = bio_issue_time(issue); + u64 req_time; + + /* + * Have to do this so we are truncated to the correct time that our + * issue is truncated to. + */ + now = __bio_issue_time(now); + + if (now <= start) + return; + + req_time = now - start; + + /* + * We don't want to count issue_as_root bio's in the cgroups latency + * statistics as it could skew the numbers downwards. + */ + if (unlikely(issue_as_root && iolat->rq_depth.max_depth != UINT_MAX)) { + u64 sub = iolat->min_lat_nsec; + if (req_time < sub) + blkcg_add_delay(lat_to_blkg(iolat), now, sub - req_time); + return; + } + + rq_stat = get_cpu_ptr(iolat->stats); + blk_rq_stat_add(rq_stat, req_time); + put_cpu_ptr(rq_stat); +} + +#define BLKIOLATENCY_MIN_ADJUST_TIME (500 * NSEC_PER_MSEC) +#define BLKIOLATENCY_MIN_GOOD_SAMPLES 5 + +static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now) +{ + struct blkcg_gq *blkg = lat_to_blkg(iolat); + struct iolatency_grp *parent; + struct child_latency_info *lat_info; + struct blk_rq_stat stat; + unsigned long flags; + int cpu, exp_idx; + + blk_rq_stat_init(&stat); + preempt_disable(); + for_each_online_cpu(cpu) { + struct blk_rq_stat *s; + s = per_cpu_ptr(iolat->stats, cpu); + blk_rq_stat_sum(&stat, s); + blk_rq_stat_init(s); + } + preempt_enable(); + + parent = blkg_to_lat(blkg->parent); + if (!parent) + return; + + lat_info = &parent->child_lat; + + /* + * CALC_LOAD takes in a number stored in fixed point representation. + * Because we are using this for IO time in ns, the values stored + * are significantly larger than the FIXED_1 denominator (2048). + * Therefore, rounding errors in the calculation are negligible and + * can be ignored. + */ + exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1, + div64_u64(iolat->cur_win_nsec, + BLKIOLATENCY_EXP_BUCKET_SIZE)); + CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat.mean); + + /* Everything is ok and we don't need to adjust the scale. */ + if (stat.mean <= iolat->min_lat_nsec && + atomic_read(&lat_info->scale_cookie) == DEFAULT_SCALE_COOKIE) + return; + + /* Somebody beat us to the punch, just bail. */ + spin_lock_irqsave(&lat_info->lock, flags); + lat_info->nr_samples -= iolat->nr_samples; + lat_info->nr_samples += stat.nr_samples; + iolat->nr_samples = stat.nr_samples; + + if ((lat_info->last_scale_event >= now || + now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME) && + lat_info->scale_lat <= iolat->min_lat_nsec) + goto out; + + if (stat.mean <= iolat->min_lat_nsec && + stat.nr_samples >= BLKIOLATENCY_MIN_GOOD_SAMPLES) { + if (lat_info->scale_grp == iolat) { + lat_info->last_scale_event = now; + scale_cookie_change(iolat->blkiolat, lat_info, true); + } + } else if (stat.mean > iolat->min_lat_nsec) { + lat_info->last_scale_event = now; + if (!lat_info->scale_grp || + lat_info->scale_lat > iolat->min_lat_nsec) { + WRITE_ONCE(lat_info->scale_lat, iolat->min_lat_nsec); + lat_info->scale_grp = iolat; + } + scale_cookie_change(iolat->blkiolat, lat_info, false); + } +out: + spin_unlock_irqrestore(&lat_info->lock, flags); +} + +static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) +{ + struct blkcg_gq *blkg; + struct rq_wait *rqw; + struct iolatency_grp *iolat; + u64 window_start; + u64 now = ktime_to_ns(ktime_get()); + bool issue_as_root = bio_issue_as_root_blkg(bio); + bool enabled = false; + + blkg = bio->bi_blkg; + if (!blkg) + return; + + iolat = blkg_to_lat(bio->bi_blkg); + if (!iolat) + return; + + enabled = blk_iolatency_enabled(iolat->blkiolat); + while (blkg && blkg->parent) { + iolat = blkg_to_lat(blkg); + if (!iolat) { + blkg = blkg->parent; + continue; + } + rqw = &iolat->rq_wait; + + atomic_dec(&rqw->inflight); + if (!enabled || iolat->min_lat_nsec == 0) + goto next; + iolatency_record_time(iolat, &bio->bi_issue, now, + issue_as_root); + window_start = atomic64_read(&iolat->window_start); + if (now > window_start && + (now - window_start) >= iolat->cur_win_nsec) { + if (atomic64_cmpxchg(&iolat->window_start, + window_start, now) == window_start) + iolatency_check_latencies(iolat, now); + } +next: + wake_up(&rqw->wait); + blkg = blkg->parent; + } +} + +static void blkcg_iolatency_cleanup(struct rq_qos *rqos, struct bio *bio) +{ + struct blkcg_gq *blkg; + + blkg = bio->bi_blkg; + while (blkg && blkg->parent) { + struct rq_wait *rqw; + struct iolatency_grp *iolat; + + iolat = blkg_to_lat(blkg); + if (!iolat) + goto next; + + rqw = &iolat->rq_wait; + atomic_dec(&rqw->inflight); + wake_up(&rqw->wait); +next: + blkg = blkg->parent; + } +} + +static void blkcg_iolatency_exit(struct rq_qos *rqos) +{ + struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); + + del_timer_sync(&blkiolat->timer); + blkcg_deactivate_policy(rqos->q, &blkcg_policy_iolatency); + kfree(blkiolat); +} + +static struct rq_qos_ops blkcg_iolatency_ops = { + .throttle = blkcg_iolatency_throttle, + .cleanup = blkcg_iolatency_cleanup, + .done_bio = blkcg_iolatency_done_bio, + .exit = blkcg_iolatency_exit, +}; + +static void blkiolatency_timer_fn(struct timer_list *t) +{ + struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer); + struct blkcg_gq *blkg; + struct cgroup_subsys_state *pos_css; + u64 now = ktime_to_ns(ktime_get()); + + rcu_read_lock(); + blkg_for_each_descendant_pre(blkg, pos_css, + blkiolat->rqos.q->root_blkg) { + struct iolatency_grp *iolat; + struct child_latency_info *lat_info; + unsigned long flags; + u64 cookie; + + /* + * We could be exiting, don't access the pd unless we have a + * ref on the blkg. + */ + if (!blkg_try_get(blkg)) + continue; + + iolat = blkg_to_lat(blkg); + if (!iolat) + goto next; + + lat_info = &iolat->child_lat; + cookie = atomic_read(&lat_info->scale_cookie); + + if (cookie >= DEFAULT_SCALE_COOKIE) + goto next; + + spin_lock_irqsave(&lat_info->lock, flags); + if (lat_info->last_scale_event >= now) + goto next_lock; + + /* + * We scaled down but don't have a scale_grp, scale up and carry + * on. + */ + if (lat_info->scale_grp == NULL) { + scale_cookie_change(iolat->blkiolat, lat_info, true); + goto next_lock; + } + + /* + * It's been 5 seconds since our last scale event, clear the + * scale grp in case the group that needed the scale down isn't + * doing any IO currently. + */ + if (now - lat_info->last_scale_event >= + ((u64)NSEC_PER_SEC * 5)) + lat_info->scale_grp = NULL; +next_lock: + spin_unlock_irqrestore(&lat_info->lock, flags); +next: + blkg_put(blkg); + } + rcu_read_unlock(); +} + +int blk_iolatency_init(struct request_queue *q) +{ + struct blk_iolatency *blkiolat; + struct rq_qos *rqos; + int ret; + + blkiolat = kzalloc(sizeof(*blkiolat), GFP_KERNEL); + if (!blkiolat) + return -ENOMEM; + + rqos = &blkiolat->rqos; + rqos->id = RQ_QOS_CGROUP; + rqos->ops = &blkcg_iolatency_ops; + rqos->q = q; + + rq_qos_add(q, rqos); + + ret = blkcg_activate_policy(q, &blkcg_policy_iolatency); + if (ret) { + rq_qos_del(q, rqos); + kfree(blkiolat); + return ret; + } + + timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0); + + return 0; +} + +static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) +{ + struct iolatency_grp *iolat = blkg_to_lat(blkg); + struct blk_iolatency *blkiolat = iolat->blkiolat; + u64 oldval = iolat->min_lat_nsec; + + iolat->min_lat_nsec = val; + iolat->cur_win_nsec = max_t(u64, val << 4, BLKIOLATENCY_MIN_WIN_SIZE); + iolat->cur_win_nsec = min_t(u64, iolat->cur_win_nsec, + BLKIOLATENCY_MAX_WIN_SIZE); + + if (!oldval && val) + atomic_inc(&blkiolat->enabled); + if (oldval && !val) + atomic_dec(&blkiolat->enabled); +} + +static void iolatency_clear_scaling(struct blkcg_gq *blkg) +{ + if (blkg->parent) { + struct iolatency_grp *iolat = blkg_to_lat(blkg->parent); + struct child_latency_info *lat_info; + if (!iolat) + return; + + lat_info = &iolat->child_lat; + spin_lock(&lat_info->lock); + atomic_set(&lat_info->scale_cookie, DEFAULT_SCALE_COOKIE); + lat_info->last_scale_event = 0; + lat_info->scale_grp = NULL; + lat_info->scale_lat = 0; + spin_unlock(&lat_info->lock); + } +} + +static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct blkcg *blkcg = css_to_blkcg(of_css(of)); + struct blkcg_gq *blkg; + struct blk_iolatency *blkiolat; + struct blkg_conf_ctx ctx; + struct iolatency_grp *iolat; + char *p, *tok; + u64 lat_val = 0; + u64 oldval; + int ret; + + ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx); + if (ret) + return ret; + + iolat = blkg_to_lat(ctx.blkg); + blkiolat = iolat->blkiolat; + p = ctx.body; + + ret = -EINVAL; + while ((tok = strsep(&p, " "))) { + char key[16]; + char val[21]; /* 18446744073709551616 */ + + if (sscanf(tok, "%15[^=]=%20s", key, val) != 2) + goto out; + + if (!strcmp(key, "target")) { + u64 v; + + if (!strcmp(val, "max")) + lat_val = 0; + else if (sscanf(val, "%llu", &v) == 1) + lat_val = v * NSEC_PER_USEC; + else + goto out; + } else { + goto out; + } + } + + /* Walk up the tree to see if our new val is lower than it should be. */ + blkg = ctx.blkg; + oldval = iolat->min_lat_nsec; + + iolatency_set_min_lat_nsec(blkg, lat_val); + if (oldval != iolat->min_lat_nsec) { + iolatency_clear_scaling(blkg); + } + + ret = 0; +out: + blkg_conf_finish(&ctx); + return ret ?: nbytes; +} + +static u64 iolatency_prfill_limit(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct iolatency_grp *iolat = pd_to_lat(pd); + const char *dname = blkg_dev_name(pd->blkg); + + if (!dname || !iolat->min_lat_nsec) + return 0; + seq_printf(sf, "%s target=%llu\n", + dname, div_u64(iolat->min_lat_nsec, NSEC_PER_USEC)); + return 0; +} + +static int iolatency_print_limit(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + iolatency_prfill_limit, + &blkcg_policy_iolatency, seq_cft(sf)->private, false); + return 0; +} + +static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf, + size_t size) +{ + struct iolatency_grp *iolat = pd_to_lat(pd); + unsigned long long avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC); + unsigned long long cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC); + + if (iolat->rq_depth.max_depth == UINT_MAX) + return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu", + avg_lat, cur_win); + + return scnprintf(buf, size, " depth=%u avg_lat=%llu win=%llu", + iolat->rq_depth.max_depth, avg_lat, cur_win); +} + + +static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, int node) +{ + struct iolatency_grp *iolat; + + iolat = kzalloc_node(sizeof(*iolat), gfp, node); + if (!iolat) + return NULL; + iolat->stats = __alloc_percpu_gfp(sizeof(struct blk_rq_stat), + __alignof__(struct blk_rq_stat), gfp); + if (!iolat->stats) { + kfree(iolat); + return NULL; + } + return &iolat->pd; +} + +static void iolatency_pd_init(struct blkg_policy_data *pd) +{ + struct iolatency_grp *iolat = pd_to_lat(pd); + struct blkcg_gq *blkg = lat_to_blkg(iolat); + struct rq_qos *rqos = blkcg_rq_qos(blkg->q); + struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); + u64 now = ktime_to_ns(ktime_get()); + int cpu; + + for_each_possible_cpu(cpu) { + struct blk_rq_stat *stat; + stat = per_cpu_ptr(iolat->stats, cpu); + blk_rq_stat_init(stat); + } + + rq_wait_init(&iolat->rq_wait); + spin_lock_init(&iolat->child_lat.lock); + iolat->rq_depth.queue_depth = blk_queue_depth(blkg->q); + iolat->rq_depth.max_depth = UINT_MAX; + iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth; + iolat->blkiolat = blkiolat; + iolat->cur_win_nsec = 100 * NSEC_PER_MSEC; + atomic64_set(&iolat->window_start, now); + + /* + * We init things in list order, so the pd for the parent may not be + * init'ed yet for whatever reason. + */ + if (blkg->parent && blkg_to_pd(blkg->parent, &blkcg_policy_iolatency)) { + struct iolatency_grp *parent = blkg_to_lat(blkg->parent); + atomic_set(&iolat->scale_cookie, + atomic_read(&parent->child_lat.scale_cookie)); + } else { + atomic_set(&iolat->scale_cookie, DEFAULT_SCALE_COOKIE); + } + + atomic_set(&iolat->child_lat.scale_cookie, DEFAULT_SCALE_COOKIE); +} + +static void iolatency_pd_offline(struct blkg_policy_data *pd) +{ + struct iolatency_grp *iolat = pd_to_lat(pd); + struct blkcg_gq *blkg = lat_to_blkg(iolat); + + iolatency_set_min_lat_nsec(blkg, 0); + iolatency_clear_scaling(blkg); +} + +static void iolatency_pd_free(struct blkg_policy_data *pd) +{ + struct iolatency_grp *iolat = pd_to_lat(pd); + free_percpu(iolat->stats); + kfree(iolat); +} + +static struct cftype iolatency_files[] = { + { + .name = "latency", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = iolatency_print_limit, + .write = iolatency_set_limit, + }, + {} +}; + +static struct blkcg_policy blkcg_policy_iolatency = { + .dfl_cftypes = iolatency_files, + .pd_alloc_fn = iolatency_pd_alloc, + .pd_init_fn = iolatency_pd_init, + .pd_offline_fn = iolatency_pd_offline, + .pd_free_fn = iolatency_pd_free, + .pd_stat_fn = iolatency_pd_stat, +}; + +static int __init iolatency_init(void) +{ + return blkcg_policy_register(&blkcg_policy_iolatency); +} + +static void __exit iolatency_exit(void) +{ + return blkcg_policy_unregister(&blkcg_policy_iolatency); +} + +module_init(iolatency_init); +module_exit(iolatency_exit); diff --git a/block/blk-lib.c b/block/blk-lib.c index 8faa70f26fcd..d1b9dd03da25 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -68,6 +68,8 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, */ req_sects = min_t(sector_t, nr_sects, q->limits.max_discard_sectors); + if (!req_sects) + goto fail; if (req_sects > UINT_MAX >> 9) req_sects = UINT_MAX >> 9; @@ -105,6 +107,14 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, *biop = bio; return 0; + +fail: + if (bio) { + submit_bio_wait(bio); + bio_put(bio); + } + *biop = NULL; + return -EOPNOTSUPP; } EXPORT_SYMBOL(__blkdev_issue_discard); diff --git a/block/blk-mq-debugfs-zoned.c b/block/blk-mq-debugfs-zoned.c new file mode 100644 index 000000000000..fb2c82c351e4 --- /dev/null +++ b/block/blk-mq-debugfs-zoned.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2017 Western Digital Corporation or its affiliates. + * + * This file is released under the GPL. + */ + +#include <linux/blkdev.h> +#include "blk-mq-debugfs.h" + +int queue_zone_wlock_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + unsigned int i; + + if (!q->seq_zones_wlock) + return 0; + + for (i = 0; i < q->nr_zones; i++) + if (test_bit(i, q->seq_zones_wlock)) + seq_printf(m, "%u\n", i); + + return 0; +} diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 1c4532e92938..cb1e6cf7ac48 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -206,21 +206,6 @@ static ssize_t queue_write_hint_store(void *data, const char __user *buf, return count; } -static int queue_zone_wlock_show(void *data, struct seq_file *m) -{ - struct request_queue *q = data; - unsigned int i; - - if (!q->seq_zones_wlock) - return 0; - - for (i = 0; i < blk_queue_nr_zones(q); i++) - if (test_bit(i, q->seq_zones_wlock)) - seq_printf(m, "%u\n", i); - - return 0; -} - static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { { "poll_stat", 0400, queue_poll_stat_show }, { "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops }, @@ -637,6 +622,14 @@ static int hctx_active_show(void *data, struct seq_file *m) return 0; } +static int hctx_dispatch_busy_show(void *data, struct seq_file *m) +{ + struct blk_mq_hw_ctx *hctx = data; + + seq_printf(m, "%u\n", hctx->dispatch_busy); + return 0; +} + static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos) __acquires(&ctx->lock) { @@ -798,6 +791,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"queued", 0600, hctx_queued_show, hctx_queued_write}, {"run", 0600, hctx_run_show, hctx_run_write}, {"active", 0400, hctx_active_show}, + {"dispatch_busy", 0400, hctx_dispatch_busy_show}, {}, }; diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index b9d366e57097..a9160be12be0 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -80,4 +80,13 @@ static inline void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hc } #endif +#ifdef CONFIG_BLK_DEBUG_FS_ZONED +int queue_zone_wlock_show(void *data, struct seq_file *m); +#else +static inline int queue_zone_wlock_show(void *data, struct seq_file *m) +{ + return 0; +} +#endif + #endif diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c index e233996bb76f..db644ec624f5 100644 --- a/block/blk-mq-pci.c +++ b/block/blk-mq-pci.c @@ -17,6 +17,8 @@ #include <linux/pci.h> #include <linux/module.h> +#include "blk-mq.h" + /** * blk_mq_pci_map_queues - provide a default queue mapping for PCI device * @set: tagset to provide the mapping for @@ -48,8 +50,7 @@ int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev, fallback: WARN_ON_ONCE(set->nr_hw_queues > 1); - for_each_possible_cpu(cpu) - set->mq_map[cpu] = 0; + blk_mq_clear_mq_map(set); return 0; } EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues); diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 56c493c6cd90..cf9c66c6d35a 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -59,29 +59,16 @@ static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) return; - if (hctx->flags & BLK_MQ_F_TAG_SHARED) { - struct request_queue *q = hctx->queue; - - if (!test_and_set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - atomic_inc(&q->shared_hctx_restart); - } else - set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); + set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); } -static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) +void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - return false; - - if (hctx->flags & BLK_MQ_F_TAG_SHARED) { - struct request_queue *q = hctx->queue; - - if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - atomic_dec(&q->shared_hctx_restart); - } else - clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); + return; + clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); - return blk_mq_run_hw_queue(hctx, true); + blk_mq_run_hw_queue(hctx, true); } /* @@ -219,15 +206,8 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) } } else if (has_sched_dispatch) { blk_mq_do_dispatch_sched(hctx); - } else if (q->mq_ops->get_budget) { - /* - * If we need to get budget before queuing request, we - * dequeue request one by one from sw queue for avoiding - * to mess up I/O merge when dispatch runs out of resource. - * - * TODO: get more budgets, and dequeue more requests in - * one time. - */ + } else if (hctx->dispatch_busy) { + /* dequeue request one by one from sw queue if queue is busy */ blk_mq_do_dispatch_ctx(hctx); } else { blk_mq_flush_busy_ctxs(hctx, &rq_list); @@ -339,7 +319,8 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) return e->type->ops.mq.bio_merge(hctx, bio); } - if (hctx->flags & BLK_MQ_F_SHOULD_MERGE) { + if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && + !list_empty_careful(&ctx->rq_list)) { /* default per sw-queue merge */ spin_lock(&ctx->lock); ret = blk_mq_attempt_merge(q, ctx, bio); @@ -380,68 +361,6 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, return false; } -/** - * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list - * @pos: loop cursor. - * @skip: the list element that will not be examined. Iteration starts at - * @skip->next. - * @head: head of the list to examine. This list must have at least one - * element, namely @skip. - * @member: name of the list_head structure within typeof(*pos). - */ -#define list_for_each_entry_rcu_rr(pos, skip, head, member) \ - for ((pos) = (skip); \ - (pos = (pos)->member.next != (head) ? list_entry_rcu( \ - (pos)->member.next, typeof(*pos), member) : \ - list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \ - (pos) != (skip); ) - -/* - * Called after a driver tag has been freed to check whether a hctx needs to - * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware - * queues in a round-robin fashion if the tag set of @hctx is shared with other - * hardware queues. - */ -void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx) -{ - struct blk_mq_tags *const tags = hctx->tags; - struct blk_mq_tag_set *const set = hctx->queue->tag_set; - struct request_queue *const queue = hctx->queue, *q; - struct blk_mq_hw_ctx *hctx2; - unsigned int i, j; - - if (set->flags & BLK_MQ_F_TAG_SHARED) { - /* - * If this is 0, then we know that no hardware queues - * have RESTART marked. We're done. - */ - if (!atomic_read(&queue->shared_hctx_restart)) - return; - - rcu_read_lock(); - list_for_each_entry_rcu_rr(q, queue, &set->tag_list, - tag_set_list) { - queue_for_each_hw_ctx(q, hctx2, i) - if (hctx2->tags == tags && - blk_mq_sched_restart_hctx(hctx2)) - goto done; - } - j = hctx->queue_num + 1; - for (i = 0; i < queue->nr_hw_queues; i++, j++) { - if (j == queue->nr_hw_queues) - j = 0; - hctx2 = queue->queue_hw_ctx[j]; - if (hctx2->tags == tags && - blk_mq_sched_restart_hctx(hctx2)) - break; - } -done: - rcu_read_unlock(); - } else { - blk_mq_sched_restart_hctx(hctx); - } -} - void blk_mq_sched_insert_request(struct request *rq, bool at_head, bool run_queue, bool async) { @@ -486,8 +405,19 @@ void blk_mq_sched_insert_requests(struct request_queue *q, if (e && e->type->ops.mq.insert_requests) e->type->ops.mq.insert_requests(hctx, list, false); - else + else { + /* + * try to issue requests directly if the hw queue isn't + * busy in case of 'none' scheduler, and this way may save + * us one extra enqueue & dequeue to sw queue. + */ + if (!hctx->dispatch_busy && !e && !run_queue_async) { + blk_mq_try_issue_list_directly(hctx, list); + if (list_empty(list)) + return; + } blk_mq_insert_requests(hctx, ctx, list); + } blk_mq_run_hw_queue(hctx, run_queue_async); } diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 3de0836163c2..816923bf874d 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -23,6 +23,9 @@ bool blk_mq_has_free_tags(struct blk_mq_tags *tags) /* * If a previously inactive queue goes active, bump the active user count. + * We need to do this before try to allocate driver tag, then even if fail + * to get tag when first time, the other shared-tag users could reserve + * budget for it. */ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { @@ -399,8 +402,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, if (tdepth <= tags->nr_reserved_tags) return -EINVAL; - tdepth -= tags->nr_reserved_tags; - /* * If we are allowed to grow beyond the original size, allocate * a new set of tags before freeing the old one. @@ -420,7 +421,8 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, if (tdepth > 16 * BLKDEV_MAX_RQ) return -EINVAL; - new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, 0); + new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, + tags->nr_reserved_tags); if (!new) return -ENOMEM; ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth); @@ -437,7 +439,8 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, * Don't need (or can't) update reserved tags here, they * remain static and should never need resizing. */ - sbitmap_queue_resize(&tags->bitmap_tags, tdepth); + sbitmap_queue_resize(&tags->bitmap_tags, + tdepth - tags->nr_reserved_tags); } return 0; diff --git a/block/blk-mq.c b/block/blk-mq.c index 654b0dc7e001..72a0033ccee9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -34,8 +34,8 @@ #include "blk-mq-debugfs.h" #include "blk-mq-tag.h" #include "blk-stat.h" -#include "blk-wbt.h" #include "blk-mq-sched.h" +#include "blk-rq-qos.h" static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); static void blk_mq_poll_stats_start(struct request_queue *q); @@ -285,7 +285,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->tag = -1; rq->internal_tag = tag; } else { - if (blk_mq_tag_busy(data->hctx)) { + if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) { rq_flags = RQF_MQ_INFLIGHT; atomic_inc(&data->hctx->nr_active); } @@ -367,6 +367,8 @@ static struct request *blk_mq_get_request(struct request_queue *q, if (!op_is_flush(op) && e->type->ops.mq.limit_depth && !(data->flags & BLK_MQ_REQ_RESERVED)) e->type->ops.mq.limit_depth(op, data); + } else { + blk_mq_tag_busy(data->hctx); } tag = blk_mq_get_tag(data); @@ -504,7 +506,7 @@ void blk_mq_free_request(struct request *rq) if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) laptop_io_completion(q->backing_dev_info); - wbt_done(q->rq_wb, rq); + rq_qos_done(q, rq); if (blk_rq_rl(rq)) blk_put_rl(blk_rq_rl(rq)); @@ -527,7 +529,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error) blk_account_io_done(rq, now); if (rq->end_io) { - wbt_done(rq->q->rq_wb, rq); + rq_qos_done(rq->q, rq); rq->end_io(rq, error); } else { if (unlikely(blk_bidi_rq(rq))) @@ -639,7 +641,7 @@ void blk_mq_start_request(struct request *rq) rq->throtl_size = blk_rq_sectors(rq); #endif rq->rq_flags |= RQF_STATS; - wbt_issue(q->rq_wb, rq); + rq_qos_issue(q, rq); } WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE); @@ -665,7 +667,7 @@ static void __blk_mq_requeue_request(struct request *rq) blk_mq_put_driver_tag(rq); trace_block_rq_requeue(q, rq); - wbt_requeue(q->rq_wb, rq); + rq_qos_requeue(q, rq); if (blk_mq_request_started(rq)) { WRITE_ONCE(rq->state, MQ_RQ_IDLE); @@ -962,16 +964,14 @@ static inline unsigned int queued_to_index(unsigned int queued) return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); } -bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, - bool wait) +bool blk_mq_get_driver_tag(struct request *rq) { struct blk_mq_alloc_data data = { .q = rq->q, .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), - .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT, + .flags = BLK_MQ_REQ_NOWAIT, }; - - might_sleep_if(wait); + bool shared; if (rq->tag != -1) goto done; @@ -979,9 +979,10 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag)) data.flags |= BLK_MQ_REQ_RESERVED; + shared = blk_mq_tag_busy(data.hctx); rq->tag = blk_mq_get_tag(&data); if (rq->tag >= 0) { - if (blk_mq_tag_busy(data.hctx)) { + if (shared) { rq->rq_flags |= RQF_MQ_INFLIGHT; atomic_inc(&data.hctx->nr_active); } @@ -989,8 +990,6 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, } done: - if (hctx) - *hctx = data.hctx; return rq->tag != -1; } @@ -1001,7 +1000,10 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); + spin_lock(&hctx->dispatch_wait_lock); list_del_init(&wait->entry); + spin_unlock(&hctx->dispatch_wait_lock); + blk_mq_run_hw_queue(hctx, true); return 1; } @@ -1012,17 +1014,16 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, * restart. For both cases, take care to check the condition again after * marking us as waiting. */ -static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, +static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, struct request *rq) { - struct blk_mq_hw_ctx *this_hctx = *hctx; - struct sbq_wait_state *ws; + struct wait_queue_head *wq; wait_queue_entry_t *wait; bool ret; - if (!(this_hctx->flags & BLK_MQ_F_TAG_SHARED)) { - if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state)) - set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state); + if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) { + if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) + set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); /* * It's possible that a tag was freed in the window between the @@ -1032,30 +1033,35 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, * Don't clear RESTART here, someone else could have set it. * At most this will cost an extra queue run. */ - return blk_mq_get_driver_tag(rq, hctx, false); + return blk_mq_get_driver_tag(rq); } - wait = &this_hctx->dispatch_wait; + wait = &hctx->dispatch_wait; if (!list_empty_careful(&wait->entry)) return false; - spin_lock(&this_hctx->lock); + wq = &bt_wait_ptr(&hctx->tags->bitmap_tags, hctx)->wait; + + spin_lock_irq(&wq->lock); + spin_lock(&hctx->dispatch_wait_lock); if (!list_empty(&wait->entry)) { - spin_unlock(&this_hctx->lock); + spin_unlock(&hctx->dispatch_wait_lock); + spin_unlock_irq(&wq->lock); return false; } - ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); - add_wait_queue(&ws->wait, wait); + wait->flags &= ~WQ_FLAG_EXCLUSIVE; + __add_wait_queue(wq, wait); /* * It's possible that a tag was freed in the window between the * allocation failure and adding the hardware queue to the wait * queue. */ - ret = blk_mq_get_driver_tag(rq, hctx, false); + ret = blk_mq_get_driver_tag(rq); if (!ret) { - spin_unlock(&this_hctx->lock); + spin_unlock(&hctx->dispatch_wait_lock); + spin_unlock_irq(&wq->lock); return false; } @@ -1063,14 +1069,42 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, * We got a tag, remove ourselves from the wait queue to ensure * someone else gets the wakeup. */ - spin_lock_irq(&ws->wait.lock); list_del_init(&wait->entry); - spin_unlock_irq(&ws->wait.lock); - spin_unlock(&this_hctx->lock); + spin_unlock(&hctx->dispatch_wait_lock); + spin_unlock_irq(&wq->lock); return true; } +#define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT 8 +#define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR 4 +/* + * Update dispatch busy with the Exponential Weighted Moving Average(EWMA): + * - EWMA is one simple way to compute running average value + * - weight(7/8 and 1/8) is applied so that it can decrease exponentially + * - take 4 as factor for avoiding to get too small(0) result, and this + * factor doesn't matter because EWMA decreases exponentially + */ +static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy) +{ + unsigned int ewma; + + if (hctx->queue->elevator) + return; + + ewma = hctx->dispatch_busy; + + if (!ewma && !busy) + return; + + ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1; + if (busy) + ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR; + ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT; + + hctx->dispatch_busy = ewma; +} + #define BLK_MQ_RESOURCE_DELAY 3 /* ms units */ /* @@ -1103,7 +1137,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) break; - if (!blk_mq_get_driver_tag(rq, NULL, false)) { + if (!blk_mq_get_driver_tag(rq)) { /* * The initial allocation attempt failed, so we need to * rerun the hardware queue when a tag is freed. The @@ -1111,7 +1145,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, * before we add this entry back on the dispatch list, * we'll re-run it below. */ - if (!blk_mq_mark_tag_wait(&hctx, rq)) { + if (!blk_mq_mark_tag_wait(hctx, rq)) { blk_mq_put_dispatch_budget(hctx); /* * For non-shared tags, the RESTART check @@ -1135,7 +1169,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, bd.last = true; else { nxt = list_first_entry(list, struct request, queuelist); - bd.last = !blk_mq_get_driver_tag(nxt, NULL, false); + bd.last = !blk_mq_get_driver_tag(nxt); } ret = q->mq_ops->queue_rq(hctx, &bd); @@ -1207,8 +1241,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, else if (needs_restart && (ret == BLK_STS_RESOURCE)) blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); + blk_mq_update_dispatch_busy(hctx, true); return false; - } + } else + blk_mq_update_dispatch_busy(hctx, false); /* * If the host/device is unable to accept more work, inform the @@ -1542,19 +1578,19 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list) { + struct request *rq; + /* * preemption doesn't flush plug list, so it's possible ctx->cpu is * offline now */ - spin_lock(&ctx->lock); - while (!list_empty(list)) { - struct request *rq; - - rq = list_first_entry(list, struct request, queuelist); + list_for_each_entry(rq, list, queuelist) { BUG_ON(rq->mq_ctx != ctx); - list_del_init(&rq->queuelist); - __blk_mq_insert_req_list(hctx, rq, false); + trace_block_rq_insert(hctx->queue, rq); } + + spin_lock(&ctx->lock); + list_splice_tail_init(list, &ctx->rq_list); blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); } @@ -1657,13 +1693,16 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { case BLK_STS_OK: + blk_mq_update_dispatch_busy(hctx, false); *cookie = new_cookie; break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: + blk_mq_update_dispatch_busy(hctx, true); __blk_mq_requeue_request(rq); break; default: + blk_mq_update_dispatch_busy(hctx, false); *cookie = BLK_QC_T_NONE; break; } @@ -1698,7 +1737,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, if (!blk_mq_get_dispatch_budget(hctx)) goto insert; - if (!blk_mq_get_driver_tag(rq, NULL, false)) { + if (!blk_mq_get_driver_tag(rq)) { blk_mq_put_dispatch_budget(hctx); goto insert; } @@ -1746,6 +1785,27 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq) return ret; } +void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, + struct list_head *list) +{ + while (!list_empty(list)) { + blk_status_t ret; + struct request *rq = list_first_entry(list, struct request, + queuelist); + + list_del_init(&rq->queuelist); + ret = blk_mq_request_issue_directly(rq); + if (ret != BLK_STS_OK) { + if (ret == BLK_STS_RESOURCE || + ret == BLK_STS_DEV_RESOURCE) { + list_add(&rq->queuelist, list); + break; + } + blk_mq_end_request(rq, ret); + } + } +} + static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = op_is_sync(bio->bi_opf); @@ -1756,7 +1816,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) struct blk_plug *plug; struct request *same_queue_rq = NULL; blk_qc_t cookie; - unsigned int wb_acct; blk_queue_bounce(q, &bio); @@ -1772,19 +1831,19 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) if (blk_mq_sched_bio_merge(q, bio)) return BLK_QC_T_NONE; - wb_acct = wbt_wait(q->rq_wb, bio, NULL); + rq_qos_throttle(q, bio, NULL); trace_block_getrq(q, bio, bio->bi_opf); rq = blk_mq_get_request(q, bio, bio->bi_opf, &data); if (unlikely(!rq)) { - __wbt_done(q->rq_wb, wb_acct); + rq_qos_cleanup(q, bio); if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); return BLK_QC_T_NONE; } - wbt_track(rq, wb_acct); + rq_qos_track(q, rq, bio); cookie = request_to_qc_t(data.hctx, rq); @@ -1847,7 +1906,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_try_issue_directly(data.hctx, same_queue_rq, &cookie); } - } else if (q->nr_hw_queues > 1 && is_sync) { + } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator && + !data.hctx->dispatch_busy)) { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); blk_mq_try_issue_directly(data.hctx, rq, &cookie); @@ -2146,6 +2206,7 @@ static int blk_mq_init_hctx(struct request_queue *q, hctx->nr_ctx = 0; + spin_lock_init(&hctx->dispatch_wait_lock); init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); INIT_LIST_HEAD(&hctx->dispatch_wait.entry); @@ -2331,15 +2392,10 @@ static void queue_set_hctx_shared(struct request_queue *q, bool shared) int i; queue_for_each_hw_ctx(q, hctx, i) { - if (shared) { - if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - atomic_inc(&q->shared_hctx_restart); + if (shared) hctx->flags |= BLK_MQ_F_TAG_SHARED; - } else { - if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - atomic_dec(&q->shared_hctx_restart); + else hctx->flags &= ~BLK_MQ_F_TAG_SHARED; - } } } @@ -2370,7 +2426,6 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) blk_mq_update_tag_set_depth(set, false); } mutex_unlock(&set->tag_list_lock); - synchronize_rcu(); INIT_LIST_HEAD(&q->tag_set_list); } @@ -2685,7 +2740,6 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) { if (set->ops->map_queues) { - int cpu; /* * transport .map_queues is usually done in the following * way: @@ -2700,8 +2754,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) * killing stale mapping since one CPU may not be mapped * to any hw queue. */ - for_each_possible_cpu(cpu) - set->mq_map[cpu] = 0; + blk_mq_clear_mq_map(set); return set->ops->map_queues(set); } else @@ -2711,7 +2764,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the - * requested depth down, if if it too large. In that case, the set + * requested depth down, if it's too large. In that case, the set * value will be stored in set->queue_depth. */ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) diff --git a/block/blk-mq.h b/block/blk-mq.h index 89231e439b2f..9497b47e2526 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -36,8 +36,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); -bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, - bool wait); +bool blk_mq_get_driver_tag(struct request *rq); struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start); @@ -65,6 +64,8 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, /* Used by blk_insert_cloned_request() to issue request directly */ blk_status_t blk_mq_request_issue_directly(struct request *rq); +void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, + struct list_head *list); /* * CPU -> queue mappings @@ -203,4 +204,12 @@ static inline void blk_mq_put_driver_tag(struct request *rq) __blk_mq_put_driver_tag(hctx, rq); } +static inline void blk_mq_clear_mq_map(struct blk_mq_tag_set *set) +{ + int cpu; + + for_each_possible_cpu(cpu) + set->mq_map[cpu] = 0; +} + #endif diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c new file mode 100644 index 000000000000..0005dfd568dd --- /dev/null +++ b/block/blk-rq-qos.c @@ -0,0 +1,194 @@ +#include "blk-rq-qos.h" + +/* + * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, + * false if 'v' + 1 would be bigger than 'below'. + */ +static bool atomic_inc_below(atomic_t *v, unsigned int below) +{ + unsigned int cur = atomic_read(v); + + for (;;) { + unsigned int old; + + if (cur >= below) + return false; + old = atomic_cmpxchg(v, cur, cur + 1); + if (old == cur) + break; + cur = old; + } + + return true; +} + +bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit) +{ + return atomic_inc_below(&rq_wait->inflight, limit); +} + +void rq_qos_cleanup(struct request_queue *q, struct bio *bio) +{ + struct rq_qos *rqos; + + for (rqos = q->rq_qos; rqos; rqos = rqos->next) { + if (rqos->ops->cleanup) + rqos->ops->cleanup(rqos, bio); + } +} + +void rq_qos_done(struct request_queue *q, struct request *rq) +{ + struct rq_qos *rqos; + + for (rqos = q->rq_qos; rqos; rqos = rqos->next) { + if (rqos->ops->done) + rqos->ops->done(rqos, rq); + } +} + +void rq_qos_issue(struct request_queue *q, struct request *rq) +{ + struct rq_qos *rqos; + + for(rqos = q->rq_qos; rqos; rqos = rqos->next) { + if (rqos->ops->issue) + rqos->ops->issue(rqos, rq); + } +} + +void rq_qos_requeue(struct request_queue *q, struct request *rq) +{ + struct rq_qos *rqos; + + for(rqos = q->rq_qos; rqos; rqos = rqos->next) { + if (rqos->ops->requeue) + rqos->ops->requeue(rqos, rq); + } +} + +void rq_qos_throttle(struct request_queue *q, struct bio *bio, + spinlock_t *lock) +{ + struct rq_qos *rqos; + + for(rqos = q->rq_qos; rqos; rqos = rqos->next) { + if (rqos->ops->throttle) + rqos->ops->throttle(rqos, bio, lock); + } +} + +void rq_qos_track(struct request_queue *q, struct request *rq, struct bio *bio) +{ + struct rq_qos *rqos; + + for(rqos = q->rq_qos; rqos; rqos = rqos->next) { + if (rqos->ops->track) + rqos->ops->track(rqos, rq, bio); + } +} + +void rq_qos_done_bio(struct request_queue *q, struct bio *bio) +{ + struct rq_qos *rqos; + + for(rqos = q->rq_qos; rqos; rqos = rqos->next) { + if (rqos->ops->done_bio) + rqos->ops->done_bio(rqos, bio); + } +} + +/* + * Return true, if we can't increase the depth further by scaling + */ +bool rq_depth_calc_max_depth(struct rq_depth *rqd) +{ + unsigned int depth; + bool ret = false; + + /* + * For QD=1 devices, this is a special case. It's important for those + * to have one request ready when one completes, so force a depth of + * 2 for those devices. On the backend, it'll be a depth of 1 anyway, + * since the device can't have more than that in flight. If we're + * scaling down, then keep a setting of 1/1/1. + */ + if (rqd->queue_depth == 1) { + if (rqd->scale_step > 0) + rqd->max_depth = 1; + else { + rqd->max_depth = 2; + ret = true; + } + } else { + /* + * scale_step == 0 is our default state. If we have suffered + * latency spikes, step will be > 0, and we shrink the + * allowed write depths. If step is < 0, we're only doing + * writes, and we allow a temporarily higher depth to + * increase performance. + */ + depth = min_t(unsigned int, rqd->default_depth, + rqd->queue_depth); + if (rqd->scale_step > 0) + depth = 1 + ((depth - 1) >> min(31, rqd->scale_step)); + else if (rqd->scale_step < 0) { + unsigned int maxd = 3 * rqd->queue_depth / 4; + + depth = 1 + ((depth - 1) << -rqd->scale_step); + if (depth > maxd) { + depth = maxd; + ret = true; + } + } + + rqd->max_depth = depth; + } + + return ret; +} + +void rq_depth_scale_up(struct rq_depth *rqd) +{ + /* + * Hit max in previous round, stop here + */ + if (rqd->scaled_max) + return; + + rqd->scale_step--; + + rqd->scaled_max = rq_depth_calc_max_depth(rqd); +} + +/* + * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we + * had a latency violation. + */ +void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle) +{ + /* + * Stop scaling down when we've hit the limit. This also prevents + * ->scale_step from going to crazy values, if the device can't + * keep up. + */ + if (rqd->max_depth == 1) + return; + + if (rqd->scale_step < 0 && hard_throttle) + rqd->scale_step = 0; + else + rqd->scale_step++; + + rqd->scaled_max = false; + rq_depth_calc_max_depth(rqd); +} + +void rq_qos_exit(struct request_queue *q) +{ + while (q->rq_qos) { + struct rq_qos *rqos = q->rq_qos; + q->rq_qos = rqos->next; + rqos->ops->exit(rqos); + } +} diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h new file mode 100644 index 000000000000..32b02efbfa66 --- /dev/null +++ b/block/blk-rq-qos.h @@ -0,0 +1,109 @@ +#ifndef RQ_QOS_H +#define RQ_QOS_H + +#include <linux/kernel.h> +#include <linux/blkdev.h> +#include <linux/blk_types.h> +#include <linux/atomic.h> +#include <linux/wait.h> + +enum rq_qos_id { + RQ_QOS_WBT, + RQ_QOS_CGROUP, +}; + +struct rq_wait { + wait_queue_head_t wait; + atomic_t inflight; +}; + +struct rq_qos { + struct rq_qos_ops *ops; + struct request_queue *q; + enum rq_qos_id id; + struct rq_qos *next; +}; + +struct rq_qos_ops { + void (*throttle)(struct rq_qos *, struct bio *, spinlock_t *); + void (*track)(struct rq_qos *, struct request *, struct bio *); + void (*issue)(struct rq_qos *, struct request *); + void (*requeue)(struct rq_qos *, struct request *); + void (*done)(struct rq_qos *, struct request *); + void (*done_bio)(struct rq_qos *, struct bio *); + void (*cleanup)(struct rq_qos *, struct bio *); + void (*exit)(struct rq_qos *); +}; + +struct rq_depth { + unsigned int max_depth; + + int scale_step; + bool scaled_max; + + unsigned int queue_depth; + unsigned int default_depth; +}; + +static inline struct rq_qos *rq_qos_id(struct request_queue *q, + enum rq_qos_id id) +{ + struct rq_qos *rqos; + for (rqos = q->rq_qos; rqos; rqos = rqos->next) { + if (rqos->id == id) + break; + } + return rqos; +} + +static inline struct rq_qos *wbt_rq_qos(struct request_queue *q) +{ + return rq_qos_id(q, RQ_QOS_WBT); +} + +static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q) +{ + return rq_qos_id(q, RQ_QOS_CGROUP); +} + +static inline void rq_wait_init(struct rq_wait *rq_wait) +{ + atomic_set(&rq_wait->inflight, 0); + init_waitqueue_head(&rq_wait->wait); +} + +static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) +{ + rqos->next = q->rq_qos; + q->rq_qos = rqos; +} + +static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) +{ + struct rq_qos *cur, *prev = NULL; + for (cur = q->rq_qos; cur; cur = cur->next) { + if (cur == rqos) { + if (prev) + prev->next = rqos->next; + else + q->rq_qos = cur; + break; + } + prev = cur; + } +} + +bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit); +void rq_depth_scale_up(struct rq_depth *rqd); +void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle); +bool rq_depth_calc_max_depth(struct rq_depth *rqd); + +void rq_qos_cleanup(struct request_queue *, struct bio *); +void rq_qos_done(struct request_queue *, struct request *); +void rq_qos_issue(struct request_queue *, struct request *); +void rq_qos_requeue(struct request_queue *, struct request *); +void rq_qos_done_bio(struct request_queue *q, struct bio *bio); +void rq_qos_throttle(struct request_queue *, struct bio *, spinlock_t *); +void rq_qos_track(struct request_queue *q, struct request *, struct bio *); +void rq_qos_exit(struct request_queue *); +#endif diff --git a/block/blk-settings.c b/block/blk-settings.c index d1de71124656..ffd459969689 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -128,7 +128,7 @@ void blk_set_stacking_limits(struct queue_limits *lim) /* Inherit limits from component devices */ lim->max_segments = USHRT_MAX; - lim->max_discard_segments = 1; + lim->max_discard_segments = USHRT_MAX; lim->max_hw_sectors = UINT_MAX; lim->max_segment_size = UINT_MAX; lim->max_sectors = UINT_MAX; @@ -875,7 +875,7 @@ EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); void blk_set_queue_depth(struct request_queue *q, unsigned int depth) { q->queue_depth = depth; - wbt_set_queue_depth(q->rq_wb, depth); + wbt_set_queue_depth(q, depth); } EXPORT_SYMBOL(blk_set_queue_depth); @@ -900,7 +900,7 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) queue_flag_clear(QUEUE_FLAG_FUA, q); spin_unlock_irq(q->queue_lock); - wbt_set_write_cache(q->rq_wb, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); + wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); } EXPORT_SYMBOL_GPL(blk_queue_write_cache); diff --git a/block/blk-stat.c b/block/blk-stat.c index 175c143ac5b9..7587b1c3caaf 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -17,7 +17,7 @@ struct blk_queue_stats { bool enable_accounting; }; -static void blk_stat_init(struct blk_rq_stat *stat) +void blk_rq_stat_init(struct blk_rq_stat *stat) { stat->min = -1ULL; stat->max = stat->nr_samples = stat->mean = 0; @@ -25,7 +25,7 @@ static void blk_stat_init(struct blk_rq_stat *stat) } /* src is a per-cpu stat, mean isn't initialized */ -static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) +void blk_rq_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) { if (!src->nr_samples) return; @@ -39,7 +39,7 @@ static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) dst->nr_samples += src->nr_samples; } -static void __blk_stat_add(struct blk_rq_stat *stat, u64 value) +void blk_rq_stat_add(struct blk_rq_stat *stat, u64 value) { stat->min = min(stat->min, value); stat->max = max(stat->max, value); @@ -69,7 +69,7 @@ void blk_stat_add(struct request *rq, u64 now) continue; stat = &get_cpu_ptr(cb->cpu_stat)[bucket]; - __blk_stat_add(stat, value); + blk_rq_stat_add(stat, value); put_cpu_ptr(cb->cpu_stat); } rcu_read_unlock(); @@ -82,15 +82,15 @@ static void blk_stat_timer_fn(struct timer_list *t) int cpu; for (bucket = 0; bucket < cb->buckets; bucket++) - blk_stat_init(&cb->stat[bucket]); + blk_rq_stat_init(&cb->stat[bucket]); for_each_online_cpu(cpu) { struct blk_rq_stat *cpu_stat; cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu); for (bucket = 0; bucket < cb->buckets; bucket++) { - blk_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]); - blk_stat_init(&cpu_stat[bucket]); + blk_rq_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]); + blk_rq_stat_init(&cpu_stat[bucket]); } } @@ -143,7 +143,7 @@ void blk_stat_add_callback(struct request_queue *q, cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu); for (bucket = 0; bucket < cb->buckets; bucket++) - blk_stat_init(&cpu_stat[bucket]); + blk_rq_stat_init(&cpu_stat[bucket]); } spin_lock(&q->stats->lock); diff --git a/block/blk-stat.h b/block/blk-stat.h index 78399cdde9c9..f4a1568e81a4 100644 --- a/block/blk-stat.h +++ b/block/blk-stat.h @@ -159,4 +159,8 @@ static inline void blk_stat_activate_msecs(struct blk_stat_callback *cb, mod_timer(&cb->timer, jiffies + msecs_to_jiffies(msecs)); } +void blk_rq_stat_add(struct blk_rq_stat *, u64); +void blk_rq_stat_sum(struct blk_rq_stat *, struct blk_rq_stat *); +void blk_rq_stat_init(struct blk_rq_stat *); + #endif diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 94987b1f69e1..bb109bb0a055 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -422,16 +422,16 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page, static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) { - if (!q->rq_wb) + if (!wbt_rq_qos(q)) return -EINVAL; - return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000)); + return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); } static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, size_t count) { - struct rq_wb *rwb; + struct rq_qos *rqos; ssize_t ret; s64 val; @@ -441,23 +441,21 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, if (val < -1) return -EINVAL; - rwb = q->rq_wb; - if (!rwb) { + rqos = wbt_rq_qos(q); + if (!rqos) { ret = wbt_init(q); if (ret) return ret; } - rwb = q->rq_wb; if (val == -1) - rwb->min_lat_nsec = wbt_default_latency_nsec(q); + val = wbt_default_latency_nsec(q); else if (val >= 0) - rwb->min_lat_nsec = val * 1000ULL; + val *= 1000ULL; - if (rwb->enable_state == WBT_STATE_ON_DEFAULT) - rwb->enable_state = WBT_STATE_ON_MANUAL; + wbt_set_min_lat(q, val); - wbt_update_limits(rwb); + wbt_update_limits(q); return count; } @@ -804,6 +802,21 @@ static void __blk_release_queue(struct work_struct *work) blk_stat_remove_callback(q, q->poll_cb); blk_stat_free_callback(q->poll_cb); + if (!blk_queue_dead(q)) { + /* + * Last reference was dropped without having called + * blk_cleanup_queue(). + */ + WARN_ONCE(blk_queue_init_done(q), + "request queue %p has been registered but blk_cleanup_queue() has not been called for that queue\n", + q); + blk_exit_queue(q); + } + + WARN(blk_queue_root_blkg(q), + "request queue %p is being released but it has not yet been removed from the blkcg controller\n", + q); + blk_free_queue_stats(q->stats); blk_exit_rl(q, &q->root_rl); @@ -964,7 +977,7 @@ void blk_unregister_queue(struct gendisk *disk) kobject_del(&q->kobj); blk_trace_remove_sysfs(disk_to_dev(disk)); - wbt_exit(q); + rq_qos_exit(q); mutex_lock(&q->sysfs_lock); if (q->request_fn || (q->mq_ops && q->elevator)) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 82282e6fdcf8..a3eede00d302 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -579,8 +579,10 @@ static void blk_throtl_update_limit_valid(struct throtl_data *td) struct throtl_grp *tg = blkg_to_tg(blkg); if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] || - tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) + tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) { low_valid = true; + break; + } } rcu_read_unlock(); @@ -920,12 +922,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, } /* Calc approx time to dispatch */ - jiffy_wait = ((tg->io_disp[rw] + 1) * HZ) / tg_iops_limit(tg, rw) + 1; - - if (jiffy_wait > jiffy_elapsed) - jiffy_wait = jiffy_wait - jiffy_elapsed; - else - jiffy_wait = 1; + jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed; if (wait) *wait = jiffy_wait; @@ -2132,12 +2129,8 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td) static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio) { #ifdef CONFIG_BLK_DEV_THROTTLING_LOW - if (bio->bi_css) { - if (bio->bi_cg_private) - blkg_put(tg_to_blkg(bio->bi_cg_private)); - bio->bi_cg_private = tg; - blkg_get(tg_to_blkg(tg)); - } + if (bio->bi_css) + bio_associate_blkg(bio, tg_to_blkg(tg)); bio_issue_init(&bio->bi_issue, bio_sectors(bio)); #endif } @@ -2285,6 +2278,7 @@ void blk_throtl_stat_add(struct request *rq, u64 time_ns) void blk_throtl_bio_endio(struct bio *bio) { + struct blkcg_gq *blkg; struct throtl_grp *tg; u64 finish_time_ns; unsigned long finish_time; @@ -2292,20 +2286,18 @@ void blk_throtl_bio_endio(struct bio *bio) unsigned long lat; int rw = bio_data_dir(bio); - tg = bio->bi_cg_private; - if (!tg) + blkg = bio->bi_blkg; + if (!blkg) return; - bio->bi_cg_private = NULL; + tg = blkg_to_tg(blkg); finish_time_ns = ktime_get_ns(); tg->last_finish_time = finish_time_ns >> 10; start_time = bio_issue_time(&bio->bi_issue) >> 10; finish_time = __bio_issue_time(finish_time_ns) >> 10; - if (!start_time || finish_time <= start_time) { - blkg_put(tg_to_blkg(tg)); + if (!start_time || finish_time <= start_time) return; - } lat = finish_time - start_time; /* this is only for bio based driver */ @@ -2334,8 +2326,6 @@ void blk_throtl_bio_endio(struct bio *bio) tg->bio_cnt /= 2; tg->bad_bio_cnt /= 2; } - - blkg_put(tg_to_blkg(tg)); } #endif diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 4f89b28fa652..1d94a20374fc 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -25,6 +25,7 @@ #include <linux/swap.h> #include "blk-wbt.h" +#include "blk-rq-qos.h" #define CREATE_TRACE_POINTS #include <trace/events/wbt.h> @@ -78,28 +79,6 @@ static inline bool rwb_enabled(struct rq_wb *rwb) return rwb && rwb->wb_normal != 0; } -/* - * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, - * false if 'v' + 1 would be bigger than 'below'. - */ -static bool atomic_inc_below(atomic_t *v, int below) -{ - int cur = atomic_read(v); - - for (;;) { - int old; - - if (cur >= below) - return false; - old = atomic_cmpxchg(v, cur, cur + 1); - if (old == cur) - break; - cur = old; - } - - return true; -} - static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) { if (rwb_enabled(rwb)) { @@ -116,7 +95,7 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) */ static bool wb_recent_wait(struct rq_wb *rwb) { - struct bdi_writeback *wb = &rwb->queue->backing_dev_info->wb; + struct bdi_writeback *wb = &rwb->rqos.q->backing_dev_info->wb; return time_before(jiffies, wb->dirty_sleep + HZ); } @@ -144,8 +123,9 @@ static void rwb_wake_all(struct rq_wb *rwb) } } -void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct) +static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct) { + struct rq_wb *rwb = RQWB(rqos); struct rq_wait *rqw; int inflight, limit; @@ -186,7 +166,7 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct) int diff = limit - inflight; if (!inflight || diff >= rwb->wb_background / 2) - wake_up_all(&rqw->wait); + wake_up(&rqw->wait); } } @@ -194,10 +174,9 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct) * Called on completion of a request. Note that it's also called when * a request is merged, when the request gets freed. */ -void wbt_done(struct rq_wb *rwb, struct request *rq) +static void wbt_done(struct rq_qos *rqos, struct request *rq) { - if (!rwb) - return; + struct rq_wb *rwb = RQWB(rqos); if (!wbt_is_tracked(rq)) { if (rwb->sync_cookie == rq) { @@ -209,72 +188,11 @@ void wbt_done(struct rq_wb *rwb, struct request *rq) wb_timestamp(rwb, &rwb->last_comp); } else { WARN_ON_ONCE(rq == rwb->sync_cookie); - __wbt_done(rwb, wbt_flags(rq)); + __wbt_done(rqos, wbt_flags(rq)); } wbt_clear_state(rq); } -/* - * Return true, if we can't increase the depth further by scaling - */ -static bool calc_wb_limits(struct rq_wb *rwb) -{ - unsigned int depth; - bool ret = false; - - if (!rwb->min_lat_nsec) { - rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0; - return false; - } - - /* - * For QD=1 devices, this is a special case. It's important for those - * to have one request ready when one completes, so force a depth of - * 2 for those devices. On the backend, it'll be a depth of 1 anyway, - * since the device can't have more than that in flight. If we're - * scaling down, then keep a setting of 1/1/1. - */ - if (rwb->queue_depth == 1) { - if (rwb->scale_step > 0) - rwb->wb_max = rwb->wb_normal = 1; - else { - rwb->wb_max = rwb->wb_normal = 2; - ret = true; - } - rwb->wb_background = 1; - } else { - /* - * scale_step == 0 is our default state. If we have suffered - * latency spikes, step will be > 0, and we shrink the - * allowed write depths. If step is < 0, we're only doing - * writes, and we allow a temporarily higher depth to - * increase performance. - */ - depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth); - if (rwb->scale_step > 0) - depth = 1 + ((depth - 1) >> min(31, rwb->scale_step)); - else if (rwb->scale_step < 0) { - unsigned int maxd = 3 * rwb->queue_depth / 4; - - depth = 1 + ((depth - 1) << -rwb->scale_step); - if (depth > maxd) { - depth = maxd; - ret = true; - } - } - - /* - * Set our max/normal/bg queue depths based on how far - * we have scaled down (->scale_step). - */ - rwb->wb_max = depth; - rwb->wb_normal = (rwb->wb_max + 1) / 2; - rwb->wb_background = (rwb->wb_max + 3) / 4; - } - - return ret; -} - static inline bool stat_sample_valid(struct blk_rq_stat *stat) { /* @@ -307,7 +225,8 @@ enum { static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) { - struct backing_dev_info *bdi = rwb->queue->backing_dev_info; + struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info; + struct rq_depth *rqd = &rwb->rq_depth; u64 thislat; /* @@ -351,7 +270,7 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) return LAT_EXCEEDED; } - if (rwb->scale_step) + if (rqd->scale_step) trace_wbt_stat(bdi, stat); return LAT_OK; @@ -359,58 +278,48 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) static void rwb_trace_step(struct rq_wb *rwb, const char *msg) { - struct backing_dev_info *bdi = rwb->queue->backing_dev_info; + struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info; + struct rq_depth *rqd = &rwb->rq_depth; - trace_wbt_step(bdi, msg, rwb->scale_step, rwb->cur_win_nsec, - rwb->wb_background, rwb->wb_normal, rwb->wb_max); + trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec, + rwb->wb_background, rwb->wb_normal, rqd->max_depth); } -static void scale_up(struct rq_wb *rwb) +static void calc_wb_limits(struct rq_wb *rwb) { - /* - * Hit max in previous round, stop here - */ - if (rwb->scaled_max) - return; + if (rwb->min_lat_nsec == 0) { + rwb->wb_normal = rwb->wb_background = 0; + } else if (rwb->rq_depth.max_depth <= 2) { + rwb->wb_normal = rwb->rq_depth.max_depth; + rwb->wb_background = 1; + } else { + rwb->wb_normal = (rwb->rq_depth.max_depth + 1) / 2; + rwb->wb_background = (rwb->rq_depth.max_depth + 3) / 4; + } +} - rwb->scale_step--; +static void scale_up(struct rq_wb *rwb) +{ + rq_depth_scale_up(&rwb->rq_depth); + calc_wb_limits(rwb); rwb->unknown_cnt = 0; - - rwb->scaled_max = calc_wb_limits(rwb); - - rwb_wake_all(rwb); - - rwb_trace_step(rwb, "step up"); + rwb_trace_step(rwb, "scale up"); } -/* - * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we - * had a latency violation. - */ static void scale_down(struct rq_wb *rwb, bool hard_throttle) { - /* - * Stop scaling down when we've hit the limit. This also prevents - * ->scale_step from going to crazy values, if the device can't - * keep up. - */ - if (rwb->wb_max == 1) - return; - - if (rwb->scale_step < 0 && hard_throttle) - rwb->scale_step = 0; - else - rwb->scale_step++; - - rwb->scaled_max = false; - rwb->unknown_cnt = 0; + rq_depth_scale_down(&rwb->rq_depth, hard_throttle); calc_wb_limits(rwb); - rwb_trace_step(rwb, "step down"); + rwb->unknown_cnt = 0; + rwb_wake_all(rwb); + rwb_trace_step(rwb, "scale down"); } static void rwb_arm_timer(struct rq_wb *rwb) { - if (rwb->scale_step > 0) { + struct rq_depth *rqd = &rwb->rq_depth; + + if (rqd->scale_step > 0) { /* * We should speed this up, using some variant of a fast * integer inverse square root calculation. Since we only do @@ -418,7 +327,7 @@ static void rwb_arm_timer(struct rq_wb *rwb) * though. */ rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, - int_sqrt((rwb->scale_step + 1) << 8)); + int_sqrt((rqd->scale_step + 1) << 8)); } else { /* * For step < 0, we don't want to increase/decrease the @@ -433,12 +342,13 @@ static void rwb_arm_timer(struct rq_wb *rwb) static void wb_timer_fn(struct blk_stat_callback *cb) { struct rq_wb *rwb = cb->data; + struct rq_depth *rqd = &rwb->rq_depth; unsigned int inflight = wbt_inflight(rwb); int status; status = latency_exceeded(rwb, cb->stat); - trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step, + trace_wbt_timer(rwb->rqos.q->backing_dev_info, status, rqd->scale_step, inflight); /* @@ -469,9 +379,9 @@ static void wb_timer_fn(struct blk_stat_callback *cb) * currently don't have a valid read/write sample. For that * case, slowly return to center state (step == 0). */ - if (rwb->scale_step > 0) + if (rqd->scale_step > 0) scale_up(rwb); - else if (rwb->scale_step < 0) + else if (rqd->scale_step < 0) scale_down(rwb, false); break; default: @@ -481,19 +391,50 @@ static void wb_timer_fn(struct blk_stat_callback *cb) /* * Re-arm timer, if we have IO in flight */ - if (rwb->scale_step || inflight) + if (rqd->scale_step || inflight) rwb_arm_timer(rwb); } -void wbt_update_limits(struct rq_wb *rwb) +static void __wbt_update_limits(struct rq_wb *rwb) { - rwb->scale_step = 0; - rwb->scaled_max = false; + struct rq_depth *rqd = &rwb->rq_depth; + + rqd->scale_step = 0; + rqd->scaled_max = false; + + rq_depth_calc_max_depth(rqd); calc_wb_limits(rwb); rwb_wake_all(rwb); } +void wbt_update_limits(struct request_queue *q) +{ + struct rq_qos *rqos = wbt_rq_qos(q); + if (!rqos) + return; + __wbt_update_limits(RQWB(rqos)); +} + +u64 wbt_get_min_lat(struct request_queue *q) +{ + struct rq_qos *rqos = wbt_rq_qos(q); + if (!rqos) + return 0; + return RQWB(rqos)->min_lat_nsec; +} + +void wbt_set_min_lat(struct request_queue *q, u64 val) +{ + struct rq_qos *rqos = wbt_rq_qos(q); + if (!rqos) + return; + RQWB(rqos)->min_lat_nsec = val; + RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL; + __wbt_update_limits(RQWB(rqos)); +} + + static bool close_io(struct rq_wb *rwb) { const unsigned long now = jiffies; @@ -520,7 +461,7 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) * IO for a bit. */ if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd()) - limit = rwb->wb_max; + limit = rwb->rq_depth.max_depth; else if ((rw & REQ_BACKGROUND) || close_io(rwb)) { /* * If less than 100ms since we completed unrelated IO, @@ -533,30 +474,6 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) return limit; } -static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw, - wait_queue_entry_t *wait, unsigned long rw) -{ - /* - * inc it here even if disabled, since we'll dec it at completion. - * this only happens if the task was sleeping in __wbt_wait(), - * and someone turned it off at the same time. - */ - if (!rwb_enabled(rwb)) { - atomic_inc(&rqw->inflight); - return true; - } - - /* - * If the waitqueue is already active and we are not the next - * in line to be woken up, wait for our turn. - */ - if (waitqueue_active(&rqw->wait) && - rqw->wait.head.next != &wait->entry) - return false; - - return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw)); -} - /* * Block if we will exceed our limit, or if we are currently waiting for * the timer to kick off queuing again. @@ -567,16 +484,32 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, __acquires(lock) { struct rq_wait *rqw = get_rq_wait(rwb, wb_acct); - DEFINE_WAIT(wait); + DECLARE_WAITQUEUE(wait, current); - if (may_queue(rwb, rqw, &wait, rw)) + /* + * inc it here even if disabled, since we'll dec it at completion. + * this only happens if the task was sleeping in __wbt_wait(), + * and someone turned it off at the same time. + */ + if (!rwb_enabled(rwb)) { + atomic_inc(&rqw->inflight); return; + } + if (!waitqueue_active(&rqw->wait) + && rq_wait_inc_below(rqw, get_limit(rwb, rw))) + return; + + add_wait_queue_exclusive(&rqw->wait, &wait); do { - prepare_to_wait_exclusive(&rqw->wait, &wait, - TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); - if (may_queue(rwb, rqw, &wait, rw)) + if (!rwb_enabled(rwb)) { + atomic_inc(&rqw->inflight); + break; + } + + if (rq_wait_inc_below(rqw, get_limit(rwb, rw))) break; if (lock) { @@ -587,7 +520,8 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, io_schedule(); } while (1); - finish_wait(&rqw->wait, &wait); + __set_current_state(TASK_RUNNING); + remove_wait_queue(&rqw->wait, &wait); } static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) @@ -608,43 +542,72 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) } } +static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio) +{ + enum wbt_flags flags = 0; + + if (bio_op(bio) == REQ_OP_READ) { + flags = WBT_READ; + } else if (wbt_should_throttle(rwb, bio)) { + if (current_is_kswapd()) + flags |= WBT_KSWAPD; + if (bio_op(bio) == REQ_OP_DISCARD) + flags |= WBT_DISCARD; + flags |= WBT_TRACKED; + } + return flags; +} + +static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio) +{ + struct rq_wb *rwb = RQWB(rqos); + enum wbt_flags flags = bio_to_wbt_flags(rwb, bio); + __wbt_done(rqos, flags); +} + /* * Returns true if the IO request should be accounted, false if not. * May sleep, if we have exceeded the writeback limits. Caller can pass * in an irq held spinlock, if it holds one when calling this function. * If we do sleep, we'll release and re-grab it. */ -enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock) +static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock) { - enum wbt_flags ret = 0; + struct rq_wb *rwb = RQWB(rqos); + enum wbt_flags flags; if (!rwb_enabled(rwb)) - return 0; + return; - if (bio_op(bio) == REQ_OP_READ) - ret = WBT_READ; + flags = bio_to_wbt_flags(rwb, bio); if (!wbt_should_throttle(rwb, bio)) { - if (ret & WBT_READ) + if (flags & WBT_READ) wb_timestamp(rwb, &rwb->last_issue); - return ret; + return; } if (current_is_kswapd()) - ret |= WBT_KSWAPD; + flags |= WBT_KSWAPD; if (bio_op(bio) == REQ_OP_DISCARD) - ret |= WBT_DISCARD; + flags |= WBT_DISCARD; - __wbt_wait(rwb, ret, bio->bi_opf, lock); + __wbt_wait(rwb, flags, bio->bi_opf, lock); if (!blk_stat_is_active(rwb->cb)) rwb_arm_timer(rwb); +} - return ret | WBT_TRACKED; +static void wbt_track(struct rq_qos *rqos, struct request *rq, struct bio *bio) +{ + struct rq_wb *rwb = RQWB(rqos); + rq->wbt_flags |= bio_to_wbt_flags(rwb, bio); } -void wbt_issue(struct rq_wb *rwb, struct request *rq) +void wbt_issue(struct rq_qos *rqos, struct request *rq) { + struct rq_wb *rwb = RQWB(rqos); + if (!rwb_enabled(rwb)) return; @@ -661,8 +624,9 @@ void wbt_issue(struct rq_wb *rwb, struct request *rq) } } -void wbt_requeue(struct rq_wb *rwb, struct request *rq) +void wbt_requeue(struct rq_qos *rqos, struct request *rq) { + struct rq_wb *rwb = RQWB(rqos); if (!rwb_enabled(rwb)) return; if (rq == rwb->sync_cookie) { @@ -671,39 +635,30 @@ void wbt_requeue(struct rq_wb *rwb, struct request *rq) } } -void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth) +void wbt_set_queue_depth(struct request_queue *q, unsigned int depth) { - if (rwb) { - rwb->queue_depth = depth; - wbt_update_limits(rwb); + struct rq_qos *rqos = wbt_rq_qos(q); + if (rqos) { + RQWB(rqos)->rq_depth.queue_depth = depth; + __wbt_update_limits(RQWB(rqos)); } } -void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on) +void wbt_set_write_cache(struct request_queue *q, bool write_cache_on) { - if (rwb) - rwb->wc = write_cache_on; + struct rq_qos *rqos = wbt_rq_qos(q); + if (rqos) + RQWB(rqos)->wc = write_cache_on; } /* - * Disable wbt, if enabled by default. - */ -void wbt_disable_default(struct request_queue *q) -{ - struct rq_wb *rwb = q->rq_wb; - - if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT) - wbt_exit(q); -} -EXPORT_SYMBOL_GPL(wbt_disable_default); - -/* * Enable wbt if defaults are configured that way */ void wbt_enable_default(struct request_queue *q) { + struct rq_qos *rqos = wbt_rq_qos(q); /* Throttling already enabled? */ - if (q->rq_wb) + if (rqos) return; /* Queue not registered? Maybe shutting down... */ @@ -741,6 +696,42 @@ static int wbt_data_dir(const struct request *rq) return -1; } +static void wbt_exit(struct rq_qos *rqos) +{ + struct rq_wb *rwb = RQWB(rqos); + struct request_queue *q = rqos->q; + + blk_stat_remove_callback(q, rwb->cb); + blk_stat_free_callback(rwb->cb); + kfree(rwb); +} + +/* + * Disable wbt, if enabled by default. + */ +void wbt_disable_default(struct request_queue *q) +{ + struct rq_qos *rqos = wbt_rq_qos(q); + struct rq_wb *rwb; + if (!rqos) + return; + rwb = RQWB(rqos); + if (rwb->enable_state == WBT_STATE_ON_DEFAULT) + rwb->wb_normal = 0; +} +EXPORT_SYMBOL_GPL(wbt_disable_default); + + +static struct rq_qos_ops wbt_rqos_ops = { + .throttle = wbt_wait, + .issue = wbt_issue, + .track = wbt_track, + .requeue = wbt_requeue, + .done = wbt_done, + .cleanup = wbt_cleanup, + .exit = wbt_exit, +}; + int wbt_init(struct request_queue *q) { struct rq_wb *rwb; @@ -756,39 +747,29 @@ int wbt_init(struct request_queue *q) return -ENOMEM; } - for (i = 0; i < WBT_NUM_RWQ; i++) { - atomic_set(&rwb->rq_wait[i].inflight, 0); - init_waitqueue_head(&rwb->rq_wait[i].wait); - } + for (i = 0; i < WBT_NUM_RWQ; i++) + rq_wait_init(&rwb->rq_wait[i]); + rwb->rqos.id = RQ_QOS_WBT; + rwb->rqos.ops = &wbt_rqos_ops; + rwb->rqos.q = q; rwb->last_comp = rwb->last_issue = jiffies; - rwb->queue = q; rwb->win_nsec = RWB_WINDOW_NSEC; rwb->enable_state = WBT_STATE_ON_DEFAULT; - wbt_update_limits(rwb); + rwb->wc = 1; + rwb->rq_depth.default_depth = RWB_DEF_DEPTH; + __wbt_update_limits(rwb); /* * Assign rwb and add the stats callback. */ - q->rq_wb = rwb; + rq_qos_add(q, &rwb->rqos); blk_stat_add_callback(q, rwb->cb); rwb->min_lat_nsec = wbt_default_latency_nsec(q); - wbt_set_queue_depth(rwb, blk_queue_depth(q)); - wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); + wbt_set_queue_depth(q, blk_queue_depth(q)); + wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); return 0; } - -void wbt_exit(struct request_queue *q) -{ - struct rq_wb *rwb = q->rq_wb; - - if (rwb) { - blk_stat_remove_callback(q, rwb->cb); - blk_stat_free_callback(rwb->cb); - q->rq_wb = NULL; - kfree(rwb); - } -} diff --git a/block/blk-wbt.h b/block/blk-wbt.h index 300df531d0a6..f47218d5b3b2 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -9,6 +9,7 @@ #include <linux/ktime.h> #include "blk-stat.h" +#include "blk-rq-qos.h" enum wbt_flags { WBT_TRACKED = 1, /* write, tracked for throttling */ @@ -35,20 +36,12 @@ enum { WBT_STATE_ON_MANUAL = 2, }; -struct rq_wait { - wait_queue_head_t wait; - atomic_t inflight; -}; - struct rq_wb { /* * Settings that govern how we throttle */ unsigned int wb_background; /* background writeback */ unsigned int wb_normal; /* normal writeback */ - unsigned int wb_max; /* max throughput writeback */ - int scale_step; - bool scaled_max; short enable_state; /* WBT_STATE_* */ @@ -67,15 +60,20 @@ struct rq_wb { void *sync_cookie; unsigned int wc; - unsigned int queue_depth; unsigned long last_issue; /* last non-throttled issue */ unsigned long last_comp; /* last non-throttled comp */ unsigned long min_lat_nsec; - struct request_queue *queue; + struct rq_qos rqos; struct rq_wait rq_wait[WBT_NUM_RWQ]; + struct rq_depth rq_depth; }; +static inline struct rq_wb *RQWB(struct rq_qos *rqos) +{ + return container_of(rqos, struct rq_wb, rqos); +} + static inline unsigned int wbt_inflight(struct rq_wb *rwb) { unsigned int i, ret = 0; @@ -86,26 +84,19 @@ static inline unsigned int wbt_inflight(struct rq_wb *rwb) return ret; } -#ifdef CONFIG_BLK_WBT -static inline void wbt_track(struct request *rq, enum wbt_flags flags) -{ - rq->wbt_flags |= flags; -} +#ifdef CONFIG_BLK_WBT -void __wbt_done(struct rq_wb *, enum wbt_flags); -void wbt_done(struct rq_wb *, struct request *); -enum wbt_flags wbt_wait(struct rq_wb *, struct bio *, spinlock_t *); int wbt_init(struct request_queue *); -void wbt_exit(struct request_queue *); -void wbt_update_limits(struct rq_wb *); -void wbt_requeue(struct rq_wb *, struct request *); -void wbt_issue(struct rq_wb *, struct request *); +void wbt_update_limits(struct request_queue *); void wbt_disable_default(struct request_queue *); void wbt_enable_default(struct request_queue *); -void wbt_set_queue_depth(struct rq_wb *, unsigned int); -void wbt_set_write_cache(struct rq_wb *, bool); +u64 wbt_get_min_lat(struct request_queue *q); +void wbt_set_min_lat(struct request_queue *q, u64 val); + +void wbt_set_queue_depth(struct request_queue *, unsigned int); +void wbt_set_write_cache(struct request_queue *, bool); u64 wbt_default_latency_nsec(struct request_queue *); @@ -114,43 +105,30 @@ u64 wbt_default_latency_nsec(struct request_queue *); static inline void wbt_track(struct request *rq, enum wbt_flags flags) { } -static inline void __wbt_done(struct rq_wb *rwb, enum wbt_flags flags) -{ -} -static inline void wbt_done(struct rq_wb *rwb, struct request *rq) -{ -} -static inline enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, - spinlock_t *lock) -{ - return 0; -} static inline int wbt_init(struct request_queue *q) { return -EINVAL; } -static inline void wbt_exit(struct request_queue *q) -{ -} -static inline void wbt_update_limits(struct rq_wb *rwb) +static inline void wbt_update_limits(struct request_queue *q) { } -static inline void wbt_requeue(struct rq_wb *rwb, struct request *rq) +static inline void wbt_disable_default(struct request_queue *q) { } -static inline void wbt_issue(struct rq_wb *rwb, struct request *rq) +static inline void wbt_enable_default(struct request_queue *q) { } -static inline void wbt_disable_default(struct request_queue *q) +static inline void wbt_set_queue_depth(struct request_queue *q, unsigned int depth) { } -static inline void wbt_enable_default(struct request_queue *q) +static inline void wbt_set_write_cache(struct request_queue *q, bool wc) { } -static inline void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth) +static inline u64 wbt_get_min_lat(struct request_queue *q) { + return 0; } -static inline void wbt_set_write_cache(struct rq_wb *rwb, bool wc) +static inline void wbt_set_min_lat(struct request_queue *q, u64 val) { } static inline u64 wbt_default_latency_nsec(struct request_queue *q) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 51000914e23f..c461cf63f1f4 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -200,7 +200,7 @@ int blkdev_report_zones(struct block_device *bdev, /* Get header in the first page */ ofst = 0; if (!nr_rep) { - hdr = (struct blk_zone_report_hdr *) addr; + hdr = addr; nr_rep = hdr->nr_zones; ofst = sizeof(struct blk_zone_report_hdr); } diff --git a/block/blk.h b/block/blk.h index 8d23aea96ce9..d4d67e948920 100644 --- a/block/blk.h +++ b/block/blk.h @@ -130,6 +130,7 @@ void blk_free_flush_queue(struct blk_flush_queue *q); int blk_init_rl(struct request_list *rl, struct request_queue *q, gfp_t gfp_mask); void blk_exit_rl(struct request_queue *q, struct request_list *rl); +void blk_exit_queue(struct request_queue *q); void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio); void blk_queue_bypass_start(struct request_queue *q); @@ -412,4 +413,10 @@ static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio) extern void blk_drain_queue(struct request_queue *q); +#ifdef CONFIG_BLK_CGROUP_IOLATENCY +extern int blk_iolatency_init(struct request_queue *q); +#else +static inline int blk_iolatency_init(struct request_queue *q) { return 0; } +#endif + #endif /* BLK_INTERNAL_H */ diff --git a/block/bounce.c b/block/bounce.c index fd31347b7836..bc63b3a2d18c 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -195,6 +195,73 @@ static void bounce_end_io_read_isa(struct bio *bio) __bounce_end_io_read(bio, &isa_page_pool); } +static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, + struct bio_set *bs) +{ + struct bvec_iter iter; + struct bio_vec bv; + struct bio *bio; + + /* + * Pre immutable biovecs, __bio_clone() used to just do a memcpy from + * bio_src->bi_io_vec to bio->bi_io_vec. + * + * We can't do that anymore, because: + * + * - The point of cloning the biovec is to produce a bio with a biovec + * the caller can modify: bi_idx and bi_bvec_done should be 0. + * + * - The original bio could've had more than BIO_MAX_PAGES biovecs; if + * we tried to clone the whole thing bio_alloc_bioset() would fail. + * But the clone should succeed as long as the number of biovecs we + * actually need to allocate is fewer than BIO_MAX_PAGES. + * + * - Lastly, bi_vcnt should not be looked at or relied upon by code + * that does not own the bio - reason being drivers don't use it for + * iterating over the biovec anymore, so expecting it to be kept up + * to date (i.e. for clones that share the parent biovec) is just + * asking for trouble and would force extra work on + * __bio_clone_fast() anyways. + */ + + bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); + if (!bio) + return NULL; + bio->bi_disk = bio_src->bi_disk; + bio->bi_opf = bio_src->bi_opf; + bio->bi_write_hint = bio_src->bi_write_hint; + bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; + bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; + + switch (bio_op(bio)) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_ZEROES: + break; + case REQ_OP_WRITE_SAME: + bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; + break; + default: + bio_for_each_segment(bv, bio_src, iter) + bio->bi_io_vec[bio->bi_vcnt++] = bv; + break; + } + + if (bio_integrity(bio_src)) { + int ret; + + ret = bio_integrity_clone(bio, bio_src, gfp_mask); + if (ret < 0) { + bio_put(bio); + return NULL; + } + } + + bio_clone_blkcg_association(bio, bio_src); + + return bio; +} + static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, mempool_t *pool) { @@ -222,7 +289,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, generic_make_request(*bio_orig); *bio_orig = bio; } - bio = bio_clone_bioset(*bio_orig, GFP_NOIO, passthrough ? NULL : + bio = bounce_clone_bio(*bio_orig, GFP_NOIO, passthrough ? NULL : &bounce_bio_set); bio_for_each_segment_all(to, bio, i) { diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 9419def8c017..f3501cdaf1a6 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -48,9 +48,8 @@ static int bsg_transport_fill_hdr(struct request *rq, struct sg_io_v4 *hdr, job->request_len = hdr->request_len; job->request = memdup_user(uptr64(hdr->request), hdr->request_len); - if (IS_ERR(job->request)) - return PTR_ERR(job->request); - return 0; + + return PTR_ERR_OR_ZERO(job->request); } static int bsg_transport_complete_rq(struct request *rq, struct sg_io_v4 *hdr) diff --git a/block/bsg.c b/block/bsg.c index 3da540faf673..db588add6ba6 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -13,11 +13,9 @@ #include <linux/init.h> #include <linux/file.h> #include <linux/blkdev.h> -#include <linux/poll.h> #include <linux/cdev.h> #include <linux/jiffies.h> #include <linux/percpu.h> -#include <linux/uio.h> #include <linux/idr.h> #include <linux/bsg.h> #include <linux/slab.h> @@ -38,21 +36,10 @@ struct bsg_device { struct request_queue *queue; spinlock_t lock; - struct list_head busy_list; - struct list_head done_list; struct hlist_node dev_list; atomic_t ref_count; - int queued_cmds; - int done_cmds; - wait_queue_head_t wq_done; - wait_queue_head_t wq_free; char name[20]; int max_queue; - unsigned long flags; -}; - -enum { - BSG_F_BLOCK = 1, }; #define BSG_DEFAULT_CMDS 64 @@ -67,64 +54,6 @@ static struct hlist_head bsg_device_list[BSG_LIST_ARRAY_SIZE]; static struct class *bsg_class; static int bsg_major; -static struct kmem_cache *bsg_cmd_cachep; - -/* - * our internal command type - */ -struct bsg_command { - struct bsg_device *bd; - struct list_head list; - struct request *rq; - struct bio *bio; - struct bio *bidi_bio; - int err; - struct sg_io_v4 hdr; -}; - -static void bsg_free_command(struct bsg_command *bc) -{ - struct bsg_device *bd = bc->bd; - unsigned long flags; - - kmem_cache_free(bsg_cmd_cachep, bc); - - spin_lock_irqsave(&bd->lock, flags); - bd->queued_cmds--; - spin_unlock_irqrestore(&bd->lock, flags); - - wake_up(&bd->wq_free); -} - -static struct bsg_command *bsg_alloc_command(struct bsg_device *bd) -{ - struct bsg_command *bc = ERR_PTR(-EINVAL); - - spin_lock_irq(&bd->lock); - - if (bd->queued_cmds >= bd->max_queue) - goto out; - - bd->queued_cmds++; - spin_unlock_irq(&bd->lock); - - bc = kmem_cache_zalloc(bsg_cmd_cachep, GFP_KERNEL); - if (unlikely(!bc)) { - spin_lock_irq(&bd->lock); - bd->queued_cmds--; - bc = ERR_PTR(-ENOMEM); - goto out; - } - - bc->bd = bd; - INIT_LIST_HEAD(&bc->list); - bsg_dbg(bd, "returning free cmd %p\n", bc); - return bc; -out: - spin_unlock_irq(&bd->lock); - return bc; -} - static inline struct hlist_head *bsg_dev_idx_hash(int index) { return &bsg_device_list[index & (BSG_LIST_ARRAY_SIZE - 1)]; @@ -285,101 +214,6 @@ out: return ERR_PTR(ret); } -/* - * async completion call-back from the block layer, when scsi/ide/whatever - * calls end_that_request_last() on a request - */ -static void bsg_rq_end_io(struct request *rq, blk_status_t status) -{ - struct bsg_command *bc = rq->end_io_data; - struct bsg_device *bd = bc->bd; - unsigned long flags; - - bsg_dbg(bd, "finished rq %p bc %p, bio %p\n", - rq, bc, bc->bio); - - bc->hdr.duration = jiffies_to_msecs(jiffies - bc->hdr.duration); - - spin_lock_irqsave(&bd->lock, flags); - list_move_tail(&bc->list, &bd->done_list); - bd->done_cmds++; - spin_unlock_irqrestore(&bd->lock, flags); - - wake_up(&bd->wq_done); -} - -/* - * do final setup of a 'bc' and submit the matching 'rq' to the block - * layer for io - */ -static void bsg_add_command(struct bsg_device *bd, struct request_queue *q, - struct bsg_command *bc, struct request *rq) -{ - int at_head = (0 == (bc->hdr.flags & BSG_FLAG_Q_AT_TAIL)); - - /* - * add bc command to busy queue and submit rq for io - */ - bc->rq = rq; - bc->bio = rq->bio; - if (rq->next_rq) - bc->bidi_bio = rq->next_rq->bio; - bc->hdr.duration = jiffies; - spin_lock_irq(&bd->lock); - list_add_tail(&bc->list, &bd->busy_list); - spin_unlock_irq(&bd->lock); - - bsg_dbg(bd, "queueing rq %p, bc %p\n", rq, bc); - - rq->end_io_data = bc; - blk_execute_rq_nowait(q, NULL, rq, at_head, bsg_rq_end_io); -} - -static struct bsg_command *bsg_next_done_cmd(struct bsg_device *bd) -{ - struct bsg_command *bc = NULL; - - spin_lock_irq(&bd->lock); - if (bd->done_cmds) { - bc = list_first_entry(&bd->done_list, struct bsg_command, list); - list_del(&bc->list); - bd->done_cmds--; - } - spin_unlock_irq(&bd->lock); - - return bc; -} - -/* - * Get a finished command from the done list - */ -static struct bsg_command *bsg_get_done_cmd(struct bsg_device *bd) -{ - struct bsg_command *bc; - int ret; - - do { - bc = bsg_next_done_cmd(bd); - if (bc) - break; - - if (!test_bit(BSG_F_BLOCK, &bd->flags)) { - bc = ERR_PTR(-EAGAIN); - break; - } - - ret = wait_event_interruptible(bd->wq_done, bd->done_cmds); - if (ret) { - bc = ERR_PTR(-ERESTARTSYS); - break; - } - } while (1); - - bsg_dbg(bd, "returning done %p\n", bc); - - return bc; -} - static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, struct bio *bio, struct bio *bidi_bio) { @@ -398,234 +232,6 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, return ret; } -static bool bsg_complete(struct bsg_device *bd) -{ - bool ret = false; - bool spin; - - do { - spin_lock_irq(&bd->lock); - - BUG_ON(bd->done_cmds > bd->queued_cmds); - - /* - * All commands consumed. - */ - if (bd->done_cmds == bd->queued_cmds) - ret = true; - - spin = !test_bit(BSG_F_BLOCK, &bd->flags); - - spin_unlock_irq(&bd->lock); - } while (!ret && spin); - - return ret; -} - -static int bsg_complete_all_commands(struct bsg_device *bd) -{ - struct bsg_command *bc; - int ret, tret; - - bsg_dbg(bd, "entered\n"); - - /* - * wait for all commands to complete - */ - io_wait_event(bd->wq_done, bsg_complete(bd)); - - /* - * discard done commands - */ - ret = 0; - do { - spin_lock_irq(&bd->lock); - if (!bd->queued_cmds) { - spin_unlock_irq(&bd->lock); - break; - } - spin_unlock_irq(&bd->lock); - - bc = bsg_get_done_cmd(bd); - if (IS_ERR(bc)) - break; - - tret = blk_complete_sgv4_hdr_rq(bc->rq, &bc->hdr, bc->bio, - bc->bidi_bio); - if (!ret) - ret = tret; - - bsg_free_command(bc); - } while (1); - - return ret; -} - -static int -__bsg_read(char __user *buf, size_t count, struct bsg_device *bd, - const struct iovec *iov, ssize_t *bytes_read) -{ - struct bsg_command *bc; - int nr_commands, ret; - - if (count % sizeof(struct sg_io_v4)) - return -EINVAL; - - ret = 0; - nr_commands = count / sizeof(struct sg_io_v4); - while (nr_commands) { - bc = bsg_get_done_cmd(bd); - if (IS_ERR(bc)) { - ret = PTR_ERR(bc); - break; - } - - /* - * this is the only case where we need to copy data back - * after completing the request. so do that here, - * bsg_complete_work() cannot do that for us - */ - ret = blk_complete_sgv4_hdr_rq(bc->rq, &bc->hdr, bc->bio, - bc->bidi_bio); - - if (copy_to_user(buf, &bc->hdr, sizeof(bc->hdr))) - ret = -EFAULT; - - bsg_free_command(bc); - - if (ret) - break; - - buf += sizeof(struct sg_io_v4); - *bytes_read += sizeof(struct sg_io_v4); - nr_commands--; - } - - return ret; -} - -static inline void bsg_set_block(struct bsg_device *bd, struct file *file) -{ - if (file->f_flags & O_NONBLOCK) - clear_bit(BSG_F_BLOCK, &bd->flags); - else - set_bit(BSG_F_BLOCK, &bd->flags); -} - -/* - * Check if the error is a "real" error that we should return. - */ -static inline int err_block_err(int ret) -{ - if (ret && ret != -ENOSPC && ret != -ENODATA && ret != -EAGAIN) - return 1; - - return 0; -} - -static ssize_t -bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) -{ - struct bsg_device *bd = file->private_data; - int ret; - ssize_t bytes_read; - - bsg_dbg(bd, "read %zd bytes\n", count); - - bsg_set_block(bd, file); - - bytes_read = 0; - ret = __bsg_read(buf, count, bd, NULL, &bytes_read); - *ppos = bytes_read; - - if (!bytes_read || err_block_err(ret)) - bytes_read = ret; - - return bytes_read; -} - -static int __bsg_write(struct bsg_device *bd, const char __user *buf, - size_t count, ssize_t *bytes_written, fmode_t mode) -{ - struct bsg_command *bc; - struct request *rq; - int ret, nr_commands; - - if (count % sizeof(struct sg_io_v4)) - return -EINVAL; - - nr_commands = count / sizeof(struct sg_io_v4); - rq = NULL; - bc = NULL; - ret = 0; - while (nr_commands) { - struct request_queue *q = bd->queue; - - bc = bsg_alloc_command(bd); - if (IS_ERR(bc)) { - ret = PTR_ERR(bc); - bc = NULL; - break; - } - - if (copy_from_user(&bc->hdr, buf, sizeof(bc->hdr))) { - ret = -EFAULT; - break; - } - - /* - * get a request, fill in the blanks, and add to request queue - */ - rq = bsg_map_hdr(bd->queue, &bc->hdr, mode); - if (IS_ERR(rq)) { - ret = PTR_ERR(rq); - rq = NULL; - break; - } - - bsg_add_command(bd, q, bc, rq); - bc = NULL; - rq = NULL; - nr_commands--; - buf += sizeof(struct sg_io_v4); - *bytes_written += sizeof(struct sg_io_v4); - } - - if (bc) - bsg_free_command(bc); - - return ret; -} - -static ssize_t -bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) -{ - struct bsg_device *bd = file->private_data; - ssize_t bytes_written; - int ret; - - bsg_dbg(bd, "write %zd bytes\n", count); - - if (unlikely(uaccess_kernel())) - return -EINVAL; - - bsg_set_block(bd, file); - - bytes_written = 0; - ret = __bsg_write(bd, buf, count, &bytes_written, file->f_mode); - - *ppos = bytes_written; - - /* - * return bytes written on non-fatal errors - */ - if (!bytes_written || err_block_err(ret)) - bytes_written = ret; - - bsg_dbg(bd, "returning %zd\n", bytes_written); - return bytes_written; -} - static struct bsg_device *bsg_alloc_device(void) { struct bsg_device *bd; @@ -635,29 +241,20 @@ static struct bsg_device *bsg_alloc_device(void) return NULL; spin_lock_init(&bd->lock); - bd->max_queue = BSG_DEFAULT_CMDS; - - INIT_LIST_HEAD(&bd->busy_list); - INIT_LIST_HEAD(&bd->done_list); INIT_HLIST_NODE(&bd->dev_list); - - init_waitqueue_head(&bd->wq_free); - init_waitqueue_head(&bd->wq_done); return bd; } static int bsg_put_device(struct bsg_device *bd) { - int ret = 0, do_free; struct request_queue *q = bd->queue; mutex_lock(&bsg_mutex); - do_free = atomic_dec_and_test(&bd->ref_count); - if (!do_free) { + if (!atomic_dec_and_test(&bd->ref_count)) { mutex_unlock(&bsg_mutex); - goto out; + return 0; } hlist_del(&bd->dev_list); @@ -668,20 +265,9 @@ static int bsg_put_device(struct bsg_device *bd) /* * close can always block */ - set_bit(BSG_F_BLOCK, &bd->flags); - - /* - * correct error detection baddies here again. it's the responsibility - * of the app to properly reap commands before close() if it wants - * fool-proof error detection - */ - ret = bsg_complete_all_commands(bd); - kfree(bd); -out: - if (do_free) - blk_put_queue(q); - return ret; + blk_put_queue(q); + return 0; } static struct bsg_device *bsg_add_device(struct inode *inode, @@ -704,8 +290,6 @@ static struct bsg_device *bsg_add_device(struct inode *inode, bd->queue = rq; - bsg_set_block(bd, file); - atomic_set(&bd->ref_count, 1); hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode))); @@ -779,24 +363,6 @@ static int bsg_release(struct inode *inode, struct file *file) return bsg_put_device(bd); } -static __poll_t bsg_poll(struct file *file, poll_table *wait) -{ - struct bsg_device *bd = file->private_data; - __poll_t mask = 0; - - poll_wait(file, &bd->wq_done, wait); - poll_wait(file, &bd->wq_free, wait); - - spin_lock_irq(&bd->lock); - if (!list_empty(&bd->done_list)) - mask |= EPOLLIN | EPOLLRDNORM; - if (bd->queued_cmds < bd->max_queue) - mask |= EPOLLOUT; - spin_unlock_irq(&bd->lock); - - return mask; -} - static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct bsg_device *bd = file->private_data; @@ -870,9 +436,6 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) } static const struct file_operations bsg_fops = { - .read = bsg_read, - .write = bsg_write, - .poll = bsg_poll, .open = bsg_open, .release = bsg_release, .unlocked_ioctl = bsg_ioctl, @@ -977,21 +540,12 @@ static int __init bsg_init(void) int ret, i; dev_t devid; - bsg_cmd_cachep = kmem_cache_create("bsg_cmd", - sizeof(struct bsg_command), 0, 0, NULL); - if (!bsg_cmd_cachep) { - printk(KERN_ERR "bsg: failed creating slab cache\n"); - return -ENOMEM; - } - for (i = 0; i < BSG_LIST_ARRAY_SIZE; i++) INIT_HLIST_HEAD(&bsg_device_list[i]); bsg_class = class_create(THIS_MODULE, "bsg"); - if (IS_ERR(bsg_class)) { - ret = PTR_ERR(bsg_class); - goto destroy_kmemcache; - } + if (IS_ERR(bsg_class)) + return PTR_ERR(bsg_class); bsg_class->devnode = bsg_devnode; ret = alloc_chrdev_region(&devid, 0, BSG_MAX_DEVS, "bsg"); @@ -1012,8 +566,6 @@ unregister_chrdev: unregister_chrdev_region(MKDEV(bsg_major, 0), BSG_MAX_DEVS); destroy_bsg_class: class_destroy(bsg_class); -destroy_kmemcache: - kmem_cache_destroy(bsg_cmd_cachep); return ret; } diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 82b6c27b3245..2eb87444b157 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3666,6 +3666,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic) switch (ioprio_class) { default: printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); + /* fall through */ case IOPRIO_CLASS_NONE: /* * no prio set, inherit CPU scheduling settings @@ -4735,12 +4736,13 @@ USEC_SHOW_FUNCTION(cfq_target_latency_us_show, cfqd->cfq_target_latency); static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ { \ struct cfq_data *cfqd = e->elevator_data; \ - unsigned int __data; \ + unsigned int __data, __min = (MIN), __max = (MAX); \ + \ cfq_var_store(&__data, (page)); \ - if (__data < (MIN)) \ - __data = (MIN); \ - else if (__data > (MAX)) \ - __data = (MAX); \ + if (__data < __min) \ + __data = __min; \ + else if (__data > __max) \ + __data = __max; \ if (__CONV) \ *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ else \ @@ -4769,12 +4771,13 @@ STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ { \ struct cfq_data *cfqd = e->elevator_data; \ - unsigned int __data; \ + unsigned int __data, __min = (MIN), __max = (MAX); \ + \ cfq_var_store(&__data, (page)); \ - if (__data < (MIN)) \ - __data = (MIN); \ - else if (__data > (MAX)) \ - __data = (MAX); \ + if (__data < __min) \ + __data = __min; \ + else if (__data > __max) \ + __data = __max; \ *(__PTR) = (u64)__data * NSEC_PER_USEC; \ return count; \ } diff --git a/block/genhd.c b/block/genhd.c index f1543a45e73b..8cc719a37b32 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1333,21 +1333,28 @@ static int diskstats_show(struct seq_file *seqf, void *v) part_round_stats(gp->queue, cpu, hd); part_stat_unlock(); part_in_flight(gp->queue, hd, inflight); - seq_printf(seqf, "%4d %7d %s %lu %lu %lu " - "%u %lu %lu %lu %u %u %u %u\n", + seq_printf(seqf, "%4d %7d %s " + "%lu %lu %lu %u " + "%lu %lu %lu %u " + "%u %u %u " + "%lu %lu %lu %u\n", MAJOR(part_devt(hd)), MINOR(part_devt(hd)), disk_name(gp, hd->partno, buf), - part_stat_read(hd, ios[READ]), - part_stat_read(hd, merges[READ]), - part_stat_read(hd, sectors[READ]), - jiffies_to_msecs(part_stat_read(hd, ticks[READ])), - part_stat_read(hd, ios[WRITE]), - part_stat_read(hd, merges[WRITE]), - part_stat_read(hd, sectors[WRITE]), - jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])), + part_stat_read(hd, ios[STAT_READ]), + part_stat_read(hd, merges[STAT_READ]), + part_stat_read(hd, sectors[STAT_READ]), + jiffies_to_msecs(part_stat_read(hd, ticks[STAT_READ])), + part_stat_read(hd, ios[STAT_WRITE]), + part_stat_read(hd, merges[STAT_WRITE]), + part_stat_read(hd, sectors[STAT_WRITE]), + jiffies_to_msecs(part_stat_read(hd, ticks[STAT_WRITE])), inflight[0], jiffies_to_msecs(part_stat_read(hd, io_ticks)), - jiffies_to_msecs(part_stat_read(hd, time_in_queue)) + jiffies_to_msecs(part_stat_read(hd, time_in_queue)), + part_stat_read(hd, ios[STAT_DISCARD]), + part_stat_read(hd, merges[STAT_DISCARD]), + part_stat_read(hd, sectors[STAT_DISCARD]), + jiffies_to_msecs(part_stat_read(hd, ticks[STAT_DISCARD])) ); } disk_part_iter_exit(&piter); diff --git a/block/partition-generic.c b/block/partition-generic.c index 3dcfd4ec0e11..5a8975a1201c 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -130,19 +130,24 @@ ssize_t part_stat_show(struct device *dev, return sprintf(buf, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " - "%8u %8u %8u" + "%8u %8u %8u " + "%8lu %8lu %8llu %8u" "\n", - part_stat_read(p, ios[READ]), - part_stat_read(p, merges[READ]), - (unsigned long long)part_stat_read(p, sectors[READ]), - jiffies_to_msecs(part_stat_read(p, ticks[READ])), - part_stat_read(p, ios[WRITE]), - part_stat_read(p, merges[WRITE]), - (unsigned long long)part_stat_read(p, sectors[WRITE]), - jiffies_to_msecs(part_stat_read(p, ticks[WRITE])), + part_stat_read(p, ios[STAT_READ]), + part_stat_read(p, merges[STAT_READ]), + (unsigned long long)part_stat_read(p, sectors[STAT_READ]), + jiffies_to_msecs(part_stat_read(p, ticks[STAT_READ])), + part_stat_read(p, ios[STAT_WRITE]), + part_stat_read(p, merges[STAT_WRITE]), + (unsigned long long)part_stat_read(p, sectors[STAT_WRITE]), + jiffies_to_msecs(part_stat_read(p, ticks[STAT_WRITE])), inflight[0], jiffies_to_msecs(part_stat_read(p, io_ticks)), - jiffies_to_msecs(part_stat_read(p, time_in_queue))); + jiffies_to_msecs(part_stat_read(p, time_in_queue)), + part_stat_read(p, ios[STAT_DISCARD]), + part_stat_read(p, merges[STAT_DISCARD]), + (unsigned long long)part_stat_read(p, sectors[STAT_DISCARD]), + jiffies_to_msecs(part_stat_read(p, ticks[STAT_DISCARD]))); } ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, diff --git a/block/partitions/aix.c b/block/partitions/aix.c index 007f95eea0e1..903f3ed175d0 100644 --- a/block/partitions/aix.c +++ b/block/partitions/aix.c @@ -178,7 +178,7 @@ int aix_partition(struct parsed_partitions *state) u32 vgda_sector = 0; u32 vgda_len = 0; int numlvs = 0; - struct pvd *pvd; + struct pvd *pvd = NULL; struct lv_info { unsigned short pps_per_lv; unsigned short pps_found; @@ -232,10 +232,11 @@ int aix_partition(struct parsed_partitions *state) if (lvip[i].pps_per_lv) foundlvs += 1; } + /* pvd loops depend on n[].name and lvip[].pps_per_lv */ + pvd = alloc_pvd(state, vgda_sector + 17); } put_dev_sector(sect); } - pvd = alloc_pvd(state, vgda_sector + 17); if (pvd) { int numpps = be16_to_cpu(pvd->pp_count); int psn_part1 = be32_to_cpu(pvd->psn_part1); @@ -282,10 +283,14 @@ int aix_partition(struct parsed_partitions *state) next_lp_ix += 1; } for (i = 0; i < state->limit; i += 1) - if (lvip[i].pps_found && !lvip[i].lv_is_contiguous) + if (lvip[i].pps_found && !lvip[i].lv_is_contiguous) { + char tmp[sizeof(n[i].name) + 1]; // null char + + snprintf(tmp, sizeof(tmp), "%s", n[i].name); pr_warn("partition %s (%u pp's found) is " "not contiguous\n", - n[i].name, lvip[i].pps_found); + tmp, lvip[i].pps_found); + } kfree(pvd); } kfree(n); diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c index 0417937dfe99..16766f267559 100644 --- a/block/partitions/ldm.c +++ b/block/partitions/ldm.c @@ -830,7 +830,6 @@ static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb) { char buf[64]; int r_objid, r_name, r_id1, r_id2, len; - struct vblk_dgrp *dgrp; BUG_ON (!buffer || !vb); @@ -853,8 +852,6 @@ static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb) if (len != get_unaligned_be32(buffer + 0x14)) return false; - dgrp = &vb->vblk.dgrp; - ldm_get_vstr (buffer + 0x18 + r_objid, buf, sizeof (buf)); return true; } diff --git a/block/t10-pi.c b/block/t10-pi.c index a98db384048f..62aed77d0bb9 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -184,3 +184,113 @@ const struct blk_integrity_profile t10_pi_type3_ip = { .verify_fn = t10_pi_type3_verify_ip, }; EXPORT_SYMBOL(t10_pi_type3_ip); + +/** + * t10_pi_prepare - prepare PI prior submitting request to device + * @rq: request with PI that should be prepared + * @protection_type: PI type (Type 1/Type 2/Type 3) + * + * For Type 1/Type 2, the virtual start sector is the one that was + * originally submitted by the block layer for the ref_tag usage. Due to + * partitioning, MD/DM cloning, etc. the actual physical start sector is + * likely to be different. Remap protection information to match the + * physical LBA. + * + * Type 3 does not have a reference tag so no remapping is required. + */ +void t10_pi_prepare(struct request *rq, u8 protection_type) +{ + const int tuple_sz = rq->q->integrity.tuple_size; + u32 ref_tag = t10_pi_ref_tag(rq); + struct bio *bio; + + if (protection_type == T10_PI_TYPE3_PROTECTION) + return; + + __rq_for_each_bio(bio, rq) { + struct bio_integrity_payload *bip = bio_integrity(bio); + u32 virt = bip_get_seed(bip) & 0xffffffff; + struct bio_vec iv; + struct bvec_iter iter; + + /* Already remapped? */ + if (bip->bip_flags & BIP_MAPPED_INTEGRITY) + break; + + bip_for_each_vec(iv, bip, iter) { + void *p, *pmap; + unsigned int j; + + pmap = kmap_atomic(iv.bv_page); + p = pmap + iv.bv_offset; + for (j = 0; j < iv.bv_len; j += tuple_sz) { + struct t10_pi_tuple *pi = p; + + if (be32_to_cpu(pi->ref_tag) == virt) + pi->ref_tag = cpu_to_be32(ref_tag); + virt++; + ref_tag++; + p += tuple_sz; + } + + kunmap_atomic(pmap); + } + + bip->bip_flags |= BIP_MAPPED_INTEGRITY; + } +} +EXPORT_SYMBOL(t10_pi_prepare); + +/** + * t10_pi_complete - prepare PI prior returning request to the block layer + * @rq: request with PI that should be prepared + * @protection_type: PI type (Type 1/Type 2/Type 3) + * @intervals: total elements to prepare + * + * For Type 1/Type 2, the virtual start sector is the one that was + * originally submitted by the block layer for the ref_tag usage. Due to + * partitioning, MD/DM cloning, etc. the actual physical start sector is + * likely to be different. Since the physical start sector was submitted + * to the device, we should remap it back to virtual values expected by the + * block layer. + * + * Type 3 does not have a reference tag so no remapping is required. + */ +void t10_pi_complete(struct request *rq, u8 protection_type, + unsigned int intervals) +{ + const int tuple_sz = rq->q->integrity.tuple_size; + u32 ref_tag = t10_pi_ref_tag(rq); + struct bio *bio; + + if (protection_type == T10_PI_TYPE3_PROTECTION) + return; + + __rq_for_each_bio(bio, rq) { + struct bio_integrity_payload *bip = bio_integrity(bio); + u32 virt = bip_get_seed(bip) & 0xffffffff; + struct bio_vec iv; + struct bvec_iter iter; + + bip_for_each_vec(iv, bip, iter) { + void *p, *pmap; + unsigned int j; + + pmap = kmap_atomic(iv.bv_page); + p = pmap + iv.bv_offset; + for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) { + struct t10_pi_tuple *pi = p; + + if (be32_to_cpu(pi->ref_tag) == ref_tag) + pi->ref_tag = cpu_to_be32(virt); + virt++; + ref_tag++; + intervals--; + p += tuple_sz; + } + + kunmap_atomic(pmap); + } + } +} +EXPORT_SYMBOL(t10_pi_complete); |